yzx_core/frame.rs
1/*
2 * Description: Encoding of Zstandard frame types.
3 *
4 * Copyright (C) 2025 d@nny mc² <dmc2@hypnicjerk.ai>
5 * SPDX-License-Identifier: AGPL-3.0-or-later
6 *
7 * This program is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU Affero General Public License as published
9 * by the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU Affero General Public License for more details.
16 *
17 * You should have received a copy of the GNU Affero General Public License
18 * along with this program. If not, see <https://www.gnu.org/licenses/>.
19 */
20
21//! Encoding of Zstandard frame types.
22//!
23//! A Zstandard data stream is composed of one or more *frames*, concatenated together
24//! without interruption. Frames may refer to data or contain metadata.
25//!
26//! Supported frame types are specified in section 3.1 of IETF RFC 8878[^sec-3.1]. They are similar to the
27//! format used for frames in LZ4[^lz4].
28//!
29//! [^sec-3.1]: <https://datatracker.ietf.org/doc/html/rfc8878#section-3.1>
30//! [^lz4]: <https://github.com/lz4/lz4/blob/a018abc07a3018371625a265f195c4fafbf1f99d/doc/lz4_Frame_format.md>
31
32#[derive(Debug, Copy, Clone)]
33pub enum ParseResult<Success, Error> {
34 Success {
35 result: Success,
36 consumed_bytes: usize,
37 },
38 NeedsMoreInput {
39 at_least_this_many_more_bytes: usize,
40 },
41 Error(Error),
42}
43
44pub trait SliceParse {
45 type Success<'data>;
46 type Error;
47
48 type Arg;
49 fn parse<'data>(
50 arg: Self::Arg,
51 data: &'data [u8],
52 ) -> ParseResult<Self::Success<'data>, Self::Error>
53 where
54 Self: Sized;
55}
56
57/// Encoding of a Zstandard frame.
58///
59/// A "Zstandard frame" (somewhat confusingly) refers to a single specific type of frame which may
60/// appear in a Zstandard data stream. This frame type is also currently the only kind which is
61/// specified to produce decoded output data. Therefore, this codebase uses the term "data frame".
62///
63/// This frame type's encoding is specified in section 3.1.1 of IETF RFC 8878[^sec-3.1.1]:
64/// ```custom,{class=language-md}
65/// +--------------------+------------+
66/// | Magic_Number | 4 bytes |
67/// +--------------------+------------+
68/// | Frame_Header | 2-14 bytes |
69/// +--------------------+------------+
70/// | Data_Block | n bytes |
71/// +--------------------+------------+
72/// | [More Data_Blocks] | |
73/// +--------------------+------------+
74/// | [Content_Checksum] | 4 bytes |
75/// +--------------------+------------+
76/// ```
77///
78/// [^sec-3.1.1]: <https://datatracker.ietf.org/doc/html/rfc8878#section-3.1.1>
79pub mod data {
80 /// The "magic number" identifying the start of a data frame.
81 ///
82 /// As per the RFC:
83 /// > The magic number was selected to be less probable to find at the beginning of an arbitrary
84 /// > file. It avoids trivial patterns (0x00, 0xFF, repeated bytes, increasing bytes, etc.),
85 /// > contains byte values outside of the ASCII range, and doesn't map into UTF-8 space, all of
86 /// > which reduce the likelihood of its appearance at the top of a text file.
87 pub const MAGIC: u32 = 0xFD2FB528;
88 /* This will represent the data we write to or read off the wire. */
89 const MAGIC_BYTES: [u8; 4] = MAGIC.to_le_bytes();
90
91 pub mod header {
92 use core::{error, fmt, num};
93
94 use crate::frame::{ParseResult, SliceParse};
95
96 #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
97 pub struct FrameHeader {
98 pub window: WindowBounds,
99 pub dictionary: Option<DictionaryID>,
100 pub checksum: ChecksumBehavior,
101 }
102
103 #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
104 pub enum WindowBounds {
105 SingleSegment {
106 frame_content_size: u64,
107 },
108 Windowed {
109 window_size: num::NonZero<u64>,
110 frame_content_size: Option<u64>,
111 },
112 }
113
114 impl WindowBounds {
115 #[inline(always)]
116 pub const fn window_size(&self) -> u64 {
117 match self {
118 Self::SingleSegment { frame_content_size } => *frame_content_size,
119 Self::Windowed { window_size, .. } => window_size.get(),
120 }
121 }
122
123 #[inline(always)]
124 const fn clamp_block_size(window_size: u64) -> u32 {
125 if window_size > (super::block::BLOCK_SIZE_MAX as u64) {
126 super::block::BLOCK_SIZE_MAX
127 } else {
128 window_size as u32
129 }
130 }
131
132 #[inline(always)]
133 pub const fn block_maximum_size(&self) -> u32 { Self::clamp_block_size(self.window_size()) }
134 }
135
136 #[repr(transparent)]
137 #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
138 pub struct DictionaryID(u32);
139
140 impl DictionaryID {
141 #[inline(always)]
142 pub const fn new(x: u32) -> Self { Self(x) }
143
144 #[inline(always)]
145 pub const fn within_iana_reserved_ranges(&self) -> bool {
146 (self.0 <= 32767) || (self.0 >= 1 << 31)
147 }
148 }
149
150 #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
151 pub enum ChecksumBehavior {
152 HasContentChecksum,
153 None,
154 }
155
156 /// The first byte of each frame header.
157 ///
158 /// As per the RFC:
159 /// > The first header's byte is called the `Frame_Header_Descriptor`. It describes which other
160 /// > fields are present. Decoding this byte is enough to tell the size of `Frame_Header`.
161 #[repr(transparent)]
162 struct FrameHeaderDescriptor(u8);
163
164 impl FrameHeaderDescriptor {
165 #[inline(always)]
166 const fn from_le(x: &u8) -> Self { Self(u8::from_le(*x)) }
167
168 #[inline]
169 const fn fcs_field_size(&self) -> Option<num::NonZero<u8>> {
170 let frame_content_size_flag: u8 = (self.0 & 0b1100_0000) >> 6;
171 let ret: u8 = if frame_content_size_flag == 0 && !self.single_segment_flag() {
172 0
173 } else {
174 0b1u8 << frame_content_size_flag
175 };
176 num::NonZero::new(ret)
177 }
178
179 #[inline(always)]
180 const fn single_segment_flag(&self) -> bool { (self.0 & 0b0010_0000) != 0 }
181
182 #[inline(always)]
183 const fn unused_bit_is_set(&self) -> bool { (self.0 & 0b0001_0000) != 0 }
184
185 #[inline(always)]
186 const fn reserved_bit_is_set(&self) -> bool { (self.0 & 0b0000_1000) != 0 }
187
188 #[inline(always)]
189 const fn content_checksum_flag(&self) -> bool { (self.0 & 0b0000_0100) != 0 }
190
191 #[inline(always)]
192 const fn did_field_size(&self) -> Option<num::NonZero<u8>> {
193 num::NonZero::new(match self.0 & 0b11 {
194 3 => 4,
195 x => x,
196 })
197 }
198 }
199
200 #[repr(transparent)]
201 struct WindowDescriptor(u8);
202
203 impl WindowDescriptor {
204 #[inline(always)]
205 const fn from_le(x: &u8) -> Self { Self(u8::from_le(*x)) }
206
207 #[inline(always)]
208 #[allow(non_snake_case)]
209 #[rustfmt::skip] /* This is the only way to retain the range bounds, sigh... */
210 const fn compute_window_size(&self) -> num::NonZero<u64> {
211 let Exponent: u64 = ((self.0 & 0b1111_1000) >> 3) as u64; /* [0, 31] */
212 let Mantissa: u64 = (self.0 & 0b0000_0111) as u64; /* [0, 7] */
213 let windowLog: u64 = 10 + Exponent; /* [10, 41] */
214 let windowBase: u64 = 1 << windowLog; /* [1<<10, 1<<41] */
215 let windowAdd: u64 = (windowBase / 8) * Mantissa; /* [0, 7x(1<<38)] */
216 /* LLVM optimizes out the panic branch on --release! */
217 num::NonZero::new(windowBase + windowAdd).unwrap() /* [1<<10, (1<<41)+7x(1<<38)] */
218 }
219 }
220
221 #[derive(Debug, Copy, Clone)]
222 pub enum HeaderParseError {
223 ReservedBitSet,
224 InvalidWindowBounds {
225 window_size: num::NonZero<u64>,
226 frame_content_size: u64,
227 },
228 }
229
230 impl fmt::Display for HeaderParseError {
231 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
232 match self {
233 Self::ReservedBitSet => write!(f, "reserved bit set in frame header descriptor"),
234 Self::InvalidWindowBounds {
235 window_size,
236 frame_content_size,
237 } => {
238 let window_size = window_size.get();
239 assert!(window_size > *frame_content_size);
240 write!(
241 f,
242 "invalid window bounds: Window_Size ({window_size}) \
243 was greater than Frame_Content_Size ({frame_content_size})"
244 )
245 },
246 }
247 }
248 }
249
250 impl error::Error for HeaderParseError {}
251
252 impl SliceParse for FrameHeader {
253 type Success<'data> = FrameHeader;
254 type Error = HeaderParseError;
255
256 type Arg = ();
257 fn parse<'data>(
258 _arg: Self::Arg,
259 data: &'data [u8],
260 ) -> ParseResult<Self::Success<'data>, Self::Error>
261 where
262 Self: Sized,
263 {
264 let original_length = data.len();
265 let (descriptor, data) = match data.split_first() {
266 None => {
267 return ParseResult::NeedsMoreInput {
268 at_least_this_many_more_bytes: 1,
269 };
270 },
271 Some((x, rest)) => (FrameHeaderDescriptor::from_le(x), rest),
272 };
273 if descriptor.reserved_bit_is_set() {
274 return ParseResult::Error(HeaderParseError::ReservedBitSet);
275 }
276
277 let (window_descriptor, data): (Option<WindowDescriptor>, &[u8]) =
278 if descriptor.single_segment_flag() {
279 (None, data)
280 } else {
281 match data.split_first() {
282 None => {
283 return ParseResult::NeedsMoreInput {
284 at_least_this_many_more_bytes: 1,
285 };
286 },
287 Some((x, rest)) => (Some(WindowDescriptor::from_le(x)), rest),
288 }
289 };
290
291 let (dictionary, data): (Option<DictionaryID>, &[u8]) = match descriptor.did_field_size() {
292 None => (None, data),
293 Some(n) => match data.split_at_checked(n.get() as usize) {
294 None => {
295 return ParseResult::NeedsMoreInput {
296 at_least_this_many_more_bytes: (n.get() as usize) - data.len(),
297 };
298 },
299 Some((dict_data, rest)) => {
300 let id: u32 = match dict_data.len() {
301 1 => u8::from_le(dict_data[0]) as u32,
302 2 => u16::from_le_bytes(dict_data.try_into().unwrap()) as u32,
303 4 => u32::from_le_bytes(dict_data.try_into().unwrap()),
304 _ => unreachable!("1, 2, and 4 are the only viable did_field_size results"),
305 };
306 (Some(DictionaryID::new(id)), rest)
307 },
308 },
309 };
310
311 let (frame_content_size, data): (Option<u64>, &[u8]) = match descriptor.fcs_field_size() {
312 None => (None, data),
313 Some(n) => match data.split_at_checked(n.get() as usize) {
314 None => {
315 return ParseResult::NeedsMoreInput {
316 at_least_this_many_more_bytes: (n.get() as usize) - data.len(),
317 };
318 },
319 Some((frame_data, rest)) => {
320 let size: u64 = match frame_data.len() {
321 1 => u8::from_le(frame_data[0]) as u64,
322 /* NB: the specific case of 2-byte width adds an additional offset.
323 * This is not explained in the specification for some reason. */
324 2 => (u16::from_le_bytes(frame_data.try_into().unwrap()) as u64) + 256,
325 4 => u32::from_le_bytes(frame_data.try_into().unwrap()) as u64,
326 8 => u64::from_le_bytes(frame_data.try_into().unwrap()),
327 _ => unreachable!("1, 2, 4, and 8 are the only viable fcs_field_size results"),
328 };
329 (Some(size), rest)
330 },
331 },
332 };
333
334 let window = match (window_descriptor, frame_content_size) {
335 (None, None) => {
336 unreachable!(
337 "Single_Segment_Flag modulates Window_Size and Frame_Content_Size, \
338 making this case impossible"
339 )
340 },
341 /* Single_Segment_Flag */
342 (None, Some(frame_content_size)) => WindowBounds::SingleSegment { frame_content_size },
343 (Some(window_descriptor), None) => WindowBounds::Windowed {
344 window_size: window_descriptor.compute_window_size(),
345 frame_content_size: None,
346 },
347 (Some(window_descriptor), Some(frame_content_size)) => {
348 let window_size: num::NonZero<u64> = window_descriptor.compute_window_size();
349 if window_size.get() > frame_content_size {
350 return ParseResult::Error(HeaderParseError::InvalidWindowBounds {
351 window_size,
352 frame_content_size,
353 });
354 }
355 WindowBounds::Windowed {
356 window_size,
357 frame_content_size: Some(frame_content_size),
358 }
359 },
360 };
361
362 let checksum = if descriptor.content_checksum_flag() {
363 ChecksumBehavior::HasContentChecksum
364 } else {
365 ChecksumBehavior::None
366 };
367
368 let consumed_bytes = original_length - data.len();
369 ParseResult::Success {
370 result: FrameHeader {
371 window,
372 dictionary,
373 checksum,
374 },
375 consumed_bytes,
376 }
377 }
378 }
379 }
380
381 pub mod block {
382 use core::{error, fmt};
383
384 use crate::frame::{ParseResult, SliceParse};
385
386 const HEADER_SIZE: usize = 3;
387
388 /* 128kb global limit to block size, regardless of window_size. */
389 pub(crate) const BLOCK_SIZE_MAX: u32 = 1 << 17;
390
391 #[derive(Debug, Copy, Clone)]
392 pub struct Block<'data> {
393 pub content: BlockContent<'data>,
394 pub sequence_behavior: SequenceBehavior,
395 }
396
397 #[derive(Debug, Copy, Clone)]
398 pub enum BlockContent<'data> {
399 Raw { data: &'data [u8] },
400 RunLength { byte: &'data u8, block_size: u32 },
401 Compressed { data: &'data [u8] },
402 }
403
404 #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
405 pub enum SequenceBehavior {
406 LastBlock,
407 Continue,
408 }
409
410 #[repr(transparent)]
411 struct BlockHeader(u32);
412
413 impl BlockHeader {
414 #[inline(always)]
415 fn from_le_bytes(x: &[u8; HEADER_SIZE]) -> Self {
416 static_assertions::const_assert!(HEADER_SIZE <= core::mem::size_of::<u32>());
417 let ret: u32 = (0..HEADER_SIZE)
418 .map(|i| (x[i] as u32) << (u8::BITS * (i as u32)))
419 .sum();
420 Self(ret)
421 }
422
423 #[inline(always)]
424 const fn last_block(&self) -> bool { (self.0 & 0b1) != 0 }
425
426 #[inline(always)]
427 fn block_type(&self) -> BlockType {
428 match (self.0 & 0b110) >> 1 {
429 0 => BlockType::RawBlock,
430 1 => BlockType::RLEBlock,
431 2 => BlockType::CompressedBlock,
432 3 => BlockType::Reserved,
433 _ => unreachable!("block type is limited to two bits"),
434 }
435 }
436
437 #[inline(always)]
438 const fn block_size(&self) -> u32 { self.0 >> 3 }
439 }
440
441 #[repr(u8)]
442 #[derive(Debug, Copy, Clone)]
443 enum BlockType {
444 RawBlock = 0,
445 RLEBlock = 1,
446 CompressedBlock = 2,
447 Reserved = 3,
448 }
449
450 #[derive(Debug, Copy, Clone)]
451 pub enum BlockParseResult<'data> {
452 Success {
453 result: Block<'data>,
454 is_last_block: bool,
455 consumed_bytes: usize,
456 },
457 NeedsMoreInput {
458 at_least_this_many_more_bytes: usize,
459 },
460 Error(BlockParseError),
461 }
462
463 #[derive(Debug, Copy, Clone)]
464 pub enum BlockParseError {
465 TooLarge { max_size: u32, block_size: u32 },
466 ReservedBlockType,
467 }
468
469 impl fmt::Display for BlockParseError {
470 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
471 match self {
472 Self::TooLarge {
473 max_size,
474 block_size,
475 } => {
476 assert!(block_size > max_size);
477 write!(
478 f,
479 "Block_Size ({block_size}) was greater than Block_Maximum_Size ({max_size})"
480 )
481 },
482 Self::ReservedBlockType => {
483 write!(f, "reserved block type specified in block header")
484 },
485 }
486 }
487 }
488
489 impl error::Error for BlockParseError {}
490
491 impl SliceParse for Block<'_> {
492 type Success<'data> = Block<'data>;
493 type Error = BlockParseError;
494
495 type Arg = u32;
496 /* NB: max_size should be the result of WindowBounds::block_maximum_size()! */
497 fn parse<'data>(
498 max_size: u32,
499 data: &'data [u8],
500 ) -> ParseResult<Self::Success<'data>, Self::Error> {
501 debug_assert!(max_size <= BLOCK_SIZE_MAX);
502 let original_length = data.len();
503 let (block_header, data) = match data.split_at_checked(HEADER_SIZE) {
504 None => {
505 return ParseResult::NeedsMoreInput {
506 at_least_this_many_more_bytes: data.len() - HEADER_SIZE,
507 };
508 },
509 Some((header, rest)) => {
510 let block_header = BlockHeader::from_le_bytes(header.try_into().unwrap());
511 (block_header, rest)
512 },
513 };
514
515 let block_size: u32 = {
516 let block_size = block_header.block_size();
517 if block_size > max_size {
518 return ParseResult::Error(BlockParseError::TooLarge {
519 max_size,
520 block_size,
521 });
522 }
523 block_size
524 };
525
526 let sequence_behavior = if block_header.last_block() {
527 SequenceBehavior::LastBlock
528 } else {
529 SequenceBehavior::Continue
530 };
531
532 let (block, data) = match block_header.block_type() {
533 BlockType::RawBlock => match data.split_at_checked(block_size as usize) {
534 None => {
535 return ParseResult::NeedsMoreInput {
536 at_least_this_many_more_bytes: data.len() - (block_size as usize),
537 };
538 },
539 Some((block, rest)) => {
540 let block = BlockContent::Raw { data: block };
541 (block, rest)
542 },
543 },
544 BlockType::RLEBlock => match data.split_first() {
545 None => {
546 return ParseResult::NeedsMoreInput {
547 at_least_this_many_more_bytes: 1,
548 };
549 },
550 Some((byte, rest)) => {
551 let block = BlockContent::RunLength { byte, block_size };
552 (block, rest)
553 },
554 },
555 BlockType::CompressedBlock => match data.split_at_checked(block_size as usize) {
556 None => {
557 return ParseResult::NeedsMoreInput {
558 at_least_this_many_more_bytes: data.len() - (block_size as usize),
559 };
560 },
561 Some((block, rest)) => {
562 let block = BlockContent::Compressed { data: block };
563 (block, rest)
564 },
565 },
566 BlockType::Reserved => {
567 return ParseResult::Error(BlockParseError::ReservedBlockType);
568 },
569 };
570
571 let consumed_bytes = original_length - data.len();
572 ParseResult::Success {
573 result: Block {
574 content: block,
575 sequence_behavior,
576 },
577 consumed_bytes,
578 }
579 }
580 }
581 }
582}
583
584/// Encoding of a skippable frame.
585///
586/// A skippable frame carries a fixed-size numeric identifier in its magic number as well as
587/// variable-sized arbitrary bytes. It does *not* decompose into any internal block format like
588/// [`data`] frames. It also has no decoder behavior specified by RFC 8788, which instead explicitly
589/// clarifies its intent to support "user-defined metadata":
590/// > From a compliant decoder perspective, skippable frames simply need to be skipped, and their
591/// > content ignored, resuming decoding after the skippable frame.
592///
593/// This frame type's encoding is specified in section 3.1.2 of IETF RFC 8878[^sec-3.1.2]:
594/// ```custom,{class=language-md}
595/// +==============+============+===========+
596/// | Magic_Number | Frame_Size | User_Data |
597/// +==============+============+===========+
598/// | 4 bytes | 4 bytes | n bytes |
599/// +--------------+------------+-----------+
600/// ```
601///
602/// [^sec-3.1.2]: <https://datatracker.ietf.org/doc/html/rfc8878#section-3.1.2>
603///
604/// # Privacy Risk: Watermarking
605/// The IETF RFC *repeatedly* notes the potential for watermarking and other forms of tracking
606/// possible via skippable frames in this standard:
607/// > It should be noted that a skippable frame can be used to watermark a stream of concatenated
608/// > frames embedding any kind of tracking information (even just a Universally Unique Identifier
609/// > (UUID)). Users wary of such possibility should scan the stream of concatenated frames in an
610/// > attempt to detect such frames for analysis or removal.
611///
612/// Because the specification does not specify the behavior of skippable frames, this risk can go
613/// undetected unless the decoder explicitly handles such frames. Removing such frames will modify
614/// the resulting stream (which itself may impose its own risk of watermarking), but should make it
615/// possible for two independent implementations (or two independent users of this library) to avoid
616/// being individually watermarked by skippable frames alone if they were to reproduce a zstd stream
617/// from an untrusted source.
618///
619/// ## Data Frames Contain Hidden States
620/// However, the Zstandard stream format contains many further opportunities for individually
621/// watermarking a stream beyond skippable frames which are *not* mentioned in the spec, and which
622/// generally revolve around the immense flexibility of standard [`data`] frames.
623///
624/// These opportunities are almost too numerous to name, but take on a few broad categories:
625/// - **degenerate states:** *when decoded output is empty*
626/// - Like skippable frames, these also have no effect upon the decoded output, but can store
627/// arbitrary user data.
628/// - examples include:
629/// - when `Frame_Content_Size` is 0 (this also limits all subsequent `Window_Size` and
630/// `Block_Size`).
631/// - when `Block_Size` is 0 (for `Raw_Block` or `RLE_Block`).
632/// - when `Number_of_Sequences` is 0.
633/// - **TODO:** probably when a literal or offset is zero-length in sequence execution?
634/// - **synonymous/fungible states:** *when the same output data is representable with distinct byte
635/// strings*
636/// - This comes in three basic forms:
637/// 1. _using a more *complex* data structure than necessary_, e.g.:
638/// - a `Raw_Block` for a single repeating byte.
639/// - a `Compressed_Block` for uncompressible data.
640/// 2. _using a sequence of too *simple* data structures_, e.g.:
641/// - two consecutive `RLE_Block`s with `Block_Size == 1` vs `Raw_Block` with 2 bytes.
642/// - a `Raw_Block` for highly compressible data.
643/// 3. _using `Block_Type` vs `Literals_Block_Type`:_
644/// - `Block_Type` provides simpler forms of `RLE_Block` and `Raw_Literals_Block`, whereas
645/// the `Sequences_Section`[^seq-sec] from `Compressed_Block` can describe a program to
646/// execute a sequence of run-length literals or directly-copied bytes.
647/// - Note that "compressibility" is highly domain-specific, and decisions may be performed
648/// arbitrarily by the encoder.
649/// - *This therefore exposes the encoder to watermarking.*
650/// - **dict/literal encoding:** *when decisions are made regarding prefix data or symbol
651/// distributions*
652/// - **TODO:** it is still unclear how this works and the directions seem to contradict
653/// each other.
654/// - This technique can be supremely difficult to detect heuristically.
655/// - It may be possible through re-encoding to compare against a symbol distribution table
656/// built up by hand.
657/// - In general, the space of possible compression encodings is vast, and as compression is
658/// compared by both speed and size ratio, the decisions a compressor makes are hard to judge.
659/// - *However, this individuality streak makes encoders susceptible to watermarking too.*
660/// - **block index selection:** *when the encoder decides how to chunk up the stream*
661/// - As with dict encoding, this is generally considered an arbitrary decision by the encoder.
662/// - *As a result, encoding is also watermarkable.*
663///
664/// [^seq-sec]: <https://datatracker.ietf.org/doc/html/rfc8878#section-3.1.1.3.2>
665///
666/// ### "Unused Bit" is a Skippable Frame
667/// Also worth calling out in particular is **the "Unused Bit"
668/// from section 3.1.1.1.1.3[^sec-3.1.1.1.1.3]:**
669/// > A decoder compliant with this specification version shall not interpret this bit.
670///
671/// This is actually even *stronger* than a skippable frame, as it claims compliance *requires* not
672/// looking at the value of the bit, whereas skippable frames do not impose any interpretation
673/// (forbidding an interpretation is also an interpretation!). Luckily, as it states at the top:
674/// > This document is not an Internet Standards Track specification.
675///
676/// So for now we can do what it suggests:
677/// > An encoder compliant with this specification must set this bit to zero.
678///
679/// [^sec-3.1.1.1.1.3]: <https://datatracker.ietf.org/doc/html/rfc8878#section-3.1.1.1.1.3>
680///
681/// ## Timing Attacks on Decoding
682/// Yet decoders can be deanonymized in yet another way, even just by downloading a Zstandard
683/// data stream: in particular, by their choice of internal buffering.
684///
685/// The spec makes it clear that decoders are *free to choose* their own buffer limits, saying this
686/// *two separate times*! In `Single_Segment_Flag`[^single-seg]:
687/// > For broader compatibility, decoders are recommended to support memory sizes of at least 8 MB.
688/// > This is only a recommendation; each decoder is free to support higher or lower limits,
689/// > depending on local limitations.
690///
691/// [^single-seg]: <https://datatracker.ietf.org/doc/html/rfc8878#section-3.1.1.1.1.2>
692///
693/// And then in `Window_Descriptor`[^win-desc]:
694/// > For improved interoperability, it's recommended for decoders to support values of
695/// > `Window_Size` up to 8 MB and for encoders not to generate frames requiring a `Window_Size`
696/// > larger than 8 MB.
697/// > It's merely a recommendation though, and decoders are free to support higher or lower limits,
698/// > depending on local limitations.
699///
700/// [^win-desc]: <https://datatracker.ietf.org/doc/html/rfc8878#name-window-descriptor>
701///
702/// `curl` allows specifying a buffer size to receive output (including decompressed Zstandard
703/// stream data) into[^curl-buf-size], and this can be used to validate the effect of buffer
704/// selection (using `curl`'s internal buffer reallocation heuristics) upon remote latencies.
705///
706/// [^curl-buf-size]: <https://docs.rs/curl/latest/curl/easy/struct.Easy2.html#method.buffer_size>
707///
708/// ### Window Size is Fingerprintable Entropy
709/// Unfortunately, this freedom of choice in buffer size defines a fingerprintable time series,
710/// visible to the remote end through variable latency and packet size over the course of the
711/// download (the proof of this is left as an exercise to the reader).
712///
713/// To quote a tor browser developer[^tor-dev]:
714/// > Window dimensions are a big source of fingerprintable entropy on the web.
715///
716/// Analogously, the variable latency between reads from the network socket introduced by
717/// a particular buffer size can likely be used to fingerprint a decoder. Tor hidden services have
718/// been fingerprinted through a time series analysis of packet sizes in this way[^tor-fingerprint].
719///
720/// [^tor-dev]: <https://bugzilla.mozilla.org/show_bug.cgi?id=1407366>
721/// [^tor-fingerprint]: <https://www.informatik.tu-cottbus.de/~andriy/papers/acmccs-wpes17-hidden-services-fp.pdf>
722///
723/// ## How to Achieve Anonymity
724/// Given all of this uncertainty, how can a decoder expect to avoid fingerprinting?
725///
726/// **In general, this is simply not possible by merely scanning and discarding frames from the
727/// decoder alone (as the spec recommends).** The author of this library can identify three main
728/// strategies to mitigate the issues described above:
729/// 1. Fully read out Zstandard network streams to disk.
730/// - Instead of imparting backpressure by stream processing, decouple the Zstandard
731/// decompression from network operations.
732/// - Note that there are other forms of fingerprinting unrelated to Zstandard that may remain
733/// despite this mitigation.
734/// - It may be possible to select buffer sizes according to some degree of randomness to thwart
735/// fingerprinting, but that would require a much more thorough analysis to formalize
736/// and prove.
737/// 2. Fully decode each stream, then re-encode it.
738/// - Note that this inverts the threat model: instead of fingerprinting by correlating a stream
739/// sent to a particular individual to de-anonymize them, this now risks fingerprinting an
740/// individual by their choice of encoder settings.
741/// - However, this effectively breaks the link from received Zstandard data stream to
742/// recipient, so Zstandard data streams can be received from untrusted sources.
743/// - Also note that if the resulting Zstandard data stream is *never* going to be used
744/// anywhere else (if it completely stays on the local node or internal network), this
745/// mitigation is unnecessary.
746/// - Note that encryption is *not* a sufficient protection here if the Zstandard data stream
747/// is attacker-controlled! See the CRIME exploit[^crime] linked in the spec.
748/// 3. Re-encode using deterministic settings to avoid leaking machine-specific info.
749/// - **TODO:** this needs to be fleshed out when the encoder is built!
750/// - Especially consider how the translation of symbol frequency tables may incur rounding
751/// errors from machine precision boundaries and how this may induce deterministic differences.
752///
753/// [^crime]: <https://en.wikipedia.org/w/index.php?title=CRIME&oldid=844538656>
754pub mod skippable {}