yzx_core/
frame.rs

1/*
2 * Description: Encoding of Zstandard frame types.
3 *
4 * Copyright (C) 2025 d@nny mc² <dmc2@hypnicjerk.ai>
5 * SPDX-License-Identifier: AGPL-3.0-or-later
6 *
7 * This program is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU Affero General Public License as published
9 * by the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU Affero General Public License for more details.
16 *
17 * You should have received a copy of the GNU Affero General Public License
18 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
19 */
20
21//! Encoding of Zstandard frame types.
22//!
23//! A Zstandard data stream is composed of one or more *frames*, concatenated together
24//! without interruption. Frames may refer to data or contain metadata.
25//!
26//! Supported frame types are specified in section 3.1 of IETF RFC 8878[^sec-3.1]. They are similar to the
27//! format used for frames in LZ4[^lz4].
28//!
29//! [^sec-3.1]: <https://datatracker.ietf.org/doc/html/rfc8878#section-3.1>
30//! [^lz4]: <https://github.com/lz4/lz4/blob/a018abc07a3018371625a265f195c4fafbf1f99d/doc/lz4_Frame_format.md>
31
32#[derive(Debug, Copy, Clone)]
33pub enum ParseResult<Success, Error> {
34  Success {
35    result: Success,
36    consumed_bytes: usize,
37  },
38  NeedsMoreInput {
39    at_least_this_many_more_bytes: usize,
40  },
41  Error(Error),
42}
43
44pub trait SliceParse {
45  type Success<'data>;
46  type Error;
47
48  type Arg;
49  fn parse<'data>(
50    arg: Self::Arg,
51    data: &'data [u8],
52  ) -> ParseResult<Self::Success<'data>, Self::Error>
53  where
54    Self: Sized;
55}
56
57/// Encoding of a Zstandard frame.
58///
59/// A "Zstandard frame" (somewhat confusingly) refers to a single specific type of frame which may
60/// appear in a Zstandard data stream. This frame type is also currently the only kind which is
61/// specified to produce decoded output data. Therefore, this codebase uses the term "data frame".
62///
63/// This frame type's encoding is specified in section 3.1.1 of IETF RFC 8878[^sec-3.1.1]:
64/// ```custom,{class=language-md}
65///                     +--------------------+------------+
66///                     | Magic_Number       | 4 bytes    |
67///                     +--------------------+------------+
68///                     | Frame_Header       | 2-14 bytes |
69///                     +--------------------+------------+
70///                     | Data_Block         | n bytes    |
71///                     +--------------------+------------+
72///                     | [More Data_Blocks] |            |
73///                     +--------------------+------------+
74///                     | [Content_Checksum] | 4 bytes    |
75///                     +--------------------+------------+
76/// ```
77///
78/// [^sec-3.1.1]: <https://datatracker.ietf.org/doc/html/rfc8878#section-3.1.1>
79pub mod data {
80  /// The "magic number" identifying the start of a data frame.
81  ///
82  /// As per the RFC:
83  /// > The magic number was selected to be less probable to find at the beginning of an arbitrary
84  /// > file. It avoids trivial patterns (0x00, 0xFF, repeated bytes, increasing bytes, etc.),
85  /// > contains byte values outside of the ASCII range, and doesn't map into UTF-8 space, all of
86  /// > which reduce the likelihood of its appearance at the top of a text file.
87  pub const MAGIC: u32 = 0xFD2FB528;
88  /* This will represent the data we write to or read off the wire. */
89  const MAGIC_BYTES: [u8; 4] = MAGIC.to_le_bytes();
90
91  pub mod header {
92    use core::{error, fmt, num};
93
94    use crate::frame::{ParseResult, SliceParse};
95
96    #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
97    pub struct FrameHeader {
98      pub window: WindowBounds,
99      pub dictionary: Option<DictionaryID>,
100      pub checksum: ChecksumBehavior,
101    }
102
103    #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
104    pub enum WindowBounds {
105      SingleSegment {
106        frame_content_size: u64,
107      },
108      Windowed {
109        window_size: num::NonZero<u64>,
110        frame_content_size: Option<u64>,
111      },
112    }
113
114    impl WindowBounds {
115      #[inline(always)]
116      pub const fn window_size(&self) -> u64 {
117        match self {
118          Self::SingleSegment { frame_content_size } => *frame_content_size,
119          Self::Windowed { window_size, .. } => window_size.get(),
120        }
121      }
122
123      #[inline(always)]
124      const fn clamp_block_size(window_size: u64) -> u32 {
125        if window_size > (super::block::BLOCK_SIZE_MAX as u64) {
126          super::block::BLOCK_SIZE_MAX
127        } else {
128          window_size as u32
129        }
130      }
131
132      #[inline(always)]
133      pub const fn block_maximum_size(&self) -> u32 { Self::clamp_block_size(self.window_size()) }
134    }
135
136    #[repr(transparent)]
137    #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
138    pub struct DictionaryID(u32);
139
140    impl DictionaryID {
141      #[inline(always)]
142      pub const fn new(x: u32) -> Self { Self(x) }
143
144      #[inline(always)]
145      pub const fn within_iana_reserved_ranges(&self) -> bool {
146        (self.0 <= 32767) || (self.0 >= 1 << 31)
147      }
148    }
149
150    #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
151    pub enum ChecksumBehavior {
152      HasContentChecksum,
153      None,
154    }
155
156    /// The first byte of each frame header.
157    ///
158    /// As per the RFC:
159    /// > The first header's byte is called the `Frame_Header_Descriptor`. It describes which other
160    /// > fields are present. Decoding this byte is enough to tell the size of `Frame_Header`.
161    #[repr(transparent)]
162    struct FrameHeaderDescriptor(u8);
163
164    impl FrameHeaderDescriptor {
165      #[inline(always)]
166      const fn from_le(x: &u8) -> Self { Self(u8::from_le(*x)) }
167
168      #[inline]
169      const fn fcs_field_size(&self) -> Option<num::NonZero<u8>> {
170        let frame_content_size_flag: u8 = (self.0 & 0b1100_0000) >> 6;
171        let ret: u8 = if frame_content_size_flag == 0 && !self.single_segment_flag() {
172          0
173        } else {
174          0b1u8 << frame_content_size_flag
175        };
176        num::NonZero::new(ret)
177      }
178
179      #[inline(always)]
180      const fn single_segment_flag(&self) -> bool { (self.0 & 0b0010_0000) != 0 }
181
182      #[inline(always)]
183      const fn unused_bit_is_set(&self) -> bool { (self.0 & 0b0001_0000) != 0 }
184
185      #[inline(always)]
186      const fn reserved_bit_is_set(&self) -> bool { (self.0 & 0b0000_1000) != 0 }
187
188      #[inline(always)]
189      const fn content_checksum_flag(&self) -> bool { (self.0 & 0b0000_0100) != 0 }
190
191      #[inline(always)]
192      const fn did_field_size(&self) -> Option<num::NonZero<u8>> {
193        num::NonZero::new(match self.0 & 0b11 {
194          3 => 4,
195          x => x,
196        })
197      }
198    }
199
200    #[repr(transparent)]
201    struct WindowDescriptor(u8);
202
203    impl WindowDescriptor {
204      #[inline(always)]
205      const fn from_le(x: &u8) -> Self { Self(u8::from_le(*x)) }
206
207      #[inline(always)]
208      #[allow(non_snake_case)]
209      #[rustfmt::skip] /* This is the only way to retain the range bounds, sigh... */
210      const fn compute_window_size(&self) -> num::NonZero<u64> {
211        let Exponent: u64 = ((self.0 & 0b1111_1000) >> 3) as u64; /* [0, 31] */
212        let Mantissa: u64 = (self.0 & 0b0000_0111) as u64;        /* [0, 7] */
213        let windowLog: u64 = 10 + Exponent;                       /* [10, 41] */
214        let windowBase: u64 = 1 << windowLog;                     /* [1<<10, 1<<41] */
215        let windowAdd: u64 = (windowBase / 8) * Mantissa;         /* [0, 7x(1<<38)] */
216        /* LLVM optimizes out the panic branch on --release! */
217        num::NonZero::new(windowBase + windowAdd).unwrap() /* [1<<10, (1<<41)+7x(1<<38)] */
218      }
219    }
220
221    #[derive(Debug, Copy, Clone)]
222    pub enum HeaderParseError {
223      ReservedBitSet,
224      InvalidWindowBounds {
225        window_size: num::NonZero<u64>,
226        frame_content_size: u64,
227      },
228    }
229
230    impl fmt::Display for HeaderParseError {
231      fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
232        match self {
233          Self::ReservedBitSet => write!(f, "reserved bit set in frame header descriptor"),
234          Self::InvalidWindowBounds {
235            window_size,
236            frame_content_size,
237          } => {
238            let window_size = window_size.get();
239            assert!(window_size > *frame_content_size);
240            write!(
241              f,
242              "invalid window bounds: Window_Size ({window_size}) \
243             was greater than Frame_Content_Size ({frame_content_size})"
244            )
245          },
246        }
247      }
248    }
249
250    impl error::Error for HeaderParseError {}
251
252    impl SliceParse for FrameHeader {
253      type Success<'data> = FrameHeader;
254      type Error = HeaderParseError;
255
256      type Arg = ();
257      fn parse<'data>(
258        _arg: Self::Arg,
259        data: &'data [u8],
260      ) -> ParseResult<Self::Success<'data>, Self::Error>
261      where
262        Self: Sized,
263      {
264        let original_length = data.len();
265        let (descriptor, data) = match data.split_first() {
266          None => {
267            return ParseResult::NeedsMoreInput {
268              at_least_this_many_more_bytes: 1,
269            };
270          },
271          Some((x, rest)) => (FrameHeaderDescriptor::from_le(x), rest),
272        };
273        if descriptor.reserved_bit_is_set() {
274          return ParseResult::Error(HeaderParseError::ReservedBitSet);
275        }
276
277        let (window_descriptor, data): (Option<WindowDescriptor>, &[u8]) =
278          if descriptor.single_segment_flag() {
279            (None, data)
280          } else {
281            match data.split_first() {
282              None => {
283                return ParseResult::NeedsMoreInput {
284                  at_least_this_many_more_bytes: 1,
285                };
286              },
287              Some((x, rest)) => (Some(WindowDescriptor::from_le(x)), rest),
288            }
289          };
290
291        let (dictionary, data): (Option<DictionaryID>, &[u8]) = match descriptor.did_field_size() {
292          None => (None, data),
293          Some(n) => match data.split_at_checked(n.get() as usize) {
294            None => {
295              return ParseResult::NeedsMoreInput {
296                at_least_this_many_more_bytes: (n.get() as usize) - data.len(),
297              };
298            },
299            Some((dict_data, rest)) => {
300              let id: u32 = match dict_data.len() {
301                1 => u8::from_le(dict_data[0]) as u32,
302                2 => u16::from_le_bytes(dict_data.try_into().unwrap()) as u32,
303                4 => u32::from_le_bytes(dict_data.try_into().unwrap()),
304                _ => unreachable!("1, 2, and 4 are the only viable did_field_size results"),
305              };
306              (Some(DictionaryID::new(id)), rest)
307            },
308          },
309        };
310
311        let (frame_content_size, data): (Option<u64>, &[u8]) = match descriptor.fcs_field_size() {
312          None => (None, data),
313          Some(n) => match data.split_at_checked(n.get() as usize) {
314            None => {
315              return ParseResult::NeedsMoreInput {
316                at_least_this_many_more_bytes: (n.get() as usize) - data.len(),
317              };
318            },
319            Some((frame_data, rest)) => {
320              let size: u64 = match frame_data.len() {
321                1 => u8::from_le(frame_data[0]) as u64,
322                /* NB: the specific case of 2-byte width adds an additional offset.
323                 *     This is not explained in the specification for some reason. */
324                2 => (u16::from_le_bytes(frame_data.try_into().unwrap()) as u64) + 256,
325                4 => u32::from_le_bytes(frame_data.try_into().unwrap()) as u64,
326                8 => u64::from_le_bytes(frame_data.try_into().unwrap()),
327                _ => unreachable!("1, 2, 4, and 8 are the only viable fcs_field_size results"),
328              };
329              (Some(size), rest)
330            },
331          },
332        };
333
334        let window = match (window_descriptor, frame_content_size) {
335          (None, None) => {
336            unreachable!(
337              "Single_Segment_Flag modulates Window_Size and Frame_Content_Size, \
338             making this case impossible"
339            )
340          },
341          /* Single_Segment_Flag */
342          (None, Some(frame_content_size)) => WindowBounds::SingleSegment { frame_content_size },
343          (Some(window_descriptor), None) => WindowBounds::Windowed {
344            window_size: window_descriptor.compute_window_size(),
345            frame_content_size: None,
346          },
347          (Some(window_descriptor), Some(frame_content_size)) => {
348            let window_size: num::NonZero<u64> = window_descriptor.compute_window_size();
349            if window_size.get() > frame_content_size {
350              return ParseResult::Error(HeaderParseError::InvalidWindowBounds {
351                window_size,
352                frame_content_size,
353              });
354            }
355            WindowBounds::Windowed {
356              window_size,
357              frame_content_size: Some(frame_content_size),
358            }
359          },
360        };
361
362        let checksum = if descriptor.content_checksum_flag() {
363          ChecksumBehavior::HasContentChecksum
364        } else {
365          ChecksumBehavior::None
366        };
367
368        let consumed_bytes = original_length - data.len();
369        ParseResult::Success {
370          result: FrameHeader {
371            window,
372            dictionary,
373            checksum,
374          },
375          consumed_bytes,
376        }
377      }
378    }
379  }
380
381  pub mod block {
382    use core::{error, fmt};
383
384    use crate::frame::{ParseResult, SliceParse};
385
386    const HEADER_SIZE: usize = 3;
387
388    /* 128kb global limit to block size, regardless of window_size. */
389    pub(crate) const BLOCK_SIZE_MAX: u32 = 1 << 17;
390
391    #[derive(Debug, Copy, Clone)]
392    pub struct Block<'data> {
393      pub content: BlockContent<'data>,
394      pub sequence_behavior: SequenceBehavior,
395    }
396
397    #[derive(Debug, Copy, Clone)]
398    pub enum BlockContent<'data> {
399      Raw { data: &'data [u8] },
400      RunLength { byte: &'data u8, block_size: u32 },
401      Compressed { data: &'data [u8] },
402    }
403
404    #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
405    pub enum SequenceBehavior {
406      LastBlock,
407      Continue,
408    }
409
410    #[repr(transparent)]
411    struct BlockHeader(u32);
412
413    impl BlockHeader {
414      #[inline(always)]
415      fn from_le_bytes(x: &[u8; HEADER_SIZE]) -> Self {
416        static_assertions::const_assert!(HEADER_SIZE <= core::mem::size_of::<u32>());
417        let ret: u32 = (0..HEADER_SIZE)
418          .map(|i| (x[i] as u32) << (u8::BITS * (i as u32)))
419          .sum();
420        Self(ret)
421      }
422
423      #[inline(always)]
424      const fn last_block(&self) -> bool { (self.0 & 0b1) != 0 }
425
426      #[inline(always)]
427      fn block_type(&self) -> BlockType {
428        match (self.0 & 0b110) >> 1 {
429          0 => BlockType::RawBlock,
430          1 => BlockType::RLEBlock,
431          2 => BlockType::CompressedBlock,
432          3 => BlockType::Reserved,
433          _ => unreachable!("block type is limited to two bits"),
434        }
435      }
436
437      #[inline(always)]
438      const fn block_size(&self) -> u32 { self.0 >> 3 }
439    }
440
441    #[repr(u8)]
442    #[derive(Debug, Copy, Clone)]
443    enum BlockType {
444      RawBlock = 0,
445      RLEBlock = 1,
446      CompressedBlock = 2,
447      Reserved = 3,
448    }
449
450    #[derive(Debug, Copy, Clone)]
451    pub enum BlockParseResult<'data> {
452      Success {
453        result: Block<'data>,
454        is_last_block: bool,
455        consumed_bytes: usize,
456      },
457      NeedsMoreInput {
458        at_least_this_many_more_bytes: usize,
459      },
460      Error(BlockParseError),
461    }
462
463    #[derive(Debug, Copy, Clone)]
464    pub enum BlockParseError {
465      TooLarge { max_size: u32, block_size: u32 },
466      ReservedBlockType,
467    }
468
469    impl fmt::Display for BlockParseError {
470      fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
471        match self {
472          Self::TooLarge {
473            max_size,
474            block_size,
475          } => {
476            assert!(block_size > max_size);
477            write!(
478              f,
479              "Block_Size ({block_size}) was greater than Block_Maximum_Size ({max_size})"
480            )
481          },
482          Self::ReservedBlockType => {
483            write!(f, "reserved block type specified in block header")
484          },
485        }
486      }
487    }
488
489    impl error::Error for BlockParseError {}
490
491    impl SliceParse for Block<'_> {
492      type Success<'data> = Block<'data>;
493      type Error = BlockParseError;
494
495      type Arg = u32;
496      /* NB: max_size should be the result of WindowBounds::block_maximum_size()! */
497      fn parse<'data>(
498        max_size: u32,
499        data: &'data [u8],
500      ) -> ParseResult<Self::Success<'data>, Self::Error> {
501        debug_assert!(max_size <= BLOCK_SIZE_MAX);
502        let original_length = data.len();
503        let (block_header, data) = match data.split_at_checked(HEADER_SIZE) {
504          None => {
505            return ParseResult::NeedsMoreInput {
506              at_least_this_many_more_bytes: data.len() - HEADER_SIZE,
507            };
508          },
509          Some((header, rest)) => {
510            let block_header = BlockHeader::from_le_bytes(header.try_into().unwrap());
511            (block_header, rest)
512          },
513        };
514
515        let block_size: u32 = {
516          let block_size = block_header.block_size();
517          if block_size > max_size {
518            return ParseResult::Error(BlockParseError::TooLarge {
519              max_size,
520              block_size,
521            });
522          }
523          block_size
524        };
525
526        let sequence_behavior = if block_header.last_block() {
527          SequenceBehavior::LastBlock
528        } else {
529          SequenceBehavior::Continue
530        };
531
532        let (block, data) = match block_header.block_type() {
533          BlockType::RawBlock => match data.split_at_checked(block_size as usize) {
534            None => {
535              return ParseResult::NeedsMoreInput {
536                at_least_this_many_more_bytes: data.len() - (block_size as usize),
537              };
538            },
539            Some((block, rest)) => {
540              let block = BlockContent::Raw { data: block };
541              (block, rest)
542            },
543          },
544          BlockType::RLEBlock => match data.split_first() {
545            None => {
546              return ParseResult::NeedsMoreInput {
547                at_least_this_many_more_bytes: 1,
548              };
549            },
550            Some((byte, rest)) => {
551              let block = BlockContent::RunLength { byte, block_size };
552              (block, rest)
553            },
554          },
555          BlockType::CompressedBlock => match data.split_at_checked(block_size as usize) {
556            None => {
557              return ParseResult::NeedsMoreInput {
558                at_least_this_many_more_bytes: data.len() - (block_size as usize),
559              };
560            },
561            Some((block, rest)) => {
562              let block = BlockContent::Compressed { data: block };
563              (block, rest)
564            },
565          },
566          BlockType::Reserved => {
567            return ParseResult::Error(BlockParseError::ReservedBlockType);
568          },
569        };
570
571        let consumed_bytes = original_length - data.len();
572        ParseResult::Success {
573          result: Block {
574            content: block,
575            sequence_behavior,
576          },
577          consumed_bytes,
578        }
579      }
580    }
581  }
582}
583
584/// Encoding of a skippable frame.
585///
586/// A skippable frame carries a fixed-size numeric identifier in its magic number as well as
587/// variable-sized arbitrary bytes. It does *not* decompose into any internal block format like
588/// [`data`] frames. It also has no decoder behavior specified by RFC 8788, which instead explicitly
589/// clarifies its intent to support "user-defined metadata":
590/// > From a compliant decoder perspective, skippable frames simply need to be skipped, and their
591/// > content ignored, resuming decoding after the skippable frame.
592///
593/// This frame type's encoding is specified in section 3.1.2 of IETF RFC 8878[^sec-3.1.2]:
594/// ```custom,{class=language-md}
595///                  +==============+============+===========+
596///                  | Magic_Number | Frame_Size | User_Data |
597///                  +==============+============+===========+
598///                  |   4 bytes    |  4 bytes   |  n bytes  |
599///                  +--------------+------------+-----------+
600/// ```
601///
602/// [^sec-3.1.2]: <https://datatracker.ietf.org/doc/html/rfc8878#section-3.1.2>
603///
604/// # Privacy Risk: Watermarking
605/// The IETF RFC *repeatedly* notes the potential for watermarking and other forms of tracking
606/// possible via skippable frames in this standard:
607/// > It should be noted that a skippable frame can be used to watermark a stream of concatenated
608/// > frames embedding any kind of tracking information (even just a Universally Unique Identifier
609/// > (UUID)). Users wary of such possibility should scan the stream of concatenated frames in an
610/// > attempt to detect such frames for analysis or removal.
611///
612/// Because the specification does not specify the behavior of skippable frames, this risk can go
613/// undetected unless the decoder explicitly handles such frames. Removing such frames will modify
614/// the resulting stream (which itself may impose its own risk of watermarking), but should make it
615/// possible for two independent implementations (or two independent users of this library) to avoid
616/// being individually watermarked by skippable frames alone if they were to reproduce a zstd stream
617/// from an untrusted source.
618///
619/// ## Data Frames Contain Hidden States
620/// However, the Zstandard stream format contains many further opportunities for individually
621/// watermarking a stream beyond skippable frames which are *not* mentioned in the spec, and which
622/// generally revolve around the immense flexibility of standard [`data`] frames.
623///
624/// These opportunities are almost too numerous to name, but take on a few broad categories:
625/// - **degenerate states:** *when decoded output is empty*
626///   - Like skippable frames, these also have no effect upon the decoded output, but can store
627///     arbitrary user data.
628///   - examples include:
629///     - when `Frame_Content_Size` is 0 (this also limits all subsequent `Window_Size` and
630///       `Block_Size`).
631///     - when `Block_Size` is 0 (for `Raw_Block` or `RLE_Block`).
632///     - when `Number_of_Sequences` is 0.
633///     - **TODO:** probably when a literal or offset is zero-length in sequence execution?
634/// - **synonymous/fungible states:** *when the same output data is representable with distinct byte
635///   strings*
636///   - This comes in three basic forms:
637///     1. _using a more *complex* data structure than necessary_, e.g.:
638///        - a `Raw_Block` for a single repeating byte.
639///        - a `Compressed_Block` for uncompressible data.
640///     2. _using a sequence of too *simple* data structures_, e.g.:
641///        - two consecutive `RLE_Block`s with `Block_Size == 1` vs `Raw_Block` with 2 bytes.
642///        - a `Raw_Block` for highly compressible data.
643///     3. _using `Block_Type` vs `Literals_Block_Type`:_
644///        - `Block_Type` provides simpler forms of `RLE_Block` and `Raw_Literals_Block`, whereas
645///          the `Sequences_Section`[^seq-sec] from `Compressed_Block` can describe a program to
646///          execute a sequence of run-length literals or directly-copied bytes.
647///   - Note that "compressibility" is highly domain-specific, and decisions may be performed
648///     arbitrarily by the encoder.
649///     - *This therefore exposes the encoder to watermarking.*
650/// - **dict/literal encoding:** *when decisions are made regarding prefix data or symbol
651///   distributions*
652///   - **TODO:** it is still unclear how this works and the directions seem to contradict
653///     each other.
654///   - This technique can be supremely difficult to detect heuristically.
655///     - It may be possible through re-encoding to compare against a symbol distribution table
656///       built up by hand.
657///     - In general, the space of possible compression encodings is vast, and as compression is
658///       compared by both speed and size ratio, the decisions a compressor makes are hard to judge.
659///       - *However, this individuality streak makes encoders susceptible to watermarking too.*
660/// - **block index selection:** *when the encoder decides how to chunk up the stream*
661///   - As with dict encoding, this is generally considered an arbitrary decision by the encoder.
662///     - *As a result, encoding is also watermarkable.*
663///
664/// [^seq-sec]: <https://datatracker.ietf.org/doc/html/rfc8878#section-3.1.1.3.2>
665///
666/// ### "Unused Bit" is a Skippable Frame
667/// Also worth calling out in particular is **the "Unused Bit"
668/// from section 3.1.1.1.1.3[^sec-3.1.1.1.1.3]:**
669/// > A decoder compliant with this specification version shall not interpret this bit.
670///
671/// This is actually even *stronger* than a skippable frame, as it claims compliance *requires* not
672/// looking at the value of the bit, whereas skippable frames do not impose any interpretation
673/// (forbidding an interpretation is also an interpretation!). Luckily, as it states at the top:
674/// > This document is not an Internet Standards Track specification.
675///
676/// So for now we can do what it suggests:
677/// > An encoder compliant with this specification must set this bit to zero.
678///
679/// [^sec-3.1.1.1.1.3]: <https://datatracker.ietf.org/doc/html/rfc8878#section-3.1.1.1.1.3>
680///
681/// ## Timing Attacks on Decoding
682/// Yet decoders can be deanonymized in yet another way, even just by downloading a Zstandard
683/// data stream: in particular, by their choice of internal buffering.
684///
685/// The spec makes it clear that decoders are *free to choose* their own buffer limits, saying this
686/// *two separate times*! In `Single_Segment_Flag`[^single-seg]:
687/// > For broader compatibility, decoders are recommended to support memory sizes of at least 8 MB.
688/// > This is only a recommendation; each decoder is free to support higher or lower limits,
689/// > depending on local limitations.
690///
691/// [^single-seg]: <https://datatracker.ietf.org/doc/html/rfc8878#section-3.1.1.1.1.2>
692///
693/// And then in `Window_Descriptor`[^win-desc]:
694/// > For improved interoperability, it's recommended for decoders to support values of
695/// > `Window_Size` up to 8 MB and for encoders not to generate frames requiring a `Window_Size`
696/// > larger than 8 MB.
697/// > It's merely a recommendation though, and decoders are free to support higher or lower limits,
698/// > depending on local limitations.
699///
700/// [^win-desc]: <https://datatracker.ietf.org/doc/html/rfc8878#name-window-descriptor>
701///
702/// `curl` allows specifying a buffer size to receive output (including decompressed Zstandard
703/// stream data) into[^curl-buf-size], and this can be used to validate the effect of buffer
704/// selection (using `curl`'s internal buffer reallocation heuristics) upon remote latencies.
705///
706/// [^curl-buf-size]: <https://docs.rs/curl/latest/curl/easy/struct.Easy2.html#method.buffer_size>
707///
708/// ### Window Size is Fingerprintable Entropy
709/// Unfortunately, this freedom of choice in buffer size defines a fingerprintable time series,
710/// visible to the remote end through variable latency and packet size over the course of the
711/// download (the proof of this is left as an exercise to the reader).
712///
713/// To quote a tor browser developer[^tor-dev]:
714/// > Window dimensions are a big source of fingerprintable entropy on the web.
715///
716/// Analogously, the variable latency between reads from the network socket introduced by
717/// a particular buffer size can likely be used to fingerprint a decoder. Tor hidden services have
718/// been fingerprinted through a time series analysis of packet sizes in this way[^tor-fingerprint].
719///
720/// [^tor-dev]: <https://bugzilla.mozilla.org/show_bug.cgi?id=1407366>
721/// [^tor-fingerprint]: <https://www.informatik.tu-cottbus.de/~andriy/papers/acmccs-wpes17-hidden-services-fp.pdf>
722///
723/// ## How to Achieve Anonymity
724/// Given all of this uncertainty, how can a decoder expect to avoid fingerprinting?
725///
726/// **In general, this is simply not possible by merely scanning and discarding frames from the
727/// decoder alone (as the spec recommends).** The author of this library can identify three main
728/// strategies to mitigate the issues described above:
729/// 1. Fully read out Zstandard network streams to disk.
730///    - Instead of imparting backpressure by stream processing, decouple the Zstandard
731///      decompression from network operations.
732///      - Note that there are other forms of fingerprinting unrelated to Zstandard that may remain
733///        despite this mitigation.
734///    - It may be possible to select buffer sizes according to some degree of randomness to thwart
735///      fingerprinting, but that would require a much more thorough analysis to formalize
736///      and prove.
737/// 2. Fully decode each stream, then re-encode it.
738///    - Note that this inverts the threat model: instead of fingerprinting by correlating a stream
739///      sent to a particular individual to de-anonymize them, this now risks fingerprinting an
740///      individual by their choice of encoder settings.
741///      - However, this effectively breaks the link from received Zstandard data stream to
742///        recipient, so Zstandard data streams can be received from untrusted sources.
743///      - Also note that if the resulting Zstandard data stream is *never* going to be used
744///        anywhere else (if it completely stays on the local node or internal network), this
745///        mitigation is unnecessary.
746///        - Note that encryption is *not* a sufficient protection here if the Zstandard data stream
747///          is attacker-controlled! See the CRIME exploit[^crime] linked in the spec.
748/// 3. Re-encode using deterministic settings to avoid leaking machine-specific info.
749///    - **TODO:** this needs to be fleshed out when the encoder is built!
750///    - Especially consider how the translation of symbol frequency tables may incur rounding
751///      errors from machine precision boundaries and how this may induce deterministic differences.
752///
753/// [^crime]: <https://en.wikipedia.org/w/index.php?title=CRIME&oldid=844538656>
754pub mod skippable {}
yzx_core/frame.rs

yzx_core/
frame.rs