diff --git a/.gitignore b/.gitignore index 73fab07..a69c6b7 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,9 @@ target/ # MSVC Windows builds of rustc generate these, which store debugging information *.pdb + +# Local Claude Code settings (machine-specific) +.claude/ + +# Unreferenced / scratch files +.unref/ diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..87e889d --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,73 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +> **Important:** This file (and all sub-project `CLAUDE.md` files) are checked into the +> repository. Only include information that is valid for **any** developer or machine: +> project conventions, architecture, commands, constraints. **Do not** add machine-specific +> paths, personal tool preferences, local environment settings, or anything that would +> not apply to every contributor. + +## Commands + +```bash +# Build +cargo build --all +cargo build --release --all + +# Test +cargo test --all +cargo test # Run a single test by name +cargo test -- --nocapture # Show test output + +# Lint and format +cargo fmt --check --all +cargo clippy +``` + +The CI runs on `windows-latest` and builds for multiple targets: `wasm32-wasip1`, `aarch64-unknown-linux-musl`, `x86_64-pc-windows-msvc`, `x86_64-unknown-linux-gnu`. + +The release build uses Spectre mitigations (`/Qspectre /sdl`) and produces `preflate_rs_0_7.dll` and `preflate_util.exe`. + +## Architecture + +**preflate-rs** analyzes DEFLATE-compressed streams, extracts the uncompressed data plus a compact set of reconstruction parameters, and later recreates the exact original DEFLATE bitstream. This enables re-compression with modern algorithms (Zstd, Brotli) while preserving binary-exact round-trip fidelity. The key insight is detecting which compressor (zlib, libdeflate, zlib-ng, miniz, Windows zlib) produced a stream and storing only the differences from what that compressor would predict. + +### Workspace layout + +| Crate | Output | Role | +|---|---|---| +| `preflate/` | library | Core DEFLATE analysis and reconstruction | +| `container/` | library | Scans binary files (ZIP, PNG, JPEG) for DEFLATE streams | +| `util/` | `preflate_util.exe` | CLI for testing on files/directories | +| `dll/` | `preflate_rs_0_7.dll` | C FFI wrapper for .NET interop | +| `fuzz/` | fuzz harnesses | libfuzzer targets | +| `tests/` | integration tests | End-to-end round-trip tests using `samples/` | + +### preflate crate (core) + +The processing pipeline in `preflate/src/stream_processor.rs`: +1. **`deflate/`** — Reads a DEFLATE bitstream into tokens (literals and length/distance back-references) and writes tokens back to DEFLATE with custom Huffman trees. +2. **`estimator/`** — Estimates the compressor's parameters (`TokenPredictorParameters`): hash algorithm, `nice_length`, `max_chain`, window bits, add policy, matching type. +3. **`token_predictor.rs`** — Replays the compression using estimated parameters and hash chains to predict what tokens the original compressor would have produced. +4. **`tree_predictor.rs`** — Predicts Huffman tree structure. +5. **`statistical_codec.rs` / `cabac_codec.rs`** — Encodes the *differences* from prediction using CABAC (Context Adaptive Binary Arithmetic Coding, shared with Lepton JPEG). +6. **`stream_processor.rs`** — Public API: `PreflateStreamProcessor::decompress()` and `RecreateStreamProcessor::recreate()`. + +Parameters are serialized via `bitcode`; corrections via CABAC. The format is chunked to bound memory use. + +### container crate + +- **`scan_deflate.rs`** — Scans raw bytes to locate DEFLATE stream boundaries, identifying stream type (raw deflate, zlib-wrapped, PNG IDAT, ZIP, JPEG, etc.). +- **`idat_parse.rs`** — Extracts and reassembles PNG IDAT chunks. +- **`container_processor.rs`** — Orchestrates scanning → preflate → Zstd (compress) and Zstd → recreate → reassembly (decompress). Zstd encode/decode is handled inline using a single persistent encoder. +- **`utils.rs`** — `process_limited_buffer()` and test helpers. +- **`scoped_read.rs`** — Bounded reader adapter. + +The optional `webp` feature (enabled by default) allows PNG images to be stored as WebP instead of losslessly. PDF streams are not scanned (pdf_parse was removed). + +### Code constraints + +- **No unsafe code** — enforced via `#![forbid(unsafe_code)]` in each crate. +- Minimum Rust version: **1.85**, Edition **2024**. +- `.cargo/config.toml` sets Windows MSVC linker flags (`/DYNAMICBASE`, `/CETCOMPAT`, `/guard:cf`). diff --git a/Cargo.lock b/Cargo.lock index 269ac3b..900124c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -551,7 +551,7 @@ dependencies = [ [[package]] name = "preflate-container" -version = "0.7.5" +version = "0.7.6" dependencies = [ "adler32", "byteorder", @@ -567,7 +567,7 @@ dependencies = [ [[package]] name = "preflate-rs" -version = "0.7.5" +version = "0.7.6" dependencies = [ "bitcode", "byteorder", @@ -589,7 +589,7 @@ dependencies = [ [[package]] name = "preflate-rs-root" -version = "0.0.0" +version = "0.7.6" dependencies = [ "libdeflate-sys", "libz-ng-sys", @@ -601,7 +601,7 @@ dependencies = [ [[package]] name = "preflate_rs_0_7" -version = "0.7.5" +version = "0.7.6" dependencies = [ "preflate-container", "preflate-rs", @@ -609,7 +609,7 @@ dependencies = [ [[package]] name = "preflate_util" -version = "0.0.0" +version = "0.7.6" dependencies = [ "clap", "cpu-time", diff --git a/Cargo.toml b/Cargo.toml index 0196a28..8b4cacc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,11 +1,11 @@ # root project only exists to refer to the packages -# and run the end-to-end tests in the tests directory +# and run the end-to-end tests in the tests directory [package] name = "preflate-rs-root" -version = "0.0.0" -edition = "2024" -rust-version = "1.85" +version.workspace = true +edition.workspace = true +rust-version.workspace = true [profile.release] debug = true @@ -14,6 +14,14 @@ debug = true members = ["preflate", "container", "dll", "util", "fuzz"] resolver = "2" +[workspace.package] +version = "0.7.6" +edition = "2024" +authors = ["Kristof Roomp "] +license = "Apache-2.0" +rust-version = "1.85" +repository = "https://github.com/microsoft/preflate-rs" + [dev-dependencies] preflate-rs = { path = "preflate" } preflate-container = { path = "container" } diff --git a/container/CLAUDE.md b/container/CLAUDE.md new file mode 100644 index 0000000..a752e85 --- /dev/null +++ b/container/CLAUDE.md @@ -0,0 +1,203 @@ +# container (preflate-container) + +Scans binary files (ZIP, PNG, JPEG) for DEFLATE streams, orchestrates the +preflate + Zstd pipeline, and reassembles the output. Only format version 2 exists +(v1 was removed). + +## Public API (`lib.rs`) + +```rust +// Compress a file/buffer containing embedded DEFLATE streams +PreflateContainerProcessor::new(config: &PreflateContainerConfig, level: i32, test_baseline: bool) -> Self +impl ProcessBuffer for PreflateContainerProcessor { ... } + +// Decompress a preflate container back to the original file +RecreateContainerProcessor::new(capacity: usize) -> Self +impl ProcessBuffer for RecreateContainerProcessor { ... } + +// Core trait — both processors implement this +pub trait ProcessBuffer { + fn process_buffer(&mut self, input: &[u8], input_complete: bool, writer: &mut impl Write) -> Result<()>; + fn stats(&self) -> PreflateStats { PreflateStats::default() } // default no-op; overridden by Compress + fn copy_to_end(&mut self, input: &mut impl BufRead, output: &mut impl Write) -> Result<()>; + fn copy_to_end_size(&mut self, input: &mut impl BufRead, output: &mut impl Write, chunk: usize) -> Result<()>; +} + +// DLL helper: writes to a fixed output buffer, spills overflow into a VecDeque +fn process_limited_buffer( + process: &mut impl ProcessBuffer, + input: &[u8], + input_complete: bool, + output_buffer: &mut [u8], + output_extra: &mut VecDeque, +) -> Result<(bool, usize)>; // (all_output_drained, bytes_written_to_output_buffer) +``` + +`PreflateContainerConfig` holds knobs: `min_chunk_size`, `max_chunk_size`, +`total_plain_text_limit`, `chunk_plain_text_limit`, `validate_compression`, `max_chain_length`. + +## Wire Format (v2 only) + +### Outer framing (always raw / uncompressed) + +``` +[0x02] ← COMPRESSED_WRAPPER_VERSION_2 (1 byte, raw) + +Repeat for each block: + [type] ← block type byte (1 byte, raw) — see bit-field below + [varint(content_len)] ← byte count of what follows (1–5 bytes, raw) + [content_bytes × content_len] ← meaning depends on type (see below) +``` + +All framing bytes (`type`, `varint`) are written directly to the output stream — +they are **never** inside the Zstd encoder. + +### Block type byte bit-field + +Each block type byte encodes two fields: + +``` +Bit 7-6 BLOCK_COMPRESSION_* 00 = none/raw 01 = Zstd 10-11 = reserved +Bit 5-0 BLOCK_TYPE_* block content kind (0–63) +``` + +Mask constants (defined in `container_processor.rs`): + +| Constant | Value | Meaning | +|---|---|---| +| `BLOCK_COMPRESSION_MASK` | `0xC0` | extracts bits 7–6 | +| `BLOCK_TYPE_MASK` | `0x3F` | extracts bits 5–0 | +| `BLOCK_COMPRESSION_NONE` | `0x00` | content is raw (not Zstd) | +| `BLOCK_COMPRESSION_ZSTD` | `0x40` | content is a Zstd flush segment | + +### Block content kinds and combined wire values + +| `BLOCK_TYPE_*` | Value | Combined wire byte | Description | +|---|---|---|---| +| `BLOCK_TYPE_LITERAL` | `0x00` | `0x40` | Raw input bytes with no detectable DEFLATE stream | +| `BLOCK_TYPE_DEFLATE` | `0x01` | `0x41` | A raw/zlib DEFLATE stream (start of a new stream) | +| `BLOCK_TYPE_PNG` | `0x02` | `0x42` | A PNG IDAT stream stored without WebP | +| `BLOCK_TYPE_DEFLATE_CONTINUE` | `0x03` | `0x43` | Continuation of a DEFLATE stream that spanned a chunk boundary | +| `BLOCK_TYPE_JPEG_LEPTON` | `0x04` | `0x04` | JPEG re-compressed with Lepton; bypasses Zstd entirely | +| `BLOCK_TYPE_WEBP` | `0x05` | `0x05` | PNG image stored as WebP lossless; bypasses Zstd entirely | + +### Zstd encoder/decoder lifecycle + +- A **single persistent `zstd::stream::write::Encoder`** is created once and shared across + all Zstd-compressed blocks (compression bits `0x40`). +- After writing each block's inner payload into the encoder, `encoder.flush()` is called, + which emits a Zstd `ZSTD_e_flush` segment. Those bytes are what get stored as + `content_bytes` in the outer framing. +- Each flush segment is decodable in sequence: the decoder is a persistent + `zstd::stream::raw::Decoder` that maintains cross-block history, so compression + quality benefits from all previously seen blocks. +- The stream is terminated by EOF — there is no explicit end-of-stream block. + +### Inner payload layout (inside Zstd, after decompression) + +**`BLOCK_TYPE_LITERAL` (wire `0x40`)** +``` +varint(data_len) +data[data_len] ← verbatim bytes from the original input +``` + +**`BLOCK_TYPE_DEFLATE` (wire `0x41`) and `BLOCK_TYPE_DEFLATE_CONTINUE` (wire `0x43`)** +``` +varint(corrections_len) +varint(plaintext_len) +corrections[corrections_len] ← CABAC-encoded differences from predicted tokens +plaintext[plaintext_len] ← uncompressed data +``` +`BLOCK_TYPE_DEFLATE_CONTINUE` has the same layout; the decoder reuses the +`RecreateStreamProcessor` state from the preceding `BLOCK_TYPE_DEFLATE` block. + +**`BLOCK_TYPE_PNG` (wire `0x42`) — non-WebP path** +``` +varint(corrections_len) +varint(plaintext_len) +IdatContents metadata: + varint(chunk_size_1) … varint(chunk_size_N) varint(0) ← IDAT chunk size list (0-terminated) + zlib_header[2] + addler32[4] + 0xFF ← sentinel: no png_header present +corrections[corrections_len] +plaintext[plaintext_len] ← raw unfiltered pixel data +``` + +### Raw block payload layout (outside Zstd) + +**`BLOCK_TYPE_JPEG_LEPTON` (wire `0x04`)** +``` +lepton_bytes[content_len] ← Lepton-compressed JPEG; decoded by lepton_jpeg::decode_lepton() +``` + +**`BLOCK_TYPE_WEBP` (wire `0x05`)** +``` +varint(corrections_len) +varint(webp_data_len) +IdatContents metadata: + varint(chunk_size_1) … varint(chunk_size_N) varint(0) + zlib_header[2] + addler32[4] + color_type[1] ← PngColorType (RGB=2, RGBA=6) + varint(width) + varint(height) +filters[height] ← PNG row filter bytes (one per row) +corrections[corrections_len] +webp_data[webp_data_len] ← WebP lossless encoded pixel data +``` +On decode, the WebP bytes are decompressed back to pixels, PNG filters are re-applied, +and the result is re-deflated using the corrections to recreate the original IDAT stream. + +## Idempotent Finalization (important bug history) + +`process_buffer` may be called with `input_complete=true` multiple times (DLL pattern). +The finalization block must guard against double-finalization: + +```rust +if input_complete && !self.input_complete { // NOT just `if input_complete` + self.input_complete = true; + // ... encoder.take().unwrap() +} +``` + +## Module Layout + +``` +src/ + lib.rs ← public types and re-exports + container_processor.rs ← PreflateContainerProcessor, RecreateContainerProcessor, + ProcessBuffer trait, MeasureWriteSink, + block-type constants, emit_compressed_block(), + write_chunk_block_v2(), write_varint(), read_varint() + scan_deflate.rs ← locates DEFLATE stream boundaries in raw bytes + identifies: raw deflate, zlib-wrapped, PNG IDAT, ZIP, JPEG + idat_parse.rs ← extracts / reassembles PNG IDAT chunks; parses IHDR + scoped_read.rs ← bounded reader adapter + utils.rs ← process_limited_buffer(), TakeReader, test helpers +``` + +## Key Internal Types + +| Type | Purpose | +|---|---| +| `MeasureWriteSink` | `pub(crate)` sink that counts bytes; used for baseline Zstd measurement | +| `PreflateStats` | pub struct: `deflate_compressed_size`, `zstd_compressed_size`, `uncompressed_size`, `overhead_bytes`, `hash_algorithm`, `zstd_baseline_size` | +| `TakeReader` | `pub` BufRead wrapper that reads at most N bytes (used in utils.rs) | + +## Features + +- `webp` (default-enabled) — allows PNG images to be stored as WebP instead of lossless PNG, + using the `webp` crate. + +## Dependencies of Note + +- `lepton_jpeg` (0.5.1) — JPEG blocks are recompressed with Lepton, bypassing Zstd entirely. +- `zstd` (0.13) — single persistent encoder across all non-JPEG/WebP blocks. +- `preflate-rs` — core analysis/reconstruction (path dependency). +- `webp` (0.3, optional, default-enabled) — PNG images can be stored as WebP lossless. + +## Constraints + +- `#![forbid(unsafe_code)]` enforced. +- `main.rs` exists but is a stub; this crate is a library. diff --git a/container/Cargo.toml b/container/Cargo.toml index ef285df..11826a7 100644 --- a/container/Cargo.toml +++ b/container/Cargo.toml @@ -1,15 +1,15 @@ [package] name = "preflate-container" -version = "0.7.5" -edition = "2024" -authors = ["Kristof Roomp "] -license = "Apache-2.0" -rust-version = "1.85" +version.workspace = true +edition.workspace = true +authors.workspace = true +license.workspace = true +rust-version.workspace = true +repository.workspace = true description = """ Scans binary files for zStd streams and uses Preflate-rs to decompress the stream and repack with -zStd compression. For PNG files, we use WEBP compression for RGB and RGBA to get better results. +zStd compression. For PNG files, we use WEBP compression for RGB and RGBA to get better results. """ -repository = "https://github.com/microsoft/preflate-rs" categories = ["compression"] keywords = ["gzip", "deflate", "zlib", "zip"] diff --git a/container/src/container_common.rs b/container/src/container_common.rs new file mode 100644 index 0000000..b41e771 --- /dev/null +++ b/container/src/container_common.rs @@ -0,0 +1,1085 @@ +use std::io::{BufRead, Read, Write}; + +use preflate_rs::{AddContext, HashAlgorithm, PreflateConfig, Result}; + +/// Configuration for the deflate process +#[derive(Debug, Clone)] +pub struct PreflateContainerConfig { + /// As we scan for deflate streams, we need to have a minimum memory + /// chunk to process. We scan this chunk for deflate streams and at least + /// deflate one block has to fit into a chunk for us to recognize it. + pub min_chunk_size: usize, + + /// The maximum size of a deflate or PNG compressed block we will consider. If + /// a deflate stream is larger than this, we will not decompress it and + /// just write it out as a literal block. + pub max_chunk_size: usize, + + /// The maximum overall size of plain text that we will compress. This is + /// global to the entire container and limits the amount of processing that + /// we will do to avoid running out of CPU time on a single file. Once we + /// hit this limit, we will stop looking for deflate streams and just write + /// out the rest of the data as literal blocks. + pub total_plain_text_limit: u64, + + /// The maximum size of a plain text chunk that we will decompress at a time. This limits + /// the memory usage of the decompression process. + pub chunk_plain_text_limit: usize, + + /// true if we should verify that the decompressed data can be recompressed to the same bytes. + /// This is important since there may be corner cases where the data may not yield the same bytes. + /// + /// If this is false, we will not verify the decompressed data and just write it out as is and it is + /// up to the caller to make sure the data is valid. In no case should you just assume that you + /// can get the same data back without verifying it. + pub validate_compression: bool, + + /// Maximum number of lookups we will do in the hash chain. This will limit the CPU time we spend + /// on deflate stream processing but also means that we won't be able to recompress deflate streams + /// that were compressed with a larger chain length (eg level 9 has 4096). + pub max_chain_length: u32, +} + +impl Default for PreflateContainerConfig { + fn default() -> Self { + PreflateContainerConfig { + min_chunk_size: 1024 * 1024, + max_chunk_size: 64 * 1024 * 1024, + total_plain_text_limit: 512 * 1024 * 1024, + chunk_plain_text_limit: 128 * 1024 * 1024, + max_chain_length: 4096, + validate_compression: true, + } + } +} + +impl PreflateContainerConfig { + pub fn preflate_config(&self) -> PreflateConfig { + PreflateConfig { + max_chain_length: self.max_chain_length, + plain_text_limit: self.chunk_plain_text_limit, + verify_compression: self.validate_compression, + } + } +} + +pub(crate) const COMPRESSED_WRAPPER_VERSION_2: u8 = 2; + +// Bit-field masks for the block type byte +// Bits 7-6: compression algorithm Bits 5-0: block content kind +pub(crate) const BLOCK_COMPRESSION_MASK: u8 = 0xC0; +pub(crate) const BLOCK_TYPE_MASK: u8 = 0x3F; + +// Compression algorithms (top 2 bits) +pub(crate) const BLOCK_COMPRESSION_NONE: u8 = 0x00; +pub(crate) const BLOCK_COMPRESSION_ZSTD: u8 = 0x40; + +// Block content kinds (bottom 6 bits) +pub(crate) const BLOCK_TYPE_LITERAL: u8 = 0x00; +pub(crate) const BLOCK_TYPE_DEFLATE: u8 = 0x01; +pub(crate) const BLOCK_TYPE_PNG: u8 = 0x02; +pub(crate) const BLOCK_TYPE_DEFLATE_CONTINUE: u8 = 0x03; +pub(crate) const BLOCK_TYPE_JPEG_LEPTON: u8 = 0x04; +pub(crate) const BLOCK_TYPE_WEBP: u8 = 0x05; + +pub(crate) fn write_varint(destination: &mut impl Write, value: u32) -> std::io::Result<()> { + let mut value = value; + loop { + let mut byte = (value & 0x7F) as u8; + value >>= 7; + if value != 0 { + byte |= 0x80; + } + destination.write_all(&[byte])?; + if value == 0 { + break; + } + } + + Ok(()) +} + +pub(crate) fn read_varint(source: &mut impl Read) -> std::io::Result { + let mut result = 0; + let mut shift = 0; + loop { + let mut byte = [0u8; 1]; + source.read_exact(&mut byte)?; + let byte = byte[0]; + result |= ((byte & 0x7F) as u32) << shift; + shift += 7; + if byte & 0x80 == 0 { + break; + } + } + Ok(result) +} + +#[test] +fn test_variant_roundtrip() { + let values = [ + 0, 1, 127, 128, 255, 256, 16383, 16384, 2097151, 2097152, 268435455, 268435456, 4294967295, + ]; + + let mut buffer = Vec::new(); + for &v in values.iter() { + write_varint(&mut buffer, v).unwrap(); + } + + let mut buffer = &buffer[..]; + + for &v in values.iter() { + assert_eq!(v, read_varint(&mut buffer).unwrap()); + } +} + +/// Statistics about the preflate process +#[derive(Debug, Copy, Clone, Default)] +pub struct PreflateStats { + pub deflate_compressed_size: u64, + pub zstd_compressed_size: u64, + pub uncompressed_size: u64, + pub overhead_bytes: u64, + pub hash_algorithm: HashAlgorithm, + pub zstd_baseline_size: u64, +} + +/// Processes an input buffer and writes the output to a writer +pub trait ProcessBuffer { + fn process_buffer( + &mut self, + input: &[u8], + input_complete: bool, + writer: &mut impl Write, + ) -> Result<()>; + + #[cfg(test)] + fn process_vec(&mut self, input: &[u8]) -> Result> { + let mut writer = Vec::new(); + + self.copy_to_end(&mut std::io::Cursor::new(&input), &mut writer) + .context()?; + + Ok(writer) + } + + #[cfg(test)] + fn process_vec_size(&mut self, input: &[u8], read_chunk_size: usize) -> Result> { + let mut writer = Vec::new(); + + self.copy_to_end_size( + &mut std::io::Cursor::new(&input), + &mut writer, + read_chunk_size, + ) + .context()?; + + Ok(writer) + } + + /// Reads everything from input and writes it to the output. + /// Wraps calls to process buffer + fn copy_to_end(&mut self, input: &mut impl BufRead, output: &mut impl Write) -> Result<()> { + self.copy_to_end_size(input, output, 1024 * 1024) + } + + /// Reads everything from input and writes it to the output. + /// Wraps calls to process buffer + fn copy_to_end_size( + &mut self, + input: &mut impl BufRead, + output: &mut impl Write, + read_chunk_size: usize, + ) -> Result<()> { + let mut input_complete = false; + loop { + let buffer: &[u8]; + if input_complete { + buffer = &[]; + } else { + buffer = input.fill_buf().context()?; + if buffer.len() == 0 { + input_complete = true + } + }; + + if input_complete { + self.process_buffer(&[], true, output).context()?; + break; + } else { + // process buffer a piece at a time to avoid overflowing memory + let mut amount_read = 0; + while amount_read < buffer.len() { + let chunk_size = (buffer.len() - amount_read).min(read_chunk_size); + + self.process_buffer( + &buffer[amount_read..amount_read + chunk_size], + false, + output, + ) + .context()?; + + amount_read += chunk_size; + } + + let buflen = buffer.len(); + input.consume(buflen); + } + } + + Ok(()) + } + + fn stats(&self) -> PreflateStats { + PreflateStats::default() + } +} + +#[cfg(test)] +pub(crate) mod test { + use std::io::Write; + + use preflate_rs::{AddContext, Result}; + + use crate::container_common::{ + BLOCK_COMPRESSION_MASK, BLOCK_COMPRESSION_NONE, BLOCK_COMPRESSION_ZSTD, BLOCK_TYPE_DEFLATE, + BLOCK_TYPE_DEFLATE_CONTINUE, BLOCK_TYPE_JPEG_LEPTON, BLOCK_TYPE_LITERAL, BLOCK_TYPE_MASK, + BLOCK_TYPE_PNG, BLOCK_TYPE_WEBP, COMPRESSED_WRAPPER_VERSION_2, PreflateContainerConfig, + ProcessBuffer, read_varint, + }; + use crate::container_read::RecreateContainerProcessor; + use crate::container_write::PreflateContainerProcessor; + + pub struct NopProcessBuffer {} + + impl ProcessBuffer for NopProcessBuffer { + fn process_buffer( + &mut self, + input: &[u8], + _input_complete: bool, + writer: &mut impl Write, + ) -> Result<()> { + writer.write_all(input).context()?; + + Ok(()) + } + } + + fn roundtrip_deflate_chunks(filename: &str) { + use crate::utils::assert_eq_array; + + let f = crate::utils::read_file(filename); + + println!("Processing file: {}", filename); + + let mut expanded = Vec::new(); + let mut ctx = + PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); + ctx.copy_to_end(&mut std::io::Cursor::new(&f), &mut expanded) + .unwrap(); + + println!("Recreating file: {}", filename); + + let mut destination = Vec::new(); + let mut ctx = RecreateContainerProcessor::new(usize::MAX); + ctx.copy_to_end(&mut std::io::Cursor::new(expanded), &mut destination) + .unwrap(); + + assert_eq_array(&destination, &f); + } + + #[test] + fn roundtrip_skip_length_crash() { + roundtrip_deflate_chunks("skiplengthcrash.bin"); + } + + #[test] + fn roundtrip_png_chunks() { + roundtrip_deflate_chunks("treegdi.png"); + } + + #[test] + fn roundtrip_zip_chunks() { + roundtrip_deflate_chunks("samplezip.zip"); + } + + #[test] + fn roundtrip_gz_chunks() { + roundtrip_deflate_chunks("sample1.bin.gz"); + } + + #[test] + fn roundtrip_png_chunks2() { + roundtrip_deflate_chunks("starcontrol.samplesave"); + } + + #[test] + fn roundtrip_small_chunk() { + use crate::utils::{assert_eq_array, read_file}; + + let original = read_file("pptxplaintext.zip"); + + let mut context = PreflateContainerProcessor::new( + &PreflateContainerConfig { + min_chunk_size: 100000, + max_chunk_size: 100000, + total_plain_text_limit: u64::MAX, + ..Default::default() + }, + 1, + false, + ); + + let compressed = context.process_vec_size(&original, 20001).unwrap(); + + let mut context = RecreateContainerProcessor::new(usize::MAX); + let recreated = context.process_vec_size(&compressed, 20001).unwrap(); + + assert_eq_array(&original, &recreated); + } + + #[test] + fn roundtrip_small_plain_text() { + use crate::utils::{assert_eq_array, read_file}; + + let original = read_file("pptxplaintext.zip"); + + let mut context = PreflateContainerProcessor::new( + &PreflateContainerConfig { + min_chunk_size: 100000, + max_chunk_size: 100000, + total_plain_text_limit: u64::MAX, + ..Default::default() + }, + 1, + false, + ); + + let compressed = context.process_vec_size(&original, 2001).unwrap(); + + let mut context = RecreateContainerProcessor::new(usize::MAX); + let recreated = context.process_vec_size(&compressed, 2001).unwrap(); + + assert_eq_array(&original, &recreated); + } + + #[test] + fn roundtrip_zstd_per_block() { + use crate::utils::{assert_eq_array, read_file}; + + let original = read_file("samplezip.zip"); + + let mut context = + PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); + + let compressed = context.process_vec(&original).unwrap(); + + let mut context = RecreateContainerProcessor::new(usize::MAX); + let recreated = context.process_vec(&compressed).unwrap(); + + assert_eq_array(&original, &recreated); + } + + // ── Block type bit-field tests ─────────────────────────────────────────────── + + /// Parse the outer framing of a v2 container and return each block's + /// (compression_bits, block_type_bits) in order, stopping at the 0xFF CRC end block. + fn parse_wire_block_types(data: &[u8]) -> Vec<(u8, u8)> { + use byteorder::ReadBytesExt; + let mut cursor = std::io::Cursor::new(data); + let version = cursor.read_u8().unwrap(); + assert_eq!(version, COMPRESSED_WRAPPER_VERSION_2); + let mut blocks = Vec::new(); + while (cursor.position() as usize) < data.len() { + let type_byte = cursor.read_u8().unwrap(); + if type_byte == 0xFF { + break; // CRC end block; 4 raw bytes follow but we stop here + } + let compression = type_byte & BLOCK_COMPRESSION_MASK; + let block_type = type_byte & BLOCK_TYPE_MASK; + blocks.push((compression, block_type)); + let size = read_varint(&mut cursor).unwrap() as u64; + cursor.set_position(cursor.position() + size); + } + blocks + } + + /// Feed `stream` to the decoder with input_complete=true and assert the + /// error exit code matches `expected`. + fn assert_decoder_fails(stream: &[u8], expected: preflate_rs::ExitCode) { + let mut ctx = RecreateContainerProcessor::new(usize::MAX); + let mut out = Vec::new(); + let err = ctx + .process_buffer(stream, true, &mut out) + .expect_err("expected an error, but decoder returned Ok"); + assert_eq!( + err.exit_code(), + expected, + "wrong exit code for stream {stream:02X?}" + ); + } + + /// The two masks must partition the byte: non-overlapping and together covering all 8 bits. + /// Every content-kind constant must sit entirely within BLOCK_TYPE_MASK, and every + /// compression constant within BLOCK_COMPRESSION_MASK. + #[test] + fn test_bit_field_masks_partition_byte() { + assert_eq!( + BLOCK_COMPRESSION_MASK | BLOCK_TYPE_MASK, + 0xFF, + "masks do not cover all bits" + ); + assert_eq!( + BLOCK_COMPRESSION_MASK & BLOCK_TYPE_MASK, + 0x00, + "masks overlap" + ); + for kind in [ + BLOCK_TYPE_LITERAL, + BLOCK_TYPE_DEFLATE, + BLOCK_TYPE_PNG, + BLOCK_TYPE_DEFLATE_CONTINUE, + BLOCK_TYPE_JPEG_LEPTON, + BLOCK_TYPE_WEBP, + ] { + assert_eq!( + kind & BLOCK_COMPRESSION_MASK, + 0, + "BLOCK_TYPE 0x{kind:02X} bleeds into compression bits" + ); + } + for comp in [BLOCK_COMPRESSION_NONE, BLOCK_COMPRESSION_ZSTD] { + assert_eq!( + comp & BLOCK_TYPE_MASK, + 0, + "BLOCK_COMPRESSION 0x{comp:02X} bleeds into type bits" + ); + } + } + + /// The combined (compression | kind) wire bytes must match the expected values + /// documented in CLAUDE.md. This catches accidental constant drift. + #[test] + fn test_combined_wire_values() { + assert_eq!(BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_LITERAL, 0x40); + assert_eq!(BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_DEFLATE, 0x41); + assert_eq!(BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_PNG, 0x42); + assert_eq!(BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_DEFLATE_CONTINUE, 0x43); + assert_eq!(BLOCK_COMPRESSION_NONE | BLOCK_TYPE_JPEG_LEPTON, 0x04); + assert_eq!(BLOCK_COMPRESSION_NONE | BLOCK_TYPE_WEBP, 0x05); + } + + /// Reserved compression bits 0x80 (10xx_xxxx) must be rejected by the decoder. + #[test] + fn test_decoder_rejects_reserved_compression_bits_10() { + assert_decoder_fails( + &[COMPRESSED_WRAPPER_VERSION_2, 0x80], + preflate_rs::ExitCode::InvalidCompressedWrapper, + ); + } + + /// Reserved compression bits 0xC0 (11xx_xxxx) must be rejected by the decoder. + #[test] + fn test_decoder_rejects_reserved_compression_bits_11() { + assert_decoder_fails( + &[COMPRESSED_WRAPPER_VERSION_2, 0xC0], + preflate_rs::ExitCode::InvalidCompressedWrapper, + ); + } + + /// BLOCK_COMPRESSION_NONE | BLOCK_TYPE_LITERAL (0x00) must be rejected: + /// literal blocks are Zstd-only; there is no raw literal block type. + #[test] + fn test_decoder_rejects_raw_literal_block_type() { + let byte = BLOCK_COMPRESSION_NONE | BLOCK_TYPE_LITERAL; // == 0x00 + assert_decoder_fails( + &[COMPRESSED_WRAPPER_VERSION_2, byte], + preflate_rs::ExitCode::InvalidCompressedWrapper, + ); + } + + /// Any BLOCK_COMPRESSION_NONE byte that is not JPEG_LEPTON or WEBP must be rejected. + #[test] + fn test_decoder_rejects_undefined_raw_block_types() { + // 0x10 is arbitrary: not 0x04 (JPEG) or 0x05 (WEBP) + let byte = BLOCK_COMPRESSION_NONE | 0x10; + assert_decoder_fails( + &[COMPRESSED_WRAPPER_VERSION_2, byte], + preflate_rs::ExitCode::InvalidCompressedWrapper, + ); + } + + /// Compressing plain bytes (no embedded DEFLATE streams) must produce a stream + /// whose first block carries BLOCK_COMPRESSION_ZSTD and BLOCK_TYPE_LITERAL. + #[test] + fn test_encoder_literal_block_carries_zstd_compression_bit() { + let input = vec![0xABu8; 512]; + let mut ctx = + PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); + let compressed = ctx.process_vec(&input).unwrap(); + + let blocks = parse_wire_block_types(&compressed); + assert!( + !blocks.is_empty(), + "expected at least one block in the output" + ); + assert_eq!( + blocks[0], + (BLOCK_COMPRESSION_ZSTD, BLOCK_TYPE_LITERAL), + "first block should be a Zstd literal block" + ); + } + + /// Every block type byte in a real compressed output must have compression bits + /// of either BLOCK_COMPRESSION_NONE or BLOCK_COMPRESSION_ZSTD — never the + /// reserved patterns 0x80 or 0xC0. + #[test] + fn test_encoder_never_emits_reserved_compression_bits() { + let input = crate::utils::read_file("samplezip.zip"); + let mut ctx = + PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); + let compressed = ctx.process_vec(&input).unwrap(); + + for &(compression, _) in &parse_wire_block_types(&compressed) { + assert!( + compression == BLOCK_COMPRESSION_NONE || compression == BLOCK_COMPRESSION_ZSTD, + "found reserved compression bits 0x{compression:02X} in output" + ); + } + } + + /// Verify that the decoder extracts the lower 6 bits as block_type rather + /// than passing the full byte to process_compressed_block. If it passed the + /// full byte (0x41) instead of the kind bits (0x01), the match would fall + /// through to the error arm and the round-trip would fail. + #[test] + fn test_decoder_strips_compression_bits_before_dispatch() { + use crate::utils::{assert_eq_array, read_file}; + // A zip file exercises DEFLATE blocks (wire type 0x41 = ZSTD|DEFLATE). + // A successful round-trip proves the decoder is matching on 0x01, not 0x41. + let original = read_file("samplezip.zip"); + let mut enc = + PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); + let compressed = enc.process_vec(&original).unwrap(); + + // Confirm the stream actually contains DEFLATE blocks (type 0x41), + // so the test is meaningful and not trivially passing. + let has_deflate = parse_wire_block_types(&compressed) + .iter() + .any(|&(c, t)| c == BLOCK_COMPRESSION_ZSTD && t == BLOCK_TYPE_DEFLATE); + assert!( + has_deflate, + "test file produced no DEFLATE blocks — test is vacuous" + ); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + assert_eq_array(&original, &recreated); + } + + /// A PNG file must produce at least one PNG or WebP IDAT block (not merely a DEFLATE + /// block), and must round-trip to the original bytes. The PNG code path in the encoder + /// is distinct from the plain DEFLATE path: it reconstructs IDAT framing and, when the + /// `webp` feature is enabled, may store pixels as WebP lossless instead of raw. + #[test] + fn test_png_produces_idat_block_and_roundtrips() { + use crate::utils::{assert_eq_array, read_file}; + let original = read_file("treegdi.png"); + let mut enc = + PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); + let compressed = enc.process_vec(&original).unwrap(); + + let blocks = parse_wire_block_types(&compressed); + let has_png_block = blocks + .iter() + .any(|&(_, t)| t == BLOCK_TYPE_PNG || t == BLOCK_TYPE_WEBP); + assert!( + has_png_block, + "PNG input should produce at least one PNG (0x02) or WebP (0x05) block, \ + got: {blocks:?}" + ); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + assert_eq_array(&original, &recreated); + } + + /// A PDF containing embedded JPEG images must produce JPEG_LEPTON blocks (raw, + /// outside Zstd) as well as DEFLATE blocks for the PDF's own compressed object + /// streams. Both must survive a full round-trip. + #[test] + fn test_pdf_with_jpegs_produces_lepton_and_deflate_blocks_and_roundtrips() { + use crate::utils::{assert_eq_array, read_file}; + let original = read_file("embedded-images.pdf"); + let mut enc = + PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); + let compressed = enc.process_vec(&original).unwrap(); + + let blocks = parse_wire_block_types(&compressed); + + let has_lepton = blocks + .iter() + .any(|&(c, t)| c == BLOCK_COMPRESSION_NONE && t == BLOCK_TYPE_JPEG_LEPTON); + assert!( + has_lepton, + "PDF with embedded JPEGs should produce at least one JPEG_LEPTON block" + ); + + let has_deflate = blocks.iter().any(|&(_, t)| t == BLOCK_TYPE_DEFLATE); + assert!( + has_deflate, + "PDF with embedded JPEGs should also produce DEFLATE blocks for compressed objects" + ); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + assert_eq_array(&original, &recreated); + } + + /// DEFLATE_CONTINUE blocks are produced when the compressed-data buffer is + /// truncated mid-stream: `DeflateParser::parse` reads to EOF and returns + /// `Ok` with `is_done()=false`, the encoder emits a DEFLATE block for the + /// plaintext decoded so far, saves the mid-stream state, and resumes on + /// subsequent calls via DEFLATE_CONTINUE blocks. + /// + /// `sample1.bin.gz` is a single gzip stream with ~418 KiB of uncompressed + /// content. Feeding it in 10 KiB slices (with `min_chunk_size=5000` so the + /// processor starts immediately) means the scanner always sees only a + /// partial window of the compressed stream, forcing many DEFLATE_CONTINUE + /// blocks that must all round-trip correctly. + #[test] + fn test_deflate_continue_blocks_appear_and_roundtrip() { + use crate::utils::{assert_eq_array, read_file}; + let original = read_file("sample1.bin.gz"); + // min_chunk_size: 0 so the loop processes data immediately after Start, + // letting Searching run with the first truncated chunk rather than waiting + // for an additional min_chunk_size bytes before beginning. + let mut enc = PreflateContainerProcessor::new( + &PreflateContainerConfig { + min_chunk_size: 0, + ..PreflateContainerConfig::default() + }, + 1, + false, + ); + // Feed the 263 KiB file in two pieces. The first piece (200 KiB) truncates + // the DEFLATE stream mid-way; decompress() hits EOF with at least one + // complete block already parsed, so it returns Ok(partial) / is_done()=false, + // causing the encoder to emit a DEFLATE block and enter DeflateContinue. + // The second piece completes the stream → DEFLATE_CONTINUE block. + let mut compressed = Vec::new(); + { + let chunk1 = &original[..200_000.min(original.len())]; + enc.process_buffer(chunk1, false, &mut compressed).unwrap(); + if original.len() > 200_000 { + let chunk2 = &original[200_000..]; + enc.process_buffer(chunk2, false, &mut compressed).unwrap(); + } + enc.process_buffer(&[], true, &mut compressed).unwrap(); + } + + let blocks = parse_wire_block_types(&compressed); + let n_continue = blocks + .iter() + .filter(|&&(_, t)| t == BLOCK_TYPE_DEFLATE_CONTINUE) + .count(); + assert!( + n_continue > 0, + "200 KiB chunks on a ~263 KiB gzip should force at least one DEFLATE_CONTINUE block; \ + blocks seen: {blocks:?}" + ); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec_size(&compressed, 10_000).unwrap(); + assert_eq_array(&original, &recreated); + } + + /// When `total_plain_text_limit` is exceeded the encoder stops analysing + /// deflate streams and writes the remaining bytes as LITERAL blocks. The + /// decoder must still reproduce the original bytes exactly, including the + /// unprocessed portion. + #[test] + fn test_total_plain_text_limit_forces_literal_fallback_and_roundtrips() { + use crate::utils::{assert_eq_array, read_file}; + // samplezip.zip has several DEFLATE entries; setting the limit to 1 byte + // ensures that after the first DEFLATE entry's plaintext is accumulated, + // every subsequent scan sees total_plain_text_seen > limit and falls back + // to writing remaining content as a single LITERAL block. + let original = read_file("samplezip.zip"); + let mut enc = PreflateContainerProcessor::new( + &PreflateContainerConfig { + total_plain_text_limit: 1, + ..PreflateContainerConfig::default() + }, + 1, + false, + ); + let compressed = enc.process_vec(&original).unwrap(); + + let blocks = parse_wire_block_types(&compressed); + + // At least one LITERAL block must appear (the fallback content). + let has_literal = blocks.iter().any(|&(_, t)| t == BLOCK_TYPE_LITERAL); + assert!( + has_literal, + "after total_plain_text_limit is exceeded, remaining content must be LITERAL" + ); + + // The stream must still decode back to the original bytes. + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + assert_eq_array(&original, &recreated); + } + + // ── Multi-scheme fixture tests ─────────────────────────────────────────────── + + /// Helper: compress `data` in one shot and return `(compressed, blocks)`. + fn compress_default(data: &[u8]) -> (Vec, Vec<(u8, u8)>) { + let mut enc = + PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); + let compressed = enc.process_vec(data).unwrap(); + let blocks = parse_wire_block_types(&compressed); + (compressed, blocks) + } + + /// Helper: full roundtrip assertion — compress then decompress, check byte equality. + fn assert_roundtrip(original: &[u8]) { + let (compressed, _) = compress_default(original); + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + crate::utils::assert_eq_array(original, &recreated); + } + + /// Count how many blocks have a given block-type kind. + fn count_block_type(blocks: &[(u8, u8)], kind: u8) -> usize { + blocks.iter().filter(|&&(_, t)| t == kind).count() + } + + /// Two concatenated gzip streams — each contains plaintext well above + /// MIN_BLOCKSIZE=1024, so the scanner must emit exactly two DEFLATE blocks. + /// + /// Fixture: `test_two_gzip_streams.bin` + /// Expected wire sequence: literal, deflate, literal, deflate, literal, EOS + #[test] + fn test_two_gzip_streams_produce_two_deflate_blocks_and_roundtrip() { + use crate::utils::read_file; + let original = read_file("test_two_gzip_streams.bin"); + let (compressed, blocks) = compress_default(&original); + + assert_eq!( + count_block_type(&blocks, BLOCK_TYPE_DEFLATE), + 2, + "two consecutive gzip streams should each produce one DEFLATE block; blocks={blocks:?}" + ); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + crate::utils::assert_eq_array(&original, &recreated); + } + + /// A gzip stream whose plaintext is below MIN_BLOCKSIZE (500 < 1024) must NOT + /// be promoted to a DEFLATE block — the whole file becomes a single literal chunk. + /// + /// Fixture: `test_tiny_gzip.bin` + /// Expected wire sequence: literal, EOS (no DEFLATE blocks) + #[test] + fn test_tiny_gzip_below_min_blocksize_becomes_literal_and_roundtrip() { + use crate::utils::read_file; + let original = read_file("test_tiny_gzip.bin"); + let (compressed, blocks) = compress_default(&original); + + assert_eq!( + count_block_type(&blocks, BLOCK_TYPE_DEFLATE), + 0, + "gzip with 500-byte plaintext ( MIN_BLOCKSIZE) immediately followed by + /// a tiny gzip (plaintext < MIN_BLOCKSIZE). Only the large stream must become a + /// DEFLATE block; the small one stays literal. + /// + /// Fixture: `test_big_then_small_gzip.bin` + /// Expected wire sequence: literal, deflate, literal, EOS (exactly 1 DEFLATE block) + #[test] + fn test_big_gzip_deflate_small_gzip_literal_and_roundtrip() { + use crate::utils::read_file; + let original = read_file("test_big_then_small_gzip.bin"); + let (compressed, blocks) = compress_default(&original); + + assert_eq!( + count_block_type(&blocks, BLOCK_TYPE_DEFLATE), + 1, + "only the large gzip stream should become a DEFLATE block; blocks={blocks:?}" + ); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + crate::utils::assert_eq_array(&original, &recreated); + } + + /// A file with a valid gzip header but a deliberately corrupted DEFLATE body + /// (0xFF leading byte) must not crash. The scanner must gracefully abandon the + /// stream and encode the entire file as a literal block. + /// + /// Fixture: `test_corrupted_deflate.bin` + /// Expected wire sequence: literal, EOS (0 DEFLATE blocks) + #[test] + fn test_corrupted_deflate_body_falls_back_to_literal_and_roundtrip() { + use crate::utils::read_file; + let original = read_file("test_corrupted_deflate.bin"); + let (compressed, blocks) = compress_default(&original); + + assert_eq!( + count_block_type(&blocks, BLOCK_TYPE_DEFLATE), + 0, + "corrupted DEFLATE body must not produce a DEFLATE block; blocks={blocks:?}" + ); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + crate::utils::assert_eq_array(&original, &recreated); + } + + /// A file containing padding bytes, then two zlib streams (each with plaintext + /// > MIN_BLOCKSIZE), then more padding. The scanner must find both zlib headers + /// and emit exactly two DEFLATE blocks. + /// + /// Fixture: `test_two_zlib_streams.bin` + /// layout: 100 × `\xDE\xAD` | zlib(EEEE×6000) | 100 × `\xDE\xAD` | zlib(FFFF×6000) | 100 × `\xDE\xAD` + /// Expected wire sequence: literal, deflate, literal, deflate, literal, EOS + #[test] + fn test_two_zlib_streams_produce_two_deflate_blocks_and_roundtrip() { + use crate::utils::read_file; + let original = read_file("test_two_zlib_streams.bin"); + let (compressed, blocks) = compress_default(&original); + + assert_eq!( + count_block_type(&blocks, BLOCK_TYPE_DEFLATE), + 2, + "two zlib streams surrounded by literal bytes should each produce a DEFLATE block; \ + blocks={blocks:?}" + ); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + crate::utils::assert_eq_array(&original, &recreated); + } + + /// A ZIP file containing three DEFLATE-compressed entries must produce exactly three + /// DEFLATE blocks — one per entry — and round-trip correctly. + /// + /// Fixture: `test_zip_3entries.zip` (entries G×20000, H×20000, I×20000 bytes) + /// Expected wire sequence: literal, deflate, literal, deflate, literal, deflate, literal, EOS + #[test] + fn test_zip_three_deflated_entries_produce_three_deflate_blocks_and_roundtrip() { + use crate::utils::read_file; + let original = read_file("test_zip_3entries.zip"); + let (compressed, blocks) = compress_default(&original); + + assert_eq!( + count_block_type(&blocks, BLOCK_TYPE_DEFLATE), + 3, + "ZIP with 3 DEFLATED entries should produce 3 DEFLATE blocks; blocks={blocks:?}" + ); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + crate::utils::assert_eq_array(&original, &recreated); + } + + /// A ZIP file with a STORED entry (method=0) followed by a DEFLATED entry (method=8). + /// `parse_zip_stream` returns `Err` for STORED entries so they become literal blocks; + /// only the DEFLATED entry is analysed and emitted as a DEFLATE block. + /// + /// Fixture: `test_zip_stored_then_deflated.zip` (J×8000 STORED, K×20000 DEFLATED) + /// Expected wire sequence: literal, deflate, literal, EOS (exactly 1 DEFLATE block) + #[test] + fn test_zip_stored_entry_stays_literal_deflated_entry_becomes_deflate_and_roundtrip() { + use crate::utils::read_file; + let original = read_file("test_zip_stored_then_deflated.zip"); + let (compressed, blocks) = compress_default(&original); + + assert_eq!( + count_block_type(&blocks, BLOCK_TYPE_DEFLATE), + 1, + "only the DEFLATED entry should become a DEFLATE block; STORED stays literal; \ + blocks={blocks:?}" + ); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + crate::utils::assert_eq_array(&original, &recreated); + } + + /// A buffer filled with pseudo-random bytes contains no recognisable DEFLATE/zlib/gzip + /// signatures. The entire file must be emitted as a single literal block with no + /// DEFLATE analysis. + /// + /// Fixture: `test_random_bytes.bin` (32 KiB pseudo-random) + /// Expected wire sequence: literal, EOS + #[test] + fn test_random_bytes_produce_no_deflate_blocks_and_roundtrip() { + use crate::utils::read_file; + let original = read_file("test_random_bytes.bin"); + let (compressed, blocks) = compress_default(&original); + + assert_eq!( + count_block_type(&blocks, BLOCK_TYPE_DEFLATE), + 0, + "random bytes contain no DEFLATE streams; blocks={blocks:?}" + ); + + // The literal block must survive the round-trip. + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + crate::utils::assert_eq_array(&original, &recreated); + } + + /// Two gzip streams separated by a 1000-byte null gap. Both streams have + /// plaintext > MIN_BLOCKSIZE, so both must produce DEFLATE blocks, and the gap + /// must appear as a literal block between them. + /// + /// Fixture: `test_gzip_with_gap.bin` + /// Expected wire sequence: literal, deflate, literal, deflate, literal, EOS + #[test] + fn test_two_gzip_streams_with_null_gap_produce_two_deflate_blocks_and_roundtrip() { + use crate::utils::read_file; + let original = read_file("test_gzip_with_gap.bin"); + let (compressed, blocks) = compress_default(&original); + + assert_eq!( + count_block_type(&blocks, BLOCK_TYPE_DEFLATE), + 2, + "both gzip streams should become DEFLATE blocks; null gap stays literal; \ + blocks={blocks:?}" + ); + // There should be at least one literal block (the gap between the two streams). + assert!( + count_block_type(&blocks, BLOCK_TYPE_LITERAL) >= 1, + "null gap between gzip streams should produce at least one literal block; \ + blocks={blocks:?}" + ); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + crate::utils::assert_eq_array(&original, &recreated); + } + + /// Feed a fixture containing two gzip streams in very small chunks (64 bytes at a + /// time) via the incremental `process_buffer` API to exercise boundary handling. + /// The round-trip result must be byte-exact regardless of where chunk boundaries fall. + #[test] + fn test_two_gzip_streams_incremental_small_chunks_roundtrip() { + use crate::utils::{assert_eq_array, read_file}; + let original = read_file("test_two_gzip_streams.bin"); + + let mut enc = PreflateContainerProcessor::new( + &PreflateContainerConfig { + min_chunk_size: 0, + ..PreflateContainerConfig::default() + }, + 1, + false, + ); + let mut compressed = Vec::new(); + let chunk_size = 64; + let mut pos = 0; + while pos < original.len() { + let end = (pos + chunk_size).min(original.len()); + enc.process_buffer(&original[pos..end], false, &mut compressed) + .unwrap(); + pos = end; + } + enc.process_buffer(&[], true, &mut compressed).unwrap(); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + assert_eq_array(&original, &recreated); + } + + /// Feed `test_two_zlib_streams.bin` in small chunks (128 bytes) to confirm that + /// the incremental path handles mixed literal padding + zlib streams correctly. + #[test] + fn test_two_zlib_streams_incremental_small_chunks_roundtrip() { + use crate::utils::{assert_eq_array, read_file}; + let original = read_file("test_two_zlib_streams.bin"); + + let mut enc = PreflateContainerProcessor::new( + &PreflateContainerConfig { + min_chunk_size: 0, + ..PreflateContainerConfig::default() + }, + 1, + false, + ); + let mut compressed = Vec::new(); + let chunk_size = 128; + let mut pos = 0; + while pos < original.len() { + let end = (pos + chunk_size).min(original.len()); + enc.process_buffer(&original[pos..end], false, &mut compressed) + .unwrap(); + pos = end; + } + enc.process_buffer(&[], true, &mut compressed).unwrap(); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + assert_eq_array(&original, &recreated); + } + + /// Feed a ZIP fixture in small chunks (256 bytes) to check that chunk boundaries + /// inside the ZIP local-file headers and DEFLATE bodies are handled gracefully. + #[test] + fn test_zip_three_entries_incremental_small_chunks_roundtrip() { + use crate::utils::{assert_eq_array, read_file}; + let original = read_file("test_zip_3entries.zip"); + + let mut enc = PreflateContainerProcessor::new( + &PreflateContainerConfig { + min_chunk_size: 0, + ..PreflateContainerConfig::default() + }, + 1, + false, + ); + let mut compressed = Vec::new(); + let chunk_size = 256; + let mut pos = 0; + while pos < original.len() { + let end = (pos + chunk_size).min(original.len()); + enc.process_buffer(&original[pos..end], false, &mut compressed) + .unwrap(); + pos = end; + } + enc.process_buffer(&[], true, &mut compressed).unwrap(); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + assert_eq_array(&original, &recreated); + } + + /// Verify that the decoder also handles the recreated stream correctly when fed in + /// small chunks, not just when given the entire buffer at once. + /// Uses `test_zip_stored_then_deflated.zip` (mixed STORED + DEFLATED entries). + #[test] + fn test_zip_stored_then_deflated_decoder_small_chunks_roundtrip() { + use crate::utils::{assert_eq_array, read_file}; + let original = read_file("test_zip_stored_then_deflated.zip"); + + let mut enc = + PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); + let compressed = enc.process_vec(&original).unwrap(); + + // Decompress in 512-byte chunks to exercise the incremental decoder. + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec_size(&compressed, 512).unwrap(); + assert_eq_array(&original, &recreated); + } +} diff --git a/container/src/container_processor.rs b/container/src/container_processor.rs deleted file mode 100644 index 1d2fe24..0000000 --- a/container/src/container_processor.rs +++ /dev/null @@ -1,1301 +0,0 @@ -use byteorder::ReadBytesExt; -use lepton_jpeg::{DEFAULT_THREAD_POOL, EnabledFeatures}; - -use std::{ - collections::VecDeque, - io::{BufRead, Cursor, Read, Write}, - usize, -}; - -use crate::{ - idat_parse::{IdatContents, PngHeader, recreate_idat}, - scan_deflate::{FindStreamResult, FoundStream, FoundStreamType, find_compressable_stream}, - scoped_read::ScopedRead, - utils::TakeReader, -}; - -use preflate_rs::{ - AddContext, ExitCode, HashAlgorithm, PreflateConfig, PreflateError, PreflateStreamProcessor, - RecreateStreamProcessor, Result, err_exit_code, recreate_whole_deflate_stream, -}; - -/// Configuration for the deflate process -#[derive(Debug, Clone)] -pub struct PreflateContainerConfig { - /// As we scan for deflate streams, we need to have a minimum memory - /// chunk to process. We scan this chunk for deflate streams and at least - /// deflate one block has to fit into a chunk for us to recognize it. - pub min_chunk_size: usize, - - /// The maximum size of a deflate or PNG compressed block we will consider. If - /// a deflate stream is larger than this, we will not decompress it and - /// just write it out as a literal block. - pub max_chunk_size: usize, - - /// The maximum overall size of plain text that we will compress. This is - /// global to the entire container and limits the amount of processing that - /// we will do to avoid running out of CPU time on a single file. Once we - /// hit this limit, we will stop looking for deflate streams and just write - /// out the rest of the data as literal blocks. - pub total_plain_text_limit: u64, - - /// The maximum size of a plain text chunk that we will decompress at a time. This limits - /// the memory usage of the decompression process. - pub chunk_plain_text_limit: usize, - - /// true if we should verify that the decompressed data can be recompressed to the same bytes. - /// This is important since there may be corner cases where the data may not yield the same bytes. - /// - /// If this is false, we will not verify the decompressed data and just write it out as is and it is - /// up to the caller to make sure the data is valid. In no case should you just assume that you - /// can get the same data back without verifying it. - pub validate_compression: bool, - - /// Maximum number of lookups we will do in the hash chain. This will limit the CPU time we spend - /// on deflate stream processing but also means that we won't be able to recompress deflate streams - /// that were compressed with a larger chain length (eg level 9 has 4096). - pub max_chain_length: u32, -} - -impl Default for PreflateContainerConfig { - fn default() -> Self { - PreflateContainerConfig { - min_chunk_size: 1024 * 1024, - max_chunk_size: 64 * 1024 * 1024, - total_plain_text_limit: 512 * 1024 * 1024, - chunk_plain_text_limit: 128 * 1024 * 1024, - max_chain_length: 4096, - validate_compression: true, - } - } -} - -impl PreflateContainerConfig { - pub fn preflate_config(&self) -> PreflateConfig { - PreflateConfig { - max_chain_length: self.max_chain_length, - plain_text_limit: self.chunk_plain_text_limit, - verify_compression: self.validate_compression, - } - } -} - -const COMPRESSED_WRAPPER_VERSION_1: u8 = 1; - -/// literal chunks are just copied to the output -const LITERAL_CHUNK: u8 = 0; - -/// zlib compressed chunks are zlib compressed -const DEFLATE_STREAM: u8 = 1; - -/// PNG chunks are IDAT chunks that are zlib compressed -const PNG_COMPRESSED: u8 = 2; - -/// deflate stream that continues the previous one with the same dictionary, bitstream etc -const DEFLATE_STREAM_CONTINUE: u8 = 3; - -/// JPEG Lepton compressed chunks are JPEG Lepton compressed -const JPEG_LEPTON_COMPRESSED: u8 = 4; - -pub(crate) fn write_varint(destination: &mut impl Write, value: u32) -> std::io::Result<()> { - let mut value = value; - loop { - let mut byte = (value & 0x7F) as u8; - value >>= 7; - if value != 0 { - byte |= 0x80; - } - destination.write_all(&[byte])?; - if value == 0 { - break; - } - } - - Ok(()) -} - -pub(crate) fn read_varint(source: &mut impl Read) -> std::io::Result { - let mut result = 0; - let mut shift = 0; - loop { - let mut byte = [0u8; 1]; - source.read_exact(&mut byte)?; - let byte = byte[0]; - result |= ((byte & 0x7F) as u32) << shift; - shift += 7; - if byte & 0x80 == 0 { - break; - } - } - Ok(result) -} - -#[test] -fn test_variant_roundtrip() { - let values = [ - 0, 1, 127, 128, 255, 256, 16383, 16384, 2097151, 2097152, 268435455, 268435456, 4294967295, - ]; - - let mut buffer = Vec::new(); - for &v in values.iter() { - write_varint(&mut buffer, v).unwrap(); - } - - let mut buffer = &buffer[..]; - - for &v in values.iter() { - assert_eq!(v, read_varint(&mut buffer).unwrap()); - } -} - -fn write_literal_block(content: &[u8], destination: &mut impl Write) -> Result<()> { - destination.write_all(&[LITERAL_CHUNK])?; - write_varint(destination, content.len() as u32)?; - destination.write_all(content)?; - Ok(()) -} - -fn write_chunk_block( - result: &mut impl Write, - chunk: FoundStream, - stats: &mut PreflateStats, -) -> Result> { - match chunk.chunk_type { - FoundStreamType::DeflateStream(parameters, state) => { - result.write_all(&[DEFLATE_STREAM])?; - - write_varint(result, chunk.corrections.len() as u32)?; - write_varint(result, state.plain_text().text().len() as u32)?; - - result.write_all(&chunk.corrections)?; - result.write_all(&state.plain_text().text())?; - - stats.overhead_bytes += chunk.corrections.len() as u64; - stats.uncompressed_size += state.plain_text().len() as u64; - stats.hash_algorithm = parameters.hash_algorithm; - - if !state.is_done() { - return Ok(Some(state)); - } - } - - FoundStreamType::IDATDeflate(parameters, mut idat, plain_text) => { - log::debug!( - "IDATDeflate param {:?} corrections {}", - parameters, - chunk.corrections.len() - ); - - if webp_compress(result, plain_text.text(), &chunk.corrections, &idat).is_err() { - log::debug!("non-Webp compressed {}", idat.total_chunk_length); - - result.write_all(&[PNG_COMPRESSED])?; - write_varint(result, chunk.corrections.len() as u32)?; - write_varint(result, plain_text.text().len() as u32)?; - - idat.png_header = None; - idat.write_to_bytestream(result)?; - - result.write_all(&chunk.corrections)?; - result.write_all(&plain_text.text())?; - } - - stats.uncompressed_size += plain_text.len() as u64; - stats.hash_algorithm = parameters.hash_algorithm; - stats.overhead_bytes += chunk.corrections.len() as u64; - } - FoundStreamType::JPEGLepton(data) => { - result.write_all(&[JPEG_LEPTON_COMPRESSED])?; - write_varint(result, data.len() as u32)?; - - result.write_all(&data)?; - - stats.uncompressed_size += data.len() as u64; - } - } - Ok(None) -} - -/// Scans for multiple deflate streams in an arbitrary binary file, decompresses the streams and -/// returns an uncompressed file that can then be recompressed using a better algorithm. -/// This can then be passed back into recreate_whole_from_container to recreate the exact original file. -/// -/// Note that the result is NOT compressed and has to be compressed by some other algorithm -/// in order to see any savings. -/// -/// This is a wrapper for PreflateContainerProcessor. -pub fn preflate_whole_into_container( - config: &PreflateContainerConfig, - compressed_data: &mut impl BufRead, - write: &mut impl Write, -) -> Result { - let mut context = PreflateContainerProcessor::new(&config); - context.copy_to_end(compressed_data, write).unwrap(); - - Ok(context.stats()) -} - -/// Takes the binary output of preflate_whole_into_container and recreates the original file. -/// -/// This is a wrapper for RecreateContainerProcessor. -pub fn recreate_whole_from_container( - source: &mut impl BufRead, - destination: &mut impl Write, -) -> Result<()> { - let mut recreate = RecreateContainerProcessor::new(usize::MAX); - recreate.copy_to_end(source, destination).context() -} - -#[cfg(test)] -fn read_chunk_block_slow( - source: &mut impl BufRead, - destination: &mut impl Write, -) -> std::result::Result<(), PreflateError> { - let mut p = RecreateContainerProcessor::new_single_chunk(usize::MAX); - p.copy_to_end_size(source, destination, 1).context() -} - -#[test] -fn roundtrip_chunk_block_literal() { - let mut buffer = Vec::new(); - - write_literal_block(b"hello", &mut buffer).unwrap(); - - let mut read_cursor = std::io::Cursor::new(buffer); - let mut destination = Vec::new(); - read_chunk_block_slow(&mut read_cursor, &mut destination).unwrap(); - - assert!(destination == b"hello"); -} - -#[test] -fn roundtrip_chunk_block_deflate() { - let contents = crate::utils::read_file("compressed_zlib_level1.deflate"); - - let mut stream_state = PreflateStreamProcessor::new(&PreflateConfig::default()); - let results = stream_state.decompress(&contents).unwrap(); - - let mut buffer = Vec::new(); - - let mut stats = PreflateStats::default(); - write_chunk_block( - &mut buffer, - FoundStream { - chunk_type: FoundStreamType::DeflateStream(results.parameters.unwrap(), stream_state), - corrections: results.corrections, - }, - &mut stats, - ) - .unwrap(); - - let mut read_cursor = std::io::Cursor::new(buffer); - let mut destination = Vec::new(); - read_chunk_block_slow(&mut read_cursor, &mut destination).unwrap(); - - assert!(destination == contents); -} - -#[test] -fn roundtrip_chunk_block_png() { - let f = crate::utils::read_file("treegdi.png"); - - // we know the first IDAT chunk starts at 83 (avoid testing the scan_deflate code in a unit teast) - let (idat_contents, deflate_stream) = crate::idat_parse::parse_idat(None, &f[83..]).unwrap(); - let mut stream = PreflateStreamProcessor::new(&PreflateConfig::default()); - let results = stream.decompress(&deflate_stream).unwrap(); - - let total_chunk_length = idat_contents.total_chunk_length; - - let mut buffer = Vec::new(); - - let mut stats = PreflateStats::default(); - write_chunk_block( - &mut buffer, - FoundStream { - chunk_type: FoundStreamType::IDATDeflate( - results.parameters.unwrap(), - idat_contents, - stream.detach_plain_text(), - ), - corrections: results.corrections, - }, - &mut stats, - ) - .unwrap(); - - let mut read_cursor = std::io::Cursor::new(buffer); - let mut destination = Vec::new(); - read_chunk_block_slow(&mut read_cursor, &mut destination).unwrap(); - - assert!(destination == &f[83..83 + total_chunk_length]); -} - -#[cfg(test)] -fn roundtrip_deflate_chunks(filename: &str) { - use crate::utils::assert_eq_array; - - let f = crate::utils::read_file(filename); - - println!("Processing file: {}", filename); - - let mut expanded = Vec::new(); - preflate_whole_into_container( - &PreflateContainerConfig::default(), - &mut std::io::Cursor::new(&f), - &mut expanded, - ) - .unwrap(); - - println!("Recreating file: {}", filename); - - let mut read_cursor = std::io::Cursor::new(expanded); - - let mut destination = Vec::new(); - recreate_whole_from_container(&mut read_cursor, &mut destination).unwrap(); - - assert_eq_array(&destination, &f); -} - -#[test] -fn roundtrip_skip_length_crash() { - roundtrip_deflate_chunks("skiplengthcrash.bin"); -} - -#[test] -fn roundtrip_png_chunks() { - roundtrip_deflate_chunks("treegdi.png"); -} - -#[test] -fn roundtrip_zip_chunks() { - roundtrip_deflate_chunks("samplezip.zip"); -} - -#[test] -fn roundtrip_gz_chunks() { - roundtrip_deflate_chunks("sample1.bin.gz"); -} - -#[test] -fn roundtrip_png_chunks2() { - roundtrip_deflate_chunks("starcontrol.samplesave"); -} - -#[test] -fn verify_zip_compress() { - use crate::utils::read_file; - let v = read_file("samplezip.zip"); - - let mut expanded = Vec::new(); - preflate_whole_into_container( - &PreflateContainerConfig::default(), - &mut std::io::Cursor::new(&v), - &mut expanded, - ) - .unwrap(); - - let mut recompressed = Vec::new(); - recreate_whole_from_container(&mut std::io::Cursor::new(expanded), &mut recompressed).unwrap(); - - assert!(v == recompressed); -} - -/// Statistics about the preflate process -#[derive(Debug, Copy, Clone, Default)] -pub struct PreflateStats { - pub deflate_compressed_size: u64, - pub zstd_compressed_size: u64, - pub uncompressed_size: u64, - pub overhead_bytes: u64, - pub hash_algorithm: HashAlgorithm, - pub zstd_baseline_size: u64, -} - -/// Processes an input buffer and writes the output to a writer -pub trait ProcessBuffer { - fn process_buffer( - &mut self, - input: &[u8], - input_complete: bool, - writer: &mut impl Write, - ) -> Result<()>; - - #[cfg(test)] - fn process_vec(&mut self, input: &[u8]) -> Result> { - let mut writer = Vec::new(); - - self.copy_to_end(&mut std::io::Cursor::new(&input), &mut writer) - .context()?; - - Ok(writer) - } - - #[cfg(test)] - fn process_vec_size(&mut self, input: &[u8], read_chunk_size: usize) -> Result> { - let mut writer = Vec::new(); - - self.copy_to_end_size( - &mut std::io::Cursor::new(&input), - &mut writer, - read_chunk_size, - ) - .context()?; - - Ok(writer) - } - - /// Reads everything from input and writes it to the output. - /// Wraps calls to process buffer - fn copy_to_end(&mut self, input: &mut impl BufRead, output: &mut impl Write) -> Result<()> { - self.copy_to_end_size(input, output, 1024 * 1024) - } - - /// Reads everything from input and writes it to the output. - /// Wraps calls to process buffer - fn copy_to_end_size( - &mut self, - input: &mut impl BufRead, - output: &mut impl Write, - read_chunk_size: usize, - ) -> Result<()> { - let mut input_complete = false; - loop { - let buffer: &[u8]; - if input_complete { - buffer = &[]; - } else { - buffer = input.fill_buf().context()?; - if buffer.len() == 0 { - input_complete = true - } - }; - - if input_complete { - self.process_buffer(&[], true, output).context()?; - break; - } else { - // process buffer a piece at a time to avoid overflowing memory - let mut amount_read = 0; - while amount_read < buffer.len() { - let chunk_size = (buffer.len() - amount_read).min(read_chunk_size); - - self.process_buffer( - &buffer[amount_read..amount_read + chunk_size], - false, - output, - ) - .context()?; - - amount_read += chunk_size; - } - - let buflen = buffer.len(); - input.consume(buflen); - } - } - - Ok(()) - } - - fn stats(&self) -> PreflateStats { - PreflateStats::default() - } -} - -#[derive(Debug)] -enum ChunkParseState { - Start, - /// we are looking for a deflate stream or PNG chunk. The data of the PNG file - /// is stored later than the IHDR chunk that will tell us the dimensions of the image, - /// so we need to keep track of the IHDR chunk so we can use it later to properly - /// compress the PNG data. - Searching(Option), - DeflateContinue(PreflateStreamProcessor), -} - -/// Takes a sequence of bytes that may contain deflate streams, find -/// the streams, and emits a new stream that containus the decompressed -/// streams along with the corrections needed to recreate the original. -/// -/// This output can then be compressed with a better algorithm, like Zstandard -/// and achieve much better compression than if we tried to compress the -/// deflate stream directlyh. -pub struct PreflateContainerProcessor { - content: Vec, - compression_stats: PreflateStats, - input_complete: bool, - total_plain_text_seen: u64, - - /// used to track the last attempted chunk size, in case we - /// need more input to continue, we will collect at least min_chunk_size - /// more input before trying to process again until we reach max_chunk_size - last_attempt_chunk_size: usize, - - state: ChunkParseState, - config: PreflateContainerConfig, -} - -impl PreflateContainerProcessor { - pub fn new(config: &PreflateContainerConfig) -> Self { - PreflateContainerProcessor { - content: Vec::new(), - compression_stats: PreflateStats::default(), - input_complete: false, - state: ChunkParseState::Start, - total_plain_text_seen: 0, - last_attempt_chunk_size: 0, - config: config.clone(), - } - } -} - -impl ProcessBuffer for PreflateContainerProcessor { - fn process_buffer( - &mut self, - input: &[u8], - input_complete: bool, - writer: &mut impl Write, - ) -> Result<()> { - if self.input_complete && (input.len() > 0 || !input_complete) { - return Err(PreflateError::new( - ExitCode::InvalidParameter, - "more data provided after input_complete signaled", - )); - } - - if input.len() > 0 { - self.compression_stats.deflate_compressed_size += input.len() as u64; - self.content.extend_from_slice(input); - } - - loop { - // wait until we have at least min_chunk_size before we start processing - if self.content.is_empty() - || (!input_complete - && (self.content.len() - self.last_attempt_chunk_size) - < self.config.min_chunk_size - && self.content.len() <= self.config.max_chunk_size) - { - break; - } - - self.last_attempt_chunk_size = self.content.len(); - - match &mut self.state { - ChunkParseState::Start => { - writer.write_all(&[COMPRESSED_WRAPPER_VERSION_1])?; - self.state = ChunkParseState::Searching(None); - } - ChunkParseState::Searching(prev_ihdr) => { - if self.total_plain_text_seen > self.config.total_plain_text_limit { - // once we've exceeded our limit, we don't do any more compression - // this is to ensure we don't suck the CPU time for too long on - // a single file - write_literal_block(&self.content, writer)?; - - self.last_attempt_chunk_size = 0; - self.content.clear(); - break; - } - - // here we are looking for a deflate stream or PNG chunk - match find_compressable_stream( - &self.content, - prev_ihdr, - input_complete, - &self.config, - ) { - FindStreamResult::Found(next, chunk) => { - // the gap between the start and the beginning of the deflate stream - // is written out as a literal block - if next.start != 0 { - write_literal_block(&self.content[..next.start], writer)?; - } - - if let Some(mut state) = - write_chunk_block(writer, chunk, &mut self.compression_stats) - .context()? - { - self.total_plain_text_seen += state.plain_text().len() as u64; - state.shrink_to_dictionary(); - - self.state = ChunkParseState::DeflateContinue(state); - } - - self.content.drain(0..next.end); - self.last_attempt_chunk_size = self.content.len(); - } - FindStreamResult::ShortRead => { - if input_complete || self.content.len() > self.config.max_chunk_size { - // if we have too much data or have no more data, - // we just write it out as a literal block with everything we have - write_literal_block(&self.content, writer)?; - - self.content.clear(); - self.last_attempt_chunk_size = 0; - } else { - // we don't have enough data to process the stream, so we just - // wait for more data - break; - } - } - FindStreamResult::None => { - // couldn't find anything, just write the rest as a literal block - write_literal_block(&self.content, writer)?; - - self.content.clear(); - self.last_attempt_chunk_size = 0; - } - } - } - ChunkParseState::DeflateContinue(state) => { - // here we have a deflate stream that we need to continue - // right now we error out if the continuation cannot be processed - match state.decompress(&self.content) { - Err(_e) => { - // indicate that we got an error while trying to continue - // the compression of a previous chunk, this happens - // when the stream significantly diverged from the behavior we estimated - // in the first chunk that we saw - self.state = ChunkParseState::Searching(None); - - log::debug!("Error while trying to continue compression {:?}", _e); - } - Ok(res) => { - log::debug!( - "Deflate continue: {} -> {}", - state.plain_text().len(), - res.compressed_size - ); - - writer.write_all(&[DEFLATE_STREAM_CONTINUE])?; - - write_varint(writer, res.corrections.len() as u32)?; - write_varint(writer, state.plain_text().len() as u32)?; - - writer.write_all(&res.corrections)?; - writer.write_all(&state.plain_text().text())?; - - self.total_plain_text_seen += state.plain_text().len() as u64; - self.compression_stats.overhead_bytes += res.corrections.len() as u64; - self.compression_stats.uncompressed_size += - state.plain_text().len() as u64; - - self.content.drain(0..res.compressed_size); - self.last_attempt_chunk_size = self.content.len(); - - if state.is_done() { - self.state = ChunkParseState::Searching(None); - } else { - state.shrink_to_dictionary(); - } - } - } - } - } - } - - if input_complete { - self.input_complete = true; - - if self.content.len() > 0 { - write_literal_block(&self.content, writer)?; - } - self.content.clear(); - } - - Ok(()) - } - - fn stats(&self) -> PreflateStats { - self.compression_stats - } -} - -#[cfg(test)] -pub struct NopProcessBuffer {} - -#[cfg(test)] -impl ProcessBuffer for NopProcessBuffer { - fn process_buffer( - &mut self, - input: &[u8], - _input_complete: bool, - writer: &mut impl Write, - ) -> Result<()> { - writer.write_all(input).context()?; - - Ok(()) - } -} - -enum DecompressionState { - Start, - StartSegment, - LiteralBlock(usize), - DeflateBlock(usize, usize), - PNGBlock { - correction_length: usize, - uncompressed_length: usize, - idat: IdatContents, - filters: Vec, - }, - JPEGBlock { - lepton_length: usize, - }, -} - -/// recreates the orignal content from the chunked data -pub struct RecreateContainerProcessor { - capacity: usize, - input: VecDeque, - input_complete: bool, - state: DecompressionState, - - /// state of the predictor and plain text if we need to contiune a deflate stream - /// if it was too big to complete in a single chunk - deflate_continue_state: Option, -} - -impl RecreateContainerProcessor { - pub fn new(capacity: usize) -> Self { - RecreateContainerProcessor { - input: VecDeque::new(), - capacity, - input_complete: false, - state: DecompressionState::Start, - deflate_continue_state: None, - } - } - - /// for testing reading a single chunk (skip header) - pub fn new_single_chunk(capacity: usize) -> Self { - RecreateContainerProcessor { - input: VecDeque::new(), - capacity, - input_complete: false, - state: DecompressionState::StartSegment, - deflate_continue_state: None, - } - } -} - -impl ProcessBuffer for RecreateContainerProcessor { - fn process_buffer( - &mut self, - input: &[u8], - input_complete: bool, - writer: &mut impl Write, - ) -> Result<()> { - if self.input_complete && (input.len() > 0 || !input_complete) { - return Err(PreflateError::new( - ExitCode::InvalidParameter, - "more data provided after input_complete signaled", - )); - } - - // we could have been passed a big buffer, so we need to process it in chunks - let mut amount_read = 0; - loop { - let amount_to_read = (input.len() - amount_read).min(self.capacity); - - // when we get to the end and we've read everything, we can signal that we are done - if amount_read + amount_to_read == input.len() && input_complete { - self.input_complete = true; - } - - self.input - .extend(&input[amount_read..amount_read + amount_to_read]); - - amount_read += amount_to_read; - - self.process_buffer_internal(writer)?; - - if amount_read == input.len() { - break; - } - } - - Ok(()) - } -} - -impl RecreateContainerProcessor { - fn process_buffer_internal(&mut self, writer: &mut impl Write) -> Result<()> { - loop { - match &mut self.state { - DecompressionState::Start => { - if !self.input_complete && self.input.len() == 0 { - break; - } - - let version = self.input.read_u8()?; - - if version != COMPRESSED_WRAPPER_VERSION_1 { - return err_exit_code( - ExitCode::InvalidCompressedWrapper, - format!("Invalid version {version}"), - ); - } - - self.state = DecompressionState::StartSegment; - } - DecompressionState::StartSegment => { - // here's a good place to stop if we run out of input - if self.input.len() == 0 { - break; - } - - // use scoped read so that if we run out of bytes we can undo the read and wait for more input - self.state = match self.input.scoped_read(|r| match r.read_u8()? { - LITERAL_CHUNK => { - let length = read_varint(r)? as usize; - - Ok(DecompressionState::LiteralBlock(length)) - } - DEFLATE_STREAM => { - let correction_length = read_varint(r)? as usize; - let uncompressed_length = read_varint(r)? as usize; - - // clear the deflate state if we are starting a new block - self.deflate_continue_state = None; - - Ok(DecompressionState::DeflateBlock( - correction_length, - uncompressed_length, - )) - } - DEFLATE_STREAM_CONTINUE => { - let correction_length = read_varint(r)? as usize; - let uncompressed_length = read_varint(r)? as usize; - - if self.deflate_continue_state.is_none() { - return err_exit_code( - ExitCode::InvalidCompressedWrapper, - "no deflate state to continue", - ); - } - - Ok(DecompressionState::DeflateBlock( - correction_length, - uncompressed_length, - )) - } - PNG_COMPRESSED => { - let correction_length = read_varint(r)? as usize; - let uncompressed_length = read_varint(r)? as usize; - let idat = IdatContents::read_from_bytestream(r)?; - - let mut filters = Vec::new(); - if let Some(png_header) = &idat.png_header { - filters.resize(png_header.height as usize, 0); - r.read_exact(&mut filters[..])?; - } - - Ok(DecompressionState::PNGBlock { - correction_length, - uncompressed_length, - idat, - filters, - }) - } - JPEG_LEPTON_COMPRESSED => { - let lepton_length = read_varint(r)? as usize; - - Ok(DecompressionState::JPEGBlock { lepton_length }) - } - - _ => Err(PreflateError::new( - ExitCode::InvalidCompressedWrapper, - "Invalid chunk", - )), - }) { - Ok(s) => s, - Err(e) => { - if !self.input_complete && e.exit_code() == ExitCode::ShortRead { - // wait for more input if we ran out of bytes here - break; - } else { - return Err(e); - } - } - } - } - - DecompressionState::LiteralBlock(length) => { - let source_size = self.input.len(); - if source_size < *length { - if self.input_complete { - return Err(PreflateError::new( - ExitCode::InvalidCompressedWrapper, - "unexpected end of input", - )); - } - - std::io::copy(&mut self.input, writer).context()?; - *length -= source_size; - break; - } - - std::io::copy(&mut (&mut self.input).take(*length as u64), writer).context()?; - self.state = DecompressionState::StartSegment; - } - - DecompressionState::DeflateBlock(correction_length, uncompressed_length) => { - let source_size = self.input.len(); - let total_length = *correction_length + *uncompressed_length; - - if source_size < total_length { - if self.input_complete { - return Err(PreflateError::new( - ExitCode::InvalidCompressedWrapper, - "unexpected end of input", - )); - } - break; - } - - let corrections: Vec = self.input.drain(0..*correction_length).collect(); - - if let Some(reconstruct) = &mut self.deflate_continue_state { - let (comp, _) = reconstruct - .recompress( - &mut TakeReader::new(&mut self.input, *uncompressed_length), - &corrections, - ) - .context()?; - - writer.write_all(&comp)?; - } else { - let mut reconstruct = RecreateStreamProcessor::new(); - let (comp, _) = reconstruct - .recompress( - &mut TakeReader::new(&mut self.input, *uncompressed_length), - &corrections, - ) - .context()?; - - writer.write_all(&comp)?; - - self.deflate_continue_state = Some(reconstruct); - } - - self.state = DecompressionState::StartSegment; - } - - DecompressionState::PNGBlock { - correction_length, - uncompressed_length, - idat, - filters, - } => { - let source_size = self.input.len(); - - let total_length = *correction_length + *uncompressed_length; - if source_size < total_length { - // wait till we have the full block - if self.input_complete { - return Err(PreflateError::new( - ExitCode::InvalidCompressedWrapper, - "unexpected end of input", - )); - } - break; - } - - let corrections: Vec = self.input.drain(0..*correction_length).collect(); - - let plain_text; - - if let Some(header) = &idat.png_header { - let webp: Vec = self.input.drain(0..*uncompressed_length).collect(); - - plain_text = webp_decompress(filters, webp, header).context()?; - } else { - plain_text = self.input.drain(0..*uncompressed_length).collect(); - } - - let recompressed = - recreate_whole_deflate_stream(&plain_text, &corrections).context()?; - - recreate_idat(&idat, &recompressed[..], writer).context()?; - - self.state = DecompressionState::StartSegment; - } - DecompressionState::JPEGBlock { lepton_length } => { - let source_size = self.input.len(); - if source_size < *lepton_length { - if self.input_complete { - return Err(PreflateError::new( - ExitCode::InvalidCompressedWrapper, - "unexpected end of input", - )); - } - break; - } - - let lepton_data: Vec = self.input.drain(0..*lepton_length).collect(); - - match lepton_jpeg::decode_lepton( - &mut Cursor::new(&lepton_data), - writer, - &EnabledFeatures::compat_lepton_vector_read(), - &DEFAULT_THREAD_POOL, - ) { - Err(e) => { - return Err(PreflateError::new( - ExitCode::InvalidCompressedWrapper, - format!("JPEG Lepton decode failed: {}", e), - )); - } - Ok(_) => {} - } - - self.state = DecompressionState::StartSegment; - } - } - } - - Ok(()) - } -} - -fn webp_compress( - result: &mut impl Write, - plain_text: &[u8], - corrections: &[u8], - idat: &IdatContents, -) -> Result<()> { - log::debug!("{:?}", idat); - - #[cfg(feature = "webp")] - if let Some(png_header) = idat.png_header { - use crate::idat_parse::{PngColorType, undo_png_filters}; - use std::ops::Deref; - - let bbp = png_header.color_type.bytes_per_pixel(); - let w = png_header.width as usize; - let h = png_header.height as usize; - - log::debug!( - "plain text compressing {} bytes ({}x{}x{})", - plain_text.len(), - w, - h, - bbp - ); - - // see if the bitmap looks like the way with think it should (bits per pixel map + 1 height worth of filter bytes) - if (bbp * w * h) + h == plain_text.len() { - let (bitmap, filters) = undo_png_filters(plain_text, w, h, bbp); - - let enc = webp::Encoder::new( - &bitmap, - match png_header.color_type { - PngColorType::RGB => webp::PixelLayout::Rgb, - PngColorType::RGBA => webp::PixelLayout::Rgba, - }, - png_header.width, - png_header.height, - ); - - let mut webpconfig = webp::WebPConfig::new().unwrap(); - webpconfig.lossless = 1; - webpconfig.alpha_compression = 0; - webpconfig.exact = 1; // undocumented option, but required to not throw away color if alpha channel is zero - - // this is the default quality setting for webp lossless, we could dial it up - // but the quality gains are marginal for the CPU cost, although the - // CPU decompression cost is the same. - webpconfig.quality = 75.0; // 0..100 higher is slower but better compression - webpconfig.method = 4; // 0..6 higher is slower but better compression - - let comp = match enc.encode_advanced(&webpconfig) { - Ok(c) => c, - Err(e) => { - return err_exit_code( - ExitCode::WebPDecodeError, - format!("Webp encode failed: {:?}", e), - ); - } - }; - - result.write_all(&[PNG_COMPRESSED])?; - - write_varint(result, corrections.len() as u32)?; - write_varint(result, comp.deref().len() as u32)?; - - log::debug!( - "Webp compressed {} bytes (vs {})", - comp.deref().len(), - idat.total_chunk_length - ); - - idat.write_to_bytestream(result)?; - result.write_all(&filters)?; - - result.write_all(&corrections)?; - result.write_all(comp.deref())?; - - return Ok(()); - } - } - - return err_exit_code( - ExitCode::InvalidCompressedWrapper, - "Webp compression not supported", - ); -} - -fn webp_decompress( - filters: &[u8], - webp: Vec, - header: &crate::idat_parse::PngHeader, -) -> Result> { - #[cfg(feature = "webp")] - match webp::Decoder::new(webp.as_slice()).decode() { - Some(result) => { - use crate::idat_parse::apply_png_filters_with_types; - use std::ops::Deref; - - let m = result.deref(); - - return Ok(apply_png_filters_with_types( - m, - header.width as usize, - header.height as usize, - if result.is_alpha() { 4 } else { 3 }, - header.color_type.bytes_per_pixel(), - &filters, - )); - } - _ => {} - } - return err_exit_code(ExitCode::InvalidCompressedWrapper, "Webp decode failed"); -} - -#[test] -fn test_baseline_calc() { - use crate::utils::read_file; - use crate::zstd_compression::ZstdCompressContext; - - let v = read_file("samplezip.zip"); - - let mut context = ZstdCompressContext::new( - PreflateContainerProcessor::new(&PreflateContainerConfig::default()), - 9, - true, - ); - - let _r = context.process_vec(&v).unwrap(); - - let stats = context.stats(); - - println!("stats: {:?}", stats); - - // these change if the compression algorithm is altered, update them - assert_eq!(stats.overhead_bytes, 463); - assert_eq!(stats.zstd_compressed_size, 12444); - assert_eq!(stats.uncompressed_size, 54871); - assert_eq!(stats.zstd_baseline_size, 13664); -} - -#[test] -fn roundtrip_small_chunk() { - use crate::utils::{assert_eq_array, read_file}; - - let original = read_file("pptxplaintext.zip"); - - let mut context = PreflateContainerProcessor::new(&PreflateContainerConfig { - min_chunk_size: 100000, - max_chunk_size: 100000, - total_plain_text_limit: u64::MAX, - ..Default::default() - }); - - let compressed = context.process_vec_size(&original, 20001).unwrap(); - - let mut context = RecreateContainerProcessor::new(usize::MAX); - let recreated = context.process_vec_size(&compressed, 20001).unwrap(); - - assert_eq_array(&original, &recreated); -} - -#[test] -fn roundtrip_small_plain_text() { - use crate::utils::{assert_eq_array, read_file}; - - let original = read_file("pptxplaintext.zip"); - - let mut context = PreflateContainerProcessor::new(&PreflateContainerConfig { - min_chunk_size: 100000, - max_chunk_size: 100000, - total_plain_text_limit: u64::MAX, - ..Default::default() - }); - - let compressed = context.process_vec_size(&original, 2001).unwrap(); - - let mut context = RecreateContainerProcessor::new(usize::MAX); - let recreated = context.process_vec_size(&compressed, 2001).unwrap(); - - assert_eq_array(&original, &recreated); -} - -#[test] -fn roundtrip_png_e2e() { - crate::init_logging(); - - use crate::utils::{assert_eq_array, read_file}; - - let original = read_file("figma.png"); - - println!("Compressing file"); - - let mut context = PreflateContainerProcessor::new(&PreflateContainerConfig { - min_chunk_size: 100000, - max_chunk_size: original.len(), - ..Default::default() - }); - - let compressed = context.process_vec_size(&original, 100100).unwrap(); - - println!("Recreating file"); - - let mut context = RecreateContainerProcessor::new(usize::MAX); - let recreated = context.process_vec_size(&compressed, 100100).unwrap(); - - assert_eq_array(&original, &recreated); -} - -#[test] -fn roundtrip_jpg() { - crate::init_logging(); - - use crate::utils::{assert_eq_array, read_file}; - - let original = read_file("embedded-images.pdf"); - - println!("Compressing file"); - - let mut context = PreflateContainerProcessor::new(&PreflateContainerConfig { - min_chunk_size: 1000000, - max_chunk_size: original.len(), - ..Default::default() - }); - - let compressed = context.process_vec_size(&original, usize::MAX).unwrap(); - - println!( - "Compressed size: {} vs {}", - compressed.len(), - original.len() - ); - - println!("Recreating file"); - - let mut context = RecreateContainerProcessor::new(usize::MAX); - let recreated = context.process_vec_size(&compressed, usize::MAX).unwrap(); - - assert_eq_array(&original, &recreated); -} diff --git a/container/src/container_read.rs b/container/src/container_read.rs new file mode 100644 index 0000000..795382a --- /dev/null +++ b/container/src/container_read.rs @@ -0,0 +1,535 @@ +use byteorder::ReadBytesExt; +use crc32fast::Hasher as CrcHasher; +use lepton_jpeg::{DEFAULT_THREAD_POOL, EnabledFeatures}; + +use std::{ + collections::VecDeque, + io::{Cursor, Read, Write}, +}; + +use crate::{ + container_common::{ + BLOCK_COMPRESSION_MASK, BLOCK_COMPRESSION_NONE, BLOCK_COMPRESSION_ZSTD, BLOCK_TYPE_DEFLATE, + BLOCK_TYPE_DEFLATE_CONTINUE, BLOCK_TYPE_JPEG_LEPTON, BLOCK_TYPE_LITERAL, BLOCK_TYPE_MASK, + BLOCK_TYPE_PNG, BLOCK_TYPE_WEBP, COMPRESSED_WRAPPER_VERSION_2, ProcessBuffer, read_varint, + }, + idat_parse::{IdatContents, recreate_idat}, + scoped_read::ScopedRead, +}; + +use preflate_rs::{ + AddContext, ExitCode, PreflateError, RecreateStreamProcessor, Result, err_exit_code, + recreate_whole_deflate_stream, +}; + +/// Write wrapper that computes a running CRC-32 of every byte written. +struct CrcWriter<'a, W: Write> { + inner: &'a mut W, + hasher: &'a mut CrcHasher, +} + +impl Write for CrcWriter<'_, W> { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + let n = self.inner.write(buf)?; + self.hasher.update(&buf[..n]); + Ok(n) + } + fn flush(&mut self) -> std::io::Result<()> { + self.inner.flush() + } +} + +enum DecompressionState { + Start, + StartSegment, + /// accumulate compressed_size bytes then decode and process the block immediately. + AccumulateBlock { + block_type: u8, + compressed_size: usize, + }, + /// accumulate lepton bytes then decode the JPEG block immediately. + JpegAccumulate { + lepton_length: usize, + }, + /// accumulate raw WebP-compressed PNG bytes then process the block immediately. + WebpAccumulate { + total_len: usize, + }, + /// 0xFF end block was parsed; CRC check deferred to process_buffer. + CrcCheck { + expected: u32, + }, +} + +/// recreates the orignal content from the chunked data +pub struct RecreateContainerProcessor { + capacity: usize, + input: VecDeque, + input_complete: bool, + state: DecompressionState, + + /// state of the predictor and plain text if we need to contiune a deflate stream + /// if it was too big to complete in a single chunk + deflate_continue_state: Option, + + /// persistent Zstd decoder — maintains the streaming context across blocks + zstd_decoder: zstd::stream::raw::Decoder<'static>, + + /// running CRC-32 of all output bytes, verified against the 0xFF end block + output_crc: CrcHasher, +} + +impl RecreateContainerProcessor { + pub fn new(capacity: usize) -> Self { + RecreateContainerProcessor { + input: VecDeque::new(), + capacity, + input_complete: false, + state: DecompressionState::Start, + deflate_continue_state: None, + zstd_decoder: zstd::stream::raw::Decoder::new().expect("failed to create zstd decoder"), + output_crc: CrcHasher::new(), + } + } +} + +impl ProcessBuffer for RecreateContainerProcessor { + fn process_buffer( + &mut self, + input: &[u8], + input_complete: bool, + writer: &mut impl Write, + ) -> Result<()> { + if self.input_complete && (input.len() > 0 || !input_complete) { + return Err(PreflateError::new( + ExitCode::InvalidParameter, + "more data provided after input_complete signaled", + )); + } + + // we could have been passed a big buffer, so we need to process it in chunks + let mut amount_read = 0; + loop { + let amount_to_read = (input.len() - amount_read).min(self.capacity); + + // when we get to the end and we've read everything, we can signal that we are done + if amount_read + amount_to_read == input.len() && input_complete { + self.input_complete = true; + } + + self.input + .extend(&input[amount_read..amount_read + amount_to_read]); + + amount_read += amount_to_read; + + self.with_crc_writer(writer, |this, crc_writer| { + this.process_buffer_internal(crc_writer) + })?; + + // If process_buffer_internal parsed the 0xFF end block, verify the CRC now + // that output_crc has been restored with all written bytes. + if let DecompressionState::CrcCheck { expected } = self.state { + let actual = self.output_crc.clone().finalize(); + if actual != expected { + return err_exit_code( + ExitCode::InvalidCompressedWrapper, + format!("CRC-32 mismatch: expected {expected:#010x}, got {actual:#010x}"), + ); + } + self.state = DecompressionState::StartSegment; + } + + if amount_read == input.len() { + break; + } + } + + Ok(()) + } +} + +impl RecreateContainerProcessor { + /// Runs `f` with a `CrcWriter` wrapping `writer`, then restores `self.output_crc`. + /// + /// `self.output_crc` must be borrowed mutably through the `CrcWriter`, but + /// `f` also needs `&mut self` to drive the state machine — a direct borrow + /// conflict. This helper resolves it by temporarily swapping `output_crc` + /// out of `self` with `mem::replace`, so the field is no longer part of the + /// active `&mut self` borrow while `f` runs. + fn with_crc_writer(&mut self, writer: &mut W, f: F) -> Result<()> + where + F: FnOnce(&mut Self, &mut CrcWriter<'_, W>) -> Result<()>, + { + let mut hasher = std::mem::replace(&mut self.output_crc, CrcHasher::new()); + let result = { + let mut crc_writer = CrcWriter { + inner: writer, + hasher: &mut hasher, + }; + f(self, &mut crc_writer) + }; + self.output_crc = hasher; + result + } + + fn process_buffer_internal(&mut self, writer: &mut impl Write) -> Result<()> { + loop { + match &mut self.state { + DecompressionState::Start => { + if !self.input_complete && self.input.len() == 0 { + break; + } + + let version = self.input.read_u8()?; + + match version { + COMPRESSED_WRAPPER_VERSION_2 => { + self.state = DecompressionState::StartSegment; + } + _ => { + return err_exit_code( + ExitCode::InvalidCompressedWrapper, + format!("Invalid version {version}"), + ); + } + } + } + DecompressionState::StartSegment => { + // here's a good place to stop if we run out of input + if self.input.len() == 0 { + break; + } + + // read type byte, then dispatch + self.state = match self.input.scoped_read(|r| { + let type_byte = r.read_u8()?; + + // 0xFF is the CRC end block: 4 raw bytes, no varint. + if type_byte == 0xFF { + let mut buf = [0u8; 4]; + r.read_exact(&mut buf)?; + return Ok(DecompressionState::CrcCheck { + expected: u32::from_le_bytes(buf), + }); + } + + let compression = type_byte & BLOCK_COMPRESSION_MASK; + let block_type = type_byte & BLOCK_TYPE_MASK; + match compression { + BLOCK_COMPRESSION_NONE => match block_type { + BLOCK_TYPE_JPEG_LEPTON => { + let lepton_length = read_varint(r)? as usize; + Ok(DecompressionState::JpegAccumulate { lepton_length }) + } + BLOCK_TYPE_WEBP => { + let total_len = read_varint(r)? as usize; + Ok(DecompressionState::WebpAccumulate { total_len }) + } + _ => err_exit_code( + ExitCode::InvalidCompressedWrapper, + "unknown raw block type", + ), + }, + BLOCK_COMPRESSION_ZSTD => { + let compressed_size = read_varint(r)? as usize; + Ok(DecompressionState::AccumulateBlock { + block_type, + compressed_size, + }) + } + _ => err_exit_code( + ExitCode::InvalidCompressedWrapper, + "unknown compression algorithm", + ), + } + }) { + Ok(s) => s, + Err(e) => { + if !self.input_complete && e.exit_code() == ExitCode::ShortRead { + break; + } else { + return Err(e); + } + } + }; + } + + DecompressionState::AccumulateBlock { + block_type, + compressed_size, + } => { + if self.input.len() < *compressed_size { + if self.input_complete { + return Err(PreflateError::new( + ExitCode::InvalidCompressedWrapper, + "unexpected end of input in block", + )); + } + break; + } + + let block_type = *block_type; + let compressed_bytes: Vec = self.input.drain(0..*compressed_size).collect(); + let decoded = drain_zstd_block(&mut self.zstd_decoder, &compressed_bytes)?; + process_compressed_block( + block_type, + &mut Cursor::new(decoded), + &mut self.deflate_continue_state, + writer, + )?; + self.state = DecompressionState::StartSegment; + } + + DecompressionState::JpegAccumulate { lepton_length } => { + if self.input.len() < *lepton_length { + if self.input_complete { + return Err(PreflateError::new( + ExitCode::InvalidCompressedWrapper, + "unexpected end of input in jpeg block", + )); + } + break; + } + + let lepton_bytes: Vec = self.input.drain(0..*lepton_length).collect(); + match lepton_jpeg::decode_lepton( + &mut Cursor::new(&lepton_bytes), + writer, + &EnabledFeatures::compat_lepton_vector_read(), + &DEFAULT_THREAD_POOL, + ) { + Err(e) => { + return Err(PreflateError::new( + ExitCode::InvalidCompressedWrapper, + format!("JPEG Lepton decode failed: {}", e), + )); + } + Ok(_) => {} + } + self.state = DecompressionState::StartSegment; + } + + DecompressionState::WebpAccumulate { total_len } => { + if self.input.len() < *total_len { + if self.input_complete { + return Err(PreflateError::new( + ExitCode::InvalidCompressedWrapper, + "unexpected end of input in webp block", + )); + } + break; + } + + let webp_bytes: Vec = self.input.drain(0..*total_len).collect(); + // Payload is what webp_compress wrote after the BLOCK_TYPE_PNG type byte, + // so process_compressed_block can parse it directly. + process_compressed_block( + BLOCK_TYPE_PNG, + &mut Cursor::new(webp_bytes), + &mut self.deflate_continue_state, + writer, + )?; + self.state = DecompressionState::StartSegment; + } + + DecompressionState::CrcCheck { .. } => { + // CRC verification is handled in process_buffer after this returns. + break; + } + } + } + + Ok(()) + } +} + +/// Feeds `compressed` bytes into the persistent `decoder` and returns all decompressed output. +/// +/// Each call corresponds to one Zstd flush frame (written by the encoder via `flush()`). +/// After consuming all input bytes the decoder is drained until it produces no more output, +/// which is guaranteed because `ZSTD_e_flush` ensures all data is available to the decoder +/// before the next block starts. +fn drain_zstd_block( + decoder: &mut zstd::stream::raw::Decoder<'static>, + compressed: &[u8], +) -> Result> { + use zstd::stream::raw::{InBuffer, Operation, OutBuffer}; + + let mut output = Vec::new(); + let mut scratch = vec![0u8; 65536]; + let mut in_buf = InBuffer::around(compressed); + + loop { + let mut out_buf = OutBuffer::around(scratch.as_mut_slice()); + decoder.run(&mut in_buf, &mut out_buf).map_err(|e| { + PreflateError::new( + ExitCode::InvalidCompressedWrapper, + format!("zstd decode failed: {e}"), + ) + })?; + let produced = out_buf.pos(); + output.extend_from_slice(&scratch[..produced]); + + // Stop when all input has been consumed and the decoder produced no more output. + // zstd guarantees progress (either bytes_read > 0 or bytes_written > 0) so this + // loop always terminates. + if in_buf.pos() >= compressed.len() && produced == 0 { + break; + } + } + + Ok(output) +} + +/// Parses and processes a single non-JPEG/non-WebP block. +/// +/// `cursor` wraps the output of `drain_zstd_block` for compressed blocks, +/// or the raw WebP payload for `BLOCK_TYPE_PNG` blocks stored outside Zstd. +/// +/// Layout written by the encoder for each block type (block_type = lower 6 bits): +/// BLOCK_TYPE_LITERAL: varint(len) + data +/// BLOCK_TYPE_DEFLATE: varint(corrections_len) + varint(plaintext_len) + corrections + plaintext +/// BLOCK_TYPE_DEFLATE_CONTINUE: same as BLOCK_TYPE_DEFLATE +/// BLOCK_TYPE_PNG: varint(correction_length) + varint(uncompressed_length) + +/// IdatContents + [filters if png_header present] + +/// corrections + (webp_data or raw_plaintext) +fn process_compressed_block( + block_type: u8, + cursor: &mut Cursor>, + deflate_continue_state: &mut Option, + writer: &mut impl Write, +) -> Result<()> { + match block_type { + BLOCK_TYPE_LITERAL => { + let length = read_varint(cursor)? as usize; + let mut data = vec![0u8; length]; + cursor.read_exact(&mut data).map_err(|e| { + PreflateError::new(ExitCode::InvalidCompressedWrapper, e.to_string()) + })?; + writer.write_all(&data)?; + } + BLOCK_TYPE_DEFLATE => { + *deflate_continue_state = None; + + let correction_length = read_varint(cursor)? as usize; + let uncompressed_length = read_varint(cursor)? as usize; + + let mut corrections = vec![0u8; correction_length]; + cursor.read_exact(&mut corrections).map_err(|e| { + PreflateError::new(ExitCode::InvalidCompressedWrapper, e.to_string()) + })?; + + let mut plain_text_buf = vec![0u8; uncompressed_length]; + cursor.read_exact(&mut plain_text_buf).map_err(|e| { + PreflateError::new(ExitCode::InvalidCompressedWrapper, e.to_string()) + })?; + + let mut reconstruct = RecreateStreamProcessor::new(); + let (comp, _) = reconstruct + .recompress(&mut Cursor::new(&plain_text_buf), &corrections) + .context()?; + + writer.write_all(&comp)?; + *deflate_continue_state = Some(reconstruct); + } + BLOCK_TYPE_DEFLATE_CONTINUE => { + let correction_length = read_varint(cursor)? as usize; + let uncompressed_length = read_varint(cursor)? as usize; + + let mut corrections = vec![0u8; correction_length]; + cursor.read_exact(&mut corrections).map_err(|e| { + PreflateError::new(ExitCode::InvalidCompressedWrapper, e.to_string()) + })?; + + let mut plain_text_buf = vec![0u8; uncompressed_length]; + cursor.read_exact(&mut plain_text_buf).map_err(|e| { + PreflateError::new(ExitCode::InvalidCompressedWrapper, e.to_string()) + })?; + + let reconstruct = deflate_continue_state.as_mut().ok_or_else(|| { + PreflateError::new( + ExitCode::InvalidCompressedWrapper, + "no deflate state to continue", + ) + })?; + + let (comp, _) = reconstruct + .recompress(&mut Cursor::new(&plain_text_buf), &corrections) + .context()?; + + writer.write_all(&comp)?; + } + BLOCK_TYPE_PNG => { + let correction_length = read_varint(cursor)? as usize; + let uncompressed_length = read_varint(cursor)? as usize; + let idat = IdatContents::read_from_bytestream(cursor)?; + + let mut filters = Vec::new(); + if let Some(png_header) = &idat.png_header { + filters.resize(png_header.height as usize, 0); + cursor.read_exact(&mut filters[..]).map_err(|e| { + PreflateError::new(ExitCode::InvalidCompressedWrapper, e.to_string()) + })?; + } + + let mut corrections = vec![0u8; correction_length]; + cursor.read_exact(&mut corrections).map_err(|e| { + PreflateError::new(ExitCode::InvalidCompressedWrapper, e.to_string()) + })?; + + let plain_text; + if let Some(header) = &idat.png_header { + let mut webp = vec![0u8; uncompressed_length]; + cursor.read_exact(&mut webp).map_err(|e| { + PreflateError::new(ExitCode::InvalidCompressedWrapper, e.to_string()) + })?; + plain_text = webp_decompress(&filters, webp, header).context()?; + } else { + let mut raw = vec![0u8; uncompressed_length]; + cursor.read_exact(&mut raw).map_err(|e| { + PreflateError::new(ExitCode::InvalidCompressedWrapper, e.to_string()) + })?; + plain_text = raw; + } + + let recompressed = + recreate_whole_deflate_stream(&plain_text, &corrections).context()?; + + recreate_idat(&idat, &recompressed[..], writer).context()?; + } + _ => { + return err_exit_code( + ExitCode::InvalidCompressedWrapper, + format!("Unknown block type {block_type}"), + ); + } + } + Ok(()) +} + +fn webp_decompress( + filters: &[u8], + webp: Vec, + header: &crate::idat_parse::PngHeader, +) -> Result> { + #[cfg(feature = "webp")] + match webp::Decoder::new(webp.as_slice()).decode() { + Some(result) => { + use crate::idat_parse::apply_png_filters_with_types; + use std::ops::Deref; + + let m = result.deref(); + + return Ok(apply_png_filters_with_types( + m, + header.width as usize, + header.height as usize, + if result.is_alpha() { 4 } else { 3 }, + header.color_type.bytes_per_pixel(), + &filters, + )); + } + _ => {} + } + return err_exit_code(ExitCode::InvalidCompressedWrapper, "Webp decode failed"); +} diff --git a/container/src/container_write.rs b/container/src/container_write.rs new file mode 100644 index 0000000..2e9c5e5 --- /dev/null +++ b/container/src/container_write.rs @@ -0,0 +1,534 @@ +use crc32fast::Hasher as CrcHasher; + +use std::io::Write; + +use crate::{ + container_common::{ + BLOCK_COMPRESSION_NONE, BLOCK_COMPRESSION_ZSTD, BLOCK_TYPE_DEFLATE, + BLOCK_TYPE_DEFLATE_CONTINUE, BLOCK_TYPE_JPEG_LEPTON, BLOCK_TYPE_LITERAL, BLOCK_TYPE_PNG, + BLOCK_TYPE_WEBP, PreflateContainerConfig, PreflateStats, ProcessBuffer, write_varint, + }, + idat_parse::{IdatContents, PngHeader}, + scan_deflate::{FindStreamResult, FoundStream, FoundStreamType, find_compressable_stream}, +}; + +use preflate_rs::{AddContext, ExitCode, PreflateError, PreflateStreamProcessor, Result}; + +/// used to measure the length of the output without storing it +pub(crate) struct MeasureWriteSink { + pub length: usize, +} + +impl Write for MeasureWriteSink { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + self.length += buf.len(); + Ok(buf.len()) + } + + fn flush(&mut self) -> std::io::Result<()> { + Ok(()) + } +} + +#[derive(Debug)] +pub(crate) enum ChunkParseState { + Start, + /// we are looking for a deflate stream or PNG chunk. The data of the PNG file + /// is stored later than the IHDR chunk that will tell us the dimensions of the image, + /// so we need to keep track of the IHDR chunk so we can use it later to properly + /// compress the PNG data. + Searching(Option), + DeflateContinue(PreflateStreamProcessor), +} + +/// V2 variant of write_chunk_block: block content goes through the persistent Zstd encoder. +/// JPEG blocks are written raw to writer (bypass encoder). +/// Returns (total compressed bytes written, optional continue state). +pub(crate) fn write_chunk_block_v2( + encoder: &mut zstd::stream::write::Encoder<'static, Vec>, + writer: &mut impl Write, + chunk: FoundStream, + stats: &mut PreflateStats, +) -> Result<(usize, Option)> { + match chunk.chunk_type { + FoundStreamType::DeflateStream(parameters, state) => { + write_varint(encoder, chunk.corrections.len() as u32)?; + write_varint(encoder, state.plain_text().text().len() as u32)?; + encoder.write_all(&chunk.corrections)?; + encoder.write_all(&state.plain_text().text())?; + + let compressed_size = emit_compressed_block( + BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_DEFLATE, + encoder, + writer, + )?; + + stats.overhead_bytes += chunk.corrections.len() as u64; + stats.uncompressed_size += state.plain_text().len() as u64; + stats.hash_algorithm = parameters.hash_algorithm; + + if !state.is_done() { + return Ok((compressed_size, Some(state))); + } + Ok((compressed_size, None)) + } + + FoundStreamType::IDATDeflate(parameters, mut idat, plain_text) => { + log::debug!( + "IDATDeflate param {:?} corrections {}", + parameters, + chunk.corrections.len() + ); + + let mut temp_vec = Vec::new(); + + if webp_compress(&mut temp_vec, plain_text.text(), &chunk.corrections, &idat).is_ok() { + // WebP is already compressed — write raw, bypassing the Zstd encoder. + // temp_vec[0] is the BLOCK_TYPE_PNG placeholder byte; temp_vec[1..] is the payload. + let payload = &temp_vec[1..]; + writer.write_all(&[BLOCK_COMPRESSION_NONE | BLOCK_TYPE_WEBP])?; + write_varint(writer, payload.len() as u32)?; + writer.write_all(payload)?; + + stats.uncompressed_size += plain_text.len() as u64; + stats.hash_algorithm = parameters.hash_algorithm; + stats.overhead_bytes += chunk.corrections.len() as u64; + + Ok((payload.len(), None)) + } else { + // Non-WebP PNG: corrections + plaintext are compressible, send through Zstd. + log::debug!("non-Webp compressed {}", idat.total_chunk_length); + write_varint(encoder, chunk.corrections.len() as u32)?; + write_varint(encoder, plain_text.text().len() as u32)?; + idat.png_header = None; + idat.write_to_bytestream(encoder)?; + encoder.write_all(&chunk.corrections)?; + encoder.write_all(plain_text.text())?; + + let compressed_size = emit_compressed_block( + BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_PNG, + encoder, + writer, + )?; + + stats.uncompressed_size += plain_text.len() as u64; + stats.hash_algorithm = parameters.hash_algorithm; + stats.overhead_bytes += chunk.corrections.len() as u64; + + Ok((compressed_size, None)) + } + } + + FoundStreamType::JPEGLepton(data) => { + // JPEG is written raw (bypasses the encoder entirely) + writer.write_all(&[BLOCK_COMPRESSION_NONE | BLOCK_TYPE_JPEG_LEPTON])?; + write_varint(writer, data.len() as u32)?; + writer.write_all(&data)?; + + stats.uncompressed_size += data.len() as u64; + Ok((0, None)) + } + } +} + +/// Takes a sequence of bytes that may contain deflate streams, find +/// the streams, and emits a new stream that containus the decompressed +/// streams along with the corrections needed to recreate the original. +/// +/// This output can then be compressed with a better algorithm, like Zstandard +/// and achieve much better compression than if we tried to compress the +/// deflate stream directlyh. +pub struct PreflateContainerProcessor { + content: Vec, + compression_stats: PreflateStats, + input_complete: bool, + total_plain_text_seen: u64, + + /// used to track the last attempted chunk size, in case we + /// need more input to continue, we will collect at least min_chunk_size + /// more input before trying to process again until we reach max_chunk_size + last_attempt_chunk_size: usize, + + state: ChunkParseState, + config: PreflateContainerConfig, + + /// running CRC-32 of all input bytes, written as the final block + input_crc: CrcHasher, + + /// each block is individually compressed with this encoder (v2 format) + encoder: Option>>, + + /// when present, all raw input is also fed to this encoder so we can measure + /// baseline Zstd compression (without preflate processing) + baseline_encoder: Option>, +} + +impl PreflateContainerProcessor { + /// Creates a processor that uses v2 format with a persistent Zstd encoder shared + /// across all non-JPEG blocks. JPEG blocks bypass the encoder entirely. + pub fn new(config: &PreflateContainerConfig, level: i32, test_baseline: bool) -> Self { + PreflateContainerProcessor { + content: Vec::new(), + compression_stats: PreflateStats::default(), + input_complete: false, + state: ChunkParseState::Start, + total_plain_text_seen: 0, + last_attempt_chunk_size: 0, + config: config.clone(), + input_crc: CrcHasher::new(), + encoder: Some(zstd::stream::write::Encoder::new(Vec::new(), level).unwrap()), + baseline_encoder: if test_baseline { + Some( + zstd::stream::write::Encoder::new(MeasureWriteSink { length: 0 }, level) + .unwrap(), + ) + } else { + None + }, + } + } +} + +impl ProcessBuffer for PreflateContainerProcessor { + fn process_buffer( + &mut self, + input: &[u8], + input_complete: bool, + writer: &mut impl Write, + ) -> Result<()> { + use crate::container_common::COMPRESSED_WRAPPER_VERSION_2; + + if self.input_complete && (input.len() > 0 || !input_complete) { + return Err(PreflateError::new( + ExitCode::InvalidParameter, + "more data provided after input_complete signaled", + )); + } + + if input.len() > 0 { + self.compression_stats.deflate_compressed_size += input.len() as u64; + self.input_crc.update(input); + self.content.extend_from_slice(input); + + if let Some(encoder) = &mut self.baseline_encoder { + encoder.write_all(input).context()?; + } + } + + loop { + // wait until we have at least min_chunk_size before we start processing + if self.content.is_empty() + || (!input_complete + && (self.content.len() - self.last_attempt_chunk_size) + < self.config.min_chunk_size + && self.content.len() <= self.config.max_chunk_size) + { + break; + } + + self.last_attempt_chunk_size = self.content.len(); + + match &mut self.state { + ChunkParseState::Start => { + writer.write_all(&[COMPRESSED_WRAPPER_VERSION_2])?; + self.state = ChunkParseState::Searching(None); + } + ChunkParseState::Searching(prev_ihdr) => { + if self.total_plain_text_seen > self.config.total_plain_text_limit { + // once we've exceeded our limit, we don't do any more compression + let encoder = self.encoder.as_mut().unwrap(); + write_varint(encoder, self.content.len() as u32)?; + encoder.write_all(&self.content)?; + let sz = emit_compressed_block( + BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_LITERAL, + encoder, + writer, + )?; + self.compression_stats.zstd_compressed_size += sz as u64; + + self.last_attempt_chunk_size = 0; + self.content.clear(); + break; + } + + // here we are looking for a deflate stream or PNG chunk + match find_compressable_stream( + &self.content, + prev_ihdr, + input_complete, + &self.config, + ) { + FindStreamResult::Found(next, chunk) => { + // the gap between the start and the beginning of the deflate stream + // is written out as a literal block + if next.start != 0 { + let encoder = self.encoder.as_mut().unwrap(); + write_varint(encoder, next.start as u32)?; + encoder.write_all(&self.content[..next.start])?; + let sz = emit_compressed_block( + BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_LITERAL, + encoder, + writer, + )?; + self.compression_stats.zstd_compressed_size += sz as u64; + } + + let (compressed_size, next_state) = write_chunk_block_v2( + self.encoder.as_mut().unwrap(), + writer, + chunk, + &mut self.compression_stats, + ) + .context()?; + self.compression_stats.zstd_compressed_size += compressed_size as u64; + + if let Some(mut state) = next_state { + self.total_plain_text_seen += state.plain_text().len() as u64; + state.shrink_to_dictionary(); + self.state = ChunkParseState::DeflateContinue(state); + } + + self.content.drain(0..next.end); + self.last_attempt_chunk_size = self.content.len(); + } + FindStreamResult::ShortRead => { + if input_complete || self.content.len() > self.config.max_chunk_size { + // if we have too much data or have no more data, + // we just write it out as a literal block with everything we have + let encoder = self.encoder.as_mut().unwrap(); + write_varint(encoder, self.content.len() as u32)?; + encoder.write_all(&self.content)?; + let sz = emit_compressed_block( + BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_LITERAL, + encoder, + writer, + )?; + self.compression_stats.zstd_compressed_size += sz as u64; + + self.content.clear(); + self.last_attempt_chunk_size = 0; + } else { + // we don't have enough data to process the stream, so we just + // wait for more data + break; + } + } + FindStreamResult::None => { + // couldn't find anything, just write the rest as a literal block + let encoder = self.encoder.as_mut().unwrap(); + write_varint(encoder, self.content.len() as u32)?; + encoder.write_all(&self.content)?; + let sz = emit_compressed_block( + BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_LITERAL, + encoder, + writer, + )?; + self.compression_stats.zstd_compressed_size += sz as u64; + + self.content.clear(); + self.last_attempt_chunk_size = 0; + } + } + } + ChunkParseState::DeflateContinue(state) => { + // here we have a deflate stream that we need to continue + match state.decompress(&self.content) { + Err(ref e) + if e.exit_code() == ExitCode::ShortRead + && !input_complete + && self.content.len() <= self.config.max_chunk_size => + { + // Not enough data to complete the next block yet; wait for more. + break; + } + Err(_e) => { + // Stream analysis diverged or no more data is coming; give up on + // continuation and fall back to treating the remaining bytes as raw. + self.state = ChunkParseState::Searching(None); + + log::debug!("Error while trying to continue compression {:?}", _e); + } + Ok(res) => { + log::debug!( + "Deflate continue: {} -> {}", + state.plain_text().len(), + res.compressed_size + ); + + let encoder = self.encoder.as_mut().unwrap(); + write_varint(encoder, res.corrections.len() as u32)?; + write_varint(encoder, state.plain_text().len() as u32)?; + encoder.write_all(&res.corrections)?; + encoder.write_all(&state.plain_text().text())?; + let sz = emit_compressed_block( + BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_DEFLATE_CONTINUE, + encoder, + writer, + )?; + self.compression_stats.zstd_compressed_size += sz as u64; + + self.total_plain_text_seen += state.plain_text().len() as u64; + self.compression_stats.overhead_bytes += res.corrections.len() as u64; + self.compression_stats.uncompressed_size += + state.plain_text().len() as u64; + + self.content.drain(0..res.compressed_size); + self.last_attempt_chunk_size = self.content.len(); + + if state.is_done() { + self.state = ChunkParseState::Searching(None); + } else { + state.shrink_to_dictionary(); + } + } + } + } + } + } + + if input_complete && !self.input_complete { + self.input_complete = true; + + if self.content.len() > 0 { + let encoder = self.encoder.as_mut().unwrap(); + write_varint(encoder, self.content.len() as u32)?; + encoder.write_all(&self.content)?; + let sz = emit_compressed_block( + BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_LITERAL, + encoder, + writer, + )?; + self.compression_stats.zstd_compressed_size += sz as u64; + } + self.content.clear(); + + // Finalize the Zstd encoder; finish bytes are discarded since each block + // was already flushed and the decoder relies on EOF as the stream terminator. + let encoder = self.encoder.take().unwrap(); + let _ = encoder.finish(); + + // Write the CRC-32 end block: 0xFF sentinel + 4-byte LE CRC of original input. + let crc = self.input_crc.clone().finalize(); + writer.write_all(&[0xFF])?; + writer.write_all(&crc.to_le_bytes())?; + + // Finalize baseline encoder for stats + if let Some(mut encoder) = self.baseline_encoder.take() { + encoder.flush().context()?; + encoder.do_finish().context()?; + self.compression_stats.zstd_baseline_size = encoder.get_mut().length as u64; + } + } + + Ok(()) + } + + fn stats(&self) -> PreflateStats { + self.compression_stats + } +} + +/// Flushes the encoder, writes [block_type][varint(compressed_size)][compressed_bytes] to +/// destination, clears the encoder's inner buffer, and returns the compressed byte count. +fn emit_compressed_block( + block_type: u8, + encoder: &mut zstd::stream::write::Encoder<'static, Vec>, + destination: &mut impl Write, +) -> Result { + encoder.flush().context()?; + let compressed = encoder.get_mut(); + let len = compressed.len(); + destination.write_all(&[block_type])?; + write_varint(destination, len as u32)?; + destination.write_all(compressed)?; + compressed.clear(); + Ok(len) +} + +fn webp_compress( + result: &mut impl Write, + plain_text: &[u8], + corrections: &[u8], + idat: &IdatContents, +) -> Result<()> { + use crate::container_common::BLOCK_TYPE_PNG; + log::debug!("{:?}", idat); + + #[cfg(feature = "webp")] + if let Some(png_header) = idat.png_header { + use crate::idat_parse::{PngColorType, undo_png_filters}; + use std::ops::Deref; + + let bbp = png_header.color_type.bytes_per_pixel(); + let w = png_header.width as usize; + let h = png_header.height as usize; + + log::debug!( + "plain text compressing {} bytes ({}x{}x{})", + plain_text.len(), + w, + h, + bbp + ); + + // see if the bitmap looks like the way with think it should (bits per pixel map + 1 height worth of filter bytes) + if (bbp * w * h) + h == plain_text.len() { + let (bitmap, filters) = undo_png_filters(plain_text, w, h, bbp); + + let enc = webp::Encoder::new( + &bitmap, + match png_header.color_type { + PngColorType::RGB => webp::PixelLayout::Rgb, + PngColorType::RGBA => webp::PixelLayout::Rgba, + }, + png_header.width, + png_header.height, + ); + + let mut webpconfig = webp::WebPConfig::new().unwrap(); + webpconfig.lossless = 1; + webpconfig.alpha_compression = 0; + webpconfig.exact = 1; // undocumented option, but required to not throw away color if alpha channel is zero + + // this is the default quality setting for webp lossless, we could dial it up + // but the quality gains are marginal for the CPU cost, although the + // CPU decompression cost is the same. + webpconfig.quality = 75.0; // 0..100 higher is slower but better compression + webpconfig.method = 4; // 0..6 higher is slower but better compression + + let comp = match enc.encode_advanced(&webpconfig) { + Ok(c) => c, + Err(e) => { + return preflate_rs::err_exit_code( + ExitCode::WebPDecodeError, + format!("Webp encode failed: {:?}", e), + ); + } + }; + + result.write_all(&[BLOCK_TYPE_PNG])?; // placeholder — caller skips this byte + + write_varint(result, corrections.len() as u32)?; + write_varint(result, comp.deref().len() as u32)?; + + log::debug!( + "Webp compressed {} bytes (vs {})", + comp.deref().len(), + idat.total_chunk_length + ); + + idat.write_to_bytestream(result)?; + result.write_all(&filters)?; + + result.write_all(&corrections)?; + result.write_all(comp.deref())?; + + return Ok(()); + } + } + + return preflate_rs::err_exit_code( + ExitCode::InvalidCompressedWrapper, + "Webp compression not supported", + ); +} diff --git a/container/src/idat_parse.rs b/container/src/idat_parse.rs index cc56363..ed225b0 100644 --- a/container/src/idat_parse.rs +++ b/container/src/idat_parse.rs @@ -4,7 +4,7 @@ use byteorder::{ReadBytesExt, WriteBytesExt}; use preflate_rs::{ExitCode, Result, err_exit_code}; -use crate::container_processor::{read_varint, write_varint}; +use crate::container_common::{read_varint, write_varint}; /// The contents of a PNG IDat stream. These are treated specially since they /// contain a Zlib stream that is split into multiple chunks and would be diff --git a/container/src/lib.rs b/container/src/lib.rs index 5a1f638..9b0593e 100644 --- a/container/src/lib.rs +++ b/container/src/lib.rs @@ -20,23 +20,17 @@ #![forbid(macro_use_extern_crate)] #![forbid(missing_unsafe_on_extern)] -mod container_processor; +mod container_common; +mod container_read; +mod container_write; mod idat_parse; mod scan_deflate; mod scoped_read; mod utils; -mod zstd_compression; -pub use zstd_compression::{ - zstd_preflate_whole_deflate_stream, zstd_recreate_whole_deflate_stream, -}; -pub use container_processor::{PreflateContainerConfig, PreflateStats}; -pub use container_processor::{ - PreflateContainerProcessor, ProcessBuffer, RecreateContainerProcessor, - preflate_whole_into_container, recreate_whole_from_container, -}; - -pub use zstd_compression::{ZstdCompressContext, ZstdDecompressContext}; +pub use container_common::{PreflateContainerConfig, PreflateStats, ProcessBuffer}; +pub use container_read::RecreateContainerProcessor; +pub use container_write::PreflateContainerProcessor; pub use utils::process_limited_buffer; diff --git a/container/src/utils.rs b/container/src/utils.rs index 0278ad0..8d345f5 100644 --- a/container/src/utils.rs +++ b/container/src/utils.rs @@ -1,48 +1,12 @@ use std::{ collections::VecDeque, - io::{BufRead, Read, Write}, + io::{Read, Write}, }; use preflate_rs::Result; use crate::ProcessBuffer; -/// A BufRead implementation that reads at most `limit` bytes from the underlying reader. -pub struct TakeReader { - inner: T, - amount_left: usize, -} - -impl TakeReader { - pub fn new(inner: T, limit: usize) -> Self { - TakeReader { - inner, - amount_left: limit, - } - } -} - -impl BufRead for TakeReader { - fn fill_buf(&mut self) -> std::io::Result<&[u8]> { - let buf = self.inner.fill_buf()?; - Ok(&buf[..buf.len().min(self.amount_left)]) - } - - fn consume(&mut self, amt: usize) { - self.amount_left -= amt; - self.inner.consume(amt); - } -} - -impl Read for TakeReader { - fn read(&mut self, buf: &mut [u8]) -> std::io::Result { - let len = buf.len().min(self.amount_left); - let read = self.inner.read(&mut buf[..len])?; - self.amount_left -= read; - Ok(read) - } -} - #[allow(dead_code)] #[cfg(test)] pub fn write_file(filename: &str, data: &[u8]) { @@ -174,7 +138,7 @@ pub fn process_limited_buffer( #[test] fn test_process_limited_buffer() { - let mut p = crate::container_processor::NopProcessBuffer {}; + let mut p = crate::container_common::test::NopProcessBuffer {}; let input = b"Hello, world!"; let mut output = [0u8; 5]; diff --git a/container/src/zstd_compression.rs b/container/src/zstd_compression.rs deleted file mode 100644 index c0f1e69..0000000 --- a/container/src/zstd_compression.rs +++ /dev/null @@ -1,277 +0,0 @@ -//! Implements processors for Zstandard compression and decompression using -//! the ProcessBuffer model. These are designed to be chained together with -//! the other ProcessBuffer implementations to create a full compression or -//! decompression pipeline. - -use std::io::{BufRead, Write}; - -use crate::{ - PreflateContainerProcessor, PreflateStats, ProcessBuffer, RecreateContainerProcessor, - container_processor::PreflateContainerConfig, -}; - -use preflate_rs::{AddContext, ExitCode, PreflateError, Result}; - -/// processor that compresses the input using Zstandard -/// -/// Designed to wrap around the PreflateChunkProcessor. -pub struct ZstdCompressContext { - zstd_compress: zstd::stream::write::Encoder<'static, Vec>, - input_complete: bool, - internal: D, - - /// if set, the encoder will write all the input to a null zstd encoder to see how much - /// compression we would get if we just used Zstandard without any Preflate processing. - /// - /// This gives a fairer comparison of the compression ratio of Preflate + Zstandard vs. Zstandard - /// since Zstd does compress the data a bit, especially if there is a lot of non-Deflate streams - /// in the file. - test_baseline: Option>, - - zstd_baseline_size: u64, - zstd_compressed_size: u64, -} - -impl ZstdCompressContext { - pub fn new(internal: D, compression_level: i32, test_baseline: bool) -> Self { - ZstdCompressContext { - zstd_compress: zstd::stream::write::Encoder::new(Vec::new(), compression_level) - .unwrap(), - input_complete: false, - internal, - zstd_baseline_size: 0, - zstd_compressed_size: 0, - test_baseline: if test_baseline { - Some( - zstd::stream::write::Encoder::new( - MeasureWriteSink { length: 0 }, - compression_level, - ) - .unwrap(), - ) - } else { - None - }, - } - } -} - -impl ProcessBuffer for ZstdCompressContext { - fn process_buffer( - &mut self, - input: &[u8], - input_complete: bool, - writer: &mut impl Write, - ) -> Result<()> { - if self.input_complete && (input.len() > 0 || !input_complete) { - return Err(PreflateError::new( - ExitCode::InvalidParameter, - "more data provided after input_complete signaled", - )); - } - - if input.len() > 0 { - if let Some(encoder) = &mut self.test_baseline { - encoder.write_all(input).context()?; - } - } - - self.internal - .process_buffer(input, input_complete, &mut self.zstd_compress) - .context()?; - - if input_complete && !self.input_complete { - self.input_complete = true; - - self.zstd_compress.flush().context()?; - - if let Some(encoder) = &mut self.test_baseline { - encoder.flush()?; - encoder.do_finish()?; - self.zstd_baseline_size = encoder.get_mut().length as u64; - } - } - - let compressed = self.zstd_compress.get_mut(); - writer.write_all(compressed).context()?; - self.zstd_compressed_size += compressed.len() as u64; - compressed.drain(..); - - Ok(()) - } - - fn stats(&self) -> PreflateStats { - PreflateStats { - zstd_compressed_size: self.zstd_compressed_size, - zstd_baseline_size: self.zstd_baseline_size, - ..self.internal.stats() - } - } -} - -/// used to measure the length of the output without storing it anyway -struct MeasureWriteSink { - pub length: usize, -} - -impl Write for MeasureWriteSink { - fn write(&mut self, buf: &[u8]) -> std::io::Result { - self.length += buf.len(); - Ok(buf.len()) - } - - fn flush(&mut self) -> std::io::Result<()> { - Ok(()) - } -} - -/// Processor that decompresses the input using Zstandard -/// -/// Designed to wrap around the RecreateContainerProcessor. -pub struct ZstdDecompressContext { - zstd_decompress: zstd::stream::write::Decoder<'static, AcceptWrite>>, -} - -/// used to accept the output from the Zstandard decoder and write it to the output buffer. -/// Since the plain text is significantly larger than the compressed version, we want -/// to avoid buffering the output in memory, so we send it directly to the recreator. -struct AcceptWrite { - internal: D, - output: O, - input_complete: bool, -} - -impl Write for AcceptWrite { - fn write(&mut self, buf: &[u8]) -> std::io::Result { - self.internal - .process_buffer(buf, self.input_complete, &mut self.output)?; - Ok(buf.len()) - } - - fn flush(&mut self) -> std::io::Result<()> { - Ok(()) - } -} - -impl ZstdDecompressContext { - pub fn new(internal: D) -> Self { - ZstdDecompressContext { - zstd_decompress: zstd::stream::write::Decoder::new(AcceptWrite { - internal: internal, - output: Vec::new(), - input_complete: false, - }) - .unwrap(), - } - } -} - -impl ProcessBuffer for ZstdDecompressContext { - fn process_buffer( - &mut self, - input: &[u8], - input_complete: bool, - writer: &mut impl Write, - ) -> Result<()> { - if self.zstd_decompress.get_mut().input_complete && (input.len() > 0 || !input_complete) { - return Err(PreflateError::new( - ExitCode::InvalidParameter, - "more data provided after input_complete signaled", - )); - } - - if input.len() > 0 { - self.zstd_decompress.write_all(input).context()?; - } - - if input_complete && !self.zstd_decompress.get_mut().input_complete { - self.zstd_decompress.flush().context()?; - self.zstd_decompress.get_mut().input_complete = true; - } - - let a = self.zstd_decompress.get_mut(); - writer.write_all(&a.output).context()?; - a.output.clear(); - - Ok(()) - } -} - -/// Expands the Zlib compressed streams in the data and then recompresses the result -/// with Zstd with the given level. -pub fn zstd_preflate_whole_deflate_stream( - config: &PreflateContainerConfig, - input: &mut impl BufRead, - output: &mut impl Write, - compression_level: i32, -) -> Result { - let mut ctx = ZstdCompressContext::new( - PreflateContainerProcessor::new(config), - compression_level, - false, - ); - - ctx.copy_to_end(input, output).context()?; - - Ok(ctx.stats()) -} - -/// Decompresses the Zstd compressed data and then recompresses the result back -/// to the original Zlib compressed streams. -pub fn zstd_recreate_whole_deflate_stream( - input: &mut impl BufRead, - output: &mut impl Write, -) -> Result<()> { - let mut ctx = ZstdDecompressContext::::new( - RecreateContainerProcessor::new(1024 * 1024 * 128), - ); - - ctx.copy_to_end(input, output).context()?; - - Ok(()) -} - -#[test] -fn verify_zip_compress_zstd() { - use crate::utils::read_file; - let v = read_file("samplezip.zip"); - - let mut compressed = Vec::new(); - let stats = zstd_preflate_whole_deflate_stream( - &PreflateContainerConfig::default(), - &mut std::io::Cursor::new(&v), - &mut compressed, - 1, // for testing use a lower level to save CPU - ) - .unwrap(); - - let mut recreated = Vec::new(); - zstd_recreate_whole_deflate_stream(&mut std::io::Cursor::new(&compressed), &mut recreated) - .unwrap(); - - assert!(v == recreated); - println!( - "original zip = {} bytes, expanded = {} bytes recompressed zip = {} bytes", - v.len(), - stats.uncompressed_size, - compressed.len() - ); -} - -/// tests zstd compression buffer processing without involving preflate code -#[test] -fn roundtrip_zstd_only_contexts() { - use crate::container_processor::NopProcessBuffer; - use crate::utils::{assert_eq_array, read_file}; - use crate::zstd_compression::{ZstdCompressContext, ZstdDecompressContext}; - - let original = read_file("samplezip.zip"); - - let mut context = ZstdCompressContext::new(NopProcessBuffer {}, 9, false); - let compressed = context.process_vec_size(&original, 997).unwrap(); - - let mut context = ZstdDecompressContext::new(NopProcessBuffer {}); - let recreated = context.process_vec_size(&compressed, 997).unwrap(); - - assert_eq_array(&original, &recreated); -} diff --git a/dll/CLAUDE.md b/dll/CLAUDE.md new file mode 100644 index 0000000..28ca341 --- /dev/null +++ b/dll/CLAUDE.md @@ -0,0 +1,73 @@ +# dll (preflate_rs_0_7) + +C-compatible DLL for .NET interop. Exposes a streaming compress/decompress API as +`extern "C"` functions. The version number is baked into the crate name +(`preflate_rs_0_7`) for binary compatibility. + +## Exported C API (`src/unmanaged_api.rs`) + +### Compression + +```c +void* create_compression_context(uint32_t flags); +void free_compression_context(void* context); +int32_t compress_buffer( + void* context, + const uint8_t* input, size_t input_size, + bool input_complete, + uint8_t* output, size_t output_size, + size_t* result_size, + char* error_string, size_t error_string_buffer_len +); +void get_compression_stats(void* context, /* stat out-params */); +``` + +`flags` encoding: +- bits 0–4: Zstd compression level +- bit 5: `test_baseline` +- bit 6: `verify` + +Return value of `compress_buffer`: `0` = more output available, `1` = done, `<0` = error. + +### Decompression + +```c +void* create_decompression_context(uint32_t flags, size_t capacity); +void free_decompression_context(void* context); +int32_t decompress_buffer( + void* context, + const uint8_t* input, size_t input_size, + bool input_complete, + uint8_t* output, size_t output_size, + size_t* result_size, + char* error_string, size_t error_string_buffer_len +); +``` + +## Internal Structs + +```rust +struct CompressionContext { + magic: u32, // MAGIC_COMPRESSION_CONTEXT = 0x4B3CFF2E + internal: PreflateContainerProcessor, + output_extra: VecDeque, // buffers overflow when C buffer is too small +} +struct DecompressionContext { + magic: u32, // MAGIC_DECOMPRESSION_CONTEXT = 0x053D2AB1 + internal: RecreateContainerProcessor, + output_extra: VecDeque, +} +``` + +Magic numbers are validated on every call to catch dangling/wrong pointer bugs. + +## Safety Notes + +- Uses `#[unsafe(no_mangle)]` on exported functions — the only place in the workspace + where `unsafe` appears (required for C FFI entry points). +- `catch_unwind_result()` wraps every entry point to prevent panics crossing the FFI boundary. +- All other code in the crate remains safe Rust. + +## Build Output + +`cdylib` — produces `preflate_rs_0_7.dll` on Windows. diff --git a/dll/Cargo.toml b/dll/Cargo.toml index e4f06a0..a4ef4ad 100644 --- a/dll/Cargo.toml +++ b/dll/Cargo.toml @@ -3,12 +3,12 @@ # this makes sure that we can keep old versions around to decode old encodings since the format # is complicated enough that maintaining backwards compat is hard (even minor changes in the # predictor will break the format) -version = "0.7.5" name = "preflate_rs_0_7" -edition = "2024" -authors = ["Kristof Roomp "] -license = "Apache-2.0" -rust-version = "1.85" +version.workspace = true +edition.workspace = true +authors.workspace = true +license.workspace = true +rust-version.workspace = true [dependencies] preflate-rs = { path = "../preflate" } diff --git a/dll/src/unmanaged_api.rs b/dll/src/unmanaged_api.rs index 99b8701..4d117c7 100644 --- a/dll/src/unmanaged_api.rs +++ b/dll/src/unmanaged_api.rs @@ -6,7 +6,7 @@ use std::{ use preflate_container::{ PreflateContainerConfig, PreflateContainerProcessor, ProcessBuffer, RecreateContainerProcessor, - ZstdCompressContext, ZstdDecompressContext, process_limited_buffer, + process_limited_buffer, }; use preflate_rs::{ExitCode, PreflateError}; @@ -198,7 +198,7 @@ pub unsafe extern "C" fn get_compression_stats( struct CompressionContext { magic: u32, - internal: ZstdCompressContext, + internal: PreflateContainerProcessor, output_extra: VecDeque, } @@ -221,12 +221,12 @@ impl CompressionContext { fn new(verify: bool, compression_level: i32, test_baseline: bool) -> Self { CompressionContext { magic: MAGIC_COMPRESSION_CONTEXT, - internal: ZstdCompressContext::new( - PreflateContainerProcessor::new(&PreflateContainerConfig { + internal: PreflateContainerProcessor::new( + &PreflateContainerConfig { validate_compression: verify, max_chain_length: 1024, // lower max chain to avoid excessive CPU usage ..PreflateContainerConfig::default() - }), + }, compression_level, test_baseline, ), @@ -237,7 +237,7 @@ impl CompressionContext { struct DecompressionContext { magic: u32, - internal: ZstdDecompressContext, + internal: RecreateContainerProcessor, output_extra: VecDeque, } @@ -255,11 +255,9 @@ impl DecompressionContext { } fn new(capacity: usize) -> Self { - let internal = ZstdDecompressContext::new(RecreateContainerProcessor::new(capacity)); - DecompressionContext { magic: MAGIC_DECOMRESSION_CONTEXT, - internal, + internal: RecreateContainerProcessor::new(capacity), output_extra: VecDeque::new(), } } diff --git a/fuzz/CLAUDE.md b/fuzz/CLAUDE.md new file mode 100644 index 0000000..608fd4c --- /dev/null +++ b/fuzz/CLAUDE.md @@ -0,0 +1,30 @@ +# fuzz (preflate-rs-fuzz) + +libfuzzer harnesses for fuzzing the core and container APIs. Not published; requires +the `fuzzing` cargo feature. + +## Harnesses + +### `fuzz_target_1` — core round-trip + +Feeds arbitrary bytes to `preflate_whole_deflate_stream()` as a raw DEFLATE stream, +then attempts `recreate_whole_deflate_stream()` on the result. Verifies no crash or panic. + +### `fuzz_container` — container round-trip + +Feeds arbitrary bytes (minimum 1 byte) to `preflate_whole_into_container()`, then +`recreate_whole_from_container()`, and asserts the output matches the original input. + +## Running Fuzz Tests + +```bash +# Requires nightly and cargo-fuzz +cargo +nightly fuzz run fuzz_target_1 +cargo +nightly fuzz run fuzz_container +``` + +## Notes + +- Edition 2021 (older than the main workspace crates which use 2024). +- `libfuzzer-sys` (0.4) provides the fuzzing harness glue. +- Corpus and artifacts are stored under `fuzz/corpus/` and `fuzz/artifacts/` (gitignored). diff --git a/preflate/CLAUDE.md b/preflate/CLAUDE.md new file mode 100644 index 0000000..e02dddd --- /dev/null +++ b/preflate/CLAUDE.md @@ -0,0 +1,83 @@ +# preflate (core library) + +Core DEFLATE analysis and reconstruction. Analyzes a DEFLATE bitstream, extracts the +uncompressed plaintext plus a compact set of reconstruction parameters, and later recreates +the bit-exact original bitstream. + +## Public API (`lib.rs`) + +```rust +// Compress: analyze a DEFLATE stream and produce plaintext + correction data +PreflateStreamProcessor::new(config) -> Self +PreflateStreamProcessor::decompress(input: &[u8]) -> Result + +// Recreate: given plaintext + correction data, reproduce the original DEFLATE stream +RecreateStreamProcessor::new() -> Self +RecreateStreamProcessor::recreate(chunk: PreflateStreamChunkResult) -> Result> + +// One-shot helpers +preflate_whole_deflate_stream(input, config) -> Result<(Vec, Vec)> +recreate_whole_deflate_stream(plaintext, correction_data) -> Result> +``` + +`PreflateConfig` controls `max_chain_length`, `plain_text_limit`, and `verify_compression`. + +## Processing Pipeline + +``` +DEFLATE bytes + └─ deflate/deflate_reader.rs → tokens (literals + back-refs) + └─ estimator/ → TokenPredictorParameters (hash algo, nice_len, max_chain…) + └─ token_predictor.rs → predicted tokens (replaying the original compressor) + └─ tree_predictor.rs → predicted Huffman trees + └─ cabac_codec.rs → encode *differences* from prediction → correction bytes +``` + +Reconstruction runs the same pipeline in reverse. + +## Key Types + +| Type | Where | Purpose | +|---|---|---| +| `PlainText` | `preflate_input.rs` | Wraps uncompressed data | +| `TokenPredictorParameters` | `token_predictor.rs` | Compressor fingerprint | +| `HashAlgorithm` | `hash_algorithm.rs` | Zlib / Miniz / Libdeflate / zlib-ng / … | +| `PreflateError` / `ExitCode` | `preflate_error.rs` | 29 error variants with context | +| `DeflateToken` | `deflate/deflate_token.rs` | Literal or length/distance match | + +## Module Layout + +``` +src/ + lib.rs ← public API, PreflateConfig + stream_processor.rs ← PreflateStreamProcessor, RecreateStreamProcessor + deflate/ + deflate_reader.rs ← DEFLATE bitstream → tokens + deflate_writer.rs ← tokens → DEFLATE bitstream + bit_reader.rs / bit_writer.rs + huffman_calc.rs / huffman_encoding.rs + deflate_token.rs / deflate_constants.rs + estimator/ + preflate_parameter_estimator.rs ← main estimator entry point + complevel_estimator.rs + depth_estimator.rs + add_policy_estimator.rs + preflate_parse_config.rs + preflate_stream_info.rs + token_predictor.rs + tree_predictor.rs + statistical_codec.rs + cabac_codec.rs + hash_algorithm.rs + hash_chain.rs / hash_chain_holder.rs + preflate_input.rs + preflate_error.rs + bit_helper.rs / utils.rs +``` + +## Constraints + +- `#![forbid(unsafe_code)]` — strictly enforced. +- Serialization: parameters via `bitcode`; correction data via CABAC (`cabac` crate). +- The format is chunked to bound peak memory use. +- `#![deny(trivial_casts, non_ascii_idents)]` also set. diff --git a/preflate/Cargo.toml b/preflate/Cargo.toml index cf89cf9..6310898 100644 --- a/preflate/Cargo.toml +++ b/preflate/Cargo.toml @@ -1,17 +1,17 @@ [package] name = "preflate-rs" -version = "0.7.5" -edition = "2024" -authors = ["Kristof Roomp "] -license = "Apache-2.0" -rust-version = "1.85" +version.workspace = true +edition.workspace = true +authors.workspace = true +license.workspace = true +rust-version.workspace = true +repository.workspace = true description = """ Decompresses existing DEFLATE and PNG streams to allow for better with a more state-of-the-art compression -(eg with ZStandard, Brotli) while allowing the exact original binary DEFLATE stream to be recreated +(eg with ZStandard, Brotli) while allowing the exact original binary DEFLATE stream to be recreated by detecting the parameters used during compression. """ readme = "../README.md" -repository = "https://github.com/microsoft/preflate-rs" categories = ["compression"] keywords = ["gzip", "deflate", "zlib", "zip", "png"] diff --git a/preflate/src/stream_processor.rs b/preflate/src/stream_processor.rs index 813abe6..717ca49 100644 --- a/preflate/src/stream_processor.rs +++ b/preflate/src/stream_processor.rs @@ -876,3 +876,115 @@ fn verify_partial_blocks() { ); } } + +/// Replicates exactly what `scan_deflate::find_compressable_stream` does when it +/// encounters a gzip stream in a truncated 200 KiB content buffer: +/// +/// 1. strip the 10-byte gzip header (and 8-byte footer) to obtain the raw DEFLATE body +/// 2. call `decompress` with only the first ~190 KB of that body (what fits in the +/// 200 KiB content window after the header) +/// 3. assert the call returns `Ok` and `is_done() == false` ← DeflateContinue requires this +/// 4. call `decompress` again with the remainder of the body, assert `is_done() == true` +/// 5. reconstruct the original DEFLATE bytes and assert roundtrip identity +#[test] +fn verify_decompress_partial_gzip_deflate_body_roundtrip() { + crate::init_logging(); + + // sample1.bin.gz: 263 972 bytes total; gzip header = 10 bytes, footer = 8 bytes + // → raw DEFLATE body = 263 954 bytes + let gzip = crate::utils::read_file("sample1.bin.gz"); + assert!(gzip.len() > 18, "gzip file too short"); + + // Gzip header for this file has no extra flags, so it is exactly 10 bytes. + let deflate_start: usize = 10; + let deflate_end: usize = gzip.len() - 8; + let deflate_body = &gzip[deflate_start..deflate_end]; + + // Mimic what find_compressable_stream does: content buffer contains the first + // 200 000 bytes of the gzip file, and the DEFLATE body starts at offset 10, + // so the slice passed to decompress() is bytes [10..200000] = 199 990 bytes. + let content_window: usize = 200_000 - deflate_start; // 199 990 + assert!( + content_window < deflate_body.len(), + "content window must truncate the DEFLATE body (window={content_window}, body={})", + deflate_body.len() + ); + + let mut state = PreflateStreamProcessor::new(&PreflateConfig::default()); + + // ── First call: truncated slice ────────────────────────────────────────── + let r1 = state.decompress(&deflate_body[..content_window]); + let r1 = match r1 { + Ok(r) => r, + Err(e) => panic!( + "decompress on truncated DEFLATE body ({content_window} B of {} B) returned \ + Err({e:?}); expected Ok(partial) with is_done()=false", + deflate_body.len() + ), + }; + assert!( + !state.is_done(), + "is_done() must be false after truncated first call; compressed_size={}", + r1.compressed_size + ); + assert!( + r1.compressed_size > 0, + "at least one block must have been consumed" + ); + assert!( + r1.compressed_size <= content_window, + "compressed_size ({}) must not exceed the slice length ({content_window})", + r1.compressed_size + ); + println!( + "first call: compressed_size={} / {content_window} blocks={}", + r1.compressed_size, + r1.blocks.len() + ); + + let first_corrections = r1.corrections.clone(); + let first_plain_text = state.plain_text().text().to_vec(); + state.shrink_to_dictionary(); + + // ── Second call: remainder ──────────────────────────────────────────────── + let offset = r1.compressed_size; + let r2 = state.decompress(&deflate_body[offset..]); + let r2 = match r2 { + Ok(r) => r, + Err(e) => panic!( + "decompress on remainder ({} B) returned Err({e:?})", + deflate_body.len() - offset + ), + }; + assert!( + state.is_done(), + "is_done() must be true after consuming the full body" + ); + println!( + "second call: compressed_size={} blocks={}", + r2.compressed_size, + r2.blocks.len() + ); + + let second_corrections = r2.corrections.clone(); + let second_plain_text = state.plain_text().text().to_vec(); + + // ── Roundtrip: reconstruct the original bytes ───────────────────────────── + let mut reconstruct = RecreateStreamProcessor::new(); + let (mut recompressed, _) = reconstruct + .recompress( + &mut std::io::Cursor::new(&first_plain_text), + &first_corrections, + ) + .expect("recompress chunk 1 failed"); + + let (mut rest, _) = reconstruct + .recompress( + &mut std::io::Cursor::new(&second_plain_text), + &second_corrections, + ) + .expect("recompress chunk 2 failed"); + + recompressed.append(&mut rest); + crate::utils::assert_eq_array(deflate_body, &recompressed); +} diff --git a/samples/test_big_then_small_gzip.bin b/samples/test_big_then_small_gzip.bin new file mode 100644 index 0000000..c3618df Binary files /dev/null and b/samples/test_big_then_small_gzip.bin differ diff --git a/samples/test_corrupted_deflate.bin b/samples/test_corrupted_deflate.bin new file mode 100644 index 0000000..8ae9d48 Binary files /dev/null and b/samples/test_corrupted_deflate.bin differ diff --git a/samples/test_gzip_with_gap.bin b/samples/test_gzip_with_gap.bin new file mode 100644 index 0000000..1cdd72c Binary files /dev/null and b/samples/test_gzip_with_gap.bin differ diff --git a/samples/test_random_bytes.bin b/samples/test_random_bytes.bin new file mode 100644 index 0000000..f1ab9b1 Binary files /dev/null and b/samples/test_random_bytes.bin differ diff --git a/samples/test_tiny_gzip.bin b/samples/test_tiny_gzip.bin new file mode 100644 index 0000000..d3f8415 Binary files /dev/null and b/samples/test_tiny_gzip.bin differ diff --git a/samples/test_two_gzip_streams.bin b/samples/test_two_gzip_streams.bin new file mode 100644 index 0000000..aafddc4 Binary files /dev/null and b/samples/test_two_gzip_streams.bin differ diff --git a/samples/test_two_zlib_streams.bin b/samples/test_two_zlib_streams.bin new file mode 100644 index 0000000..ffd8032 Binary files /dev/null and b/samples/test_two_zlib_streams.bin differ diff --git a/samples/test_zip_3entries.zip b/samples/test_zip_3entries.zip new file mode 100644 index 0000000..e5965a2 Binary files /dev/null and b/samples/test_zip_3entries.zip differ diff --git a/samples/test_zip_stored_then_deflated.zip b/samples/test_zip_stored_then_deflated.zip new file mode 100644 index 0000000..bfc6f74 Binary files /dev/null and b/samples/test_zip_stored_then_deflated.zip differ diff --git a/tests/CLAUDE.md b/tests/CLAUDE.md new file mode 100644 index 0000000..e105e7f --- /dev/null +++ b/tests/CLAUDE.md @@ -0,0 +1,37 @@ +# tests (integration tests) + +End-to-end round-trip tests against real sample files in `samples/`. + +## What Is Tested + +- **Core round-trip**: decompress a `.deflate` file with `preflate_whole_deflate_stream`, + recompress with `recreate_whole_deflate_stream`, assert bitwise identical output. +- **Container round-trip**: compress a ZIP / PNG / DOCX / PDF through + `PreflateContainerProcessor`, decompress with `RecreateContainerProcessor`, + assert the output matches the original file byte-for-byte. + +## Sample Files (`samples/`) + +The `samples/` directory contains real-world compressed files used as test fixtures: +deflate streams, zlib streams, PNGs, ZIPs, PDFs, DOCX, JPEG, WebP, and binary blobs +from various compressors (zlib, zlib-ng, libdeflate, miniz, Windows zlib). + +These files are checked into the repository. Do not remove or alter them without +updating the corresponding tests. + +## Running + +```bash +cargo test --all # all integration tests +cargo test --package preflate-rs # core tests only +cargo test --package preflate-container # container tests only +cargo test # single test by name +cargo test -- --nocapture # show println! output +``` + +## Notes + +- Tests live in `tests/end_to_end.rs` at the workspace root. +- Some tests use `libdeflate-sys` to generate reference compressions on the fly. +- Test failures often mean a regression in the estimator or token predictor; check + those modules first. diff --git a/tests/end_to_end.rs b/tests/end_to_end.rs index a98010b..0458039 100644 --- a/tests/end_to_end.rs +++ b/tests/end_to_end.rs @@ -10,7 +10,9 @@ use std::path::Path; use std::{mem, ptr}; use libdeflate_sys::{libdeflate_alloc_compressor, libdeflate_deflate_compress}; -use preflate_container::{zstd_preflate_whole_deflate_stream, zstd_recreate_whole_deflate_stream}; +use preflate_container::{ + PreflateContainerConfig, PreflateContainerProcessor, ProcessBuffer, RecreateContainerProcessor, +}; use preflate_rs::{PreflateConfig, preflate_whole_deflate_stream, recreate_whole_deflate_stream}; #[cfg(test)] @@ -77,18 +79,15 @@ fn test_container(filename: &str) { let v = read_file(filename); let mut c = Vec::new(); - - let stats = zstd_preflate_whole_deflate_stream( - &preflate_container::PreflateContainerConfig::default(), - &mut std::io::Cursor::new(&v), - &mut c, - 4, // use lower level to save CPU on testing - ) - .unwrap(); + let mut ctx = PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 4, false); + ctx.copy_to_end(&mut std::io::Cursor::new(&v), &mut c) + .unwrap(); + let stats = ctx.stats(); let mut r = Vec::new(); - - zstd_recreate_whole_deflate_stream(&mut std::io::Cursor::new(&c), &mut r).unwrap(); + let mut ctx = RecreateContainerProcessor::new(128 * 1024 * 1024); + ctx.copy_to_end(&mut std::io::Cursor::new(&c), &mut r) + .unwrap(); assert!(v == r); println!( diff --git a/util/CLAUDE.md b/util/CLAUDE.md new file mode 100644 index 0000000..bfa5ba6 --- /dev/null +++ b/util/CLAUDE.md @@ -0,0 +1,38 @@ +# util (preflate_util CLI) + +CLI tool for manually testing preflate compression on files and directories. + +## Usage + +``` +preflate_util [OPTIONS] + +Options: + --max-chain Hash chain depth limit (default: 4096) + -c, --level Zstd compression level 0-14 (default: 9) + --loglevel Log level (default: Error) + --verify Round-trip verify after compress (default: true) + --baseline Measure baseline Zstd-only size (default: false) +``` + +`` may be a single file or a directory (scanned recursively). + +## What It Does + +1. For each file, calls `PreflateContainerProcessor` to compress. +2. Optionally calls `RecreateContainerProcessor` to decompress and byte-compares the result. +3. Prints per-file and aggregate statistics: compressed size, baseline size, CPU time. + +## Source + +Single file: `src/main.rs` (~193 lines). + +Helper `assert_eq_array()` provides detailed positional diff output for debugging +mismatches during verification. + +## Dependencies + +- `clap` (4, derive) — argument parsing +- `cpu-time` (1) — CPU time measurement +- `preflate-rs` and `preflate-container` — core logic +- `env_logger` / `log` — logging diff --git a/util/Cargo.toml b/util/Cargo.toml index 5c5b026..a8f4a4a 100644 --- a/util/Cargo.toml +++ b/util/Cargo.toml @@ -1,9 +1,10 @@ [package] name = "preflate_util" -edition = "2024" -authors = ["Kristof Roomp "] -license = "Apache-2.0" -rust-version = "1.85" +version.workspace = true +edition.workspace = true +authors.workspace = true +license.workspace = true +rust-version.workspace = true [dependencies] preflate-rs = { path = "../preflate" } diff --git a/util/src/main.rs b/util/src/main.rs index 244daeb..5d8cca4 100644 --- a/util/src/main.rs +++ b/util/src/main.rs @@ -11,7 +11,6 @@ use std::{ use preflate_container::{ PreflateContainerConfig, PreflateContainerProcessor, ProcessBuffer, RecreateContainerProcessor, - ZstdCompressContext, ZstdDecompressContext, }; #[derive(Parser)] @@ -99,11 +98,7 @@ fn main() { // open file for reading let original = fs::read(&entry).unwrap(); - let mut ctx = ZstdCompressContext::new( - PreflateContainerProcessor::new(&config), - cli.level as i32, - cli.baseline, - ); + let mut ctx = PreflateContainerProcessor::new(&config, cli.level as i32, cli.baseline); let compress_start = ProcessTime::now(); @@ -140,9 +135,7 @@ fn main() { let start = ProcessTime::now(); let mut recreated = Vec::new(); - let mut decomp = ZstdDecompressContext::new(RecreateContainerProcessor::new( - config.chunk_plain_text_limit, - )); + let mut decomp = RecreateContainerProcessor::new(config.chunk_plain_text_limit); if let Err(e) = decomp.copy_to_end_size( &mut Cursor::new(&preflate_compressed),