From 9db650c4f7b2a8a1d450f05f01c8281fee7e7ab5 Mon Sep 17 00:00:00 2001 From: Kristof Date: Tue, 24 Feb 2026 23:32:31 +0100 Subject: [PATCH 1/8] v1 of refactor --- container/src/container_processor.rs | 943 +++++++++++++-------------- container/src/lib.rs | 7 - container/src/zstd_compression.rs | 277 -------- dll/src/unmanaged_api.rs | 16 +- tests/end_to_end.rs | 19 +- util/src/main.rs | 9 +- 6 files changed, 487 insertions(+), 784 deletions(-) delete mode 100644 container/src/zstd_compression.rs diff --git a/container/src/container_processor.rs b/container/src/container_processor.rs index 1d2fe24..b086ace 100644 --- a/container/src/container_processor.rs +++ b/container/src/container_processor.rs @@ -11,7 +11,6 @@ use crate::{ idat_parse::{IdatContents, PngHeader, recreate_idat}, scan_deflate::{FindStreamResult, FoundStream, FoundStreamType, find_compressable_stream}, scoped_read::ScopedRead, - utils::TakeReader, }; use preflate_rs::{ @@ -80,7 +79,7 @@ impl PreflateContainerConfig { } } -const COMPRESSED_WRAPPER_VERSION_1: u8 = 1; +const COMPRESSED_WRAPPER_VERSION_2: u8 = 2; /// literal chunks are just copied to the output const LITERAL_CHUNK: u8 = 0; @@ -97,6 +96,12 @@ const DEFLATE_STREAM_CONTINUE: u8 = 3; /// JPEG Lepton compressed chunks are JPEG Lepton compressed const JPEG_LEPTON_COMPRESSED: u8 = 4; +/// PNG chunk stored as WebP lossless — already compressed, written raw (bypasses Zstd) +const WEBP_COMPRESSED: u8 = 5; + +/// V2 end-of-stream marker that carries the final Zstd finish bytes +const ZSTD_END_OF_STREAM: u8 = 0xFF; + pub(crate) fn write_varint(destination: &mut impl Write, value: u32) -> std::io::Result<()> { let mut value = value; loop { @@ -148,35 +153,49 @@ fn test_variant_roundtrip() { } } -fn write_literal_block(content: &[u8], destination: &mut impl Write) -> Result<()> { - destination.write_all(&[LITERAL_CHUNK])?; - write_varint(destination, content.len() as u32)?; - destination.write_all(content)?; - Ok(()) +/// Flushes the encoder, writes [block_type][varint(compressed_size)][compressed_bytes] to +/// destination, clears the encoder's inner buffer, and returns the compressed byte count. +fn emit_compressed_block( + block_type: u8, + encoder: &mut zstd::stream::write::Encoder<'static, Vec>, + destination: &mut impl Write, +) -> Result { + encoder.flush().context()?; + let compressed = encoder.get_mut(); + let len = compressed.len(); + destination.write_all(&[block_type])?; + write_varint(destination, len as u32)?; + destination.write_all(compressed)?; + compressed.clear(); + Ok(len) } -fn write_chunk_block( - result: &mut impl Write, +/// V2 variant of write_chunk_block: block content goes through the persistent Zstd encoder. +/// JPEG blocks are written raw to writer (bypass encoder). +/// Returns (total compressed bytes written, optional continue state). +fn write_chunk_block_v2( + encoder: &mut zstd::stream::write::Encoder<'static, Vec>, + writer: &mut impl Write, chunk: FoundStream, stats: &mut PreflateStats, -) -> Result> { +) -> Result<(usize, Option)> { match chunk.chunk_type { FoundStreamType::DeflateStream(parameters, state) => { - result.write_all(&[DEFLATE_STREAM])?; + write_varint(encoder, chunk.corrections.len() as u32)?; + write_varint(encoder, state.plain_text().text().len() as u32)?; + encoder.write_all(&chunk.corrections)?; + encoder.write_all(&state.plain_text().text())?; - write_varint(result, chunk.corrections.len() as u32)?; - write_varint(result, state.plain_text().text().len() as u32)?; - - result.write_all(&chunk.corrections)?; - result.write_all(&state.plain_text().text())?; + let compressed_size = emit_compressed_block(DEFLATE_STREAM, encoder, writer)?; stats.overhead_bytes += chunk.corrections.len() as u64; stats.uncompressed_size += state.plain_text().len() as u64; stats.hash_algorithm = parameters.hash_algorithm; if !state.is_done() { - return Ok(Some(state)); + return Ok((compressed_size, Some(state))); } + Ok((compressed_size, None)) } FoundStreamType::IDATDeflate(parameters, mut idat, plain_text) => { @@ -186,218 +205,67 @@ fn write_chunk_block( chunk.corrections.len() ); - if webp_compress(result, plain_text.text(), &chunk.corrections, &idat).is_err() { - log::debug!("non-Webp compressed {}", idat.total_chunk_length); + let mut temp_vec = Vec::new(); - result.write_all(&[PNG_COMPRESSED])?; - write_varint(result, chunk.corrections.len() as u32)?; - write_varint(result, plain_text.text().len() as u32)?; + if webp_compress(&mut temp_vec, plain_text.text(), &chunk.corrections, &idat).is_ok() { + // WebP is already compressed — write raw, bypassing the Zstd encoder. + // temp_vec[0] is the PNG_COMPRESSED type byte; temp_vec[1..] is the payload. + let payload = &temp_vec[1..]; + writer.write_all(&[WEBP_COMPRESSED])?; + write_varint(writer, payload.len() as u32)?; + writer.write_all(payload)?; + stats.uncompressed_size += plain_text.len() as u64; + stats.hash_algorithm = parameters.hash_algorithm; + stats.overhead_bytes += chunk.corrections.len() as u64; + + Ok((payload.len(), None)) + } else { + // Non-WebP PNG: corrections + plaintext are compressible, send through Zstd. + log::debug!("non-Webp compressed {}", idat.total_chunk_length); + write_varint(encoder, chunk.corrections.len() as u32)?; + write_varint(encoder, plain_text.text().len() as u32)?; idat.png_header = None; - idat.write_to_bytestream(result)?; + idat.write_to_bytestream(encoder)?; + encoder.write_all(&chunk.corrections)?; + encoder.write_all(plain_text.text())?; - result.write_all(&chunk.corrections)?; - result.write_all(&plain_text.text())?; - } + let compressed_size = emit_compressed_block(PNG_COMPRESSED, encoder, writer)?; - stats.uncompressed_size += plain_text.len() as u64; - stats.hash_algorithm = parameters.hash_algorithm; - stats.overhead_bytes += chunk.corrections.len() as u64; + stats.uncompressed_size += plain_text.len() as u64; + stats.hash_algorithm = parameters.hash_algorithm; + stats.overhead_bytes += chunk.corrections.len() as u64; + + Ok((compressed_size, None)) + } } - FoundStreamType::JPEGLepton(data) => { - result.write_all(&[JPEG_LEPTON_COMPRESSED])?; - write_varint(result, data.len() as u32)?; - result.write_all(&data)?; + FoundStreamType::JPEGLepton(data) => { + // JPEG is written raw (bypasses the encoder entirely) + writer.write_all(&[JPEG_LEPTON_COMPRESSED])?; + write_varint(writer, data.len() as u32)?; + writer.write_all(&data)?; stats.uncompressed_size += data.len() as u64; + Ok((0, None)) } } - Ok(None) -} - -/// Scans for multiple deflate streams in an arbitrary binary file, decompresses the streams and -/// returns an uncompressed file that can then be recompressed using a better algorithm. -/// This can then be passed back into recreate_whole_from_container to recreate the exact original file. -/// -/// Note that the result is NOT compressed and has to be compressed by some other algorithm -/// in order to see any savings. -/// -/// This is a wrapper for PreflateContainerProcessor. -pub fn preflate_whole_into_container( - config: &PreflateContainerConfig, - compressed_data: &mut impl BufRead, - write: &mut impl Write, -) -> Result { - let mut context = PreflateContainerProcessor::new(&config); - context.copy_to_end(compressed_data, write).unwrap(); - - Ok(context.stats()) -} - -/// Takes the binary output of preflate_whole_into_container and recreates the original file. -/// -/// This is a wrapper for RecreateContainerProcessor. -pub fn recreate_whole_from_container( - source: &mut impl BufRead, - destination: &mut impl Write, -) -> Result<()> { - let mut recreate = RecreateContainerProcessor::new(usize::MAX); - recreate.copy_to_end(source, destination).context() -} - -#[cfg(test)] -fn read_chunk_block_slow( - source: &mut impl BufRead, - destination: &mut impl Write, -) -> std::result::Result<(), PreflateError> { - let mut p = RecreateContainerProcessor::new_single_chunk(usize::MAX); - p.copy_to_end_size(source, destination, 1).context() -} - -#[test] -fn roundtrip_chunk_block_literal() { - let mut buffer = Vec::new(); - - write_literal_block(b"hello", &mut buffer).unwrap(); - - let mut read_cursor = std::io::Cursor::new(buffer); - let mut destination = Vec::new(); - read_chunk_block_slow(&mut read_cursor, &mut destination).unwrap(); - - assert!(destination == b"hello"); -} - -#[test] -fn roundtrip_chunk_block_deflate() { - let contents = crate::utils::read_file("compressed_zlib_level1.deflate"); - - let mut stream_state = PreflateStreamProcessor::new(&PreflateConfig::default()); - let results = stream_state.decompress(&contents).unwrap(); - - let mut buffer = Vec::new(); - - let mut stats = PreflateStats::default(); - write_chunk_block( - &mut buffer, - FoundStream { - chunk_type: FoundStreamType::DeflateStream(results.parameters.unwrap(), stream_state), - corrections: results.corrections, - }, - &mut stats, - ) - .unwrap(); - - let mut read_cursor = std::io::Cursor::new(buffer); - let mut destination = Vec::new(); - read_chunk_block_slow(&mut read_cursor, &mut destination).unwrap(); - - assert!(destination == contents); -} - -#[test] -fn roundtrip_chunk_block_png() { - let f = crate::utils::read_file("treegdi.png"); - - // we know the first IDAT chunk starts at 83 (avoid testing the scan_deflate code in a unit teast) - let (idat_contents, deflate_stream) = crate::idat_parse::parse_idat(None, &f[83..]).unwrap(); - let mut stream = PreflateStreamProcessor::new(&PreflateConfig::default()); - let results = stream.decompress(&deflate_stream).unwrap(); - - let total_chunk_length = idat_contents.total_chunk_length; - - let mut buffer = Vec::new(); - - let mut stats = PreflateStats::default(); - write_chunk_block( - &mut buffer, - FoundStream { - chunk_type: FoundStreamType::IDATDeflate( - results.parameters.unwrap(), - idat_contents, - stream.detach_plain_text(), - ), - corrections: results.corrections, - }, - &mut stats, - ) - .unwrap(); - - let mut read_cursor = std::io::Cursor::new(buffer); - let mut destination = Vec::new(); - read_chunk_block_slow(&mut read_cursor, &mut destination).unwrap(); - - assert!(destination == &f[83..83 + total_chunk_length]); -} - -#[cfg(test)] -fn roundtrip_deflate_chunks(filename: &str) { - use crate::utils::assert_eq_array; - - let f = crate::utils::read_file(filename); - - println!("Processing file: {}", filename); - - let mut expanded = Vec::new(); - preflate_whole_into_container( - &PreflateContainerConfig::default(), - &mut std::io::Cursor::new(&f), - &mut expanded, - ) - .unwrap(); - - println!("Recreating file: {}", filename); - - let mut read_cursor = std::io::Cursor::new(expanded); - - let mut destination = Vec::new(); - recreate_whole_from_container(&mut read_cursor, &mut destination).unwrap(); - - assert_eq_array(&destination, &f); -} - -#[test] -fn roundtrip_skip_length_crash() { - roundtrip_deflate_chunks("skiplengthcrash.bin"); -} - -#[test] -fn roundtrip_png_chunks() { - roundtrip_deflate_chunks("treegdi.png"); -} - -#[test] -fn roundtrip_zip_chunks() { - roundtrip_deflate_chunks("samplezip.zip"); } -#[test] -fn roundtrip_gz_chunks() { - roundtrip_deflate_chunks("sample1.bin.gz"); -} - -#[test] -fn roundtrip_png_chunks2() { - roundtrip_deflate_chunks("starcontrol.samplesave"); +/// used to measure the length of the output without storing it +struct MeasureWriteSink { + pub length: usize, } -#[test] -fn verify_zip_compress() { - use crate::utils::read_file; - let v = read_file("samplezip.zip"); - - let mut expanded = Vec::new(); - preflate_whole_into_container( - &PreflateContainerConfig::default(), - &mut std::io::Cursor::new(&v), - &mut expanded, - ) - .unwrap(); - - let mut recompressed = Vec::new(); - recreate_whole_from_container(&mut std::io::Cursor::new(expanded), &mut recompressed).unwrap(); +impl Write for MeasureWriteSink { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + self.length += buf.len(); + Ok(buf.len()) + } - assert!(v == recompressed); + fn flush(&mut self) -> std::io::Result<()> { + Ok(()) + } } /// Statistics about the preflate process @@ -533,10 +401,19 @@ pub struct PreflateContainerProcessor { state: ChunkParseState, config: PreflateContainerConfig, + + /// each block is individually compressed with this encoder (v2 format) + encoder: Option>>, + + /// when present, all raw input is also fed to this encoder so we can measure + /// baseline Zstd compression (without preflate processing) + baseline_encoder: Option>, } impl PreflateContainerProcessor { - pub fn new(config: &PreflateContainerConfig) -> Self { + /// Creates a processor that uses v2 format with a persistent Zstd encoder shared + /// across all non-JPEG blocks. JPEG blocks bypass the encoder entirely. + pub fn new(config: &PreflateContainerConfig, level: i32, test_baseline: bool) -> Self { PreflateContainerProcessor { content: Vec::new(), compression_stats: PreflateStats::default(), @@ -545,6 +422,17 @@ impl PreflateContainerProcessor { total_plain_text_seen: 0, last_attempt_chunk_size: 0, config: config.clone(), + encoder: Some( + zstd::stream::write::Encoder::new(Vec::new(), level).unwrap(), + ), + baseline_encoder: if test_baseline { + Some( + zstd::stream::write::Encoder::new(MeasureWriteSink { length: 0 }, level) + .unwrap(), + ) + } else { + None + }, } } } @@ -566,6 +454,10 @@ impl ProcessBuffer for PreflateContainerProcessor { if input.len() > 0 { self.compression_stats.deflate_compressed_size += input.len() as u64; self.content.extend_from_slice(input); + + if let Some(encoder) = &mut self.baseline_encoder { + encoder.write_all(input).context()?; + } } loop { @@ -583,15 +475,17 @@ impl ProcessBuffer for PreflateContainerProcessor { match &mut self.state { ChunkParseState::Start => { - writer.write_all(&[COMPRESSED_WRAPPER_VERSION_1])?; + writer.write_all(&[COMPRESSED_WRAPPER_VERSION_2])?; self.state = ChunkParseState::Searching(None); } ChunkParseState::Searching(prev_ihdr) => { if self.total_plain_text_seen > self.config.total_plain_text_limit { // once we've exceeded our limit, we don't do any more compression - // this is to ensure we don't suck the CPU time for too long on - // a single file - write_literal_block(&self.content, writer)?; + let encoder = self.encoder.as_mut().unwrap(); + write_varint(encoder, self.content.len() as u32)?; + encoder.write_all(&self.content)?; + let sz = emit_compressed_block(LITERAL_CHUNK, encoder, writer)?; + self.compression_stats.zstd_compressed_size += sz as u64; self.last_attempt_chunk_size = 0; self.content.clear(); @@ -609,16 +503,21 @@ impl ProcessBuffer for PreflateContainerProcessor { // the gap between the start and the beginning of the deflate stream // is written out as a literal block if next.start != 0 { - write_literal_block(&self.content[..next.start], writer)?; + let encoder = self.encoder.as_mut().unwrap(); + write_varint(encoder, next.start as u32)?; + encoder.write_all(&self.content[..next.start])?; + let sz = emit_compressed_block(LITERAL_CHUNK, encoder, writer)?; + self.compression_stats.zstd_compressed_size += sz as u64; } - if let Some(mut state) = - write_chunk_block(writer, chunk, &mut self.compression_stats) - .context()? - { + let (compressed_size, next_state) = + write_chunk_block_v2(self.encoder.as_mut().unwrap(), writer, chunk, &mut self.compression_stats) + .context()?; + self.compression_stats.zstd_compressed_size += compressed_size as u64; + + if let Some(mut state) = next_state { self.total_plain_text_seen += state.plain_text().len() as u64; state.shrink_to_dictionary(); - self.state = ChunkParseState::DeflateContinue(state); } @@ -629,7 +528,11 @@ impl ProcessBuffer for PreflateContainerProcessor { if input_complete || self.content.len() > self.config.max_chunk_size { // if we have too much data or have no more data, // we just write it out as a literal block with everything we have - write_literal_block(&self.content, writer)?; + let encoder = self.encoder.as_mut().unwrap(); + write_varint(encoder, self.content.len() as u32)?; + encoder.write_all(&self.content)?; + let sz = emit_compressed_block(LITERAL_CHUNK, encoder, writer)?; + self.compression_stats.zstd_compressed_size += sz as u64; self.content.clear(); self.last_attempt_chunk_size = 0; @@ -641,7 +544,11 @@ impl ProcessBuffer for PreflateContainerProcessor { } FindStreamResult::None => { // couldn't find anything, just write the rest as a literal block - write_literal_block(&self.content, writer)?; + let encoder = self.encoder.as_mut().unwrap(); + write_varint(encoder, self.content.len() as u32)?; + encoder.write_all(&self.content)?; + let sz = emit_compressed_block(LITERAL_CHUNK, encoder, writer)?; + self.compression_stats.zstd_compressed_size += sz as u64; self.content.clear(); self.last_attempt_chunk_size = 0; @@ -668,13 +575,13 @@ impl ProcessBuffer for PreflateContainerProcessor { res.compressed_size ); - writer.write_all(&[DEFLATE_STREAM_CONTINUE])?; - - write_varint(writer, res.corrections.len() as u32)?; - write_varint(writer, state.plain_text().len() as u32)?; - - writer.write_all(&res.corrections)?; - writer.write_all(&state.plain_text().text())?; + let encoder = self.encoder.as_mut().unwrap(); + write_varint(encoder, res.corrections.len() as u32)?; + write_varint(encoder, state.plain_text().len() as u32)?; + encoder.write_all(&res.corrections)?; + encoder.write_all(&state.plain_text().text())?; + let sz = emit_compressed_block(DEFLATE_STREAM_CONTINUE, encoder, writer)?; + self.compression_stats.zstd_compressed_size += sz as u64; self.total_plain_text_seen += state.plain_text().len() as u64; self.compression_stats.overhead_bytes += res.corrections.len() as u64; @@ -695,13 +602,32 @@ impl ProcessBuffer for PreflateContainerProcessor { } } - if input_complete { + if input_complete && !self.input_complete { self.input_complete = true; if self.content.len() > 0 { - write_literal_block(&self.content, writer)?; + let encoder = self.encoder.as_mut().unwrap(); + write_varint(encoder, self.content.len() as u32)?; + encoder.write_all(&self.content)?; + let sz = emit_compressed_block(LITERAL_CHUNK, encoder, writer)?; + self.compression_stats.zstd_compressed_size += sz as u64; } self.content.clear(); + + // Finalize the Zstd encoder and write the end-of-stream marker + let encoder = self.encoder.take().unwrap(); + let finish_bytes = encoder.finish().context()?; + writer.write_all(&[ZSTD_END_OF_STREAM])?; + write_varint(writer, finish_bytes.len() as u32)?; + writer.write_all(&finish_bytes)?; + self.compression_stats.zstd_compressed_size += finish_bytes.len() as u64; + + // Finalize baseline encoder for stats + if let Some(mut encoder) = self.baseline_encoder.take() { + encoder.flush().context()?; + encoder.do_finish().context()?; + self.compression_stats.zstd_baseline_size = encoder.get_mut().length as u64; + } } Ok(()) @@ -732,17 +658,33 @@ impl ProcessBuffer for NopProcessBuffer { enum DecompressionState { Start, StartSegment, - LiteralBlock(usize), - DeflateBlock(usize, usize), - PNGBlock { - correction_length: usize, - uncompressed_length: usize, - idat: IdatContents, - filters: Vec, + /// accumulate compressed_size bytes into compressed_data, then record block_type. + AccumulateBlock { + block_type: u8, + compressed_size: usize, }, - JPEGBlock { + /// accumulate lepton bytes and store as BlockInfo::Jpeg (processed at end). + JpegAccumulate { lepton_length: usize, }, + /// accumulate raw WebP-compressed PNG bytes (stored directly, bypass Zstd). + WebpAccumulate { + total_len: usize, + }, + /// accumulate final Zstd finish bytes, then batch-decode the whole stream. + DecodeAll { + final_size: usize, + }, +} + +/// Describes a single block in the encoded stream, used to replay processing after batch decode. +enum BlockInfo { + /// A non-JPEG block; its content comes from the batch-decoded Zstd output. + Compressed(u8), + /// A JPEG/Lepton block stored raw; bytes are kept here and decoded directly. + Jpeg(Vec), + /// A WebP-compressed PNG block stored raw; bytes are kept here and decoded directly. + RawWebp(Vec), } /// recreates the orignal content from the chunked data @@ -755,6 +697,12 @@ pub struct RecreateContainerProcessor { /// state of the predictor and plain text if we need to contiune a deflate stream /// if it was too big to complete in a single chunk deflate_continue_state: Option, + + /// ordered list of all blocks seen so far + blocks: Vec, + + /// concatenated Zstd-compressed bytes from all non-JPEG blocks + compressed_data: Vec, } impl RecreateContainerProcessor { @@ -765,17 +713,8 @@ impl RecreateContainerProcessor { input_complete: false, state: DecompressionState::Start, deflate_continue_state: None, - } - } - - /// for testing reading a single chunk (skip header) - pub fn new_single_chunk(capacity: usize) -> Self { - RecreateContainerProcessor { - input: VecDeque::new(), - capacity, - input_complete: false, - state: DecompressionState::StartSegment, - deflate_continue_state: None, + blocks: Vec::new(), + compressed_data: Vec::new(), } } } @@ -831,14 +770,17 @@ impl RecreateContainerProcessor { let version = self.input.read_u8()?; - if version != COMPRESSED_WRAPPER_VERSION_1 { - return err_exit_code( - ExitCode::InvalidCompressedWrapper, - format!("Invalid version {version}"), - ); + match version { + COMPRESSED_WRAPPER_VERSION_2 => { + self.state = DecompressionState::StartSegment; + } + _ => { + return err_exit_code( + ExitCode::InvalidCompressedWrapper, + format!("Invalid version {version}"), + ); + } } - - self.state = DecompressionState::StartSegment; } DecompressionState::StartSegment => { // here's a good place to stop if we run out of input @@ -846,209 +788,157 @@ impl RecreateContainerProcessor { break; } - // use scoped read so that if we run out of bytes we can undo the read and wait for more input - self.state = match self.input.scoped_read(|r| match r.read_u8()? { - LITERAL_CHUNK => { - let length = read_varint(r)? as usize; - - Ok(DecompressionState::LiteralBlock(length)) - } - DEFLATE_STREAM => { - let correction_length = read_varint(r)? as usize; - let uncompressed_length = read_varint(r)? as usize; - - // clear the deflate state if we are starting a new block - self.deflate_continue_state = None; - - Ok(DecompressionState::DeflateBlock( - correction_length, - uncompressed_length, - )) - } - DEFLATE_STREAM_CONTINUE => { - let correction_length = read_varint(r)? as usize; - let uncompressed_length = read_varint(r)? as usize; - - if self.deflate_continue_state.is_none() { - return err_exit_code( - ExitCode::InvalidCompressedWrapper, - "no deflate state to continue", - ); + // read type byte, then dispatch + self.state = match self.input.scoped_read(|r| { + let type_byte = r.read_u8()?; + match type_byte { + JPEG_LEPTON_COMPRESSED => { + let lepton_length = read_varint(r)? as usize; + Ok(DecompressionState::JpegAccumulate { lepton_length }) } - - Ok(DecompressionState::DeflateBlock( - correction_length, - uncompressed_length, - )) - } - PNG_COMPRESSED => { - let correction_length = read_varint(r)? as usize; - let uncompressed_length = read_varint(r)? as usize; - let idat = IdatContents::read_from_bytestream(r)?; - - let mut filters = Vec::new(); - if let Some(png_header) = &idat.png_header { - filters.resize(png_header.height as usize, 0); - r.read_exact(&mut filters[..])?; + WEBP_COMPRESSED => { + let total_len = read_varint(r)? as usize; + Ok(DecompressionState::WebpAccumulate { total_len }) + } + ZSTD_END_OF_STREAM => { + let final_size = read_varint(r)? as usize; + Ok(DecompressionState::DecodeAll { final_size }) + } + other => { + let compressed_size = read_varint(r)? as usize; + Ok(DecompressionState::AccumulateBlock { + block_type: other, + compressed_size, + }) } - - Ok(DecompressionState::PNGBlock { - correction_length, - uncompressed_length, - idat, - filters, - }) - } - JPEG_LEPTON_COMPRESSED => { - let lepton_length = read_varint(r)? as usize; - - Ok(DecompressionState::JPEGBlock { lepton_length }) } - - _ => Err(PreflateError::new( - ExitCode::InvalidCompressedWrapper, - "Invalid chunk", - )), }) { Ok(s) => s, Err(e) => { if !self.input_complete && e.exit_code() == ExitCode::ShortRead { - // wait for more input if we ran out of bytes here break; } else { return Err(e); } } - } + }; } - DecompressionState::LiteralBlock(length) => { - let source_size = self.input.len(); - if source_size < *length { + DecompressionState::AccumulateBlock { + block_type, + compressed_size, + } => { + if self.input.len() < *compressed_size { if self.input_complete { return Err(PreflateError::new( ExitCode::InvalidCompressedWrapper, - "unexpected end of input", + "unexpected end of input in block", )); } - - std::io::copy(&mut self.input, writer).context()?; - *length -= source_size; break; } - std::io::copy(&mut (&mut self.input).take(*length as u64), writer).context()?; + let block_type = *block_type; + let compressed_size = *compressed_size; + self.compressed_data + .extend(self.input.drain(0..compressed_size)); + self.blocks.push(BlockInfo::Compressed(block_type)); self.state = DecompressionState::StartSegment; } - DecompressionState::DeflateBlock(correction_length, uncompressed_length) => { - let source_size = self.input.len(); - let total_length = *correction_length + *uncompressed_length; - - if source_size < total_length { + DecompressionState::JpegAccumulate { lepton_length } => { + if self.input.len() < *lepton_length { if self.input_complete { return Err(PreflateError::new( ExitCode::InvalidCompressedWrapper, - "unexpected end of input", + "unexpected end of input in jpeg block", )); } break; } - let corrections: Vec = self.input.drain(0..*correction_length).collect(); - - if let Some(reconstruct) = &mut self.deflate_continue_state { - let (comp, _) = reconstruct - .recompress( - &mut TakeReader::new(&mut self.input, *uncompressed_length), - &corrections, - ) - .context()?; - - writer.write_all(&comp)?; - } else { - let mut reconstruct = RecreateStreamProcessor::new(); - let (comp, _) = reconstruct - .recompress( - &mut TakeReader::new(&mut self.input, *uncompressed_length), - &corrections, - ) - .context()?; - - writer.write_all(&comp)?; - - self.deflate_continue_state = Some(reconstruct); - } - + let lepton_bytes: Vec = self.input.drain(0..*lepton_length).collect(); + self.blocks.push(BlockInfo::Jpeg(lepton_bytes)); self.state = DecompressionState::StartSegment; } - DecompressionState::PNGBlock { - correction_length, - uncompressed_length, - idat, - filters, - } => { - let source_size = self.input.len(); - - let total_length = *correction_length + *uncompressed_length; - if source_size < total_length { - // wait till we have the full block + DecompressionState::WebpAccumulate { total_len } => { + if self.input.len() < *total_len { if self.input_complete { return Err(PreflateError::new( ExitCode::InvalidCompressedWrapper, - "unexpected end of input", + "unexpected end of input in webp block", )); } break; } - let corrections: Vec = self.input.drain(0..*correction_length).collect(); - - let plain_text; - - if let Some(header) = &idat.png_header { - let webp: Vec = self.input.drain(0..*uncompressed_length).collect(); - - plain_text = webp_decompress(filters, webp, header).context()?; - } else { - plain_text = self.input.drain(0..*uncompressed_length).collect(); - } - - let recompressed = - recreate_whole_deflate_stream(&plain_text, &corrections).context()?; - - recreate_idat(&idat, &recompressed[..], writer).context()?; - + let webp_bytes: Vec = self.input.drain(0..*total_len).collect(); + self.blocks.push(BlockInfo::RawWebp(webp_bytes)); self.state = DecompressionState::StartSegment; } - DecompressionState::JPEGBlock { lepton_length } => { - let source_size = self.input.len(); - if source_size < *lepton_length { + + DecompressionState::DecodeAll { final_size } => { + if self.input.len() < *final_size { if self.input_complete { return Err(PreflateError::new( ExitCode::InvalidCompressedWrapper, - "unexpected end of input", + "unexpected end of input in end-of-stream", )); } break; } - let lepton_data: Vec = self.input.drain(0..*lepton_length).collect(); - - match lepton_jpeg::decode_lepton( - &mut Cursor::new(&lepton_data), - writer, - &EnabledFeatures::compat_lepton_vector_read(), - &DEFAULT_THREAD_POOL, - ) { - Err(e) => { - return Err(PreflateError::new( + // Collect final finish bytes and batch-decode the entire Zstd stream. + self.compressed_data + .extend(self.input.drain(0..*final_size)); + let decoded = zstd::decode_all(Cursor::new(&self.compressed_data)) + .map_err(|e| { + PreflateError::new( ExitCode::InvalidCompressedWrapper, - format!("JPEG Lepton decode failed: {}", e), - )); + format!("zstd decode failed: {e}"), + ) + })?; + + let mut cursor = Cursor::new(decoded); + let blocks = std::mem::take(&mut self.blocks); + for block_info in blocks { + match block_info { + BlockInfo::Compressed(block_type) => { + process_compressed_block( + block_type, + &mut cursor, + &mut self.deflate_continue_state, + writer, + )?; + } + BlockInfo::Jpeg(lepton_data) => { + match lepton_jpeg::decode_lepton( + &mut Cursor::new(&lepton_data), + writer, + &EnabledFeatures::compat_lepton_vector_read(), + &DEFAULT_THREAD_POOL, + ) { + Err(e) => { + return Err(PreflateError::new( + ExitCode::InvalidCompressedWrapper, + format!("JPEG Lepton decode failed: {}", e), + )); + } + Ok(_) => {} + } + } + BlockInfo::RawWebp(webp_bytes) => { + // Payload is what webp_compress wrote after the PNG_COMPRESSED + // type byte, so process_compressed_block can parse it directly. + process_compressed_block( + PNG_COMPRESSED, + &mut Cursor::new(webp_bytes), + &mut self.deflate_continue_state, + writer, + )?; + } } - Ok(_) => {} } self.state = DecompressionState::StartSegment; @@ -1060,6 +950,129 @@ impl RecreateContainerProcessor { } } +/// Parses and processes a single non-JPEG block from a cursor over the batch-decoded output. +/// +/// The encoded layout (as written by the encoder) for each block type is: +/// LITERAL_CHUNK: varint(len) + data +/// DEFLATE_STREAM: varint(corrections_len) + varint(plaintext_len) + corrections + plaintext +/// DEFLATE_STREAM_CONTINUE: same layout as DEFLATE_STREAM +/// PNG_COMPRESSED: varint(correction_length) + varint(uncompressed_length) + +/// IdatContents + [filters if png_header present] + +/// corrections + (webp_data or raw_plaintext) +fn process_compressed_block( + block_type: u8, + cursor: &mut Cursor>, + deflate_continue_state: &mut Option, + writer: &mut impl Write, +) -> Result<()> { + match block_type { + LITERAL_CHUNK => { + let length = read_varint(cursor)? as usize; + let mut data = vec![0u8; length]; + cursor.read_exact(&mut data).map_err(|e| { + PreflateError::new(ExitCode::InvalidCompressedWrapper, e.to_string()) + })?; + writer.write_all(&data)?; + } + DEFLATE_STREAM => { + *deflate_continue_state = None; + + let correction_length = read_varint(cursor)? as usize; + let uncompressed_length = read_varint(cursor)? as usize; + + let mut corrections = vec![0u8; correction_length]; + cursor.read_exact(&mut corrections).map_err(|e| { + PreflateError::new(ExitCode::InvalidCompressedWrapper, e.to_string()) + })?; + + let mut plain_text_buf = vec![0u8; uncompressed_length]; + cursor.read_exact(&mut plain_text_buf).map_err(|e| { + PreflateError::new(ExitCode::InvalidCompressedWrapper, e.to_string()) + })?; + + let mut reconstruct = RecreateStreamProcessor::new(); + let (comp, _) = reconstruct + .recompress(&mut Cursor::new(&plain_text_buf), &corrections) + .context()?; + + writer.write_all(&comp)?; + *deflate_continue_state = Some(reconstruct); + } + DEFLATE_STREAM_CONTINUE => { + let correction_length = read_varint(cursor)? as usize; + let uncompressed_length = read_varint(cursor)? as usize; + + let mut corrections = vec![0u8; correction_length]; + cursor.read_exact(&mut corrections).map_err(|e| { + PreflateError::new(ExitCode::InvalidCompressedWrapper, e.to_string()) + })?; + + let mut plain_text_buf = vec![0u8; uncompressed_length]; + cursor.read_exact(&mut plain_text_buf).map_err(|e| { + PreflateError::new(ExitCode::InvalidCompressedWrapper, e.to_string()) + })?; + + let reconstruct = deflate_continue_state.as_mut().ok_or_else(|| { + PreflateError::new( + ExitCode::InvalidCompressedWrapper, + "no deflate state to continue", + ) + })?; + + let (comp, _) = reconstruct + .recompress(&mut Cursor::new(&plain_text_buf), &corrections) + .context()?; + + writer.write_all(&comp)?; + } + PNG_COMPRESSED => { + let correction_length = read_varint(cursor)? as usize; + let uncompressed_length = read_varint(cursor)? as usize; + let idat = IdatContents::read_from_bytestream(cursor)?; + + let mut filters = Vec::new(); + if let Some(png_header) = &idat.png_header { + filters.resize(png_header.height as usize, 0); + cursor.read_exact(&mut filters[..]).map_err(|e| { + PreflateError::new(ExitCode::InvalidCompressedWrapper, e.to_string()) + })?; + } + + let mut corrections = vec![0u8; correction_length]; + cursor.read_exact(&mut corrections).map_err(|e| { + PreflateError::new(ExitCode::InvalidCompressedWrapper, e.to_string()) + })?; + + let plain_text; + if let Some(header) = &idat.png_header { + let mut webp = vec![0u8; uncompressed_length]; + cursor.read_exact(&mut webp).map_err(|e| { + PreflateError::new(ExitCode::InvalidCompressedWrapper, e.to_string()) + })?; + plain_text = webp_decompress(&filters, webp, header).context()?; + } else { + let mut raw = vec![0u8; uncompressed_length]; + cursor.read_exact(&mut raw).map_err(|e| { + PreflateError::new(ExitCode::InvalidCompressedWrapper, e.to_string()) + })?; + plain_text = raw; + } + + let recompressed = + recreate_whole_deflate_stream(&plain_text, &corrections).context()?; + + recreate_idat(&idat, &recompressed[..], writer).context()?; + } + _ => { + return err_exit_code( + ExitCode::InvalidCompressedWrapper, + format!("Unknown block type {block_type}"), + ); + } + } + Ok(()) +} + fn webp_compress( result: &mut impl Write, plain_text: &[u8], @@ -1174,30 +1187,50 @@ fn webp_decompress( return err_exit_code(ExitCode::InvalidCompressedWrapper, "Webp decode failed"); } -#[test] -fn test_baseline_calc() { - use crate::utils::read_file; - use crate::zstd_compression::ZstdCompressContext; +#[cfg(test)] +fn roundtrip_deflate_chunks(filename: &str) { + use crate::utils::assert_eq_array; - let v = read_file("samplezip.zip"); + let f = crate::utils::read_file(filename); - let mut context = ZstdCompressContext::new( - PreflateContainerProcessor::new(&PreflateContainerConfig::default()), - 9, - true, - ); + println!("Processing file: {}", filename); - let _r = context.process_vec(&v).unwrap(); + let mut expanded = Vec::new(); + let mut ctx = PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); + ctx.copy_to_end(&mut std::io::Cursor::new(&f), &mut expanded).unwrap(); + + println!("Recreating file: {}", filename); - let stats = context.stats(); + let mut destination = Vec::new(); + let mut ctx = RecreateContainerProcessor::new(usize::MAX); + ctx.copy_to_end(&mut std::io::Cursor::new(expanded), &mut destination).unwrap(); + + assert_eq_array(&destination, &f); +} + +#[test] +fn roundtrip_skip_length_crash() { + roundtrip_deflate_chunks("skiplengthcrash.bin"); +} + +#[test] +fn roundtrip_png_chunks() { + roundtrip_deflate_chunks("treegdi.png"); +} + +#[test] +fn roundtrip_zip_chunks() { + roundtrip_deflate_chunks("samplezip.zip"); +} - println!("stats: {:?}", stats); +#[test] +fn roundtrip_gz_chunks() { + roundtrip_deflate_chunks("sample1.bin.gz"); +} - // these change if the compression algorithm is altered, update them - assert_eq!(stats.overhead_bytes, 463); - assert_eq!(stats.zstd_compressed_size, 12444); - assert_eq!(stats.uncompressed_size, 54871); - assert_eq!(stats.zstd_baseline_size, 13664); +#[test] +fn roundtrip_png_chunks2() { + roundtrip_deflate_chunks("starcontrol.samplesave"); } #[test] @@ -1211,7 +1244,7 @@ fn roundtrip_small_chunk() { max_chunk_size: 100000, total_plain_text_limit: u64::MAX, ..Default::default() - }); + }, 1, false); let compressed = context.process_vec_size(&original, 20001).unwrap(); @@ -1232,7 +1265,7 @@ fn roundtrip_small_plain_text() { max_chunk_size: 100000, total_plain_text_limit: u64::MAX, ..Default::default() - }); + }, 1, false); let compressed = context.process_vec_size(&original, 2001).unwrap(); @@ -1243,59 +1276,21 @@ fn roundtrip_small_plain_text() { } #[test] -fn roundtrip_png_e2e() { - crate::init_logging(); - +fn roundtrip_zstd_per_block() { use crate::utils::{assert_eq_array, read_file}; - let original = read_file("figma.png"); - - println!("Compressing file"); - - let mut context = PreflateContainerProcessor::new(&PreflateContainerConfig { - min_chunk_size: 100000, - max_chunk_size: original.len(), - ..Default::default() - }); - - let compressed = context.process_vec_size(&original, 100100).unwrap(); - - println!("Recreating file"); + let original = read_file("samplezip.zip"); - let mut context = RecreateContainerProcessor::new(usize::MAX); - let recreated = context.process_vec_size(&compressed, 100100).unwrap(); - - assert_eq_array(&original, &recreated); -} - -#[test] -fn roundtrip_jpg() { - crate::init_logging(); - - use crate::utils::{assert_eq_array, read_file}; - - let original = read_file("embedded-images.pdf"); - - println!("Compressing file"); - - let mut context = PreflateContainerProcessor::new(&PreflateContainerConfig { - min_chunk_size: 1000000, - max_chunk_size: original.len(), - ..Default::default() - }); - - let compressed = context.process_vec_size(&original, usize::MAX).unwrap(); - - println!( - "Compressed size: {} vs {}", - compressed.len(), - original.len() + let mut context = PreflateContainerProcessor::new( + &PreflateContainerConfig::default(), + 1, + false, ); - println!("Recreating file"); + let compressed = context.process_vec(&original).unwrap(); let mut context = RecreateContainerProcessor::new(usize::MAX); - let recreated = context.process_vec_size(&compressed, usize::MAX).unwrap(); + let recreated = context.process_vec(&compressed).unwrap(); assert_eq_array(&original, &recreated); } diff --git a/container/src/lib.rs b/container/src/lib.rs index 5a1f638..c368df2 100644 --- a/container/src/lib.rs +++ b/container/src/lib.rs @@ -25,19 +25,12 @@ mod idat_parse; mod scan_deflate; mod scoped_read; mod utils; -mod zstd_compression; -pub use zstd_compression::{ - zstd_preflate_whole_deflate_stream, zstd_recreate_whole_deflate_stream, -}; pub use container_processor::{PreflateContainerConfig, PreflateStats}; pub use container_processor::{ PreflateContainerProcessor, ProcessBuffer, RecreateContainerProcessor, - preflate_whole_into_container, recreate_whole_from_container, }; -pub use zstd_compression::{ZstdCompressContext, ZstdDecompressContext}; - pub use utils::process_limited_buffer; #[cfg(test)] diff --git a/container/src/zstd_compression.rs b/container/src/zstd_compression.rs deleted file mode 100644 index c0f1e69..0000000 --- a/container/src/zstd_compression.rs +++ /dev/null @@ -1,277 +0,0 @@ -//! Implements processors for Zstandard compression and decompression using -//! the ProcessBuffer model. These are designed to be chained together with -//! the other ProcessBuffer implementations to create a full compression or -//! decompression pipeline. - -use std::io::{BufRead, Write}; - -use crate::{ - PreflateContainerProcessor, PreflateStats, ProcessBuffer, RecreateContainerProcessor, - container_processor::PreflateContainerConfig, -}; - -use preflate_rs::{AddContext, ExitCode, PreflateError, Result}; - -/// processor that compresses the input using Zstandard -/// -/// Designed to wrap around the PreflateChunkProcessor. -pub struct ZstdCompressContext { - zstd_compress: zstd::stream::write::Encoder<'static, Vec>, - input_complete: bool, - internal: D, - - /// if set, the encoder will write all the input to a null zstd encoder to see how much - /// compression we would get if we just used Zstandard without any Preflate processing. - /// - /// This gives a fairer comparison of the compression ratio of Preflate + Zstandard vs. Zstandard - /// since Zstd does compress the data a bit, especially if there is a lot of non-Deflate streams - /// in the file. - test_baseline: Option>, - - zstd_baseline_size: u64, - zstd_compressed_size: u64, -} - -impl ZstdCompressContext { - pub fn new(internal: D, compression_level: i32, test_baseline: bool) -> Self { - ZstdCompressContext { - zstd_compress: zstd::stream::write::Encoder::new(Vec::new(), compression_level) - .unwrap(), - input_complete: false, - internal, - zstd_baseline_size: 0, - zstd_compressed_size: 0, - test_baseline: if test_baseline { - Some( - zstd::stream::write::Encoder::new( - MeasureWriteSink { length: 0 }, - compression_level, - ) - .unwrap(), - ) - } else { - None - }, - } - } -} - -impl ProcessBuffer for ZstdCompressContext { - fn process_buffer( - &mut self, - input: &[u8], - input_complete: bool, - writer: &mut impl Write, - ) -> Result<()> { - if self.input_complete && (input.len() > 0 || !input_complete) { - return Err(PreflateError::new( - ExitCode::InvalidParameter, - "more data provided after input_complete signaled", - )); - } - - if input.len() > 0 { - if let Some(encoder) = &mut self.test_baseline { - encoder.write_all(input).context()?; - } - } - - self.internal - .process_buffer(input, input_complete, &mut self.zstd_compress) - .context()?; - - if input_complete && !self.input_complete { - self.input_complete = true; - - self.zstd_compress.flush().context()?; - - if let Some(encoder) = &mut self.test_baseline { - encoder.flush()?; - encoder.do_finish()?; - self.zstd_baseline_size = encoder.get_mut().length as u64; - } - } - - let compressed = self.zstd_compress.get_mut(); - writer.write_all(compressed).context()?; - self.zstd_compressed_size += compressed.len() as u64; - compressed.drain(..); - - Ok(()) - } - - fn stats(&self) -> PreflateStats { - PreflateStats { - zstd_compressed_size: self.zstd_compressed_size, - zstd_baseline_size: self.zstd_baseline_size, - ..self.internal.stats() - } - } -} - -/// used to measure the length of the output without storing it anyway -struct MeasureWriteSink { - pub length: usize, -} - -impl Write for MeasureWriteSink { - fn write(&mut self, buf: &[u8]) -> std::io::Result { - self.length += buf.len(); - Ok(buf.len()) - } - - fn flush(&mut self) -> std::io::Result<()> { - Ok(()) - } -} - -/// Processor that decompresses the input using Zstandard -/// -/// Designed to wrap around the RecreateContainerProcessor. -pub struct ZstdDecompressContext { - zstd_decompress: zstd::stream::write::Decoder<'static, AcceptWrite>>, -} - -/// used to accept the output from the Zstandard decoder and write it to the output buffer. -/// Since the plain text is significantly larger than the compressed version, we want -/// to avoid buffering the output in memory, so we send it directly to the recreator. -struct AcceptWrite { - internal: D, - output: O, - input_complete: bool, -} - -impl Write for AcceptWrite { - fn write(&mut self, buf: &[u8]) -> std::io::Result { - self.internal - .process_buffer(buf, self.input_complete, &mut self.output)?; - Ok(buf.len()) - } - - fn flush(&mut self) -> std::io::Result<()> { - Ok(()) - } -} - -impl ZstdDecompressContext { - pub fn new(internal: D) -> Self { - ZstdDecompressContext { - zstd_decompress: zstd::stream::write::Decoder::new(AcceptWrite { - internal: internal, - output: Vec::new(), - input_complete: false, - }) - .unwrap(), - } - } -} - -impl ProcessBuffer for ZstdDecompressContext { - fn process_buffer( - &mut self, - input: &[u8], - input_complete: bool, - writer: &mut impl Write, - ) -> Result<()> { - if self.zstd_decompress.get_mut().input_complete && (input.len() > 0 || !input_complete) { - return Err(PreflateError::new( - ExitCode::InvalidParameter, - "more data provided after input_complete signaled", - )); - } - - if input.len() > 0 { - self.zstd_decompress.write_all(input).context()?; - } - - if input_complete && !self.zstd_decompress.get_mut().input_complete { - self.zstd_decompress.flush().context()?; - self.zstd_decompress.get_mut().input_complete = true; - } - - let a = self.zstd_decompress.get_mut(); - writer.write_all(&a.output).context()?; - a.output.clear(); - - Ok(()) - } -} - -/// Expands the Zlib compressed streams in the data and then recompresses the result -/// with Zstd with the given level. -pub fn zstd_preflate_whole_deflate_stream( - config: &PreflateContainerConfig, - input: &mut impl BufRead, - output: &mut impl Write, - compression_level: i32, -) -> Result { - let mut ctx = ZstdCompressContext::new( - PreflateContainerProcessor::new(config), - compression_level, - false, - ); - - ctx.copy_to_end(input, output).context()?; - - Ok(ctx.stats()) -} - -/// Decompresses the Zstd compressed data and then recompresses the result back -/// to the original Zlib compressed streams. -pub fn zstd_recreate_whole_deflate_stream( - input: &mut impl BufRead, - output: &mut impl Write, -) -> Result<()> { - let mut ctx = ZstdDecompressContext::::new( - RecreateContainerProcessor::new(1024 * 1024 * 128), - ); - - ctx.copy_to_end(input, output).context()?; - - Ok(()) -} - -#[test] -fn verify_zip_compress_zstd() { - use crate::utils::read_file; - let v = read_file("samplezip.zip"); - - let mut compressed = Vec::new(); - let stats = zstd_preflate_whole_deflate_stream( - &PreflateContainerConfig::default(), - &mut std::io::Cursor::new(&v), - &mut compressed, - 1, // for testing use a lower level to save CPU - ) - .unwrap(); - - let mut recreated = Vec::new(); - zstd_recreate_whole_deflate_stream(&mut std::io::Cursor::new(&compressed), &mut recreated) - .unwrap(); - - assert!(v == recreated); - println!( - "original zip = {} bytes, expanded = {} bytes recompressed zip = {} bytes", - v.len(), - stats.uncompressed_size, - compressed.len() - ); -} - -/// tests zstd compression buffer processing without involving preflate code -#[test] -fn roundtrip_zstd_only_contexts() { - use crate::container_processor::NopProcessBuffer; - use crate::utils::{assert_eq_array, read_file}; - use crate::zstd_compression::{ZstdCompressContext, ZstdDecompressContext}; - - let original = read_file("samplezip.zip"); - - let mut context = ZstdCompressContext::new(NopProcessBuffer {}, 9, false); - let compressed = context.process_vec_size(&original, 997).unwrap(); - - let mut context = ZstdDecompressContext::new(NopProcessBuffer {}); - let recreated = context.process_vec_size(&compressed, 997).unwrap(); - - assert_eq_array(&original, &recreated); -} diff --git a/dll/src/unmanaged_api.rs b/dll/src/unmanaged_api.rs index 99b8701..4d117c7 100644 --- a/dll/src/unmanaged_api.rs +++ b/dll/src/unmanaged_api.rs @@ -6,7 +6,7 @@ use std::{ use preflate_container::{ PreflateContainerConfig, PreflateContainerProcessor, ProcessBuffer, RecreateContainerProcessor, - ZstdCompressContext, ZstdDecompressContext, process_limited_buffer, + process_limited_buffer, }; use preflate_rs::{ExitCode, PreflateError}; @@ -198,7 +198,7 @@ pub unsafe extern "C" fn get_compression_stats( struct CompressionContext { magic: u32, - internal: ZstdCompressContext, + internal: PreflateContainerProcessor, output_extra: VecDeque, } @@ -221,12 +221,12 @@ impl CompressionContext { fn new(verify: bool, compression_level: i32, test_baseline: bool) -> Self { CompressionContext { magic: MAGIC_COMPRESSION_CONTEXT, - internal: ZstdCompressContext::new( - PreflateContainerProcessor::new(&PreflateContainerConfig { + internal: PreflateContainerProcessor::new( + &PreflateContainerConfig { validate_compression: verify, max_chain_length: 1024, // lower max chain to avoid excessive CPU usage ..PreflateContainerConfig::default() - }), + }, compression_level, test_baseline, ), @@ -237,7 +237,7 @@ impl CompressionContext { struct DecompressionContext { magic: u32, - internal: ZstdDecompressContext, + internal: RecreateContainerProcessor, output_extra: VecDeque, } @@ -255,11 +255,9 @@ impl DecompressionContext { } fn new(capacity: usize) -> Self { - let internal = ZstdDecompressContext::new(RecreateContainerProcessor::new(capacity)); - DecompressionContext { magic: MAGIC_DECOMRESSION_CONTEXT, - internal, + internal: RecreateContainerProcessor::new(capacity), output_extra: VecDeque::new(), } } diff --git a/tests/end_to_end.rs b/tests/end_to_end.rs index a98010b..d7ce310 100644 --- a/tests/end_to_end.rs +++ b/tests/end_to_end.rs @@ -10,7 +10,9 @@ use std::path::Path; use std::{mem, ptr}; use libdeflate_sys::{libdeflate_alloc_compressor, libdeflate_deflate_compress}; -use preflate_container::{zstd_preflate_whole_deflate_stream, zstd_recreate_whole_deflate_stream}; +use preflate_container::{ + PreflateContainerConfig, PreflateContainerProcessor, ProcessBuffer, RecreateContainerProcessor, +}; use preflate_rs::{PreflateConfig, preflate_whole_deflate_stream, recreate_whole_deflate_stream}; #[cfg(test)] @@ -77,18 +79,13 @@ fn test_container(filename: &str) { let v = read_file(filename); let mut c = Vec::new(); - - let stats = zstd_preflate_whole_deflate_stream( - &preflate_container::PreflateContainerConfig::default(), - &mut std::io::Cursor::new(&v), - &mut c, - 4, // use lower level to save CPU on testing - ) - .unwrap(); + let mut ctx = PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 4, false); + ctx.copy_to_end(&mut std::io::Cursor::new(&v), &mut c).unwrap(); + let stats = ctx.stats(); let mut r = Vec::new(); - - zstd_recreate_whole_deflate_stream(&mut std::io::Cursor::new(&c), &mut r).unwrap(); + let mut ctx = RecreateContainerProcessor::new(128 * 1024 * 1024); + ctx.copy_to_end(&mut std::io::Cursor::new(&c), &mut r).unwrap(); assert!(v == r); println!( diff --git a/util/src/main.rs b/util/src/main.rs index 244daeb..b63943c 100644 --- a/util/src/main.rs +++ b/util/src/main.rs @@ -11,7 +11,6 @@ use std::{ use preflate_container::{ PreflateContainerConfig, PreflateContainerProcessor, ProcessBuffer, RecreateContainerProcessor, - ZstdCompressContext, ZstdDecompressContext, }; #[derive(Parser)] @@ -99,8 +98,8 @@ fn main() { // open file for reading let original = fs::read(&entry).unwrap(); - let mut ctx = ZstdCompressContext::new( - PreflateContainerProcessor::new(&config), + let mut ctx = PreflateContainerProcessor::new( + &config, cli.level as i32, cli.baseline, ); @@ -140,9 +139,7 @@ fn main() { let start = ProcessTime::now(); let mut recreated = Vec::new(); - let mut decomp = ZstdDecompressContext::new(RecreateContainerProcessor::new( - config.chunk_plain_text_limit, - )); + let mut decomp = RecreateContainerProcessor::new(config.chunk_plain_text_limit); if let Err(e) = decomp.copy_to_end_size( &mut Cursor::new(&preflate_compressed), From 82c7b15fe72a725e102daeda93e37850e37614a7 Mon Sep 17 00:00:00 2001 From: Kristof Date: Wed, 25 Feb 2026 12:15:24 +0100 Subject: [PATCH 2/8] work and tests --- container/src/container_processor.rs | 932 ++++++++++++++++++++------- container/src/pdf_parse.rs | 473 ++++++++++++++ container/src/utils.rs | 2 +- preflate/src/stream_processor.rs | 112 ++++ tests/end_to_end.rs | 6 +- util/src/main.rs | 6 +- 6 files changed, 1274 insertions(+), 257 deletions(-) create mode 100644 container/src/pdf_parse.rs diff --git a/container/src/container_processor.rs b/container/src/container_processor.rs index b086ace..b703162 100644 --- a/container/src/container_processor.rs +++ b/container/src/container_processor.rs @@ -81,26 +81,23 @@ impl PreflateContainerConfig { const COMPRESSED_WRAPPER_VERSION_2: u8 = 2; -/// literal chunks are just copied to the output -const LITERAL_CHUNK: u8 = 0; - -/// zlib compressed chunks are zlib compressed -const DEFLATE_STREAM: u8 = 1; - -/// PNG chunks are IDAT chunks that are zlib compressed -const PNG_COMPRESSED: u8 = 2; - -/// deflate stream that continues the previous one with the same dictionary, bitstream etc -const DEFLATE_STREAM_CONTINUE: u8 = 3; - -/// JPEG Lepton compressed chunks are JPEG Lepton compressed -const JPEG_LEPTON_COMPRESSED: u8 = 4; - -/// PNG chunk stored as WebP lossless — already compressed, written raw (bypasses Zstd) -const WEBP_COMPRESSED: u8 = 5; - -/// V2 end-of-stream marker that carries the final Zstd finish bytes -const ZSTD_END_OF_STREAM: u8 = 0xFF; +// Bit-field masks for the block type byte +// Bits 7-6: compression algorithm Bits 5-0: block content kind +const BLOCK_COMPRESSION_MASK: u8 = 0xC0; +const BLOCK_TYPE_MASK: u8 = 0x3F; + +// Compression algorithms (top 2 bits) +const BLOCK_COMPRESSION_NONE: u8 = 0x00; +const BLOCK_COMPRESSION_ZSTD: u8 = 0x40; + +// Block content kinds (bottom 6 bits) +const BLOCK_TYPE_LITERAL: u8 = 0x00; +const BLOCK_TYPE_DEFLATE: u8 = 0x01; +const BLOCK_TYPE_PNG: u8 = 0x02; +const BLOCK_TYPE_DEFLATE_CONTINUE: u8 = 0x03; +const BLOCK_TYPE_JPEG_LEPTON: u8 = 0x04; +const BLOCK_TYPE_WEBP: u8 = 0x05; +const BLOCK_TYPE_EOS: u8 = 0x3F; // end-of-stream pub(crate) fn write_varint(destination: &mut impl Write, value: u32) -> std::io::Result<()> { let mut value = value; @@ -186,7 +183,11 @@ fn write_chunk_block_v2( encoder.write_all(&chunk.corrections)?; encoder.write_all(&state.plain_text().text())?; - let compressed_size = emit_compressed_block(DEFLATE_STREAM, encoder, writer)?; + let compressed_size = emit_compressed_block( + BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_DEFLATE, + encoder, + writer, + )?; stats.overhead_bytes += chunk.corrections.len() as u64; stats.uncompressed_size += state.plain_text().len() as u64; @@ -209,9 +210,9 @@ fn write_chunk_block_v2( if webp_compress(&mut temp_vec, plain_text.text(), &chunk.corrections, &idat).is_ok() { // WebP is already compressed — write raw, bypassing the Zstd encoder. - // temp_vec[0] is the PNG_COMPRESSED type byte; temp_vec[1..] is the payload. + // temp_vec[0] is the BLOCK_TYPE_PNG placeholder byte; temp_vec[1..] is the payload. let payload = &temp_vec[1..]; - writer.write_all(&[WEBP_COMPRESSED])?; + writer.write_all(&[BLOCK_COMPRESSION_NONE | BLOCK_TYPE_WEBP])?; write_varint(writer, payload.len() as u32)?; writer.write_all(payload)?; @@ -230,7 +231,11 @@ fn write_chunk_block_v2( encoder.write_all(&chunk.corrections)?; encoder.write_all(plain_text.text())?; - let compressed_size = emit_compressed_block(PNG_COMPRESSED, encoder, writer)?; + let compressed_size = emit_compressed_block( + BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_PNG, + encoder, + writer, + )?; stats.uncompressed_size += plain_text.len() as u64; stats.hash_algorithm = parameters.hash_algorithm; @@ -242,7 +247,7 @@ fn write_chunk_block_v2( FoundStreamType::JPEGLepton(data) => { // JPEG is written raw (bypasses the encoder entirely) - writer.write_all(&[JPEG_LEPTON_COMPRESSED])?; + writer.write_all(&[BLOCK_COMPRESSION_NONE | BLOCK_TYPE_JPEG_LEPTON])?; write_varint(writer, data.len() as u32)?; writer.write_all(&data)?; @@ -422,9 +427,7 @@ impl PreflateContainerProcessor { total_plain_text_seen: 0, last_attempt_chunk_size: 0, config: config.clone(), - encoder: Some( - zstd::stream::write::Encoder::new(Vec::new(), level).unwrap(), - ), + encoder: Some(zstd::stream::write::Encoder::new(Vec::new(), level).unwrap()), baseline_encoder: if test_baseline { Some( zstd::stream::write::Encoder::new(MeasureWriteSink { length: 0 }, level) @@ -484,7 +487,11 @@ impl ProcessBuffer for PreflateContainerProcessor { let encoder = self.encoder.as_mut().unwrap(); write_varint(encoder, self.content.len() as u32)?; encoder.write_all(&self.content)?; - let sz = emit_compressed_block(LITERAL_CHUNK, encoder, writer)?; + let sz = emit_compressed_block( + BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_LITERAL, + encoder, + writer, + )?; self.compression_stats.zstd_compressed_size += sz as u64; self.last_attempt_chunk_size = 0; @@ -506,13 +513,21 @@ impl ProcessBuffer for PreflateContainerProcessor { let encoder = self.encoder.as_mut().unwrap(); write_varint(encoder, next.start as u32)?; encoder.write_all(&self.content[..next.start])?; - let sz = emit_compressed_block(LITERAL_CHUNK, encoder, writer)?; + let sz = emit_compressed_block( + BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_LITERAL, + encoder, + writer, + )?; self.compression_stats.zstd_compressed_size += sz as u64; } - let (compressed_size, next_state) = - write_chunk_block_v2(self.encoder.as_mut().unwrap(), writer, chunk, &mut self.compression_stats) - .context()?; + let (compressed_size, next_state) = write_chunk_block_v2( + self.encoder.as_mut().unwrap(), + writer, + chunk, + &mut self.compression_stats, + ) + .context()?; self.compression_stats.zstd_compressed_size += compressed_size as u64; if let Some(mut state) = next_state { @@ -531,7 +546,11 @@ impl ProcessBuffer for PreflateContainerProcessor { let encoder = self.encoder.as_mut().unwrap(); write_varint(encoder, self.content.len() as u32)?; encoder.write_all(&self.content)?; - let sz = emit_compressed_block(LITERAL_CHUNK, encoder, writer)?; + let sz = emit_compressed_block( + BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_LITERAL, + encoder, + writer, + )?; self.compression_stats.zstd_compressed_size += sz as u64; self.content.clear(); @@ -547,7 +566,11 @@ impl ProcessBuffer for PreflateContainerProcessor { let encoder = self.encoder.as_mut().unwrap(); write_varint(encoder, self.content.len() as u32)?; encoder.write_all(&self.content)?; - let sz = emit_compressed_block(LITERAL_CHUNK, encoder, writer)?; + let sz = emit_compressed_block( + BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_LITERAL, + encoder, + writer, + )?; self.compression_stats.zstd_compressed_size += sz as u64; self.content.clear(); @@ -557,13 +580,17 @@ impl ProcessBuffer for PreflateContainerProcessor { } ChunkParseState::DeflateContinue(state) => { // here we have a deflate stream that we need to continue - // right now we error out if the continuation cannot be processed match state.decompress(&self.content) { + Err(ref e) if e.exit_code() == ExitCode::ShortRead + && !input_complete + && self.content.len() <= self.config.max_chunk_size => + { + // Not enough data to complete the next block yet; wait for more. + break; + } Err(_e) => { - // indicate that we got an error while trying to continue - // the compression of a previous chunk, this happens - // when the stream significantly diverged from the behavior we estimated - // in the first chunk that we saw + // Stream analysis diverged or no more data is coming; give up on + // continuation and fall back to treating the remaining bytes as raw. self.state = ChunkParseState::Searching(None); log::debug!("Error while trying to continue compression {:?}", _e); @@ -580,7 +607,11 @@ impl ProcessBuffer for PreflateContainerProcessor { write_varint(encoder, state.plain_text().len() as u32)?; encoder.write_all(&res.corrections)?; encoder.write_all(&state.plain_text().text())?; - let sz = emit_compressed_block(DEFLATE_STREAM_CONTINUE, encoder, writer)?; + let sz = emit_compressed_block( + BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_DEFLATE_CONTINUE, + encoder, + writer, + )?; self.compression_stats.zstd_compressed_size += sz as u64; self.total_plain_text_seen += state.plain_text().len() as u64; @@ -609,7 +640,11 @@ impl ProcessBuffer for PreflateContainerProcessor { let encoder = self.encoder.as_mut().unwrap(); write_varint(encoder, self.content.len() as u32)?; encoder.write_all(&self.content)?; - let sz = emit_compressed_block(LITERAL_CHUNK, encoder, writer)?; + let sz = emit_compressed_block( + BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_LITERAL, + encoder, + writer, + )?; self.compression_stats.zstd_compressed_size += sz as u64; } self.content.clear(); @@ -617,7 +652,7 @@ impl ProcessBuffer for PreflateContainerProcessor { // Finalize the Zstd encoder and write the end-of-stream marker let encoder = self.encoder.take().unwrap(); let finish_bytes = encoder.finish().context()?; - writer.write_all(&[ZSTD_END_OF_STREAM])?; + writer.write_all(&[BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_EOS])?; write_varint(writer, finish_bytes.len() as u32)?; writer.write_all(&finish_bytes)?; self.compression_stats.zstd_compressed_size += finish_bytes.len() as u64; @@ -638,55 +673,28 @@ impl ProcessBuffer for PreflateContainerProcessor { } } -#[cfg(test)] -pub struct NopProcessBuffer {} - -#[cfg(test)] -impl ProcessBuffer for NopProcessBuffer { - fn process_buffer( - &mut self, - input: &[u8], - _input_complete: bool, - writer: &mut impl Write, - ) -> Result<()> { - writer.write_all(input).context()?; - - Ok(()) - } -} - enum DecompressionState { Start, StartSegment, - /// accumulate compressed_size bytes into compressed_data, then record block_type. + /// accumulate compressed_size bytes then decode and process the block immediately. AccumulateBlock { block_type: u8, compressed_size: usize, }, - /// accumulate lepton bytes and store as BlockInfo::Jpeg (processed at end). + /// accumulate lepton bytes then decode the JPEG block immediately. JpegAccumulate { lepton_length: usize, }, - /// accumulate raw WebP-compressed PNG bytes (stored directly, bypass Zstd). + /// accumulate raw WebP-compressed PNG bytes then process the block immediately. WebpAccumulate { total_len: usize, }, - /// accumulate final Zstd finish bytes, then batch-decode the whole stream. - DecodeAll { + /// accumulate the final Zstd finish bytes to close the frame cleanly. + ZstdEndOfStream { final_size: usize, }, } -/// Describes a single block in the encoded stream, used to replay processing after batch decode. -enum BlockInfo { - /// A non-JPEG block; its content comes from the batch-decoded Zstd output. - Compressed(u8), - /// A JPEG/Lepton block stored raw; bytes are kept here and decoded directly. - Jpeg(Vec), - /// A WebP-compressed PNG block stored raw; bytes are kept here and decoded directly. - RawWebp(Vec), -} - /// recreates the orignal content from the chunked data pub struct RecreateContainerProcessor { capacity: usize, @@ -698,11 +706,8 @@ pub struct RecreateContainerProcessor { /// if it was too big to complete in a single chunk deflate_continue_state: Option, - /// ordered list of all blocks seen so far - blocks: Vec, - - /// concatenated Zstd-compressed bytes from all non-JPEG blocks - compressed_data: Vec, + /// persistent Zstd decoder — maintains the streaming context across blocks + zstd_decoder: zstd::stream::raw::Decoder<'static>, } impl RecreateContainerProcessor { @@ -713,8 +718,7 @@ impl RecreateContainerProcessor { input_complete: false, state: DecompressionState::Start, deflate_continue_state: None, - blocks: Vec::new(), - compressed_data: Vec::new(), + zstd_decoder: zstd::stream::raw::Decoder::new().expect("failed to create zstd decoder"), } } } @@ -791,26 +795,40 @@ impl RecreateContainerProcessor { // read type byte, then dispatch self.state = match self.input.scoped_read(|r| { let type_byte = r.read_u8()?; - match type_byte { - JPEG_LEPTON_COMPRESSED => { - let lepton_length = read_varint(r)? as usize; - Ok(DecompressionState::JpegAccumulate { lepton_length }) - } - WEBP_COMPRESSED => { - let total_len = read_varint(r)? as usize; - Ok(DecompressionState::WebpAccumulate { total_len }) - } - ZSTD_END_OF_STREAM => { - let final_size = read_varint(r)? as usize; - Ok(DecompressionState::DecodeAll { final_size }) - } - other => { - let compressed_size = read_varint(r)? as usize; - Ok(DecompressionState::AccumulateBlock { - block_type: other, - compressed_size, - }) - } + let compression = type_byte & BLOCK_COMPRESSION_MASK; + let block_type = type_byte & BLOCK_TYPE_MASK; + match compression { + BLOCK_COMPRESSION_NONE => match block_type { + BLOCK_TYPE_JPEG_LEPTON => { + let lepton_length = read_varint(r)? as usize; + Ok(DecompressionState::JpegAccumulate { lepton_length }) + } + BLOCK_TYPE_WEBP => { + let total_len = read_varint(r)? as usize; + Ok(DecompressionState::WebpAccumulate { total_len }) + } + _ => err_exit_code( + ExitCode::InvalidCompressedWrapper, + "unknown raw block type", + ), + }, + BLOCK_COMPRESSION_ZSTD => match block_type { + BLOCK_TYPE_EOS => { + let final_size = read_varint(r)? as usize; + Ok(DecompressionState::ZstdEndOfStream { final_size }) + } + other => { + let compressed_size = read_varint(r)? as usize; + Ok(DecompressionState::AccumulateBlock { + block_type: other, + compressed_size, + }) + } + }, + _ => err_exit_code( + ExitCode::InvalidCompressedWrapper, + "unknown compression algorithm", + ), } }) { Ok(s) => s, @@ -839,10 +857,14 @@ impl RecreateContainerProcessor { } let block_type = *block_type; - let compressed_size = *compressed_size; - self.compressed_data - .extend(self.input.drain(0..compressed_size)); - self.blocks.push(BlockInfo::Compressed(block_type)); + let compressed_bytes: Vec = self.input.drain(0..*compressed_size).collect(); + let decoded = drain_zstd_block(&mut self.zstd_decoder, &compressed_bytes)?; + process_compressed_block( + block_type, + &mut Cursor::new(decoded), + &mut self.deflate_continue_state, + writer, + )?; self.state = DecompressionState::StartSegment; } @@ -858,7 +880,20 @@ impl RecreateContainerProcessor { } let lepton_bytes: Vec = self.input.drain(0..*lepton_length).collect(); - self.blocks.push(BlockInfo::Jpeg(lepton_bytes)); + match lepton_jpeg::decode_lepton( + &mut Cursor::new(&lepton_bytes), + writer, + &EnabledFeatures::compat_lepton_vector_read(), + &DEFAULT_THREAD_POOL, + ) { + Err(e) => { + return Err(PreflateError::new( + ExitCode::InvalidCompressedWrapper, + format!("JPEG Lepton decode failed: {}", e), + )); + } + Ok(_) => {} + } self.state = DecompressionState::StartSegment; } @@ -874,11 +909,18 @@ impl RecreateContainerProcessor { } let webp_bytes: Vec = self.input.drain(0..*total_len).collect(); - self.blocks.push(BlockInfo::RawWebp(webp_bytes)); + // Payload is what webp_compress wrote after the BLOCK_TYPE_PNG type byte, + // so process_compressed_block can parse it directly. + process_compressed_block( + BLOCK_TYPE_PNG, + &mut Cursor::new(webp_bytes), + &mut self.deflate_continue_state, + writer, + )?; self.state = DecompressionState::StartSegment; } - DecompressionState::DecodeAll { final_size } => { + DecompressionState::ZstdEndOfStream { final_size } => { if self.input.len() < *final_size { if self.input_complete { return Err(PreflateError::new( @@ -889,57 +931,10 @@ impl RecreateContainerProcessor { break; } - // Collect final finish bytes and batch-decode the entire Zstd stream. - self.compressed_data - .extend(self.input.drain(0..*final_size)); - let decoded = zstd::decode_all(Cursor::new(&self.compressed_data)) - .map_err(|e| { - PreflateError::new( - ExitCode::InvalidCompressedWrapper, - format!("zstd decode failed: {e}"), - ) - })?; - - let mut cursor = Cursor::new(decoded); - let blocks = std::mem::take(&mut self.blocks); - for block_info in blocks { - match block_info { - BlockInfo::Compressed(block_type) => { - process_compressed_block( - block_type, - &mut cursor, - &mut self.deflate_continue_state, - writer, - )?; - } - BlockInfo::Jpeg(lepton_data) => { - match lepton_jpeg::decode_lepton( - &mut Cursor::new(&lepton_data), - writer, - &EnabledFeatures::compat_lepton_vector_read(), - &DEFAULT_THREAD_POOL, - ) { - Err(e) => { - return Err(PreflateError::new( - ExitCode::InvalidCompressedWrapper, - format!("JPEG Lepton decode failed: {}", e), - )); - } - Ok(_) => {} - } - } - BlockInfo::RawWebp(webp_bytes) => { - // Payload is what webp_compress wrote after the PNG_COMPRESSED - // type byte, so process_compressed_block can parse it directly. - process_compressed_block( - PNG_COMPRESSED, - &mut Cursor::new(webp_bytes), - &mut self.deflate_continue_state, - writer, - )?; - } - } - } + // Feed the finish bytes to cleanly close the Zstd frame. + // No decompressed output is expected since the encoder flushes after each block. + let finish_bytes: Vec = self.input.drain(0..*final_size).collect(); + drain_zstd_block(&mut self.zstd_decoder, &finish_bytes)?; self.state = DecompressionState::StartSegment; } @@ -950,15 +945,56 @@ impl RecreateContainerProcessor { } } -/// Parses and processes a single non-JPEG block from a cursor over the batch-decoded output. +/// Feeds `compressed` bytes into the persistent `decoder` and returns all decompressed output. /// -/// The encoded layout (as written by the encoder) for each block type is: -/// LITERAL_CHUNK: varint(len) + data -/// DEFLATE_STREAM: varint(corrections_len) + varint(plaintext_len) + corrections + plaintext -/// DEFLATE_STREAM_CONTINUE: same layout as DEFLATE_STREAM -/// PNG_COMPRESSED: varint(correction_length) + varint(uncompressed_length) + -/// IdatContents + [filters if png_header present] + -/// corrections + (webp_data or raw_plaintext) +/// Each call corresponds to one Zstd flush frame (written by the encoder via `flush()`). +/// After consuming all input bytes the decoder is drained until it produces no more output, +/// which is guaranteed because `ZSTD_e_flush` ensures all data is available to the decoder +/// before the next block starts. +fn drain_zstd_block( + decoder: &mut zstd::stream::raw::Decoder<'static>, + compressed: &[u8], +) -> Result> { + use zstd::stream::raw::{InBuffer, Operation, OutBuffer}; + + let mut output = Vec::new(); + let mut scratch = vec![0u8; 65536]; + let mut in_buf = InBuffer::around(compressed); + + loop { + let mut out_buf = OutBuffer::around(scratch.as_mut_slice()); + decoder.run(&mut in_buf, &mut out_buf).map_err(|e| { + PreflateError::new( + ExitCode::InvalidCompressedWrapper, + format!("zstd decode failed: {e}"), + ) + })?; + let produced = out_buf.pos(); + output.extend_from_slice(&scratch[..produced]); + + // Stop when all input has been consumed and the decoder produced no more output. + // zstd guarantees progress (either bytes_read > 0 or bytes_written > 0) so this + // loop always terminates. + if in_buf.pos() >= compressed.len() && produced == 0 { + break; + } + } + + Ok(output) +} + +/// Parses and processes a single non-JPEG/non-WebP block. +/// +/// `cursor` wraps the output of `drain_zstd_block` for compressed blocks, +/// or the raw WebP payload for `BLOCK_TYPE_PNG` blocks stored outside Zstd. +/// +/// Layout written by the encoder for each block type (block_type = lower 6 bits): +/// BLOCK_TYPE_LITERAL: varint(len) + data +/// BLOCK_TYPE_DEFLATE: varint(corrections_len) + varint(plaintext_len) + corrections + plaintext +/// BLOCK_TYPE_DEFLATE_CONTINUE: same as BLOCK_TYPE_DEFLATE +/// BLOCK_TYPE_PNG: varint(correction_length) + varint(uncompressed_length) + +/// IdatContents + [filters if png_header present] + +/// corrections + (webp_data or raw_plaintext) fn process_compressed_block( block_type: u8, cursor: &mut Cursor>, @@ -966,7 +1002,7 @@ fn process_compressed_block( writer: &mut impl Write, ) -> Result<()> { match block_type { - LITERAL_CHUNK => { + BLOCK_TYPE_LITERAL => { let length = read_varint(cursor)? as usize; let mut data = vec![0u8; length]; cursor.read_exact(&mut data).map_err(|e| { @@ -974,7 +1010,7 @@ fn process_compressed_block( })?; writer.write_all(&data)?; } - DEFLATE_STREAM => { + BLOCK_TYPE_DEFLATE => { *deflate_continue_state = None; let correction_length = read_varint(cursor)? as usize; @@ -998,7 +1034,7 @@ fn process_compressed_block( writer.write_all(&comp)?; *deflate_continue_state = Some(reconstruct); } - DEFLATE_STREAM_CONTINUE => { + BLOCK_TYPE_DEFLATE_CONTINUE => { let correction_length = read_varint(cursor)? as usize; let uncompressed_length = read_varint(cursor)? as usize; @@ -1025,7 +1061,7 @@ fn process_compressed_block( writer.write_all(&comp)?; } - PNG_COMPRESSED => { + BLOCK_TYPE_PNG => { let correction_length = read_varint(cursor)? as usize; let uncompressed_length = read_varint(cursor)? as usize; let idat = IdatContents::read_from_bytestream(cursor)?; @@ -1133,7 +1169,7 @@ fn webp_compress( } }; - result.write_all(&[PNG_COMPRESSED])?; + result.write_all(&[BLOCK_TYPE_PNG])?; // placeholder — caller skips this byte write_varint(result, corrections.len() as u32)?; write_varint(result, comp.deref().len() as u32)?; @@ -1188,109 +1224,507 @@ fn webp_decompress( } #[cfg(test)] -fn roundtrip_deflate_chunks(filename: &str) { - use crate::utils::assert_eq_array; +pub(crate) mod test { + use super::*; - let f = crate::utils::read_file(filename); + pub struct NopProcessBuffer {} - println!("Processing file: {}", filename); + impl ProcessBuffer for NopProcessBuffer { + fn process_buffer( + &mut self, + input: &[u8], + _input_complete: bool, + writer: &mut impl Write, + ) -> Result<()> { + writer.write_all(input).context()?; - let mut expanded = Vec::new(); - let mut ctx = PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); - ctx.copy_to_end(&mut std::io::Cursor::new(&f), &mut expanded).unwrap(); + Ok(()) + } + } - println!("Recreating file: {}", filename); + fn roundtrip_deflate_chunks(filename: &str) { + use crate::utils::assert_eq_array; - let mut destination = Vec::new(); - let mut ctx = RecreateContainerProcessor::new(usize::MAX); - ctx.copy_to_end(&mut std::io::Cursor::new(expanded), &mut destination).unwrap(); + let f = crate::utils::read_file(filename); - assert_eq_array(&destination, &f); -} + println!("Processing file: {}", filename); -#[test] -fn roundtrip_skip_length_crash() { - roundtrip_deflate_chunks("skiplengthcrash.bin"); -} + let mut expanded = Vec::new(); + let mut ctx = + PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); + ctx.copy_to_end(&mut std::io::Cursor::new(&f), &mut expanded) + .unwrap(); -#[test] -fn roundtrip_png_chunks() { - roundtrip_deflate_chunks("treegdi.png"); -} + println!("Recreating file: {}", filename); -#[test] -fn roundtrip_zip_chunks() { - roundtrip_deflate_chunks("samplezip.zip"); -} + let mut destination = Vec::new(); + let mut ctx = RecreateContainerProcessor::new(usize::MAX); + ctx.copy_to_end(&mut std::io::Cursor::new(expanded), &mut destination) + .unwrap(); -#[test] -fn roundtrip_gz_chunks() { - roundtrip_deflate_chunks("sample1.bin.gz"); -} + assert_eq_array(&destination, &f); + } -#[test] -fn roundtrip_png_chunks2() { - roundtrip_deflate_chunks("starcontrol.samplesave"); -} + #[test] + fn roundtrip_skip_length_crash() { + roundtrip_deflate_chunks("skiplengthcrash.bin"); + } -#[test] -fn roundtrip_small_chunk() { - use crate::utils::{assert_eq_array, read_file}; + #[test] + fn roundtrip_png_chunks() { + roundtrip_deflate_chunks("treegdi.png"); + } - let original = read_file("pptxplaintext.zip"); + #[test] + fn roundtrip_zip_chunks() { + roundtrip_deflate_chunks("samplezip.zip"); + } - let mut context = PreflateContainerProcessor::new(&PreflateContainerConfig { - min_chunk_size: 100000, - max_chunk_size: 100000, - total_plain_text_limit: u64::MAX, - ..Default::default() - }, 1, false); + #[test] + fn roundtrip_gz_chunks() { + roundtrip_deflate_chunks("sample1.bin.gz"); + } - let compressed = context.process_vec_size(&original, 20001).unwrap(); + #[test] + fn roundtrip_png_chunks2() { + roundtrip_deflate_chunks("starcontrol.samplesave"); + } - let mut context = RecreateContainerProcessor::new(usize::MAX); - let recreated = context.process_vec_size(&compressed, 20001).unwrap(); + #[test] + fn roundtrip_small_chunk() { + use crate::utils::{assert_eq_array, read_file}; - assert_eq_array(&original, &recreated); -} + let original = read_file("pptxplaintext.zip"); -#[test] -fn roundtrip_small_plain_text() { - use crate::utils::{assert_eq_array, read_file}; + let mut context = PreflateContainerProcessor::new( + &PreflateContainerConfig { + min_chunk_size: 100000, + max_chunk_size: 100000, + total_plain_text_limit: u64::MAX, + ..Default::default() + }, + 1, + false, + ); - let original = read_file("pptxplaintext.zip"); + let compressed = context.process_vec_size(&original, 20001).unwrap(); - let mut context = PreflateContainerProcessor::new(&PreflateContainerConfig { - min_chunk_size: 100000, - max_chunk_size: 100000, - total_plain_text_limit: u64::MAX, - ..Default::default() - }, 1, false); + let mut context = RecreateContainerProcessor::new(usize::MAX); + let recreated = context.process_vec_size(&compressed, 20001).unwrap(); - let compressed = context.process_vec_size(&original, 2001).unwrap(); + assert_eq_array(&original, &recreated); + } - let mut context = RecreateContainerProcessor::new(usize::MAX); - let recreated = context.process_vec_size(&compressed, 2001).unwrap(); + #[test] + fn roundtrip_small_plain_text() { + use crate::utils::{assert_eq_array, read_file}; - assert_eq_array(&original, &recreated); -} + let original = read_file("pptxplaintext.zip"); -#[test] -fn roundtrip_zstd_per_block() { - use crate::utils::{assert_eq_array, read_file}; + let mut context = PreflateContainerProcessor::new( + &PreflateContainerConfig { + min_chunk_size: 100000, + max_chunk_size: 100000, + total_plain_text_limit: u64::MAX, + ..Default::default() + }, + 1, + false, + ); - let original = read_file("samplezip.zip"); + let compressed = context.process_vec_size(&original, 2001).unwrap(); - let mut context = PreflateContainerProcessor::new( - &PreflateContainerConfig::default(), - 1, - false, - ); + let mut context = RecreateContainerProcessor::new(usize::MAX); + let recreated = context.process_vec_size(&compressed, 2001).unwrap(); + + assert_eq_array(&original, &recreated); + } - let compressed = context.process_vec(&original).unwrap(); + #[test] + fn roundtrip_zstd_per_block() { + use crate::utils::{assert_eq_array, read_file}; - let mut context = RecreateContainerProcessor::new(usize::MAX); - let recreated = context.process_vec(&compressed).unwrap(); + let original = read_file("samplezip.zip"); - assert_eq_array(&original, &recreated); + let mut context = + PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); + + let compressed = context.process_vec(&original).unwrap(); + + let mut context = RecreateContainerProcessor::new(usize::MAX); + let recreated = context.process_vec(&compressed).unwrap(); + + assert_eq_array(&original, &recreated); + } + + // ── Block type bit-field tests ─────────────────────────────────────────────── + + /// Parse the outer framing of a v2 container and return each block's + /// (compression_bits, block_type_bits) in order, stopping after EOS. + fn parse_wire_block_types(data: &[u8]) -> Vec<(u8, u8)> { + let mut cursor = std::io::Cursor::new(data); + let version = cursor.read_u8().unwrap(); + assert_eq!(version, COMPRESSED_WRAPPER_VERSION_2); + let mut blocks = Vec::new(); + while (cursor.position() as usize) < data.len() { + let type_byte = cursor.read_u8().unwrap(); + let compression = type_byte & BLOCK_COMPRESSION_MASK; + let block_type = type_byte & BLOCK_TYPE_MASK; + blocks.push((compression, block_type)); + let size = read_varint(&mut cursor).unwrap() as u64; + cursor.set_position(cursor.position() + size); + if compression == BLOCK_COMPRESSION_ZSTD && block_type == BLOCK_TYPE_EOS { + break; + } + } + blocks + } + + /// Feed `stream` to the decoder with input_complete=true and assert the + /// error exit code matches `expected`. + fn assert_decoder_fails(stream: &[u8], expected: preflate_rs::ExitCode) { + let mut ctx = RecreateContainerProcessor::new(usize::MAX); + let mut out = Vec::new(); + let err = ctx + .process_buffer(stream, true, &mut out) + .expect_err("expected an error, but decoder returned Ok"); + assert_eq!( + err.exit_code(), + expected, + "wrong exit code for stream {stream:02X?}" + ); + } + + /// The two masks must partition the byte: non-overlapping and together covering all 8 bits. + /// Every content-kind constant must sit entirely within BLOCK_TYPE_MASK, and every + /// compression constant within BLOCK_COMPRESSION_MASK. + #[test] + fn test_bit_field_masks_partition_byte() { + assert_eq!( + BLOCK_COMPRESSION_MASK | BLOCK_TYPE_MASK, + 0xFF, + "masks do not cover all bits" + ); + assert_eq!( + BLOCK_COMPRESSION_MASK & BLOCK_TYPE_MASK, + 0x00, + "masks overlap" + ); + for kind in [ + BLOCK_TYPE_LITERAL, + BLOCK_TYPE_DEFLATE, + BLOCK_TYPE_PNG, + BLOCK_TYPE_DEFLATE_CONTINUE, + BLOCK_TYPE_JPEG_LEPTON, + BLOCK_TYPE_WEBP, + BLOCK_TYPE_EOS, + ] { + assert_eq!( + kind & BLOCK_COMPRESSION_MASK, + 0, + "BLOCK_TYPE 0x{kind:02X} bleeds into compression bits" + ); + } + for comp in [BLOCK_COMPRESSION_NONE, BLOCK_COMPRESSION_ZSTD] { + assert_eq!( + comp & BLOCK_TYPE_MASK, + 0, + "BLOCK_COMPRESSION 0x{comp:02X} bleeds into type bits" + ); + } + } + + /// The combined (compression | kind) wire bytes must match the expected values + /// documented in CLAUDE.md. This catches accidental constant drift. + #[test] + fn test_combined_wire_values() { + assert_eq!(BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_LITERAL, 0x40); + assert_eq!(BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_DEFLATE, 0x41); + assert_eq!(BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_PNG, 0x42); + assert_eq!(BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_DEFLATE_CONTINUE, 0x43); + assert_eq!(BLOCK_COMPRESSION_NONE | BLOCK_TYPE_JPEG_LEPTON, 0x04); + assert_eq!(BLOCK_COMPRESSION_NONE | BLOCK_TYPE_WEBP, 0x05); + assert_eq!(BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_EOS, 0x7F); + } + + /// Reserved compression bits 0x80 (10xx_xxxx) must be rejected by the decoder. + #[test] + fn test_decoder_rejects_reserved_compression_bits_10() { + assert_decoder_fails( + &[COMPRESSED_WRAPPER_VERSION_2, 0x80], + preflate_rs::ExitCode::InvalidCompressedWrapper, + ); + } + + /// Reserved compression bits 0xC0 (11xx_xxxx) must be rejected by the decoder. + #[test] + fn test_decoder_rejects_reserved_compression_bits_11() { + assert_decoder_fails( + &[COMPRESSED_WRAPPER_VERSION_2, 0xC0], + preflate_rs::ExitCode::InvalidCompressedWrapper, + ); + } + + /// BLOCK_COMPRESSION_NONE | BLOCK_TYPE_LITERAL (0x00) must be rejected: + /// literal blocks are Zstd-only; there is no raw literal block type. + #[test] + fn test_decoder_rejects_raw_literal_block_type() { + let byte = BLOCK_COMPRESSION_NONE | BLOCK_TYPE_LITERAL; // == 0x00 + assert_decoder_fails( + &[COMPRESSED_WRAPPER_VERSION_2, byte], + preflate_rs::ExitCode::InvalidCompressedWrapper, + ); + } + + /// Any BLOCK_COMPRESSION_NONE byte that is not JPEG_LEPTON or WEBP must be rejected. + #[test] + fn test_decoder_rejects_undefined_raw_block_types() { + // 0x10 is arbitrary: not 0x04 (JPEG) or 0x05 (WEBP) + let byte = BLOCK_COMPRESSION_NONE | 0x10; + assert_decoder_fails( + &[COMPRESSED_WRAPPER_VERSION_2, byte], + preflate_rs::ExitCode::InvalidCompressedWrapper, + ); + } + + /// Compressing plain bytes (no embedded DEFLATE streams) must produce a stream + /// whose first block carries BLOCK_COMPRESSION_ZSTD and BLOCK_TYPE_LITERAL. + #[test] + fn test_encoder_literal_block_carries_zstd_compression_bit() { + let input = vec![0xABu8; 512]; + let mut ctx = + PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); + let compressed = ctx.process_vec(&input).unwrap(); + + let blocks = parse_wire_block_types(&compressed); + assert!( + !blocks.is_empty(), + "expected at least one block in the output" + ); + assert_eq!( + blocks[0], + (BLOCK_COMPRESSION_ZSTD, BLOCK_TYPE_LITERAL), + "first block should be a Zstd literal block" + ); + } + + /// The EOS block that closes the Zstd frame must always use BLOCK_COMPRESSION_ZSTD. + #[test] + fn test_encoder_eos_uses_zstd_compression_bit() { + // Plain bytes with no DEFLATE streams → [version][literal][EOS]. + let input = vec![0xABu8; 64]; + let mut ctx = + PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); + let compressed = ctx.process_vec(&input).unwrap(); + + let blocks = parse_wire_block_types(&compressed); + assert_eq!( + blocks.last(), + Some(&(BLOCK_COMPRESSION_ZSTD, BLOCK_TYPE_EOS)), + "last block must be the Zstd EOS marker" + ); + } + + /// Every block type byte in a real compressed output must have compression bits + /// of either BLOCK_COMPRESSION_NONE or BLOCK_COMPRESSION_ZSTD — never the + /// reserved patterns 0x80 or 0xC0. + #[test] + fn test_encoder_never_emits_reserved_compression_bits() { + let input = crate::utils::read_file("samplezip.zip"); + let mut ctx = + PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); + let compressed = ctx.process_vec(&input).unwrap(); + + for &(compression, _) in &parse_wire_block_types(&compressed) { + assert!( + compression == BLOCK_COMPRESSION_NONE || compression == BLOCK_COMPRESSION_ZSTD, + "found reserved compression bits 0x{compression:02X} in output" + ); + } + } + + /// Verify that the decoder extracts the lower 6 bits as block_type rather + /// than passing the full byte to process_compressed_block. If it passed the + /// full byte (0x41) instead of the kind bits (0x01), the match would fall + /// through to the error arm and the round-trip would fail. + #[test] + fn test_decoder_strips_compression_bits_before_dispatch() { + use crate::utils::{assert_eq_array, read_file}; + // A zip file exercises DEFLATE blocks (wire type 0x41 = ZSTD|DEFLATE). + // A successful round-trip proves the decoder is matching on 0x01, not 0x41. + let original = read_file("samplezip.zip"); + let mut enc = + PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); + let compressed = enc.process_vec(&original).unwrap(); + + // Confirm the stream actually contains DEFLATE blocks (type 0x41), + // so the test is meaningful and not trivially passing. + let has_deflate = parse_wire_block_types(&compressed) + .iter() + .any(|&(c, t)| c == BLOCK_COMPRESSION_ZSTD && t == BLOCK_TYPE_DEFLATE); + assert!( + has_deflate, + "test file produced no DEFLATE blocks — test is vacuous" + ); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + assert_eq_array(&original, &recreated); + } + + /// A PNG file must produce at least one PNG or WebP IDAT block (not merely a DEFLATE + /// block), and must round-trip to the original bytes. The PNG code path in the encoder + /// is distinct from the plain DEFLATE path: it reconstructs IDAT framing and, when the + /// `webp` feature is enabled, may store pixels as WebP lossless instead of raw. + #[test] + fn test_png_produces_idat_block_and_roundtrips() { + use crate::utils::{assert_eq_array, read_file}; + let original = read_file("treegdi.png"); + let mut enc = + PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); + let compressed = enc.process_vec(&original).unwrap(); + + let blocks = parse_wire_block_types(&compressed); + let has_png_block = blocks + .iter() + .any(|&(_, t)| t == BLOCK_TYPE_PNG || t == BLOCK_TYPE_WEBP); + assert!( + has_png_block, + "PNG input should produce at least one PNG (0x02) or WebP (0x05) block, \ + got: {blocks:?}" + ); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + assert_eq_array(&original, &recreated); + } + + /// A PDF containing embedded JPEG images must produce JPEG_LEPTON blocks (raw, + /// outside Zstd) as well as DEFLATE blocks for the PDF's own compressed object + /// streams. Both must survive a full round-trip. + #[test] + fn test_pdf_with_jpegs_produces_lepton_and_deflate_blocks_and_roundtrips() { + use crate::utils::{assert_eq_array, read_file}; + let original = read_file("embedded-images.pdf"); + let mut enc = + PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); + let compressed = enc.process_vec(&original).unwrap(); + + let blocks = parse_wire_block_types(&compressed); + + let has_lepton = blocks + .iter() + .any(|&(c, t)| c == BLOCK_COMPRESSION_NONE && t == BLOCK_TYPE_JPEG_LEPTON); + assert!( + has_lepton, + "PDF with embedded JPEGs should produce at least one JPEG_LEPTON block" + ); + + let has_deflate = blocks + .iter() + .any(|&(_, t)| t == BLOCK_TYPE_DEFLATE); + assert!( + has_deflate, + "PDF with embedded JPEGs should also produce DEFLATE blocks for compressed objects" + ); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + assert_eq_array(&original, &recreated); + } + + /// DEFLATE_CONTINUE blocks are produced when the compressed-data buffer is + /// truncated mid-stream: `DeflateParser::parse` reads to EOF and returns + /// `Ok` with `is_done()=false`, the encoder emits a DEFLATE block for the + /// plaintext decoded so far, saves the mid-stream state, and resumes on + /// subsequent calls via DEFLATE_CONTINUE blocks. + /// + /// `sample1.bin.gz` is a single gzip stream with ~418 KiB of uncompressed + /// content. Feeding it in 10 KiB slices (with `min_chunk_size=5000` so the + /// processor starts immediately) means the scanner always sees only a + /// partial window of the compressed stream, forcing many DEFLATE_CONTINUE + /// blocks that must all round-trip correctly. + #[test] + fn test_deflate_continue_blocks_appear_and_roundtrip() { + use crate::utils::{assert_eq_array, read_file}; + let original = read_file("sample1.bin.gz"); + // min_chunk_size: 0 so the loop processes data immediately after Start, + // letting Searching run with the first truncated chunk rather than waiting + // for an additional min_chunk_size bytes before beginning. + let mut enc = PreflateContainerProcessor::new( + &PreflateContainerConfig { + min_chunk_size: 0, + ..PreflateContainerConfig::default() + }, + 1, + false, + ); + // Feed the 263 KiB file in two pieces. The first piece (200 KiB) truncates + // the DEFLATE stream mid-way; decompress() hits EOF with at least one + // complete block already parsed, so it returns Ok(partial) / is_done()=false, + // causing the encoder to emit a DEFLATE block and enter DeflateContinue. + // The second piece completes the stream → DEFLATE_CONTINUE block. + let mut compressed = Vec::new(); + { + let chunk1 = &original[..200_000.min(original.len())]; + enc.process_buffer(chunk1, false, &mut compressed).unwrap(); + if original.len() > 200_000 { + let chunk2 = &original[200_000..]; + enc.process_buffer(chunk2, false, &mut compressed).unwrap(); + } + enc.process_buffer(&[], true, &mut compressed).unwrap(); + } + + let blocks = parse_wire_block_types(&compressed); + let n_continue = blocks + .iter() + .filter(|&&(_, t)| t == BLOCK_TYPE_DEFLATE_CONTINUE) + .count(); + assert!( + n_continue > 0, + "200 KiB chunks on a ~263 KiB gzip should force at least one DEFLATE_CONTINUE block; \ + blocks seen: {blocks:?}" + ); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec_size(&compressed, 10_000).unwrap(); + assert_eq_array(&original, &recreated); + } + + /// When `total_plain_text_limit` is exceeded the encoder stops analysing + /// deflate streams and writes the remaining bytes as LITERAL blocks. The + /// decoder must still reproduce the original bytes exactly, including the + /// unprocessed portion. + #[test] + fn test_total_plain_text_limit_forces_literal_fallback_and_roundtrips() { + use crate::utils::{assert_eq_array, read_file}; + // samplezip.zip has several DEFLATE entries; setting the limit to 1 byte + // ensures that after the first DEFLATE entry's plaintext is accumulated, + // every subsequent scan sees total_plain_text_seen > limit and falls back + // to writing remaining content as a single LITERAL block. + let original = read_file("samplezip.zip"); + let mut enc = PreflateContainerProcessor::new( + &PreflateContainerConfig { + total_plain_text_limit: 1, + ..PreflateContainerConfig::default() + }, + 1, + false, + ); + let compressed = enc.process_vec(&original).unwrap(); + + let blocks = parse_wire_block_types(&compressed); + + // At least one LITERAL block must appear (the fallback content). + let has_literal = blocks.iter().any(|&(_, t)| t == BLOCK_TYPE_LITERAL); + assert!( + has_literal, + "after total_plain_text_limit is exceeded, remaining content must be LITERAL" + ); + + // The stream must still decode back to the original bytes. + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + assert_eq_array(&original, &recreated); + } } diff --git a/container/src/pdf_parse.rs b/container/src/pdf_parse.rs new file mode 100644 index 0000000..be3a83a --- /dev/null +++ b/container/src/pdf_parse.rs @@ -0,0 +1,473 @@ +use preflate_rs::{PreflateError, Result, err_exit_code}; + +pub fn pdf_to_utf8(input: &[u8]) -> String { + input + .iter() + .map(|&b| { + match b { + 0x00..=0x7F => b as char, // ASCII range same + 0x80 => '\u{20AC}', // EURO SIGN + 0x81..=0x8C | 0x8E..=0x9F => '\u{FFFD}', // Undefined mappings → replacement char + 0xA0..=0xFF => { + // Map selectively or fallback to Latin1 + match b { + 0xA9 => '\u{00A9}', // © + 0xAD => '\u{2013}', // en dash + 0xAF => '\u{2014}', // em dash + 0xD0 => '\u{2020}', // dagger + 0xD1 => '\u{2021}', // double dagger + 0xD2 => '\u{2022}', // bullet + 0xD3 => '\u{2026}', // ellipsis + 0xFE => '\u{00A0}', // non-breaking space + 0xFF => '\u{2028}', // line separator + _ => b as char, // Latin-1 fallback for others + } + } + _ => '\u{FFFD}', // replacement char for unmapped + } + }) + .collect() +} + +fn decode_pdf_string(data: &[u8]) -> Result { + if data.len() >= 2 && data[0] == 0xFE && data[1] == 0xFF { + // UTF-16BE with BOM + if (data.len() - 2) % 2 != 0 { + return err_exit_code( + preflate_rs::ExitCode::InvalidIDat, + "Invalid UTF-16BE string length", + ); + } + let utf16_data: Vec = data[2..] + .chunks(2) + .map(|chunk| (chunk[0] as u16) << 8 | (chunk[1] as u16)) + .collect(); + + String::from_utf16(&utf16_data).map_err(|e| { + PreflateError::new( + preflate_rs::ExitCode::InvalidIDat, + format!("UTF-16 decode error: {}", e), + ) + }) + } else { + // PDFDocEncoding fallback + Ok(pdf_to_utf8(data)) + } +} + +#[derive(Debug, Clone, PartialEq)] +pub enum PdfValue { + Name(String), + String(String), + Number(f64), + Boolean(bool), + Null, + // Optional: add later + // Array(Vec), + // Dictionary(HashMap), +} + +use std::collections::HashMap; + +pub fn parse_pdf_dictionary(input: &[u8]) -> Result> { + let mut result = HashMap::new(); + let mut pos = 0; + + // Skip leading '<<' + if input.starts_with(b"<<") { + pos += 2; + } + + while pos < input.len() { + skip_whitespace(input, &mut pos); + + if pos >= input.len() || input[pos] != b'/' { + break; + } + + // Parse key + let key_start = pos + 1; + let mut key_end = key_start; + while key_end < input.len() + && !is_whitespace(input[key_end]) + && !is_delimiter(input[key_end]) + { + key_end += 1; + } + + let key_bytes = &input[key_start..key_end]; + let key = pdf_to_utf8(key_bytes); + pos = key_end; + + skip_whitespace(input, &mut pos); + + // Parse value + let value = match input.get(pos) { + Some(b'/') => { + let (name, consumed) = parse_name(&input[pos..]); + pos += consumed; + PdfValue::Name(name) + } + Some(b'(') => { + let (bytes, consumed) = parse_literal_string(&input[pos..])?; + pos += consumed; + PdfValue::String(decode_pdf_string(&bytes)?) + } + Some(b'-') | Some(b'+') | Some(b'0'..=b'9') => { + let (number, consumed) = parse_number(&input[pos..])?; + pos += consumed; + PdfValue::Number(number) + } + Some(b't') if input.len() >= pos + 4 && &input[pos..pos + 4] == b"true" => { + pos += 4; + PdfValue::Boolean(true) + } + Some(b'f') if input.len() >= pos + 5 && &input[pos..pos + 5] == b"false" => { + pos += 5; + PdfValue::Boolean(false) + } + Some(b'n') if input.len() >= pos + 4 && &input[pos..pos + 4] == b"null" => { + pos += 4; + PdfValue::Null + } + _ => { + // Unknown or unsupported type — skip + break; + } + }; + + result.insert(key, value); + } + + Ok(result) +} + +fn skip_whitespace(input: &[u8], pos: &mut usize) { + while *pos < input.len() && is_whitespace(input[*pos]) { + *pos += 1; + } +} + +fn parse_literal_string(input: &[u8]) -> Result<(Vec, usize)> { + let mut output = Vec::new(); + let mut pos = 1; // skip '(' + let mut depth = 1; + let mut escape = false; + + while pos < input.len() { + let byte = input[pos]; + pos += 1; + + if escape { + match byte { + b'n' => output.push(b'\n'), + b'r' => output.push(b'\r'), + b't' => output.push(b'\t'), + b'b' => output.push(0x08), + b'f' => output.push(0x0C), + b'(' => output.push(b'('), + b')' => output.push(b')'), + b'\\' => output.push(b'\\'), + b'0'..=b'7' => { + let mut octal = vec![byte]; + for _ in 0..2 { + if let Some(&next) = input.get(pos) { + if next >= b'0' && next <= b'7' { + octal.push(next); + pos += 1; + } else { + break; + } + } + } + if let Ok(val) = + u8::from_str_radix(std::str::from_utf8(&octal).unwrap_or("0"), 8) + { + output.push(val); + } + } + other => output.push(other), + } + escape = false; + } else if byte == b'\\' { + escape = true; + } else if byte == b'(' { + depth += 1; + output.push(b'('); + } else if byte == b')' { + depth -= 1; + if depth == 0 { + break; + } + output.push(b')'); + } else { + output.push(byte); + } + } + + Ok((output, pos)) +} + +fn parse_name(input: &[u8]) -> (String, usize) { + let mut end = 1; // skip '/' + while end < input.len() && !is_whitespace(input[end]) && !is_delimiter(input[end]) { + end += 1; + } + let name_bytes = &input[1..end]; + (pdf_to_utf8(name_bytes), end) +} + +fn parse_number(input: &[u8]) -> Result<(f64, usize)> { + let mut end = 0; + while end < input.len() + && (input[end] == b'.' + || input[end] == b'-' + || input[end] == b'+' + || input[end].is_ascii_digit()) + { + end += 1; + } + let number_str = std::str::from_utf8(&input[..end])?; + let number = number_str.parse()?; + Ok((number, end)) +} + +fn is_whitespace(b: u8) -> bool { + matches!(b, b'\x00' | b'\x09' | b'\x0A' | b'\x0C' | b'\x0D' | b' ') +} + +fn is_delimiter(b: u8) -> bool { + matches!(b, b'<' | b'>' | b'[' | b']' | b'(' | b')' | b'/' | b'%') +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_empty_dictionary() { + let input = b"<< >>"; + let result = parse_pdf_dictionary(input).unwrap(); + assert!(result.is_empty()); + } + + #[test] + fn test_string_value() { + let input = b"<< /Title (RustLang) >>"; + let result = parse_pdf_dictionary(input).unwrap(); + assert_eq!( + result.get("Title"), + Some(&PdfValue::String("RustLang".to_string())) + ); + } + + #[test] + fn test_string_with_escape_sequences() { + let input = b"<< /Note (Line\\nBreak\\tTabbed\\rReturn) >>"; + let result = parse_pdf_dictionary(input).unwrap(); + assert_eq!( + result.get("Note"), + Some(&PdfValue::String("Line\nBreak\tTabbed\rReturn".to_string())) + ); + } + + #[test] + fn test_string_with_octal_escape() { + let input = b"<< /Data (Hello\\040World) >>"; + let result = parse_pdf_dictionary(input).unwrap(); + assert_eq!( + result.get("Data"), + Some(&PdfValue::String("Hello World".to_string())) + ); + } + + #[test] + fn test_nested_parentheses() { + let input = b"<< /Comment (This (is) nested) >>"; + let result = parse_pdf_dictionary(input).unwrap(); + assert_eq!( + result.get("Comment"), + Some(&PdfValue::String("This (is) nested".to_string())) + ); + } + + #[test] + fn test_name_value() { + let input = b"<< /Author /Alice >>"; + let result = parse_pdf_dictionary(input).unwrap(); + assert_eq!( + result.get("Author"), + Some(&PdfValue::Name("Alice".to_string())) + ); + } + + #[test] + fn test_utf16_value() + { + let input = b"<< /Title (\xFE\xFF\x00R\x00u\x00s\x00t) >>"; // "Rust" in UTF-16BE with BOM + let result = parse_pdf_dictionary(input).unwrap(); + assert_eq!( + result.get("Title"), + Some(&PdfValue::String("Rust".to_string())) + ); + } + + #[test] + fn test_boolean_values() { + let input = b"<< /Enabled true /Visible false >>"; + let result = parse_pdf_dictionary(input).unwrap(); + assert_eq!(result.get("Enabled"), Some(&PdfValue::Boolean(true))); + assert_eq!(result.get("Visible"), Some(&PdfValue::Boolean(false))); + } + + #[test] + fn test_null_value() { + let input = b"<< /Deleted null >>"; + let result = parse_pdf_dictionary(input).unwrap(); + assert_eq!(result.get("Deleted"), Some(&PdfValue::Null)); + } + + #[test] + fn test_number_values() { + let input = b"<< /Count 42 /Negative -7 /Float 3.14 >>"; + let result = parse_pdf_dictionary(input).unwrap(); + assert_eq!(result.get("Count"), Some(&PdfValue::Number(42.0))); + assert_eq!(result.get("Negative"), Some(&PdfValue::Number(-7.0))); + assert_eq!(result.get("Float"), Some(&PdfValue::Number(3.14))); + } + + #[test] + fn test_multiple_mixed_values() { + let input = b"<< /Title (Rust) /Author /Bob /Pages 100 /Active true /Removed null >>"; + let result = parse_pdf_dictionary(input).unwrap(); + + assert_eq!( + result.get("Title"), + Some(&PdfValue::String("Rust".to_string())) + ); + assert_eq!( + result.get("Author"), + Some(&PdfValue::Name("Bob".to_string())) + ); + assert_eq!(result.get("Pages"), Some(&PdfValue::Number(100.0))); + assert_eq!(result.get("Active"), Some(&PdfValue::Boolean(true))); + assert_eq!(result.get("Removed"), Some(&PdfValue::Null)); + } + + #[test] + fn test_invalid_key_skips_parsing() { + let input = b"<< Title (MissingSlash) >>"; // missing '/' + let result = parse_pdf_dictionary(input).unwrap(); + assert!(result.is_empty()); + } + + #[test] + fn test_incomplete_string_does_not_panic() { + let input = b"<< /Broken (This is incomplete >>"; + let _ = parse_pdf_dictionary(input).unwrap(); // shouldn't panic + } + + #[test] + fn test_ascii_identity() { + let input = b"Hello, World!"; + let expected = "Hello, World!"; + assert_eq!(pdf_to_utf8(input), expected); + } + + #[test] + fn test_copyright_symbol() { + let input = &[0xA9]; // © + let expected = "\u{00A9}"; + assert_eq!(pdf_to_utf8(input), expected); + } + + #[test] + fn test_en_and_em_dash() { + let input = &[0xAD, 0xAF]; // en dash, em dash + let expected = "\u{2013}\u{2014}"; // –— + assert_eq!(pdf_to_utf8(input), expected); + } + + #[test] + fn test_typographic_characters() { + let input = &[0xD0, 0xD1, 0xD2, 0xD3]; // †‡•… + let expected = "\u{2020}\u{2021}\u{2022}\u{2026}"; + assert_eq!(pdf_to_utf8(input), expected); + } + + #[test] + fn test_euro_sign() { + let input = &[0x80]; // € + let expected = "\u{20AC}"; + assert_eq!(pdf_to_utf8(input), expected); + } + + #[test] + fn test_nonbreaking_space() { + let input = &[0xFE]; + let expected = "\u{00A0}"; + assert_eq!(pdf_to_utf8(input), expected); + } + + #[test] + fn test_line_separator() { + let input = &[0xFF]; + let expected = "\u{2028}"; + assert_eq!(pdf_to_utf8(input), expected); + } + + #[test] + fn test_unknown_byte_gives_replacement_char() { + let input = &[0x90]; // Undefined in PDFDocEncoding + let expected = "\u{FFFD}"; // Replacement character + assert_eq!(pdf_to_utf8(input), expected); + } + + #[test] + fn test_mixed_ascii_and_pdfdoc_chars() { + let input = &[b'H', b'i', b' ', 0xA9, b' ', 0x80]; // "Hi © €" + let expected = "Hi \u{00A9} \u{20AC}"; + assert_eq!(pdf_to_utf8(input), expected); + } + + #[test] + fn test_parse_dictionary_with_pdfdoc_encoding_characters() { + let input = b"<< + /Title (Rust Programming \xA9 2025) + /Note (\x80 price - valid until \xD3) + /Dash (\xAD\xAF) + /Fancy (\xD0\xD1\xD2) + /SpaceTest (\xFE\xFF) >>"; + + let result = parse_pdf_dictionary(input).unwrap(); + + assert_eq!( + result.get("Title"), + Some(&PdfValue::String( + "Rust Programming \u{00A9} 2025".to_string() + )) + ); + + assert_eq!( + result.get("Note"), + Some(&PdfValue::String( + "\u{20AC} price - valid until \u{2026}".to_string() + )) + ); + + assert_eq!( + result.get("Dash"), + Some(&PdfValue::String("\u{2013}\u{2014}".to_string())) + ); + + assert_eq!( + result.get("Fancy"), + Some(&PdfValue::String("\u{2020}\u{2021}\u{2022}".to_string())) + ); + + assert_eq!( + result.get("SpaceTest"), + Some(&PdfValue::String("\u{00A0}\u{2028}".to_string())) + ); + } +} diff --git a/container/src/utils.rs b/container/src/utils.rs index 0278ad0..763b415 100644 --- a/container/src/utils.rs +++ b/container/src/utils.rs @@ -174,7 +174,7 @@ pub fn process_limited_buffer( #[test] fn test_process_limited_buffer() { - let mut p = crate::container_processor::NopProcessBuffer {}; + let mut p = crate::container_processor::test::NopProcessBuffer {}; let input = b"Hello, world!"; let mut output = [0u8; 5]; diff --git a/preflate/src/stream_processor.rs b/preflate/src/stream_processor.rs index 813abe6..717ca49 100644 --- a/preflate/src/stream_processor.rs +++ b/preflate/src/stream_processor.rs @@ -876,3 +876,115 @@ fn verify_partial_blocks() { ); } } + +/// Replicates exactly what `scan_deflate::find_compressable_stream` does when it +/// encounters a gzip stream in a truncated 200 KiB content buffer: +/// +/// 1. strip the 10-byte gzip header (and 8-byte footer) to obtain the raw DEFLATE body +/// 2. call `decompress` with only the first ~190 KB of that body (what fits in the +/// 200 KiB content window after the header) +/// 3. assert the call returns `Ok` and `is_done() == false` ← DeflateContinue requires this +/// 4. call `decompress` again with the remainder of the body, assert `is_done() == true` +/// 5. reconstruct the original DEFLATE bytes and assert roundtrip identity +#[test] +fn verify_decompress_partial_gzip_deflate_body_roundtrip() { + crate::init_logging(); + + // sample1.bin.gz: 263 972 bytes total; gzip header = 10 bytes, footer = 8 bytes + // → raw DEFLATE body = 263 954 bytes + let gzip = crate::utils::read_file("sample1.bin.gz"); + assert!(gzip.len() > 18, "gzip file too short"); + + // Gzip header for this file has no extra flags, so it is exactly 10 bytes. + let deflate_start: usize = 10; + let deflate_end: usize = gzip.len() - 8; + let deflate_body = &gzip[deflate_start..deflate_end]; + + // Mimic what find_compressable_stream does: content buffer contains the first + // 200 000 bytes of the gzip file, and the DEFLATE body starts at offset 10, + // so the slice passed to decompress() is bytes [10..200000] = 199 990 bytes. + let content_window: usize = 200_000 - deflate_start; // 199 990 + assert!( + content_window < deflate_body.len(), + "content window must truncate the DEFLATE body (window={content_window}, body={})", + deflate_body.len() + ); + + let mut state = PreflateStreamProcessor::new(&PreflateConfig::default()); + + // ── First call: truncated slice ────────────────────────────────────────── + let r1 = state.decompress(&deflate_body[..content_window]); + let r1 = match r1 { + Ok(r) => r, + Err(e) => panic!( + "decompress on truncated DEFLATE body ({content_window} B of {} B) returned \ + Err({e:?}); expected Ok(partial) with is_done()=false", + deflate_body.len() + ), + }; + assert!( + !state.is_done(), + "is_done() must be false after truncated first call; compressed_size={}", + r1.compressed_size + ); + assert!( + r1.compressed_size > 0, + "at least one block must have been consumed" + ); + assert!( + r1.compressed_size <= content_window, + "compressed_size ({}) must not exceed the slice length ({content_window})", + r1.compressed_size + ); + println!( + "first call: compressed_size={} / {content_window} blocks={}", + r1.compressed_size, + r1.blocks.len() + ); + + let first_corrections = r1.corrections.clone(); + let first_plain_text = state.plain_text().text().to_vec(); + state.shrink_to_dictionary(); + + // ── Second call: remainder ──────────────────────────────────────────────── + let offset = r1.compressed_size; + let r2 = state.decompress(&deflate_body[offset..]); + let r2 = match r2 { + Ok(r) => r, + Err(e) => panic!( + "decompress on remainder ({} B) returned Err({e:?})", + deflate_body.len() - offset + ), + }; + assert!( + state.is_done(), + "is_done() must be true after consuming the full body" + ); + println!( + "second call: compressed_size={} blocks={}", + r2.compressed_size, + r2.blocks.len() + ); + + let second_corrections = r2.corrections.clone(); + let second_plain_text = state.plain_text().text().to_vec(); + + // ── Roundtrip: reconstruct the original bytes ───────────────────────────── + let mut reconstruct = RecreateStreamProcessor::new(); + let (mut recompressed, _) = reconstruct + .recompress( + &mut std::io::Cursor::new(&first_plain_text), + &first_corrections, + ) + .expect("recompress chunk 1 failed"); + + let (mut rest, _) = reconstruct + .recompress( + &mut std::io::Cursor::new(&second_plain_text), + &second_corrections, + ) + .expect("recompress chunk 2 failed"); + + recompressed.append(&mut rest); + crate::utils::assert_eq_array(deflate_body, &recompressed); +} diff --git a/tests/end_to_end.rs b/tests/end_to_end.rs index d7ce310..0458039 100644 --- a/tests/end_to_end.rs +++ b/tests/end_to_end.rs @@ -80,12 +80,14 @@ fn test_container(filename: &str) { let mut c = Vec::new(); let mut ctx = PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 4, false); - ctx.copy_to_end(&mut std::io::Cursor::new(&v), &mut c).unwrap(); + ctx.copy_to_end(&mut std::io::Cursor::new(&v), &mut c) + .unwrap(); let stats = ctx.stats(); let mut r = Vec::new(); let mut ctx = RecreateContainerProcessor::new(128 * 1024 * 1024); - ctx.copy_to_end(&mut std::io::Cursor::new(&c), &mut r).unwrap(); + ctx.copy_to_end(&mut std::io::Cursor::new(&c), &mut r) + .unwrap(); assert!(v == r); println!( diff --git a/util/src/main.rs b/util/src/main.rs index b63943c..5d8cca4 100644 --- a/util/src/main.rs +++ b/util/src/main.rs @@ -98,11 +98,7 @@ fn main() { // open file for reading let original = fs::read(&entry).unwrap(); - let mut ctx = PreflateContainerProcessor::new( - &config, - cli.level as i32, - cli.baseline, - ); + let mut ctx = PreflateContainerProcessor::new(&config, cli.level as i32, cli.baseline); let compress_start = ProcessTime::now(); From f2394a69ba9085245c84dd4ce40c011ff9050e3a Mon Sep 17 00:00:00 2001 From: Kristof Date: Wed, 25 Feb 2026 16:32:41 +0100 Subject: [PATCH 3/8] update with claude help --- .gitignore | 6 + CLAUDE.md | 73 +++++ container/CLAUDE.md | 196 ++++++++++++ container/src/container_processor.rs | 352 ++++++++++++++++++++++ dll/CLAUDE.md | 73 +++++ fuzz/CLAUDE.md | 30 ++ preflate/CLAUDE.md | 83 +++++ samples/test_big_then_small_gzip.bin | Bin 0 -> 84 bytes samples/test_corrupted_deflate.bin | Bin 0 -> 67 bytes samples/test_gzip_with_gap.bin | Bin 0 -> 1116 bytes samples/test_random_bytes.bin | Bin 0 -> 32768 bytes samples/test_tiny_gzip.bin | Bin 0 -> 26 bytes samples/test_two_gzip_streams.bin | Bin 0 -> 116 bytes samples/test_two_zlib_streams.bin | Bin 0 -> 558 bytes samples/test_zip_3entries.zip | Bin 0 -> 628 bytes samples/test_zip_stored_then_deflated.zip | Bin 0 -> 8255 bytes tests/CLAUDE.md | 37 +++ util/CLAUDE.md | 38 +++ 18 files changed, 888 insertions(+) create mode 100644 CLAUDE.md create mode 100644 container/CLAUDE.md create mode 100644 dll/CLAUDE.md create mode 100644 fuzz/CLAUDE.md create mode 100644 preflate/CLAUDE.md create mode 100644 samples/test_big_then_small_gzip.bin create mode 100644 samples/test_corrupted_deflate.bin create mode 100644 samples/test_gzip_with_gap.bin create mode 100644 samples/test_random_bytes.bin create mode 100644 samples/test_tiny_gzip.bin create mode 100644 samples/test_two_gzip_streams.bin create mode 100644 samples/test_two_zlib_streams.bin create mode 100644 samples/test_zip_3entries.zip create mode 100644 samples/test_zip_stored_then_deflated.zip create mode 100644 tests/CLAUDE.md create mode 100644 util/CLAUDE.md diff --git a/.gitignore b/.gitignore index 73fab07..a69c6b7 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,9 @@ target/ # MSVC Windows builds of rustc generate these, which store debugging information *.pdb + +# Local Claude Code settings (machine-specific) +.claude/ + +# Unreferenced / scratch files +.unref/ diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..89acbcb --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,73 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +> **Important:** This file (and all sub-project `CLAUDE.md` files) are checked into the +> repository. Only include information that is valid for **any** developer or machine: +> project conventions, architecture, commands, constraints. **Do not** add machine-specific +> paths, personal tool preferences, local environment settings, or anything that would +> not apply to every contributor. + +## Commands + +```bash +# Build +cargo build --all +cargo build --release --all + +# Test +cargo test --all +cargo test # Run a single test by name +cargo test -- --nocapture # Show test output + +# Lint and format +cargo fmt --check --all +cargo clippy +``` + +The CI runs on `windows-latest` and builds for multiple targets: `wasm32-wasip1`, `aarch64-unknown-linux-musl`, `x86_64-pc-windows-msvc`, `x86_64-unknown-linux-gnu`. + +The release build uses Spectre mitigations (`/Qspectre /sdl`) and produces `preflate_rs_0_7.dll` and `preflate_util.exe`. + +## Architecture + +**preflate-rs** analyzes DEFLATE-compressed streams, extracts the uncompressed data plus a compact set of reconstruction parameters, and later recreates the exact original DEFLATE bitstream. This enables re-compression with modern algorithms (Zstd, Brotli) while preserving binary-exact round-trip fidelity. The key insight is detecting which compressor (zlib, libdeflate, zlib-ng, miniz, Windows zlib) produced a stream and storing only the differences from what that compressor would predict. + +### Workspace layout + +| Crate | Output | Role | +|---|---|---| +| `preflate/` | library | Core DEFLATE analysis and reconstruction | +| `container/` | library | Scans binary files (ZIP, PNG, PDF) for DEFLATE streams | +| `util/` | `preflate_util.exe` | CLI for testing on files/directories | +| `dll/` | `preflate_rs_0_7.dll` | C FFI wrapper for .NET interop | +| `fuzz/` | fuzz harnesses | libfuzzer targets | +| `tests/` | integration tests | End-to-end round-trip tests using `samples/` | + +### preflate crate (core) + +The processing pipeline in `preflate/src/stream_processor.rs`: +1. **`deflate/`** — Reads a DEFLATE bitstream into tokens (literals and length/distance back-references) and writes tokens back to DEFLATE with custom Huffman trees. +2. **`estimator/`** — Estimates the compressor's parameters (`TokenPredictorParameters`): hash algorithm, `nice_length`, `max_chain`, window bits, add policy, matching type. +3. **`token_predictor.rs`** — Replays the compression using estimated parameters and hash chains to predict what tokens the original compressor would have produced. +4. **`tree_predictor.rs`** — Predicts Huffman tree structure. +5. **`statistical_codec.rs` / `cabac_codec.rs`** — Encodes the *differences* from prediction using CABAC (Context Adaptive Binary Arithmetic Coding, shared with Lepton JPEG). +6. **`stream_processor.rs`** — Public API: `PreflateStreamProcessor::decompress()` and `RecreateStreamProcessor::recreate()`. + +Parameters are serialized via `bitcode`; corrections via CABAC. The format is chunked to bound memory use. + +### container crate + +- **`scan_deflate.rs`** — Scans raw bytes to locate DEFLATE stream boundaries, identifying stream type (raw deflate, zlib-wrapped, PNG IDAT, etc.). +- **`idat_parse.rs`** — Extracts and reassembles PNG IDAT chunks. +- **`pdf_parse.rs`** — Detects PDF compressed object streams. +- **`zstd_compression.rs`** — Pipelines preflate output through Zstd for final storage. +- **`container_processor.rs`** — Orchestrates scanning → preflate → Zstd (compress) and Zstd → recreate → reassembly (decompress). + +The optional `webp` feature (enabled by default) allows PNG images to be stored as WEBP instead of losslessly. + +### Code constraints + +- **No unsafe code** — enforced via `#![forbid(unsafe_code)]` in each crate. +- Minimum Rust version: **1.85**, Edition **2024**. +- `.cargo/config.toml` sets Windows MSVC linker flags (`/DYNAMICBASE`, `/CETCOMPAT`, `/guard:cf`). diff --git a/container/CLAUDE.md b/container/CLAUDE.md new file mode 100644 index 0000000..5544e3e --- /dev/null +++ b/container/CLAUDE.md @@ -0,0 +1,196 @@ +# container (preflate-container) + +Scans binary files (ZIP, PNG, PDF, JPEG) for DEFLATE streams, orchestrates the +preflate + Zstd pipeline, and reassembles the output. Only format version 2 exists +(v1 was removed). + +## Public API (`lib.rs`) + +```rust +// Compress a file/buffer containing embedded DEFLATE streams +PreflateContainerProcessor::new(config, level, test_baseline) -> Self +impl ProcessBuffer for PreflateContainerProcessor { + fn process_buffer(&mut self, input: &[u8], input_complete: bool) -> Result>; +} + +// Decompress a preflate container back to the original file +RecreateContainerProcessor::new(capacity) -> Self +impl ProcessBuffer for RecreateContainerProcessor { + fn process_buffer(&mut self, input: &[u8], input_complete: bool) -> Result>; +} + +// DLL helper: wraps process_buffer to respect a max output size per call +fn process_limited_buffer(processor, input, input_complete, max_output) -> Result<(Vec, bool)>; + +// Stats after compression +PreflateContainerProcessor::get_stats() -> &PreflateStats +``` + +`PreflateContainerConfig` holds knobs like `max_chain_length`, `verify_compression`, etc. + +## Wire Format (v2 only) + +### Outer framing (always raw / uncompressed) + +``` +[0x02] ← COMPRESSED_WRAPPER_VERSION_2 (1 byte, raw) + +Repeat for each block: + [type] ← block type byte (1 byte, raw) — see bit-field below + [varint(content_len)] ← byte count of what follows (1–5 bytes, raw) + [content_bytes × content_len] ← meaning depends on type (see below) +``` + +All framing bytes (`type`, `varint`) are written directly to the output stream — +they are **never** inside the Zstd encoder. + +### Block type byte bit-field + +Each block type byte encodes two fields: + +``` +Bit 7-6 BLOCK_COMPRESSION_* 00 = none/raw 01 = Zstd 10-11 = reserved +Bit 5-0 BLOCK_TYPE_* block content kind (0–63) +``` + +Mask constants (defined in `container_processor.rs`): + +| Constant | Value | Meaning | +|---|---|---| +| `BLOCK_COMPRESSION_MASK` | `0xC0` | extracts bits 7–6 | +| `BLOCK_TYPE_MASK` | `0x3F` | extracts bits 5–0 | +| `BLOCK_COMPRESSION_NONE` | `0x00` | content is raw (not Zstd) | +| `BLOCK_COMPRESSION_ZSTD` | `0x40` | content is a Zstd flush segment | + +### Block content kinds and combined wire values + +| `BLOCK_TYPE_*` | Value | Combined wire byte | Description | +|---|---|---|---| +| `BLOCK_TYPE_LITERAL` | `0x00` | `0x40` | Raw input bytes with no detectable DEFLATE stream | +| `BLOCK_TYPE_DEFLATE` | `0x01` | `0x41` | A raw/zlib DEFLATE stream (start of a new stream) | +| `BLOCK_TYPE_PNG` | `0x02` | `0x42` | A PNG IDAT stream stored without WebP | +| `BLOCK_TYPE_DEFLATE_CONTINUE` | `0x03` | `0x43` | Continuation of a DEFLATE stream that spanned a chunk boundary | +| `BLOCK_TYPE_JPEG_LEPTON` | `0x04` | `0x04` | JPEG re-compressed with Lepton; bypasses Zstd entirely | +| `BLOCK_TYPE_WEBP` | `0x05` | `0x05` | PNG image stored as WebP lossless; bypasses Zstd entirely | +| `BLOCK_TYPE_EOS` | `0x3F` | `0x7F` | The `encoder.finish()` bytes that close the Zstd stream | + +### Zstd encoder/decoder lifecycle + +- A **single persistent `zstd::stream::write::Encoder`** is created once and shared across + all Zstd-compressed blocks (compression bits `0x40`). +- After writing each block's inner payload into the encoder, `encoder.flush()` is called, + which emits a Zstd `ZSTD_e_flush` segment. Those bytes are what get stored as + `content_bytes` in the outer framing. +- Each flush segment is decodable in sequence: the decoder is a persistent + `zstd::stream::raw::Decoder` that maintains cross-block history, so compression + quality benefits from all previously seen blocks. +- The `BLOCK_TYPE_EOS` (`0x7F`) end-of-stream block carries the `encoder.finish()` output + that closes the Zstd frame cleanly. No decompressed bytes are expected from it. + +### Inner payload layout (inside Zstd, after decompression) + +**`BLOCK_TYPE_LITERAL` (wire `0x40`)** +``` +varint(data_len) +data[data_len] ← verbatim bytes from the original input +``` + +**`BLOCK_TYPE_DEFLATE` (wire `0x41`) and `BLOCK_TYPE_DEFLATE_CONTINUE` (wire `0x43`)** +``` +varint(corrections_len) +varint(plaintext_len) +corrections[corrections_len] ← CABAC-encoded differences from predicted tokens +plaintext[plaintext_len] ← uncompressed data +``` +`BLOCK_TYPE_DEFLATE_CONTINUE` has the same layout; the decoder reuses the +`RecreateStreamProcessor` state from the preceding `BLOCK_TYPE_DEFLATE` block. + +**`BLOCK_TYPE_PNG` (wire `0x42`) — non-WebP path** +``` +varint(corrections_len) +varint(plaintext_len) +IdatContents metadata: + varint(chunk_size_1) … varint(chunk_size_N) varint(0) ← IDAT chunk size list (0-terminated) + zlib_header[2] + addler32[4] + 0xFF ← sentinel: no png_header present +corrections[corrections_len] +plaintext[plaintext_len] ← raw unfiltered pixel data +``` + +### Raw block payload layout (outside Zstd) + +**`BLOCK_TYPE_JPEG_LEPTON` (wire `0x04`)** +``` +lepton_bytes[content_len] ← Lepton-compressed JPEG; decoded by lepton_jpeg::decode_lepton() +``` + +**`BLOCK_TYPE_WEBP` (wire `0x05`)** +``` +varint(corrections_len) +varint(webp_data_len) +IdatContents metadata: + varint(chunk_size_1) … varint(chunk_size_N) varint(0) + zlib_header[2] + addler32[4] + color_type[1] ← PngColorType (RGB=2, RGBA=6) + varint(width) + varint(height) +filters[height] ← PNG row filter bytes (one per row) +corrections[corrections_len] +webp_data[webp_data_len] ← WebP lossless encoded pixel data +``` +On decode, the WebP bytes are decompressed back to pixels, PNG filters are re-applied, +and the result is re-deflated using the corrections to recreate the original IDAT stream. + +## Idempotent Finalization (important bug history) + +`process_buffer` may be called with `input_complete=true` multiple times (DLL pattern). +The finalization block must guard against double-finalization: + +```rust +if input_complete && !self.input_complete { // NOT just `if input_complete` + self.input_complete = true; + // ... encoder.take().unwrap() +} +``` + +## Module Layout + +``` +src/ + lib.rs ← public types and re-exports + container_processor.rs ← PreflateContainerProcessor, RecreateContainerProcessor, + V2BlockInfo enum, process_v2_compressed_block() + scan_deflate.rs ← locates DEFLATE stream boundaries in raw bytes + identifies: raw deflate, zlib-wrapped, PNG IDAT, ZIP, JPEG, PDF + idat_parse.rs ← extracts / reassembles PNG IDAT chunks; parses IHDR + pdf_parse.rs ← detects PDF FlateDecode compressed object streams + zstd_compression.rs ← ZstdCompressContext / ZstdDecompressContext (internal) + scoped_read.rs ← bounded reader adapter + utils.rs ← process_limited_buffer() and other helpers +``` + +## Key Internal Types + +| Type | Purpose | +|---|---| +| `V2BlockInfo` | `Compressed(u8)` or `Jpeg(Vec)` — one entry per scanned block | +| `MeasureWriteSink` | `pub(crate)` sink that counts bytes; used for baseline measurement | +| `PreflateStats` | `deflate_compressed_size`, `zstd_compressed_size`, `zstd_baseline_size`, … | + +## Features + +- `webp` (default-enabled) — allows PNG images to be stored as WebP instead of lossless PNG, + using the `webp` crate. + +## Dependencies of Note + +- `lepton_jpeg` (0.5.1) — JPEG blocks are recompressed with Lepton, not Zstd. +- `zstd` (0.13) — single persistent encoder across all non-JPEG blocks. +- `preflate-rs` — core analysis/reconstruction (path dependency). + +## Constraints + +- `#![forbid(unsafe_code)]` enforced. +- `main.rs` exists but is a stub; this crate is a library. diff --git a/container/src/container_processor.rs b/container/src/container_processor.rs index b703162..4246931 100644 --- a/container/src/container_processor.rs +++ b/container/src/container_processor.rs @@ -1727,4 +1727,356 @@ pub(crate) mod test { let recreated = dec.process_vec(&compressed).unwrap(); assert_eq_array(&original, &recreated); } + + // ── Multi-scheme fixture tests ─────────────────────────────────────────────── + + /// Helper: compress `data` in one shot and return `(compressed, blocks)`. + fn compress_default(data: &[u8]) -> (Vec, Vec<(u8, u8)>) { + let mut enc = + PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); + let compressed = enc.process_vec(data).unwrap(); + let blocks = parse_wire_block_types(&compressed); + (compressed, blocks) + } + + /// Helper: full roundtrip assertion — compress then decompress, check byte equality. + fn assert_roundtrip(original: &[u8]) { + let (compressed, _) = compress_default(original); + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + crate::utils::assert_eq_array(original, &recreated); + } + + /// Count how many blocks have a given block-type kind. + fn count_block_type(blocks: &[(u8, u8)], kind: u8) -> usize { + blocks.iter().filter(|&&(_, t)| t == kind).count() + } + + /// Two concatenated gzip streams — each contains plaintext well above + /// MIN_BLOCKSIZE=1024, so the scanner must emit exactly two DEFLATE blocks. + /// + /// Fixture: `test_two_gzip_streams.bin` + /// Expected wire sequence: literal, deflate, literal, deflate, literal, EOS + #[test] + fn test_two_gzip_streams_produce_two_deflate_blocks_and_roundtrip() { + use crate::utils::read_file; + let original = read_file("test_two_gzip_streams.bin"); + let (compressed, blocks) = compress_default(&original); + + assert_eq!( + count_block_type(&blocks, BLOCK_TYPE_DEFLATE), + 2, + "two consecutive gzip streams should each produce one DEFLATE block; blocks={blocks:?}" + ); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + crate::utils::assert_eq_array(&original, &recreated); + } + + /// A gzip stream whose plaintext is below MIN_BLOCKSIZE (500 < 1024) must NOT + /// be promoted to a DEFLATE block — the whole file becomes a single literal chunk. + /// + /// Fixture: `test_tiny_gzip.bin` + /// Expected wire sequence: literal, EOS (no DEFLATE blocks) + #[test] + fn test_tiny_gzip_below_min_blocksize_becomes_literal_and_roundtrip() { + use crate::utils::read_file; + let original = read_file("test_tiny_gzip.bin"); + let (compressed, blocks) = compress_default(&original); + + assert_eq!( + count_block_type(&blocks, BLOCK_TYPE_DEFLATE), + 0, + "gzip with 500-byte plaintext ( MIN_BLOCKSIZE) immediately followed by + /// a tiny gzip (plaintext < MIN_BLOCKSIZE). Only the large stream must become a + /// DEFLATE block; the small one stays literal. + /// + /// Fixture: `test_big_then_small_gzip.bin` + /// Expected wire sequence: literal, deflate, literal, EOS (exactly 1 DEFLATE block) + #[test] + fn test_big_gzip_deflate_small_gzip_literal_and_roundtrip() { + use crate::utils::read_file; + let original = read_file("test_big_then_small_gzip.bin"); + let (compressed, blocks) = compress_default(&original); + + assert_eq!( + count_block_type(&blocks, BLOCK_TYPE_DEFLATE), + 1, + "only the large gzip stream should become a DEFLATE block; blocks={blocks:?}" + ); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + crate::utils::assert_eq_array(&original, &recreated); + } + + /// A file with a valid gzip header but a deliberately corrupted DEFLATE body + /// (0xFF leading byte) must not crash. The scanner must gracefully abandon the + /// stream and encode the entire file as a literal block. + /// + /// Fixture: `test_corrupted_deflate.bin` + /// Expected wire sequence: literal, EOS (0 DEFLATE blocks) + #[test] + fn test_corrupted_deflate_body_falls_back_to_literal_and_roundtrip() { + use crate::utils::read_file; + let original = read_file("test_corrupted_deflate.bin"); + let (compressed, blocks) = compress_default(&original); + + assert_eq!( + count_block_type(&blocks, BLOCK_TYPE_DEFLATE), + 0, + "corrupted DEFLATE body must not produce a DEFLATE block; blocks={blocks:?}" + ); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + crate::utils::assert_eq_array(&original, &recreated); + } + + /// A file containing padding bytes, then two zlib streams (each with plaintext + /// > MIN_BLOCKSIZE), then more padding. The scanner must find both zlib headers + /// and emit exactly two DEFLATE blocks. + /// + /// Fixture: `test_two_zlib_streams.bin` + /// layout: 100 × `\xDE\xAD` | zlib(EEEE×6000) | 100 × `\xDE\xAD` | zlib(FFFF×6000) | 100 × `\xDE\xAD` + /// Expected wire sequence: literal, deflate, literal, deflate, literal, EOS + #[test] + fn test_two_zlib_streams_produce_two_deflate_blocks_and_roundtrip() { + use crate::utils::read_file; + let original = read_file("test_two_zlib_streams.bin"); + let (compressed, blocks) = compress_default(&original); + + assert_eq!( + count_block_type(&blocks, BLOCK_TYPE_DEFLATE), + 2, + "two zlib streams surrounded by literal bytes should each produce a DEFLATE block; \ + blocks={blocks:?}" + ); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + crate::utils::assert_eq_array(&original, &recreated); + } + + /// A ZIP file containing three DEFLATE-compressed entries must produce exactly three + /// DEFLATE blocks — one per entry — and round-trip correctly. + /// + /// Fixture: `test_zip_3entries.zip` (entries G×20000, H×20000, I×20000 bytes) + /// Expected wire sequence: literal, deflate, literal, deflate, literal, deflate, literal, EOS + #[test] + fn test_zip_three_deflated_entries_produce_three_deflate_blocks_and_roundtrip() { + use crate::utils::read_file; + let original = read_file("test_zip_3entries.zip"); + let (compressed, blocks) = compress_default(&original); + + assert_eq!( + count_block_type(&blocks, BLOCK_TYPE_DEFLATE), + 3, + "ZIP with 3 DEFLATED entries should produce 3 DEFLATE blocks; blocks={blocks:?}" + ); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + crate::utils::assert_eq_array(&original, &recreated); + } + + /// A ZIP file with a STORED entry (method=0) followed by a DEFLATED entry (method=8). + /// `parse_zip_stream` returns `Err` for STORED entries so they become literal blocks; + /// only the DEFLATED entry is analysed and emitted as a DEFLATE block. + /// + /// Fixture: `test_zip_stored_then_deflated.zip` (J×8000 STORED, K×20000 DEFLATED) + /// Expected wire sequence: literal, deflate, literal, EOS (exactly 1 DEFLATE block) + #[test] + fn test_zip_stored_entry_stays_literal_deflated_entry_becomes_deflate_and_roundtrip() { + use crate::utils::read_file; + let original = read_file("test_zip_stored_then_deflated.zip"); + let (compressed, blocks) = compress_default(&original); + + assert_eq!( + count_block_type(&blocks, BLOCK_TYPE_DEFLATE), + 1, + "only the DEFLATED entry should become a DEFLATE block; STORED stays literal; \ + blocks={blocks:?}" + ); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + crate::utils::assert_eq_array(&original, &recreated); + } + + /// A buffer filled with pseudo-random bytes contains no recognisable DEFLATE/zlib/gzip + /// signatures. The entire file must be emitted as a single literal block with no + /// DEFLATE analysis. + /// + /// Fixture: `test_random_bytes.bin` (32 KiB pseudo-random) + /// Expected wire sequence: literal, EOS + #[test] + fn test_random_bytes_produce_no_deflate_blocks_and_roundtrip() { + use crate::utils::read_file; + let original = read_file("test_random_bytes.bin"); + let (compressed, blocks) = compress_default(&original); + + assert_eq!( + count_block_type(&blocks, BLOCK_TYPE_DEFLATE), + 0, + "random bytes contain no DEFLATE streams; blocks={blocks:?}" + ); + + // The literal block must survive the round-trip. + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + crate::utils::assert_eq_array(&original, &recreated); + } + + /// Two gzip streams separated by a 1000-byte null gap. Both streams have + /// plaintext > MIN_BLOCKSIZE, so both must produce DEFLATE blocks, and the gap + /// must appear as a literal block between them. + /// + /// Fixture: `test_gzip_with_gap.bin` + /// Expected wire sequence: literal, deflate, literal, deflate, literal, EOS + #[test] + fn test_two_gzip_streams_with_null_gap_produce_two_deflate_blocks_and_roundtrip() { + use crate::utils::read_file; + let original = read_file("test_gzip_with_gap.bin"); + let (compressed, blocks) = compress_default(&original); + + assert_eq!( + count_block_type(&blocks, BLOCK_TYPE_DEFLATE), + 2, + "both gzip streams should become DEFLATE blocks; null gap stays literal; \ + blocks={blocks:?}" + ); + // There should be at least one literal block (the gap between the two streams). + assert!( + count_block_type(&blocks, BLOCK_TYPE_LITERAL) >= 1, + "null gap between gzip streams should produce at least one literal block; \ + blocks={blocks:?}" + ); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + crate::utils::assert_eq_array(&original, &recreated); + } + + /// Feed a fixture containing two gzip streams in very small chunks (64 bytes at a + /// time) via the incremental `process_buffer` API to exercise boundary handling. + /// The round-trip result must be byte-exact regardless of where chunk boundaries fall. + #[test] + fn test_two_gzip_streams_incremental_small_chunks_roundtrip() { + use crate::utils::{assert_eq_array, read_file}; + let original = read_file("test_two_gzip_streams.bin"); + + let mut enc = PreflateContainerProcessor::new( + &PreflateContainerConfig { + min_chunk_size: 0, + ..PreflateContainerConfig::default() + }, + 1, + false, + ); + let mut compressed = Vec::new(); + let chunk_size = 64; + let mut pos = 0; + while pos < original.len() { + let end = (pos + chunk_size).min(original.len()); + enc.process_buffer(&original[pos..end], false, &mut compressed) + .unwrap(); + pos = end; + } + enc.process_buffer(&[], true, &mut compressed).unwrap(); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + assert_eq_array(&original, &recreated); + } + + /// Feed `test_two_zlib_streams.bin` in small chunks (128 bytes) to confirm that + /// the incremental path handles mixed literal padding + zlib streams correctly. + #[test] + fn test_two_zlib_streams_incremental_small_chunks_roundtrip() { + use crate::utils::{assert_eq_array, read_file}; + let original = read_file("test_two_zlib_streams.bin"); + + let mut enc = PreflateContainerProcessor::new( + &PreflateContainerConfig { + min_chunk_size: 0, + ..PreflateContainerConfig::default() + }, + 1, + false, + ); + let mut compressed = Vec::new(); + let chunk_size = 128; + let mut pos = 0; + while pos < original.len() { + let end = (pos + chunk_size).min(original.len()); + enc.process_buffer(&original[pos..end], false, &mut compressed) + .unwrap(); + pos = end; + } + enc.process_buffer(&[], true, &mut compressed).unwrap(); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + assert_eq_array(&original, &recreated); + } + + /// Feed a ZIP fixture in small chunks (256 bytes) to check that chunk boundaries + /// inside the ZIP local-file headers and DEFLATE bodies are handled gracefully. + #[test] + fn test_zip_three_entries_incremental_small_chunks_roundtrip() { + use crate::utils::{assert_eq_array, read_file}; + let original = read_file("test_zip_3entries.zip"); + + let mut enc = PreflateContainerProcessor::new( + &PreflateContainerConfig { + min_chunk_size: 0, + ..PreflateContainerConfig::default() + }, + 1, + false, + ); + let mut compressed = Vec::new(); + let chunk_size = 256; + let mut pos = 0; + while pos < original.len() { + let end = (pos + chunk_size).min(original.len()); + enc.process_buffer(&original[pos..end], false, &mut compressed) + .unwrap(); + pos = end; + } + enc.process_buffer(&[], true, &mut compressed).unwrap(); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + assert_eq_array(&original, &recreated); + } + + /// Verify that the decoder also handles the recreated stream correctly when fed in + /// small chunks, not just when given the entire buffer at once. + /// Uses `test_zip_stored_then_deflated.zip` (mixed STORED + DEFLATED entries). + #[test] + fn test_zip_stored_then_deflated_decoder_small_chunks_roundtrip() { + use crate::utils::{assert_eq_array, read_file}; + let original = read_file("test_zip_stored_then_deflated.zip"); + + let mut enc = + PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); + let compressed = enc.process_vec(&original).unwrap(); + + // Decompress in 512-byte chunks to exercise the incremental decoder. + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec_size(&compressed, 512).unwrap(); + assert_eq_array(&original, &recreated); + } } diff --git a/dll/CLAUDE.md b/dll/CLAUDE.md new file mode 100644 index 0000000..28ca341 --- /dev/null +++ b/dll/CLAUDE.md @@ -0,0 +1,73 @@ +# dll (preflate_rs_0_7) + +C-compatible DLL for .NET interop. Exposes a streaming compress/decompress API as +`extern "C"` functions. The version number is baked into the crate name +(`preflate_rs_0_7`) for binary compatibility. + +## Exported C API (`src/unmanaged_api.rs`) + +### Compression + +```c +void* create_compression_context(uint32_t flags); +void free_compression_context(void* context); +int32_t compress_buffer( + void* context, + const uint8_t* input, size_t input_size, + bool input_complete, + uint8_t* output, size_t output_size, + size_t* result_size, + char* error_string, size_t error_string_buffer_len +); +void get_compression_stats(void* context, /* stat out-params */); +``` + +`flags` encoding: +- bits 0–4: Zstd compression level +- bit 5: `test_baseline` +- bit 6: `verify` + +Return value of `compress_buffer`: `0` = more output available, `1` = done, `<0` = error. + +### Decompression + +```c +void* create_decompression_context(uint32_t flags, size_t capacity); +void free_decompression_context(void* context); +int32_t decompress_buffer( + void* context, + const uint8_t* input, size_t input_size, + bool input_complete, + uint8_t* output, size_t output_size, + size_t* result_size, + char* error_string, size_t error_string_buffer_len +); +``` + +## Internal Structs + +```rust +struct CompressionContext { + magic: u32, // MAGIC_COMPRESSION_CONTEXT = 0x4B3CFF2E + internal: PreflateContainerProcessor, + output_extra: VecDeque, // buffers overflow when C buffer is too small +} +struct DecompressionContext { + magic: u32, // MAGIC_DECOMPRESSION_CONTEXT = 0x053D2AB1 + internal: RecreateContainerProcessor, + output_extra: VecDeque, +} +``` + +Magic numbers are validated on every call to catch dangling/wrong pointer bugs. + +## Safety Notes + +- Uses `#[unsafe(no_mangle)]` on exported functions — the only place in the workspace + where `unsafe` appears (required for C FFI entry points). +- `catch_unwind_result()` wraps every entry point to prevent panics crossing the FFI boundary. +- All other code in the crate remains safe Rust. + +## Build Output + +`cdylib` — produces `preflate_rs_0_7.dll` on Windows. diff --git a/fuzz/CLAUDE.md b/fuzz/CLAUDE.md new file mode 100644 index 0000000..608fd4c --- /dev/null +++ b/fuzz/CLAUDE.md @@ -0,0 +1,30 @@ +# fuzz (preflate-rs-fuzz) + +libfuzzer harnesses for fuzzing the core and container APIs. Not published; requires +the `fuzzing` cargo feature. + +## Harnesses + +### `fuzz_target_1` — core round-trip + +Feeds arbitrary bytes to `preflate_whole_deflate_stream()` as a raw DEFLATE stream, +then attempts `recreate_whole_deflate_stream()` on the result. Verifies no crash or panic. + +### `fuzz_container` — container round-trip + +Feeds arbitrary bytes (minimum 1 byte) to `preflate_whole_into_container()`, then +`recreate_whole_from_container()`, and asserts the output matches the original input. + +## Running Fuzz Tests + +```bash +# Requires nightly and cargo-fuzz +cargo +nightly fuzz run fuzz_target_1 +cargo +nightly fuzz run fuzz_container +``` + +## Notes + +- Edition 2021 (older than the main workspace crates which use 2024). +- `libfuzzer-sys` (0.4) provides the fuzzing harness glue. +- Corpus and artifacts are stored under `fuzz/corpus/` and `fuzz/artifacts/` (gitignored). diff --git a/preflate/CLAUDE.md b/preflate/CLAUDE.md new file mode 100644 index 0000000..c39e5ea --- /dev/null +++ b/preflate/CLAUDE.md @@ -0,0 +1,83 @@ +# preflate (core library) + +Core DEFLATE analysis and reconstruction. Analyzes a DEFLATE bitstream, extracts the +uncompressed plaintext plus a compact set of reconstruction parameters, and later recreates +the bit-exact original bitstream. + +## Public API (`lib.rs`) + +```rust +// Compress: analyze a DEFLATE stream and produce plaintext + correction data +PreflateStreamProcessor::new(config) -> Self +PreflateStreamProcessor::decompress(input: &[u8]) -> Result + +// Recreate: given plaintext + correction data, reproduce the original DEFLATE stream +RecreateStreamProcessor::new(capacity) -> Self +RecreateStreamProcessor::recreate(chunk: PreflateStreamChunkResult) -> Result> + +// One-shot helpers +preflate_whole_deflate_stream(input, config) -> Result<(Vec, Vec)> +recreate_whole_deflate_stream(plaintext, correction_data) -> Result> +``` + +`PreflateConfig` controls `max_chain_length`, `plain_text_limit`, and `verify_compression`. + +## Processing Pipeline + +``` +DEFLATE bytes + └─ deflate/deflate_reader.rs → tokens (literals + back-refs) + └─ estimator/ → TokenPredictorParameters (hash algo, nice_len, max_chain…) + └─ token_predictor.rs → predicted tokens (replaying the original compressor) + └─ tree_predictor.rs → predicted Huffman trees + └─ cabac_codec.rs → encode *differences* from prediction → correction bytes +``` + +Reconstruction runs the same pipeline in reverse. + +## Key Types + +| Type | Where | Purpose | +|---|---|---| +| `PlainText` | `preflate_input.rs` | Wraps uncompressed data | +| `TokenPredictorParameters` | `token_predictor.rs` | Compressor fingerprint | +| `HashAlgorithm` | `hash_algorithm.rs` | Zlib / Miniz / Libdeflate / zlib-ng / … | +| `PreflateError` / `ExitCode` | `preflate_error.rs` | 29 error variants with context | +| `DeflateToken` | `deflate/deflate_token.rs` | Literal or length/distance match | + +## Module Layout + +``` +src/ + lib.rs ← public API, PreflateConfig + stream_processor.rs ← PreflateStreamProcessor, RecreateStreamProcessor + deflate/ + deflate_reader.rs ← DEFLATE bitstream → tokens + deflate_writer.rs ← tokens → DEFLATE bitstream + bit_reader.rs / bit_writer.rs + huffman_calc.rs / huffman_encoding.rs + deflate_token.rs / deflate_constants.rs + estimator/ + preflate_parameter_estimator.rs ← main estimator entry point + complevel_estimator.rs + depth_estimator.rs + add_policy_estimator.rs + preflate_parse_config.rs + preflate_stream_info.rs + token_predictor.rs + tree_predictor.rs + statistical_codec.rs + cabac_codec.rs + hash_algorithm.rs + hash_chain.rs / hash_chain_holder.rs + preflate_input.rs + preflate_error.rs + bit_helper.rs / utils.rs +``` + +## Constraints + +- `#![forbid(unsafe_code)]` — strictly enforced. +- Serialization: parameters via `bitcode`; correction data via CABAC (`cabac` crate). +- The format is chunked to bound peak memory use. +- `#![deny(trivial_casts, non_ascii_idents)]` also set. diff --git a/samples/test_big_then_small_gzip.bin b/samples/test_big_then_small_gzip.bin new file mode 100644 index 0000000000000000000000000000000000000000..c3618dfcba76207f040f61e212058a9e2ff35870 GIT binary patch literal 84 zcmb2|=3oE;rvGmbGV%f$hZYpPzkl*3vj$iglgM+bThxCbmVrSYu5OeRLnFga809L>K7* literal 0 HcmV?d00001 diff --git a/samples/test_random_bytes.bin b/samples/test_random_bytes.bin new file mode 100644 index 0000000000000000000000000000000000000000..f1ab9b1ffcacf5787c81275c86c9fcb02f93b4e2 GIT binary patch literal 32768 zcmV(nK=Qvi42*qpM>H(h5DzbJcMWCMaOI8zQP{4IPj0LLQiYltLQeN*hhig2bJbr7M=i6LyEZ2!vYNry*%hTF`!a7O$t7&uN* z*dWEj?|=n5h^{%7-caZPhFdn6Wlq0R0Hyi~Ik}yB9eXG%_#;3;^Hhh}Y-XR#zu@OR zesdtL3UzP;Bp!7l5UC`2j`(du@_h2pVK6b=wb;_{8!{ixtv7#SVdY2IU5@O3CgL&~ z2QPV5(D?Rk%^p<30mp~vlGyoAVV7?mAD|r>^iUlsUn3%T&OLqAnwPkB>X(+pdx(tn6iH^UR~ac1kYoOE&wV$7(HBaXAosSZ1ZTZa=j z-mswGJH=sw7T*A5y4)kFpg)SU#CF#^-#Qn^CO9D38a9Zsde?I9%?V{q^{Pt_IioHr=`FPTpFI z_wL_`rF|@p=6vnKtPcIAUHNRagszW`1z{_D(E5Ax`{f8PmT}K~oV@a|*{QVXikyIU zJz}9fUSe+djgT{Amvp#Wng>9P7aNs8`!oiT@b%%YUK)Y)Iw8&bB^ymln=5}j)pNq; z;+Wf<9x`fdhbd5cSSC;b(dO`#5O$N0=p=WCW!XA&Oo&J%9#ox^;6Lx1&WrruDHPqM zfDJBn3W*e1@ZyeK-}^4`veBxfHBzY3{gcfS=q;gvr8~|5?9&@yx&PuCXoy3T;P^fc zdQqPT&@Xd3>^x5Ym5bE(@qFn@#9>7vjnuA)0g#ybOyu{ls=?``V0y%N(iWof!^KVg z5H{=c)2pXw;#)mVLs?CYzi5wd_ZXnEVw z&()MV&kCkj>$xr8HhkaBD9WC#ajI4)I%Bf4OL#O&gJoDwCR+UM<)xub;Ue(jn~WT^ zC7$f(6CJ#gCoaV773N^1^GzS-G_NaI6n@~~QM$g6#?h@Bt0AZ`#gTzbswqgbp2$Kg zp1`Oq@Z|z134?A-*f^7Unm!nM z*6JWQ{|I|YmD>Qcd(~enxFQP2@D|3ufe5xDuzNMCL>9EIS?%>*MIh=DmS+}b6``KX z@_`azlC~JBi$A>I%;CkYU;h5Qh$`7A++3EVG$^N+oaEY7GHl*HD5QKz6`QUCA%o&F0h z$?AJM0h8Z*<-x!yM6x_v$w6`f3z+TM!gYS@u}_U)IS|$edubvw5#$(s z7R}&yZyZOIccY|vnn`ifnT#J<+Wi=?!=u&FOqsx5@_TccN$nle)kMa8f@rBI=Djar z8i~OaC17v$YO0$DY+*e0d}zyhq>#eyx1EK!{_ZmHqiDUL(-(0{2!+*`PGqQc!hQd& zgZlgHRG#wJ)Z{29?vsD|}YMR!51N ziTp|+Ro;!)nfE8SfPVdC>NZ821IntxsNt*JMVX{#^`KhIqm}!7rofNMx;kcD4eV>+ zl_SyrN1J%&FWq1u}$japb?q?M<)@_cfQ)D?3QCpReg@ z_p~_yyU){R6a7M4&u+oI72=NXFJi#Moc?SXKIyM_-}l8@B}bMPtPM(ficn`c*0qaFq~@bCXu; z{W&^`yA|PY*))Eb5ax!mEZ{tOYqQ%_M`>vuvyWD#m6V7uMbaLPKtq2@q<+czNQuM}Cbf ze~}HEX<*2g8+!f4IeY&k2DqX_#@D-8@g)c%2ZnAG9?O!{*v%?Vjw+rgXiuleA)eoQ z9epg`JLnvqSv>|u1yla9gjK#ki98Rh+lwHEC;sWU9RF3kP=Yi&bO9CyeHZb%#7tWf z)^enO)uW0hzC9XSau>}b?vS6SEa?QiW|fhmg`vr(qUVcuCTB%APc~HM?x4^;w`%8|?T0-sP@5EPr%Nt> zw#pfqg;@{J=YKHlG)F!LA8|ru&AuyyCJP~D<3TT}J{9a4R^7_}55wA6wQXOFkK{}d zeV6~z^&c#c#75R;e+k34_4u%)#)@1Gqi_$89r3q!Q8Rvfh#YXnwOCqtqPGsTNMQV) zS^E<%9CvUE^Z==6Lath+9t%rIIJzaj$}_B*rA1KR`J!32a#_KhmO)wo$PitkaW(!( zsV8}{qg5@d=LS>IT+1)J#h=HZzj2 z(uc(O@)KB%&PmM^%Bgu7^NGF+tec;Y`!aMync}30)h%G+Y|!|>98n>k&P5JbVrCtG z5a}7XWsJP-%Rdi$zWU@^^0Tm0gf5W|#Tip=adT;j(Fz3eLs$fIft-xkz_ROsZ|<7Q zGy}hom^bzytc|9Pg_vMWdmYZBM-Bd$gx1f-5@D}#^S7B-T$VBML4;kaEpfGSo!6

6eG?88cZFl(?YUJ1`dn zMpvk};E6B3uU$IL%;}58^w(86Lmp8z)%W0B!nY1W_xU-};TIr34cDC^_pE&t@9JpKtZ&eIYAE}LoTuDTm?=&AC$=Q=TQ0j{-` zzcv?C{72C?f@EMl$!_XltvA#cKj!L0{{k4@_gYv%HO8dG)_;u4ua@UTM$?&~Z)R(h zutQs=JGAxHlGp||48uwr9A=18l7LDdmuL{T=QMK1d6O8!#?_!327qVcbG&3@mgQO| zUsyfC6x)u6LQ;YPsqRk{PNf=}`?n>ca9$!&)Xp!mac!xoxR!781Uj+gdwdk6!#c%- zkPN@%`4*RAq$)9010ic|*)?h2Bu)(LsS_+E8(V-RbcE2x=Fi=s2**4FAQvUKHk6ty zkl@405AslOMaY*)n7;)`KNSMWE1YYvY|}_mUU7kAI$Rs%&M5*=cOuNW(ltk?=%en=6 zR7d1dTT1u06ma==Acwto6=}G}@aPv5r8JC&S-`=&CxF0m&$O`l_5(-ZRCOnhY)oHJ zxjc2$sXF%c^=nen3w#zqQbZI>994u|&k2z$ZsqazS8&ovU58b#=p%K(#X~m;XVy$H zTJsY;FZMzJ+}FJp+jaKCvohI4g7kMPk)3gMJf0HlFuO?gb zptWy_U19%bczcIwb!UIJPc}U$`~wyCzM=e#`(2OAaX}l{^boPhS?WFOp#%nN!ZW71 zgkOeviSe(yUjNx^*1o#UkLFxMbu)^1-NR%`O!5R;*Z)MJ`I&$i%Q90QZi4eM=BtIc zxP!0?(mu0gS&_A-`Q#|WlI==Wtk=&ZC{nLWr@R}K-jrt|NM<3(EO-k!1cbS>hYyVHeFn*{(xu#)fu`}COv z+?93%zhKhe%70VG!ElBs*m{eRiPK;sT}6~Q>F104V5A9684F)DkcmLx1a~e?2mEiZ z*Ps8s?GULcN*svDB=B5ffWrnpf5qZ{5oPNzYVyaQrBM|=f%ceXxEgOcc;2Q5RgFE$ zd;}HqPrfaGQ2#Ei9v-$%HbD7zqOss&3cL25S2#?;p6-yd8)XB%3F0F*A0> zVPbz<)Hb*3+BH>N&8$%3w4tWH)7+(dZNP%K=jF=)5s?T*ngf|}ZJ`T|PcRefwogWf zyMK3snJ>(5C#LsU`7oIi!zdi9vfkA_bOpm3oE4)?r*Tdh$>c6^-IU2oL#!(E8O(+& z%-%vp_GS6XWPW@kh}@+uIn&s?vSFmK44hW8;!5EbcV@p&D>)l{OOpq$IGpD33#n^t zXoUAj3BVSDS00gOiW}|(BF2iiHwl%+cEU2D_$^;$>I)QpoT~cCPzkZLwVHU1AA`mZ zk8@3~$wn;0bk%!K+MlT9Lup25MxI67N>HLAJYJ$o52Mr4o8pX3U^YWu4TyGjb`7^E z^hEH#UdH?KMpI&Q5YY#U({X2LAzI-2rs{s$&lSR;$rCr{=@os-6vi20mEp^qlI&I3 zmK#Xz=Po&1Mak2p<_uftXXq|~rBgpHI_9}6#8JV*7FVlGQ%YMJ)}5Z@lUzJQX90l* z>i+)%{CRKl=+WXR1AUHa8D2{qvJy%Vn5iroH`QBGky`1v5-*vhyuu>dA(in&K8&c- zYik`hEccVzqJ*_V{{H-2ca&oQJPhtUYShygsXdwc-_gO!_ULsY*_P0V8po9PLn`H1tbVb`QJq=M%3xqT6#ICys$bkBox{ zIay8Qy;+5L_*@XLD@soN)5Lk@`g&@29xE(p%pQ;-HWS= zh%$o6?0aLZT%ZL6A_mejHRQ8!k$_?m3@;sTK0Pc#z+O0BftK$k=#AQ?gHk1pAH&9~ z9)yQm;UoKI$y@t3N!)KX;)FC2B_x343I0p~6UT^6sK^2P^xw`Sa{QSgRmCj7Zd0y;R_Q6ht$?Pzd=c0|Q$vk=64+X-SIa z(t24FT>mYavep84VmvUpaP4Z18)G?4^8Q*;78|;eYup4^mrz%AEYkXJ^6ZWW z+kSqh-f>I<)fN=+5!uQ0GUxq!DhAMRL${ZDE(rCUQtJ%p?MBaDz3l)b#i|gO+KOr^ zs~ox-IK;c|6uY}wB4|dddOGxLbELG0*o_+?zOd)`KJb<2ydhxfEGfualP5P+jX6vH zO=t_2)kE^J*W(J6U?2;2tJ1Hh_~YuEL!4KIboaiK7)!K~=~YsaU0F_2c{>%NK;ebG zUTG1cuuHz>Q<^>u&;Ls5qfY{w>7V0s(3IO?BlEml2KzP2;43{z%FQ%ZA-ZL)X3jo$+5e7<%(nB-_>N2H zH$H&+{&qwUG8-cQ4O_#fPMa|&{O)I#{`-Zm2@u>o#O9&E;_DI+TANh++(e(p`8fOV z^H3bfhW){`k)$6E7eTwS4X(m5+pxCMvmAYXc%LgFoI0#l3Z@h5*w=^(@C%~B$E7m* zGij{;tyP)2-)H_NZIg!>n9Xx9FszFr^V(~%Cz_evpk~y~faqKaU;l0IiQXP&y6j7P zO_|=(qabM0h>}u+c+Yw|Okxgwp6&SG*b-+^i$xBEj#?G?3LGk+m>|%)F3U%FIX!^> zE_ptnsKE7ZJQ$-#08 z3w15*WrkK6&D(`cExcDe2j1xYJaTAZh?D)pLfmO?kYLta(Mk@KhIrg*%`YLI01q+>nNcpx;!X`%U2+ zU`!izT@vtxQ}5q9{S@N1`0D&QOFyzNXpvl2i);d1)hB-cFsKcYKFPl}rA`hh>-Fe~ z0;m?}rb28Okfj_X!!OK&TYAhb3Tjiv&XG<%nETzHL$@)!X8PpD^NH9l*t}of+AkEY z?Q&23DwOZvDD;_3L79Krml!7(H`&uL?SZv|DF{03TV2T)D%F^v=;nv6Uu<3|gf`v@ z0?IoESM=-8`J1@Pnk+y?2Vpq~U=3^bN_*d7ESl|0^4JSIRZiph@e1aAXO_qsssuFM z)cuqa`ONtFb{`4Yp&*IX@;m*p<=+ZYlVb3&;C67W??aZy)wuCeYLG1VLRe-{`!m%1 zn|Ny&3Acr|QCScmtlTNqwu6*W~nX}M69;{P&Ex~oP$XW-NW zee$l7S1X9V)VO_3*K|hP)6wd=58_G%BP*?;%~akW$r4Q+m)6ajps)mwOU~qJ;;_$U znky2k$3T&7FQ2Ya3l4;Po7gxWzJ#?u7(eBh0_d%Hce9z1N5OtIr((G1GE3RgDK8({ zLi26pC%w+fhyU2&`qfiOLHO!IJZ$Om5Kz55(KtQ5loU=pzMQ|CmiP`yNY@ygk#1f3 z0Y0)K7S;Bi=d8i=Lq_Z7E;=$+l1W{HEl!=#RvxBY4IP-rGNVY-rG&se8fSWW{KYznAENEhA@>f@ zcL|9zen6Yh^8y4|)B4FNClBL4w7|E>{8wXH zAweNQgmjEz3xVhX+8U+Z(2PFr{kax$7>?KFp#sQbt0@d~*d~8uRH&aIF%I?J46A^C zH}ZO}8>)FF;4tJrD<2WTpdmpY;vAaYAv7Esmcz|4(su>u#O>QNtqtg?VmCAfB4a>< zwb%CJUwlRu9zmbqx1UN&@Sd-gyaY-ec^lZUWM*e0u3+=;sjHdRs8coQ75#7la&_)) zdiJTnY0O>&xdkP+4?0Mc;bXWK4`pSMS#X9~t$kZ$3tEOV3P!>>t!td=dcD3^-XT0m zLSbUTzoOfH7hsl5r+yocK>BMb@F7*kzyIx9q`&w5@`=q7YH(-gc*<{_*}+i1f-6KT zm*cc`74EFs=I`Cny#At#!wg&EcZVjoDwZgXbTT|7xD#foWT=3ejA2HF1euVr&d&z% zOv`k1Yw~yR?5-=IVB6Je;N}m%M?stf|Fx}i*4fUPrV9{9PN~SlXviSNVDmSq=bJBA zHChn*4PRv=#gS=ul_TgKyaiXM(#TLaLe}&n$7w=op560No@eTfX9G#(Ak2QK`|Io! zE|cl8n8;bwe_P$^#9c7S7J5*Ol_GEa(muQ=HKx(#RDo(1>`O^%M%5FIz^}L z`0OGoeBej%j#5_%Gg_8E$W|u>i%#LUTW%3<;uti?o#Pq;7{OSS6aVcL}mG1IOc3$W*%K)gwf$QRVbD#!9R^4uMc?i z_%j3_hyxWdYy-fE`0}jVti7+HdbrHN9k!`60qM<04L*;5yyx@NU;(5ExEH45clgXH zvfJ}B#g=195JCPdn(tQRJNA2gY*#Hd-VRcy#E)=;^FKxVE~;`NuffCLcXEU~lq@+d zu3$U7L7>6_pC4hy*rU2YaW={X>!C`QaR2E`kxCJxo*d~I7!&A)ZDuUTbXe5bZzEjptq29s` zekY1QIM5-eExD?TN^^__GQn}dH9uLL`Y^JPiZU2iqQ_jY&h$Zq1W!Oo!H65Zf7a?< z;I;3PjoWpcES%7wE+{8@FH{{XlKchPMfLta&j(=x*|2>T{|#_m+ok0m*Ni67BgNwgc;^n6)+nvtP}%1T)dNOkF(Z(qZnNj02-uh zlJSo-++KnEx~rQxF$O}ZxVgT0jcdd#HbOlWVPfPmCa;?EXHFJDV%CANyX8FJBKOt{ zRg6NE7aw8E{U%i4#K+BxX(C#pQSD;7;#+pApJOq!CiHH!bdhbI9{OQTE24@PyP&@+ z)8>X19eo5(HI%?n%q4Qh>@TwJ@9sA%TsO*F8VySEs~z3Xur{?2$vx5)`ZmBk(ZPnv9~c%qXDz7|EFE{64=l z4ssX;M>VD2b5|{4yaF187C0E2CWolPidRPKJwT&N)8syT^!ksa_j50YmnyV;X(uNA z9&83~@5NyGHLhx)pXg3DLU6jp=V#1dBaYZqoit1AHaL3->tBct^XIgOJS(-_Y)#AD zsvF^py)qfY*khJp&RUK9-8UJ3IB+@}5{P;sGmUf4$jLKsiT!C;Xt9%%0WeISr$s`ojH3QN*( zNVfhDecZTdEn~qqjOJN~vZ=_`jhC_$JrJpXnY zsS9$HfXBBYP?IKmpUfRN@rDl&t=ENSs~$&as2_OzB@(eD)hYEtiWM~y{&Tb3^qX`F z91Uc`3HBPs!9d*FMmyjvo|N>IVX8&roPl9+K@>Mr?{)o zzwnk1QoI(FObs?{ps;Si5KO};Cvw(48u8(xiDQ<0sC>D0>7hG&-G| zKz-0gg_5qIJSoxFOZ{bB=m%9j)`sviKoQ&wRV72l*tJ2;dzL(`tTz`WwdK*~-<0%3 zT`Eljz39TE;98F{c9{IG7}}Dh#G_kD6>Hp%pfa(MR6LW6`QUO^lY};Qjz}?SRt^+n z1NsfUXpq|DK%W{hAc*oJGCfrCZ|xVjp!wz6WueSVG;y=M!;}nhwx4KU;dCo zWi4$1-Xi;ITNBoX!g#XxT$y&2`1;7U_LnhmIg{4prheHwTJ57cvCBdC`qFp~l!O38Ed&^P|9_bQopIwZhr<63N*|3 z6;P0x40V1Hi$B55Ly9XWCDaNBO)$t~pQYIgnvCjaGl%iOmD?|I(2nEmfzqF>O%kNg0*Pz-|DbrJ?Dr`(*(bn322ws% zW#Bp!+5K1fN7l>)mptBZvdv?@%JM zbFc?ujFH-$yv~AWCr&hOk6VT`K*Xty@@rNBc5W_9NmTOyr6(Fj;o0%0nml{S!Nu~m zpQ0#jRgOvD;`)F+-m+T^yY8{sZDGT|TLom|xMKpIs4%>N28$7l^~)B4j2#d0==)gm zH50uKPFh1M);%2KAD}UgV+g9ER=Wqz)gK{ZONc61;}(r&(O3bWjI%th?kw#sJ1$); zU!n?D1#iFWDSn+{L!KMLl3R{Pq0@6lWMPtE^CBu z?}ZRMU-L)rrM|%tr8Tz%QP=aUtTSJ?oKx>#3*hPA%<_DJzpVGonG|<7*V>8A4pUW` zWk*7)8!nzmvFgfH>i|n;Sel(FUU+EVE`3hF0`yU^Gx5o6@xbkTF~&inM9im<*~uia z`VDbgc)Yo;0Lv$JM*mU`=HC#Sv>OEbeLteV)(3todL-8*81n|v9+@^0UZUyT>cT#<_c&}s?P zLX3#)qA_uE38rx%lk3;0$MXiSuA{NzP3-?`SI2_R5GeVuDr&u(m7(YF4^LIJ+|&z^ z`*^@Z!9Ov|Z)`Y}TT#u?v~Sjqqq#Ps$aBv9?ULqDTFjFHtL<1a#Mai*U=DIdDbWk2 ze^v1=y$i=;QtG^VS@CA2LzW>AY3%EL<9k}{IH^&6>>~pZ(D211)rCC_;PxJs%M`lI z;oCnoN`^UByVI>meD`aIF{mZ#`+bG4m*c7Cgp3kLOOp+8y}6a0QJnMPmQO)TmBfQ| z4s4rC286oL^jEM?pufTRR`yxErW;;&s}H+F5EoTKqHnQKp_P-4ngP*A4w)rob6@teI$oT{~< z=AGHe)dJ2O(X~hEj0tCIgO0 zC8pC&O_^pb;C-isl?z8Apt$61UZn4I>$$3Bgn|o4J<{6B`4}F*2;UH=`;e+4CNDWr zEbn0@+>P$%O{pbat?1nf&6Wel2?TSc+D(LVr|M?4PGC!Eom?{w6hX2hr=fLfXPZ;m zig$0d4MyQVzzySS!Wrb?%w=i9KQCE6Q)@l?x1!WxYc9C-f;s631H#wLGqt#$A>NsF zE64E&jHMSye_8}VPeNq`@K(U?Y-zr_pV@Pob zWwlKhxph^)Q8QF?P1$FT8O@i+*YoeBMVSC5fS+n29@0li_ELPvQP#*~f3qZWjYrKt zO=;suji&`XpEU7y;Fv$?P9(026vKi!?%G!Qm2)&LxqJQH*Bv#Fjew}ufVL_IGx&V{ zao&hE`M|>!H!?6hdbrog6tUs!6U^NWZd?qgGL_Q~>4CSfw117{7^domQfs8mvRMBu zsXoHH)Zp#EROXNUN+tl+s(R;36zVV;*;%O+(nF3*NauC>cV#Wj{HrE}sGeNT)H z{c>=9bK!1E2A6ya_u$lG(_hTSg}R2^y}Ed#wkJfSenBzwpfz-|a_)&6>)`(1Ay*6K z9kKLFAhjWlfe!oJM9OaGo|~z6&`F^K8@D|qAk$J~e_LD+{XMBIFwVx%!}3P2T``}v zTHJyEE?$MrW^}VDt>-X<3LhT7FjfEt0;j~B4!mdi!9`6}KMsGA^XaW{{*sUvZ-)X#Hsi7sF{glmdIHTE}$0%@$8ceuUVbXp7MW z)YHL^Vfldt$`$27awIxCQTifXr&WW?k&=;xH$BQ{EdFLe67cmmIdv1pUMeDc-@V{T z;uT}DLu!cLVSb6#Vs;0)m-eUnm2q8&LQ(Fj(gy%R|_cLT!#9mYL! z5+w@r#gNA4dHaB{==DzMxN=F7S;m7>yb9f!NfqnTbj?7tYR}iz>*wRf?YgX{%tUE@ zZ((D)?a{xTZOKURH|Lqt8fI^-P{iF_;tXlUSc^~1Wlhqe?|nk~p%O76D+&Y~sV<2*~kFimYV|{>V|Aj&x!9|J~kE z`5v@3BBk_{NO=8j)%ymWQLUfbK0=VLzd^n7APMBX)qZQIccAl+uBl@|q;+|J`tS*$ zFvLrwDf{e*BId_pjY+y5@_upwr?&S>i=+6UW?(&+A7hYzz}N+tg=k5a3=Y>QyzWcn z!%*X(H;n0xW$2MpL8S;k8mIKUNTI+D-lfW0rxB2)6dXhTAR5GaB^&n1eAOa0Ke-7JAds<-au zR28pEu2Dy6$|=@46*U5aRCKHQJEp(N$#ApKHgo7tU;sD$NhF+m&zoy8VGIZlmpy7< zBqD+5JlMC+1(~nLziOM?^cDLM|D&@Hg7U;N*t7c$!V~!{^MNlaeK6?4tb6bgrJm<* z(l7-<4&x(S?Tp44fjs+J=ay3xl&RV{PbX#e_llTR7XP$;QqVZ z3cNPf@)3^T*die;hPU(0d`3MCQY>F&iHehSprv|YvYcY8(vFAOU`Ll>xuj_9SUi<}X{=rTzkSH} zitDaQf+T@Iz{vq3V1oVVjn`#}8ru3d^WVmxJx5$Vxf>NDjyH)xG1zDn9tAn9jgj1Y zc5sNbc})wVa96}=2vRPJ_&+dy&~ba~R9JA2HUWB_-o@l?b~7o_)= z7 z=0}>!eMks&;DPzf?(V*5?LyQVeM&QzzZT=hN{Su~Z@vuW z&a3OOr;)rGKj8V@5+Y8)so{u8NMtX>jmdBo7Mlh{W7cEUP1=HM03lNohxZ|H?^OIc zDC@7j@Pw(SPmXo~bhvDii=>SIGM} zh?OLfMj`BlP%-{;kr2tPQ}XAxoR-bjc%G+gd+bb45-1)#OE={}OeK)2qk8t<(LS1M z@)5dF$YQV|84!WTU;gdOrRGGZ*r8qnRDjek`KwD6M|%B^XI#}21h}uhtN}ImXKYKV zjL2l(6zG)c7Llq6&>-^F?RKGZ=7*URb%M%=0(BH|=6cHk+!!W|)d-v#{%w;xc*m$( zW(V6SsD1ScGD3t_m4nn$+Gta`F_WGnCr>^BdLfEiI_IoD?mqU8Dz-1QTE<|22`nH} zO0QC5^|KGI$K5r~Y*k_RB6R)g#-zMu6%yxQ0ksIet<(dd?JGnvJ4N#Su00T^5FRFAVYRkzk&jy12l654vYT?|W{vd6icr{=b4-f?n3=b-$R4FQ9 z_A=nU9Suyhpu#$om4#6&OZ1cbCrw$C6`;cK6yEO*78wDBeZjD#yX6!06+YJ1Y_wFq87As>ktKFEPT0)>QH^8bHh!C)ZFiF{8XNX(a4ozR$3 z3VhZ=yxeUyo`5Rs6vvgvP_@656<^MEyslrCVhbzq!JX2NlnZk=yn9=XsYLD{t`n3U zm3nAyYPDm7p+3rSNHi`m=gaOC@^ejH|5HCoh5$mO{`K zy9%s%{L0(j7co0Q+6fh69B8=5%PkGXPU$72Y%^JJg08k2++E){`1sjLYloB+JA0b~ zsWg@fo?}^IH(qj< zNZigG&OE zpsYQBVu;L4d!LIG4^;GW00Z90#~>U##%CreVyg`pA;27tDV8Oq8|ctlWa83E8N6_h ziU|GT;*iKw9py*P(n3bC2`~Hd#28W{h6~GvB4cG|=yokbpto0Nu+2B`DD4GkR5U14 z0Kz$N*mQ>L4?k6pTCcx}fL+P!$z(^gmS28EiO6wd zG_KE7g(#8gnOanr@RlJjHBX3wXL}4I6W=R8_FCM`JE`8L7`cY`@unbi3 z&MfI#T(kATYG?Z(k0kU06?T(pNLurzV?t zNqJSKx_xM)oru*w`tAI1QpuEl3aW*H|2tgd-~nG*;UiKUsebSNIc@u=Mtnj{4l+c# z%Nt-=6BdX1?%=X(zP4U@$sZJF2#%U~#wnyY>mspJYcC5-Rk92(n`QKkVjW}x)bJ{6 z|1Ua9%NR{fJ8Doo>(TB<8#*mJIm6C1LR=L+5adO#;D2Rp5|(1z!mMfrTj`oLOpsbL zE|Ck$N2~;=L`;&vK?q~Jv%(<&ZLmT92r+f&JV=dU*|(>=#+^cJ4TL674M_GI*gv^a z^>+P^9#tRWE8N-(({w^WkT$QQg;4`>k`R9$KM1+c{^z{doguyAvc8eoSbl)@Hb#K# z-T^ok9y~r1Y7NtMGv>^VF8Wf~6$UZ8~*Rr7?Ze-`?chDs!<97B!!k^6_>vj;^@x9WS*71yYm_?V z2O@`5+(!*H)%UzWh*O0)( ze}~%*i07Y8A>##3tu)0*QMP>0j6eW!h|F{q-$sPW^-bd<*$W^onI&yGf45U;&^LE& z^UwgUUw;;Yq%h5@3sks{n*B!4%N*d@b@7=gcSkJ>`F0z8u2%0~rjO zMVNqF7?!}mjNaV-(ix}UhEfNgq1d|m;;s(nouhGL4%tF{y@b!hLSELalp8n8tFd(( z1ldwwJXuYaT<6>L@}JE_p(4i)H7Ht-0rNi$z%wttfW{sJ2RHJ{_WeO*C|I}S28-lH z(S}snCQCR}8)$%a^{;f?vBendvsS!^S)M^Zccy{4i5L*uf}FgP@2yH?CpkxBPUVvO zE9T(KX?&Jd`c@BO!d4~owFVy|#(ouosACYtb*4&)b*Xin1`;(R%Hi^oBH6H5j)H=? zg@)Q2Yq=rwZ{{`!q4L@CPamA2_C)nHjOIpxz;_ZY!+}|eLuhm6Q+RnaO*#J$ z$WUuo8EjQbJ{CI%UUgXk2q4Am-oiQ;5B9}A1uB!r&&U3K!qq$JmK4B?ig8nJg}IkO1VRGt7zb6$iXy+}3B7NWsrh z#_I=n-8}L5CAts2+g!QmC+NWoc|Z#+L)^FMU&05PC<44kCG5DXpgimO>69);`Ip6B zCNWBnFV05ah+0!f^s6T38;84yLp8CMzW5iT-ar9JK)e0`2^k>FaIevf8|znWMUL=V zQ>Tvt9WD4>QikVyE=ATQ$G-#iAs(|HE<3&eTkSV2pU9p7Wtq|al7W^~fVM6U0wYv^> zAs7V7lWv;^XgmRqvOo}R+ZT}_B$+UAtXa3Qe7~+s7rDkH5gyR_1_{gC9wo-$^oACD z^@u=kW0Re~a|!{;`|80^tEU8SQnCdl{A|LYfgdF(7j}ErTLehzy|rH_g6!=KKmg-9F0fK_ z16*L-Rx(#2v_JX+yD;81Ip1u)swtrD@KrFw78$L1^==}-PxV-hsh=A$MppDw02^9u zmO*tzPac6Sj*j_`Y(qF9*SKT_Kn6h)Y8j18a6>}i{;7|?;rWC0zp zcH!ykw#zYcLnI4QEz~9On86fLZ#@o%evKl=$Wu)zXFS&V2OPW4yw%C;6vTJQPAkc| zxgML6f|(NG+{k2{U~|>51e(BA=wU)u#YEmbEtgAzZqi1tq=-#36#O#KOM0Ix!iI2W zt$IX|>tDMax_EPag=Z0aCn&rB&knzcz3<0~|EcKBiK}L~df&bAipj;pAlzz?Kqy+T zlD!KAs+=sYUd&0QzQfC!2I!4Ndb~V{T*W?;8i!la_g{ykAOa>%y#aYcA zlFW7TyTu@&jq?||qktt@BD^fyi6kelsN0JOF2HQ0=Ki-{9`pc434vDLQ3oXcF-0v* zoh4^{+|KOi&l2)(B~;q`_jV{=L*T~_>#g(Gw0UZtXFD##sGeXCt6g_LN*7T*BeV>% z_`!>i>5=sV830Gu?aG{<-u}&)<>sC()@Q}B+7W}P5DUGY1)~1D4>+9$+D2s-hP;=hLv%hgC(3k%6)uw$+pO0sjTq<+$e7&g}9htgs02KaVsTqy%=bhU188rMG=2vp6ZodhtO&_37U51 z)XEMJph%Cy8BQ(v;F2KjhF}w#YNH)bbh{W+Qjgfktz6}-QDtUCPIm-F2t=vPH*RPJ zbqjE1fzj{bB0k&cx&`3cNc6Fxe;rtky=xpOSayW@S9lI?ibEGcm+B%2F6dKJLzNo8 z9pKZ1>@@TJq=nNqU56RPxOCx8@Jp>ATu-7Duf@rbf=hn*e_-b)Uo%20EEqlN6?n1K z871f$_>~fKO|&(DwOgV%{4`iuNc_fve{kqsQwq40Oi4MwTEL`*!(w9qWM`C?n&JtZ z|EUIw?Fv-|zrnI3pkd|ygI&woYf+-qcZpu-{_>+&QMXPA)+x-gDo8{QyiiS&J0zvj zZxv0Vq0M*SeeF)sSeINC+`%TRgbl4RV`w&V6c&qeP#`K=O)yv|zLz<^PTtl3&6or= zIH!gk$DW)-02Rhs{^Iq0pr^*okj$Sv-a>DeTscVJN?RskT?m{Y7m-teZ#j_oIWF>%`}3eWcZ#V{7wy;TDo~O5ewnlWTL?|az`V@FSG6!krR8XyQ?I6Si{1p|m_-ZYiStR)*qb2W%J>Tb>Ye_A zw2{w^CS#g8((HVb>aS_cHo}z-hm@fORmlaXS3D3?%tz`J%gmG?30sPRw>5D9VHK$N zoO$wwHnG%s2TbI>1{eTr7qG|9G`466x5kw*g<{~A-Q^nf2|V#q)DaIkXxhF$7g#VrO$D5O;zH;BES21>T zpac3TMw6c}ea4P}z!pv81G?S6rM|u}7dY&pW63yes7*9IP4` zxt%YB<>;M>@pZdxwn&)ZmpSY_wwdc@g_LL#s&@JhabrJk*D51sdR^KR@>wFZS@rW| z1d`rfp&+1KKI5Fxr);Nl_Zc%bw%PPxm5#&yEcN2SQ_CTqa%;*(gTUN|t{M&!IT)_> zbO?8@Q!p#hl#^GM$Bmfkz2H~9`>3)&I2v0?B;o8OMwwveDeMQRnTDt`h-umWQOGqa zU2hL-uHHH+efubktOW~IHRJCXy`f9l*lKaVEuaeR2BTv@$_hR{dng5eK=PeLp|U=F z+7%wBGd>lOryoU`Z0u>xBd`8x!p%qFC~vaOeQUIP9s{bR+%CJ7=6)=z#lU*`U$(J3 zGs2X9$rVI;@%UZo4pG;^(E_i3ms%>pf`cQ|ClNL5rD&^}(T5ekKNDa8Mf+_8he`R? zYS=_HhJ=dfJ9VOL3(^+W{mq2|SXhqeilehRd8p_``#>V%7^^LCQq3hgB4YB3E}S5w_;+L<5PBAqpJ*i!*LfioSX# z9`~LyNlA_F4)aaNg3xGPTE8g$m`Prg2ZwD~)-yL65@MXg+%m&%8S@5A%;i=bO%V{Q zdC1g7Be-PI4+X96^PmqNG5vf+ss;`0ZtN{#5$9J@FE?ny74as={YYOu41uAK&^i4$ zRU`5P=!2Yn&>Ko~nwCRexJfU|2T8A7ZHg*L2#cKL@?5|RYF*c~p)hko@X;vlnG8gV zu(z{0!xBR_HaMd$SKQB%(}MNe*FbIU!X5;PpXr1QcHk{I5c`!vE8m$G8O~?PL1m_O zj)~ujlVfYteGm=^@PTK6d4PRaDU!BF@5AJF)N}ky@aSpo=gc~g$5(JXr+3N)6x&* zq`-iYi_hKSOB*m9Mu-SUE=@*pcaj@tJ1ePpOFh{92 zJJuswr?Cl=%Q_Gr?O1RehuYB$8!JJ4N$M;QEm>wk+9R_qKo5t%1^(OKrgDUz!xx-d zN#WLTnZyh>Evk6~yZ>mO`%Hv*v)3y7(N#yc*2c5UPC|_*;?d%Ymh$f@ zQ66f!2NKgaoA;VZKzN{wYn>aN*1=9VO)RY4fwoh&gIEa6jgT!rGyG&JOYMIF+jpIF ztdmgXpWR5Cdeh)m&8lTw$dj3c=#Ty-GQy*tFL+mSaV2%3o7H$P>iFwCRM)@e0S{wjtj^gnaYmDsA#e-@nl{-c2cdNrOEbJA!SlMtl+Jy2sCS+& zF9D_qJdOX?girg~KBMt|tfV+bC*}O8GDLU2{xYt)gipX1>{4{Z2~L-l-o205?C(et zw;|#*4eMwh?LN80hW2&#A8s=FqMdXIU*eY~nx^Yq#2Hdps)WhTOXgph7v)^BT|(hRJtY9jm=~gafe+O7dpbDyQb#g9M;rq|)Xc^6mev zu!2$FCk}grOI-A8&O+&FTpn6kOZGH2ChfiY+CBR=*i;>sP`)~oiOzQrzgace1ZrDf z4_wm+Fo<5uiTx)Z*bF4*?Fnuv%wyyfuLTg5Wolm2>Ro}4z;Zd^QI6x?djfF~%cg}N zP_w2taC1TIQ2bmhQ9QGxolA%|Y0V z@}o`6fW$ON4cR^AbqxYhiJ?Y{A7NECAhmU=wFROuifczRHJN!C;`r=Gh3-JVfJ06% zc!H!jTmzc-{1x2%!)0oCSKPHQw;fcj@08t)9jdotSES$cMbB6Xh|BgsR=J#RqrWN0 zR#8}8vjjF>w9i(V{a{#`T@_Z6 zaO0hbh$sG5aEY+nt7ON5Sf>@oCg(n^yMB456h9nnp)F z=^lpIC(Ysy(8y_DA0y3E2l8TCDN*}9PS&MpckM3f1K6F0SI?DD4{3NTd8e8QIgq#3 z<_*GIAg;k=Fre9VhW zJhbQkf4z;&(oJPScH0xf&h5~VB|^TQ_*q36gI~s8RMJ5O#MR3s=+IhVu)$A$Zlv_g z?fvFozTO6!K7N8&8GwE;r)FU*Pe9HE4+0{Tm>KTOOCp?f5s@TsnLG$+{iIFsLlEAl z#F$Y5IFOYF4FJ?-| z!80+djjb53Q_Oo`Jvq)0m*`PT37~WE1M~EYx{jCi9Y$m*;~e1U&_A7CclQc@9asMGJg-txOu+vwLuz5(1(2NZqJVYbeO(j<`Oh@tpuHE;NiCxhA%p`Ztl zYDv}Kmu^8raf8D%LNib%HK>!Vbbtr~C0*v+zD`>$$r+o+8iex*Y=fb zGUH+a#Liu9EWR%;?@X3U!?~`X)C<>6ctq<-aPmCn4NX^c@f;G{p^N1lkM889ge2qF z{-B^|Clu#v>>;!vr2HM4y~kD0Ylh;|X$wyX^~@+udCWIlIu5EWGf&1cL=E}_&BhWu z80bTA_De~|c5;m0o9GvOp%&Nsr|Z*mVXd{~2KnN6GgogjrKqkIqTvv?K`-=ijE{yE zpS)`|C!Zjl_Zl=#bfp<(&EqD68qxEn9 zYCN_7yXR6%;CD62Y7<2*sl1z0baYvnF2KGcv4Vcx#|-XjnU%!xg2!4uu(pvZ`H4E& zAmRb622xZfFwZ2CGIW`Y^~D{6X!+)o4M9N9lY3_lP1&*!#9upL)WPTLKf%<~-X{P@ zpvilVSr9})L~w}|C2HWL)-q|o?8owGIGxdBcpJ^+mFCl<+-S*R0H3#@7=`k12qu823 zspuQf0ZH3seI&kyTVw{#5gT4(8(!V3s|lw@R+BI>luixK(e)BEp(cZ)NC_~_*?(zB zK}AK56A+_zLS9-JzQIQhFc&5|YJ;6?ax-FdS-M=oJ8e;Xf~(Szsz7d=U(A@8;@bjP zhuqs@iCG)Ae&6Wwe2utT@piV)FRNO_&b_K3i227*y?!6n)tNPj^1X0>&%IHT;)==M zqIa!o&_S(#PLyZH{7slNt@9~R>egg(rHX8l(x}^h4bwBpOPBC@C)6>uaw;>)Wb7>t zeGC3-@H|s~41#ZfQC^%fx z#FgtU(dP})a&M?`I1n|*H&(L_I?aBI5h-JHc8iN{^Varhj5ClIhb|Gu3>cqM6HV3CJkAg$6n>RSK4yfdnuSjuYolc**&p;V_%mt49w?>uGA#W&Xhkxf z2t}Li0#YrMFFix+nW9DbCT-vIy!*nI+h8PmC=>shIKn=XR)0-AvXK}jc`B=KsT$JV zUdAHOfv~JqrCI);CILsfkoh6*jknszkZ~Oky*94h=d#UeySkT3Uga$5Psh$gxzD=| zgu?{$3l%Dsj5u)u!Pi`c6yr1k0n*VPi!=B|NCWZER$@3lvr zx0nzBi!T$2=R0)1G&_E=51?bC;!0FQ1K*T9xcU_Bdk#SeB@CG~lc;=}C#pa&hLiBo z=#xPbSu1i2#c{Cy40P=~N-=TcfO^gcqp78^{JxwDj$Z)Q+!>g+PTL2;A{fT0*Xw(C z=Zn7M7Anl5N#oQna@Hf69Y?ot3Xfy;%u>sjiJI>%`n%UVBJ@S!|jD5R543K%F?L zuYSunr>&vtkPgII|G@`Sj#RwoA9`(ZNv)N)TyZL-N5*p(j|xZL$X+huP%`+JP)9=z znk>-mT0OUWTF=94^8(6oK(!EVK6X$SnzsX9O&8WM7&dx*Y`=5*`70j*9SdHr+%mvQ zUar88{2;uwo&n}bbO4gDZ$)=1#qaJWk*UCx;gfyi20WLvWHl-0|J&Ewt{t#i@zx?h z4N)wl^c>J79#2DkBRa_R;ta*~+GwkNMNMV+KekUKM!}K1)NuYPDw0i~Vha=wedw%x zOnfD3Nrd09!oq4m^@KG0Xa3W#B(7^m6)-Z455}}p+sTz)+~aV(AW8XMkD;(|fBL(~ zpvCagh2fUkQB;nTygdt@&_z5-uM zWR?8v1Cp@(?MA;ll^W^{Xmvy{mFG!(V&hL7ilQLO`OM=6%4E=RN7`lrIU+_)4H=b* zK*oV+Xl&T$!wXf<|0`+31gn9P-b4(&){h!GU`mC(FK)jX04F!8mB={rupSt05|V-< zACMo5dmysU#jYl8sbMsXZ}gt(qqG@h6$UiUqQj8oIQiy=5-t-@>@?44_cY%xHoM2! z*f}fk#o)GQWl_^1^A)5Hs%pPsg0>>E*r(x4y?(lSep8T9#>_}C@Aj(}BB_bl+xTh% zX9G>HXi)51^~Bt9f}$h6A5^12y_woJdIIlZ1Mxrv>2g)~^YH0AJ(WkJLH5UG03QV6 zDoQ07NRCKjmz4T$8nXA~@G#mv^Ebe{-m}ck>Cl%H!FXenhzJO~O;V6X!n$PYW?exo zCWL$j>I%7$^nBMDI;;FTDyS1N79-0|)&vg_M9s2lh3~BCXmF#GX}9|=UpJA?@h*YH zeAC8}dQS^#F8xQ)*t$wcp2@0*RsTVh0TdBP3dk8Oo52X9EDWuWYDxV%&1W5Dc%#>E z!)6989UT)Icp>9D0iG6EcB_sdt6Uj%8HRA<)!8k4)yY>3Dn~}!BC?z6gX#gXa3w7g zvTS_>-Tr6glt(Z>ok!Sr^3b*pHP86k^)mhS%Rp|?c-^-QEXz{s%~(-h+1=jb1!LvQ zE6S!;UjXfud!jZ)rLOe#?oHmM{lb?FV7&~Y4<(d8;Wx&p_h2JOw92zD;VC+u+Y}tQ z76M*|id^LTvh5jn2uNfJKQv6P1|Y~1{ESx%&9F>dQgmR@Ydi?VRuu;SWC}%vSjE^! zgy*_?@RJOu{V$E2$h^?~f*Hd6YPp+%zHox6A&+-ts0+f7F(Qkm6eB7)MwhWx39=H%$iOua)1RCmOk62_E@8op)NTG_b0(%Z1*G8+e zIJFZ)0-3K1UNE0)uXG_t?T2@A`>9|kGdW0lv<{3_&9`3bhfLnWU9dPBy8Uuuv$B=1 zDTLeUwl}#N=y?o7>yBA7Xc{YB6L-D!2;=%4- zAULU&43dOnD`%O1-@bDz6Hxc}JT{~V&`Ub+K*y~h=WS9s+DIxJ`gKoGGJeY@q~dSu z-uF5Uw?^mr0OFOgDZgYKf{daX1zH0`!F>G@ZJx0lSE0=3%NThWiv($l(Cf08BT9PN zF%~9#L*!T_7@#wJeUHE%GoDdQ0Wx@2xp-I*WGBWx(s9%Tc`WsP!Upb~_}lhg7ZnS& zLMekY8S(__I^KJUx(=i@TyE+Ez9~2eVrrSv0?2mvlMH*J{*; z^$mc=puxuX3vR)70S=CE`E<>6D6m{gI#u@_#HezQAQp!i!AdYG=T|{ckM%!d3M@M0 z%^UTr<$?5(-S)&FV@W< zB57{$V%z5q^hG`ObCv~MoI_VviZFE_(tTcd-k_ZaFf*Q%iB^}2PY+O!eY^vm;L@`$ zJlY|w)rbi3!QR~Fgy{&;HtGxA8Y_k2Dk%+ah2=D? z$o03@-KGGoC)f%x#QT+nTxgMMyxyk?j-?2z(b0AIub&r1hg;qx@l~nhKba9isT_Ny z``cK+nk3SHlYP({#bl+kaS1C6C`jPJsw-*$mb^>)>$AY~7;eU=)Jqy;*lvNFkZgp` z!F_i%Bu!*-z2Vm_^cdxEKjfN^w(RWNSkpzueaPdW75e>${lANndXlR~wPEDzfew1} z$L`kikPL@R3-#CUwJDwBvu2nrCZh3bY91XB7mg9pzRR{udz6N+eX7{IeXm?*Lekt2 z1`>tt4k#lYeVw`2Cn(IIDJ(RiZsm>5+9#i!@a5Sr(fI(6?9$A{O9;M1yw0z`2qKMC zcXk0>U;fyV2%(Pld)Qe|B7q#WB5p0lz!07Io#uRPE`jK+RfXx9nn3|zr-qO;2vu|T zbG9nb)1LnInlNZCXuBt11W2{NWQf+HD6(t1RXqBquN|8vSHP*bIKb<$TW)DFaGx#} z9AqJc#bAU%Sf5Zdz3qD%G*}>@Pw`@xE-aVLEo6TxsCoMIoTi&SUNT6~h^|f?)txFd3)VHm!Y>FOLs5W4KhH?jKus$Dw#`?=!%7ch zivBDC1+aBTwERLt=^1BGN)H$9h^_oVf?@4GO0zAOt6vtW3roxh!WNSbSY3t&*0-7T zhUUz!6$^tZso%(iXc92TsI}ZgaQouJvDtYgX~!DXK}HKLGGoI8kM@n^ClnFro#8=xw=dE1ZXBs`K*{_Bg3i;a!c0}XUr|&5IB!mUziSoWuiBW{ z_|g3d%#NjS^@)$B^Q%iJcD80I^rf(2Mr}Ix(k4iv~Zk$#1!_pXn-x(?qj09+8nF1uLaNv8t%*^;T zv8RQL)d;YCPRSJC_=wI%{Y4n6Y08-(Hcqn9<4(F2S1Y!pZ+!~>=&qwjEm6f?)rD`p z%k+qo$^qmH_OB{4A?f$Oy(3PInK7MIt!Xpcjaz-J^NZo*-m}7^67~tXrkjRqoN?Xp z{4}i6Sx5nh8~M+Bm^ra)xb5yg8W=b~s)6k2b(4mBB!>4yShTzEtj&Toa<7W{o-oO% zzwd3JO?gVYzOo6h_?r2-GSSW!Plh~W(p7qGn&SYO3GOt2&EqkG{Q7;AKeD5)6>k5L zzRnUp>k0=s`qn ziYsk)ur^}@CW~Ph@19iro^;*MbD;Ip;vk6&WCa@Obh_WVPR9&cuUyo5@rNQv$J;e6 zKtW_xCWlp}QNb!V`b(;u$)o3oi^0BaT)(1^k|#JZuPbfC=3oh}NgZi3ZsXgUGmri$ zg~LQt4km?sWqhTG9x*sx;#O+M1!MsB%9Zc;7X{b%-Zy#f!AZXX_t*C!)m||FOBImH z8JRT4aTaq@#+azhgWKAQ^uNOonsc9>!+0@1nUdZ#P(pRIwxZ?HZMQz@GmElqu3EUCQq z25_zDGyME1_zhwZu+rxpnh0ej1jdhYo~Gz`DDim~ zbaPsV*XOGLhpsySnvWKn;BPxGu*M3mlvDQ_=(w5m(>ygd8u#+Zo#wfev(~QcOzBNk z#%q_Xi4153gu=4b1>VA4&?0X`(en}F;pyPN?-Xg!Nx*Xc?_T30D;(f@w?9JGSYHs( zRg-5^c4B)PNp5w4kHQPh4#9+U0nD&MVXolFrAM4DMX!AJ7!bpUfvl7Du6;H;1%p+& zv!6)=rGDemnnSv(5B1sP%MM%>bwXA?eBD0w4WnegU#-t6^PgqUHR zWv`p;{ZzoY^Ej-}opKgX7;}mVmq3>B))@~Ok^~{vwfA)N)IpO~&%;JQyM;~=anp7i zD~TR6s!Q&BH#+NHUCN&!g!`!11_dFAKyq)Wi{+eCP<*A;)b{i$w0{a#RoqjW?s(3o z1r=vW2ooILnds0WpPEYN`hTXPg&L7se?QIqNgv(fx`}gSoOa9!jiYQ`v03nOtLbHb zGI4$i_IHi0e`*QEeeb+HzuCCnH=e?zA+4*4y+wx<6Zh#HGc zO_{FAoFphvzP>C&q<0edCN6)l3bJyZf+O%as{Ww2{yxavc34A>(R~QqJ<>!by4`u9 zLj(Zo-zA_GlujI=#kbew>nEIHZrBVJ6E_wpk<~F7ZcO+xY7>vhI>mfC(DJk`wAUB2 z62q&^Qo{9G5eD^7`A`5kl;uxi>T9&ukO}T`4sjXG?noC_J$JP!qw)! zBdq%~E7_a1+!wRE&1#!}Eg0ZyO9&7|va#E#ZF=zZ_In9tGQs`!H&WC$zL54#1~f^A z9Y(*DtfUMbua^)KYW>aXdGtk|!7<2f24|3ea2^g1D3x-cP@QuL`;Io?)1Zpe#!miY zw==;|N~}-kr_LM#R6&0eLUNK5zQgb5G+wwUycUKuR0gag)6U!$I$Uo-`Ie3N5yNBE zDyK*C$fb~*n@HJ=2pD|S$!JL3VKpXL*|)Buba#ppAa>TDwmWi^Ji$BUuz zKu+qX@rfJgi4AdW=7)sl0d;;hb;*-0U(cB&0>ub&RodpWcOe(D`pbJP5-APOs+Z6( zRHSpt6ULJYACKXb%r3>)e10wm-3arU5|WoHuOG&m*VHrN^xjto~X ziISEqwyW6Xk92l-o6xVnQgt`=4QzL~d;lDuT*S_@ZN)kC>3B8hZkyUqi~UVxvcRZ( zPW8s)g8v__AI2tcj~DMu`J02oJNo4^{+^g(2AB_D@|Km6`} z)48J7BSt_ZMMnzKZ7Iu>KK8^Qat=>n(}p|Wde~s1A=B*Mj0b#6b0TzGuSZPH=rmcv zd+ddzR9~Q8)PsbYj(^+UIXfnvad8*ggA5F$|Me@XqOu&uRrd87SGF=u2#Cjc?^Zk0 zTg%D|Zc1ULm?Z;Qqp5IUQKK|$MCY8VD<)&=K8LVKhWPIdudi%5)R>8WTs-bfEQ)XH%`*xJSMoCJSq z0D`}cfglOTGT+FD?E(*zl&KCgs4)7_nMu=E0YaBCJhua)3DDFsV?L-;slHKtQ$!f1 zV(H_Xc(aNfoc_;Qu4TFxHY_`+QKqBbTu)9V9~BZ zsuo{zEJn$pWKrsQps75O=1+zjU`IiHiT3#>mD#8fe72yxu;&#_5){%p@PH*_)}DvB zjPS@fQ8#}tOY{fUrVrrHG)`7bc@4V7myy=Ae*%NOy``?I8bHu&s@ntq|M{P8c1HQp zVjqUm5)NB^%)v<62wq-HALGBQ3(Ym^6W#0#BCa=B6C7%4fu0cR-}6&C!(Am~d3GgG z!Am&-ta1$XD^Xb06&yfHckSbkUFaL3lBN6ORZ1VsV@**FC{@=Rm4wvRyQM~o`_Tr~ z3EGF@PlK;ox$JM}?tt+9g$LpRi`Am_@9G=gEJ)~1oKMUjMN7>j!Gk^E4FIlW{3h8E ztH#R(XC}?TobkLMN$z1kS%yTodB2J(%n2(ig$m&TJn;a`_%yp$6!q^k9PyGJk#lp| zzT(2?O_tLt8>b>J5da$S8aDp94q67j7)-L0@>6wL5Ge*&@VfGe>#B9wr01T3Rk2OP zgj5`mD$kKQvTWg%6uU2Adg^uK*vjR?I=}*>ul_CSnisj4$S9195glx-%~z2Zw&Q|_ zsSV$2U4*}UqDtX6F*`0Dz9fLs*`#q)Cw#WjJZl>0WsC$APfON^PqF<(1goi+Q};AT zAdv8akK~-i0+v~R={5F&@?0r~+$ic@|Ew5_ut{9lMn6atNz;?CWrk5xVR9dY^CRfAV6`_mAo>@?m zU+%;z1=v!0hMqo#zZROI+?nXCUUS+ zphLS!ALvsi$)GVJUN4!y`0nH^xcdsr@3k&*;hX5V*|3xF!qb|#P+&VZ5>6v;Vx-7vFyc*j+|2qLnIrcr1#d4p^^_8%4>@{iQ44>I zhP>Pk{3nC+$gmy#=&??>>rHbe!D>9o4P^c&FC3xa4`9M+9@)(73kI-{&dbYvMt*PX z9uE9Bq67{7A&eN;Wq)06m~IcxtoxRdVpn|cs{;IANTx`?tc#0eW0^UXKnXZ@wdG3ftu0kR26OJVT`8_+;JtmpWgMz~; zGTG6o{^OA|K%A(>#+VRS`jV)3%%ql^+ z`38yYaN#q)I`r4Y2*sDZww;=6LIP9Z)_>P3wcy9 zzt^z2DGx!Z1}X$CXp!fMZ6Vso0x>@k1)AK-3*u@{ z)1I56v|Mg9ZY%`$B)TWhjEy8w}8*6S~*`dw#N{7=O6+XYpI^rl*8HpnI3HP z&V=EO>)bvm+9_XJJFOOZ&!Dr>BNi2!KA>@KZoWg*%>XlUprHT_Rju}71vU>OUY51) zFfYFx#V(gSVa>l1wyZrcanj%^-TWu_0|lQ;Xj8ig`!m=oPH<{RCS#AZQ$-br6&h{y zCGq{avc*s3f&<3PHX>ZqLlAWy#=RxmqWdpaI6T}KASpgWb#Y`KU8c7EMIPHtaXOK% z+%*cuH_sT8w-2yWEg?u5Z`cDq&x|w%fr8K#CjT^p6L^c+riPES56%Uu5z|BIhOW!q zaIzHngh-4ioJk0jx9H;BTcKf-=+DP1uwxc6$ddg>%|Gy#z@rC5#zN%AD?P`?M$Fy> zVVyyW%+vQ5KgzwqQ2zUu7EmC%{zGXjOQlPUTW79KuE!Aoq@Kqh=Ug|r9IZ?QxOAyy zEMPo~zS9V(b(2hdOh7*aD@%%mb2p79j}Q29hh*;dO`8`?F*@2ybT4oFDK`fd#7&=5 z(Xj2_Z>qWS=a8{M)oqzcWc*B_pO;@jq-Uo)BJN)dsTY5Mth;qz@lgYY;mh*g!oe0n z{h6fXC_)_ZplT#;$GBlMzD3GGFcJ2bDuae_0X_%DDUOo1NW+}8?4OEgZ6ozDKDl&s ze2wN*-R}-B3>RsdbC#O^y0+6JFZ0eDLOZ)o|AKGn-*7?ulNW1kJW^F(b)<(B;0KiE zX*{2A%I%)(@&1AX>j`nr#nQCR1_9>5M^aBcTtPi-)KvC`;Y8YnZfRu&nYi3l63xxa z5R0^SFkNSXhiR^jE;*jvvocPFTs#iPc<(Lp37vDmfiNo4t+TnlgjFSe|7A65(PPQ1 z{#yFX?ArYE`MYbl*(7S7E%<{av2YBSTiyFYVrB;V&pNse5HKbwayF}-0G*IsWz&HF zP$q>2ECIRjENa#$U@#^6zix$rf*x#MI| z`tJigJFA^B?*D0B(*oJ9(AOXlukvQG!S$AGW73R^{u)-pW=#6Z{Moa~hpXep#e|3A zD6+MBjN*Z#5<-Ctx_jdNRsxI5p>%|*4SC_Rj}Z0)zjbb9zRqA@J$}%_n-z@?^Kt5H z9E>cq0L5=wgAbrI7f8^!b?4n_XO+tDkJ^aI-4lLqMfha$hdHZD8<8U$Eec!u-xl{c zr#i@)hV1k3XJmK_+AM~EZlvVCit=l&3$R_$5MS>O&hV?<@E-OnJyBhH%rI{_L16pr zfx4J9YJdOIpeYecChXH*L;XJgA9$f8e zQw=;ywr(${0t=NvW++D!#6TRHKM;8wv6z$PZ@1(4)eeE$_*0eIzBAZ(bC z<}%ag-OhiDMFe-5FJG2l&)${xj!e^?j1PnDV^#g$(0byhFY@D>i&me;%{>Zw5%dS%JtcWf`@I?a#ZgUywWy;-nC+Yfm#(Gl}#+%;(+Nhyha&+rYbM7 z3$D?>Gk9{MqZurQU|(PJ*wScU`WTKuYG^S4%@im578ANnKq-i_>zB~E3aUf$30$+= zFgcWNqGJ}H1NZg2KXDY>Ck0MNH_hpjt!C>ukmPz#7Lk1_K;w{qEL+pr(zDAJS#f90 zEYr5F3)V`2=%%TyOxNj08+*_49ZBe*IRgW>-}x2V0i*0DBwH#j#OP;Sd%9qCb_l(T zkfSNqsq!4+mtaaabP!95TFQshtEo5)NfDbPA*GxUp)wh9VUftXesa}kc>|&LyqaU~ z8v&3#&xUvA;;P9A2?pFR&s3cFXg|&H|W9RzY?`D{CRp0BOS51D#2iWCHZW? zlm8BbnHI&fm$gi*XT0wU{=>Wjg~(h6@TA#GUFJAZ(hWhxAwJ&VRtJ=++-VXcbXQH{sIaN#&wLNIO&3fyQS+o`y{kAsQ z?}*j%V7Wa)Ir{#F=8*s9dcUfjZg{@M@IP9!N)jn%AHss#KTN~>IW5;X~RIK$>itqiM5#CnKr-v=B(^P#KF6V8XxVwAEjOWM(ta!RrE{rU=Z7bc={K9wG>t zO8N4IVzQ1*Isi5#)H-cXeZA$RYn>-EEvEkyV43Q@${R1zl2XK9nyaOe)l&gmKX$Hw z$5u&ykplWE<9?TQjm5+tu7vR!8PyqoCmQ0nH^^bEL?(nQS^!s6l{WdU%p$}Utlxnv ztBO8iZ)D!cxCTeABB

JK!?E@>d?HTAN!}D?2ohQHnW~IFSP@Zn?{0J&{(^q)q}o zQx7-hniWkNnw{Us&bpW-OYL7je#PpiG}WJrKcHbuIt?%~VIQGJZ#1{gS9p|NfCfNI zfXZ3P^eKE$1xKKMfl+(e;S}Za#SfoEW&GO8@R}O``9vlZ<*i$l8VY-F61^n}{7RAR zMzD=t-LaG*-)dX<-gl#3a5Pq7!(vtI+6wd(j9hlK>2@f*cqA#>Unyauf1mFiu>w4LIkM?EA`;iA}m{%<5KoN0PB2+mS_ z8{Mmf4GnlO34GOnrPK?;vxpZ$VuXq}uw zH5SsiAr}gk$^an-l{jWTc&xjs>6=6t@k>E=w%F}LEBe7F{^|IVKjem1iBqp4yxrh3Rn5V?+=VR&8TDWQN5(sZEL z=>$g`?W*I=vA43UaHEKj66VwC?^H!L5EJP~1dR)x{USeDm(DihyZZ-^FbkjB z(f29VgNsbvK$V&P9)#5JDmn-}?d}J5d{C$D7?PH94()wJoxs@lgC1=ig>*CR;z@xyMhwZrh24ykLxMxicGX+I^b zgf}8O4Mq;a1^P68&1f4CxItPbo`+M};C;Jhhee^;JwMSmpr49J(<_Xqp=;oxPR&8Q z9yW@f5M5$KW3jz;vK(ouR}ZWX{h=5BE#_>vgqB0MT#xen`uBm7jK_z45uDnP8yMaM z#YM-M-azWQx@!LHbhMPh3NuGO3xW*qP`H7$W!@Gq-X3vCf$tCDNy|mJHT0KfKgk_$ zL%tq%n0|zaPwTf&eA+Jmht!6woVB&-J6PpnH~f=H;5KzbN{?}sBDl8me>n}d^m5Sb literal 0 HcmV?d00001 diff --git a/samples/test_tiny_gzip.bin b/samples/test_tiny_gzip.bin new file mode 100644 index 0000000000000000000000000000000000000000..d3f8415a8bc16059d6e320557958ca5121527074 GIT binary patch literal 26 fcmb2|=3oE;rvIx~NU$pNGB5<5+r;&Sk%0jKOHBqO literal 0 HcmV?d00001 diff --git a/samples/test_two_gzip_streams.bin b/samples/test_two_gzip_streams.bin new file mode 100644 index 0000000000000000000000000000000000000000..aafddc4e64bd4ce718185d20f3a7742e7a0ceaca GIT binary patch literal 116 zcmb2|=3oE;rvGmbGV%f$hZf|#k3V&jSpzJLN#r#Pwt63kWnhrU)S35Q9-q!8FHQ!K FP5^GR6~q7l literal 0 HcmV?d00001 diff --git a/samples/test_two_zlib_streams.bin b/samples/test_two_zlib_streams.bin new file mode 100644 index 0000000000000000000000000000000000000000..ffd8032934994fac7986e79dd90c449c3baf130f GIT binary patch literal 558 zcmcb&_8!e)1>@TbjRFh|49tqVzK7pByI4SrN5UZCKm#K)8*Ld6n3;v2yIJgePkV^u U$=%=G>FmifK8t^x`bT>R0L4%|A^-pY literal 0 HcmV?d00001 diff --git a/samples/test_zip_3entries.zip b/samples/test_zip_3entries.zip new file mode 100644 index 0000000000000000000000000000000000000000..e5965a28848bac5e48f53ad52117e2a991d59eaa GIT binary patch literal 628 zcmWIWW@Zs#U|`^2c$^#=^QH1zbrz7P;K#tg1*B8+N{T8C^-3yA-d;EezG_?K;nT0MrJl134;V`GWeO915jMFM>;SH!$n3YF53H9ot`fGShm^_!$rm@F5355 zoYpR4WD;S(9dDqh0)Ylb5Q&Jz0B=-%=%EWz4FU~}lYk^xA2_Vh^`VCWSQA*E3eZNd WK5&Qxc(byBbkj(S5X-B<4x>R2q1s}0tg_000Iag zfB*srAbg_(Kpv80H4?Ly-GDZHFZQf I@AX{d0bn^bM*si- literal 0 HcmV?d00001 diff --git a/tests/CLAUDE.md b/tests/CLAUDE.md new file mode 100644 index 0000000..e105e7f --- /dev/null +++ b/tests/CLAUDE.md @@ -0,0 +1,37 @@ +# tests (integration tests) + +End-to-end round-trip tests against real sample files in `samples/`. + +## What Is Tested + +- **Core round-trip**: decompress a `.deflate` file with `preflate_whole_deflate_stream`, + recompress with `recreate_whole_deflate_stream`, assert bitwise identical output. +- **Container round-trip**: compress a ZIP / PNG / DOCX / PDF through + `PreflateContainerProcessor`, decompress with `RecreateContainerProcessor`, + assert the output matches the original file byte-for-byte. + +## Sample Files (`samples/`) + +The `samples/` directory contains real-world compressed files used as test fixtures: +deflate streams, zlib streams, PNGs, ZIPs, PDFs, DOCX, JPEG, WebP, and binary blobs +from various compressors (zlib, zlib-ng, libdeflate, miniz, Windows zlib). + +These files are checked into the repository. Do not remove or alter them without +updating the corresponding tests. + +## Running + +```bash +cargo test --all # all integration tests +cargo test --package preflate-rs # core tests only +cargo test --package preflate-container # container tests only +cargo test # single test by name +cargo test -- --nocapture # show println! output +``` + +## Notes + +- Tests live in `tests/end_to_end.rs` at the workspace root. +- Some tests use `libdeflate-sys` to generate reference compressions on the fly. +- Test failures often mean a regression in the estimator or token predictor; check + those modules first. diff --git a/util/CLAUDE.md b/util/CLAUDE.md new file mode 100644 index 0000000..bfa5ba6 --- /dev/null +++ b/util/CLAUDE.md @@ -0,0 +1,38 @@ +# util (preflate_util CLI) + +CLI tool for manually testing preflate compression on files and directories. + +## Usage + +``` +preflate_util [OPTIONS] + +Options: + --max-chain Hash chain depth limit (default: 4096) + -c, --level Zstd compression level 0-14 (default: 9) + --loglevel Log level (default: Error) + --verify Round-trip verify after compress (default: true) + --baseline Measure baseline Zstd-only size (default: false) +``` + +`` may be a single file or a directory (scanned recursively). + +## What It Does + +1. For each file, calls `PreflateContainerProcessor` to compress. +2. Optionally calls `RecreateContainerProcessor` to decompress and byte-compares the result. +3. Prints per-file and aggregate statistics: compressed size, baseline size, CPU time. + +## Source + +Single file: `src/main.rs` (~193 lines). + +Helper `assert_eq_array()` provides detailed positional diff output for debugging +mismatches during verification. + +## Dependencies + +- `clap` (4, derive) — argument parsing +- `cpu-time` (1) — CPU time measurement +- `preflate-rs` and `preflate-container` — core logic +- `env_logger` / `log` — logging From e2332ad3fbc85e2edb98b8fd9fb281112c9e1d8d Mon Sep 17 00:00:00 2001 From: Kristof Date: Wed, 25 Feb 2026 16:39:46 +0100 Subject: [PATCH 4/8] remove pdf parse --- container/src/pdf_parse.rs | 473 ------------------------------------- 1 file changed, 473 deletions(-) delete mode 100644 container/src/pdf_parse.rs diff --git a/container/src/pdf_parse.rs b/container/src/pdf_parse.rs deleted file mode 100644 index be3a83a..0000000 --- a/container/src/pdf_parse.rs +++ /dev/null @@ -1,473 +0,0 @@ -use preflate_rs::{PreflateError, Result, err_exit_code}; - -pub fn pdf_to_utf8(input: &[u8]) -> String { - input - .iter() - .map(|&b| { - match b { - 0x00..=0x7F => b as char, // ASCII range same - 0x80 => '\u{20AC}', // EURO SIGN - 0x81..=0x8C | 0x8E..=0x9F => '\u{FFFD}', // Undefined mappings → replacement char - 0xA0..=0xFF => { - // Map selectively or fallback to Latin1 - match b { - 0xA9 => '\u{00A9}', // © - 0xAD => '\u{2013}', // en dash - 0xAF => '\u{2014}', // em dash - 0xD0 => '\u{2020}', // dagger - 0xD1 => '\u{2021}', // double dagger - 0xD2 => '\u{2022}', // bullet - 0xD3 => '\u{2026}', // ellipsis - 0xFE => '\u{00A0}', // non-breaking space - 0xFF => '\u{2028}', // line separator - _ => b as char, // Latin-1 fallback for others - } - } - _ => '\u{FFFD}', // replacement char for unmapped - } - }) - .collect() -} - -fn decode_pdf_string(data: &[u8]) -> Result { - if data.len() >= 2 && data[0] == 0xFE && data[1] == 0xFF { - // UTF-16BE with BOM - if (data.len() - 2) % 2 != 0 { - return err_exit_code( - preflate_rs::ExitCode::InvalidIDat, - "Invalid UTF-16BE string length", - ); - } - let utf16_data: Vec = data[2..] - .chunks(2) - .map(|chunk| (chunk[0] as u16) << 8 | (chunk[1] as u16)) - .collect(); - - String::from_utf16(&utf16_data).map_err(|e| { - PreflateError::new( - preflate_rs::ExitCode::InvalidIDat, - format!("UTF-16 decode error: {}", e), - ) - }) - } else { - // PDFDocEncoding fallback - Ok(pdf_to_utf8(data)) - } -} - -#[derive(Debug, Clone, PartialEq)] -pub enum PdfValue { - Name(String), - String(String), - Number(f64), - Boolean(bool), - Null, - // Optional: add later - // Array(Vec), - // Dictionary(HashMap), -} - -use std::collections::HashMap; - -pub fn parse_pdf_dictionary(input: &[u8]) -> Result> { - let mut result = HashMap::new(); - let mut pos = 0; - - // Skip leading '<<' - if input.starts_with(b"<<") { - pos += 2; - } - - while pos < input.len() { - skip_whitespace(input, &mut pos); - - if pos >= input.len() || input[pos] != b'/' { - break; - } - - // Parse key - let key_start = pos + 1; - let mut key_end = key_start; - while key_end < input.len() - && !is_whitespace(input[key_end]) - && !is_delimiter(input[key_end]) - { - key_end += 1; - } - - let key_bytes = &input[key_start..key_end]; - let key = pdf_to_utf8(key_bytes); - pos = key_end; - - skip_whitespace(input, &mut pos); - - // Parse value - let value = match input.get(pos) { - Some(b'/') => { - let (name, consumed) = parse_name(&input[pos..]); - pos += consumed; - PdfValue::Name(name) - } - Some(b'(') => { - let (bytes, consumed) = parse_literal_string(&input[pos..])?; - pos += consumed; - PdfValue::String(decode_pdf_string(&bytes)?) - } - Some(b'-') | Some(b'+') | Some(b'0'..=b'9') => { - let (number, consumed) = parse_number(&input[pos..])?; - pos += consumed; - PdfValue::Number(number) - } - Some(b't') if input.len() >= pos + 4 && &input[pos..pos + 4] == b"true" => { - pos += 4; - PdfValue::Boolean(true) - } - Some(b'f') if input.len() >= pos + 5 && &input[pos..pos + 5] == b"false" => { - pos += 5; - PdfValue::Boolean(false) - } - Some(b'n') if input.len() >= pos + 4 && &input[pos..pos + 4] == b"null" => { - pos += 4; - PdfValue::Null - } - _ => { - // Unknown or unsupported type — skip - break; - } - }; - - result.insert(key, value); - } - - Ok(result) -} - -fn skip_whitespace(input: &[u8], pos: &mut usize) { - while *pos < input.len() && is_whitespace(input[*pos]) { - *pos += 1; - } -} - -fn parse_literal_string(input: &[u8]) -> Result<(Vec, usize)> { - let mut output = Vec::new(); - let mut pos = 1; // skip '(' - let mut depth = 1; - let mut escape = false; - - while pos < input.len() { - let byte = input[pos]; - pos += 1; - - if escape { - match byte { - b'n' => output.push(b'\n'), - b'r' => output.push(b'\r'), - b't' => output.push(b'\t'), - b'b' => output.push(0x08), - b'f' => output.push(0x0C), - b'(' => output.push(b'('), - b')' => output.push(b')'), - b'\\' => output.push(b'\\'), - b'0'..=b'7' => { - let mut octal = vec![byte]; - for _ in 0..2 { - if let Some(&next) = input.get(pos) { - if next >= b'0' && next <= b'7' { - octal.push(next); - pos += 1; - } else { - break; - } - } - } - if let Ok(val) = - u8::from_str_radix(std::str::from_utf8(&octal).unwrap_or("0"), 8) - { - output.push(val); - } - } - other => output.push(other), - } - escape = false; - } else if byte == b'\\' { - escape = true; - } else if byte == b'(' { - depth += 1; - output.push(b'('); - } else if byte == b')' { - depth -= 1; - if depth == 0 { - break; - } - output.push(b')'); - } else { - output.push(byte); - } - } - - Ok((output, pos)) -} - -fn parse_name(input: &[u8]) -> (String, usize) { - let mut end = 1; // skip '/' - while end < input.len() && !is_whitespace(input[end]) && !is_delimiter(input[end]) { - end += 1; - } - let name_bytes = &input[1..end]; - (pdf_to_utf8(name_bytes), end) -} - -fn parse_number(input: &[u8]) -> Result<(f64, usize)> { - let mut end = 0; - while end < input.len() - && (input[end] == b'.' - || input[end] == b'-' - || input[end] == b'+' - || input[end].is_ascii_digit()) - { - end += 1; - } - let number_str = std::str::from_utf8(&input[..end])?; - let number = number_str.parse()?; - Ok((number, end)) -} - -fn is_whitespace(b: u8) -> bool { - matches!(b, b'\x00' | b'\x09' | b'\x0A' | b'\x0C' | b'\x0D' | b' ') -} - -fn is_delimiter(b: u8) -> bool { - matches!(b, b'<' | b'>' | b'[' | b']' | b'(' | b')' | b'/' | b'%') -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_empty_dictionary() { - let input = b"<< >>"; - let result = parse_pdf_dictionary(input).unwrap(); - assert!(result.is_empty()); - } - - #[test] - fn test_string_value() { - let input = b"<< /Title (RustLang) >>"; - let result = parse_pdf_dictionary(input).unwrap(); - assert_eq!( - result.get("Title"), - Some(&PdfValue::String("RustLang".to_string())) - ); - } - - #[test] - fn test_string_with_escape_sequences() { - let input = b"<< /Note (Line\\nBreak\\tTabbed\\rReturn) >>"; - let result = parse_pdf_dictionary(input).unwrap(); - assert_eq!( - result.get("Note"), - Some(&PdfValue::String("Line\nBreak\tTabbed\rReturn".to_string())) - ); - } - - #[test] - fn test_string_with_octal_escape() { - let input = b"<< /Data (Hello\\040World) >>"; - let result = parse_pdf_dictionary(input).unwrap(); - assert_eq!( - result.get("Data"), - Some(&PdfValue::String("Hello World".to_string())) - ); - } - - #[test] - fn test_nested_parentheses() { - let input = b"<< /Comment (This (is) nested) >>"; - let result = parse_pdf_dictionary(input).unwrap(); - assert_eq!( - result.get("Comment"), - Some(&PdfValue::String("This (is) nested".to_string())) - ); - } - - #[test] - fn test_name_value() { - let input = b"<< /Author /Alice >>"; - let result = parse_pdf_dictionary(input).unwrap(); - assert_eq!( - result.get("Author"), - Some(&PdfValue::Name("Alice".to_string())) - ); - } - - #[test] - fn test_utf16_value() - { - let input = b"<< /Title (\xFE\xFF\x00R\x00u\x00s\x00t) >>"; // "Rust" in UTF-16BE with BOM - let result = parse_pdf_dictionary(input).unwrap(); - assert_eq!( - result.get("Title"), - Some(&PdfValue::String("Rust".to_string())) - ); - } - - #[test] - fn test_boolean_values() { - let input = b"<< /Enabled true /Visible false >>"; - let result = parse_pdf_dictionary(input).unwrap(); - assert_eq!(result.get("Enabled"), Some(&PdfValue::Boolean(true))); - assert_eq!(result.get("Visible"), Some(&PdfValue::Boolean(false))); - } - - #[test] - fn test_null_value() { - let input = b"<< /Deleted null >>"; - let result = parse_pdf_dictionary(input).unwrap(); - assert_eq!(result.get("Deleted"), Some(&PdfValue::Null)); - } - - #[test] - fn test_number_values() { - let input = b"<< /Count 42 /Negative -7 /Float 3.14 >>"; - let result = parse_pdf_dictionary(input).unwrap(); - assert_eq!(result.get("Count"), Some(&PdfValue::Number(42.0))); - assert_eq!(result.get("Negative"), Some(&PdfValue::Number(-7.0))); - assert_eq!(result.get("Float"), Some(&PdfValue::Number(3.14))); - } - - #[test] - fn test_multiple_mixed_values() { - let input = b"<< /Title (Rust) /Author /Bob /Pages 100 /Active true /Removed null >>"; - let result = parse_pdf_dictionary(input).unwrap(); - - assert_eq!( - result.get("Title"), - Some(&PdfValue::String("Rust".to_string())) - ); - assert_eq!( - result.get("Author"), - Some(&PdfValue::Name("Bob".to_string())) - ); - assert_eq!(result.get("Pages"), Some(&PdfValue::Number(100.0))); - assert_eq!(result.get("Active"), Some(&PdfValue::Boolean(true))); - assert_eq!(result.get("Removed"), Some(&PdfValue::Null)); - } - - #[test] - fn test_invalid_key_skips_parsing() { - let input = b"<< Title (MissingSlash) >>"; // missing '/' - let result = parse_pdf_dictionary(input).unwrap(); - assert!(result.is_empty()); - } - - #[test] - fn test_incomplete_string_does_not_panic() { - let input = b"<< /Broken (This is incomplete >>"; - let _ = parse_pdf_dictionary(input).unwrap(); // shouldn't panic - } - - #[test] - fn test_ascii_identity() { - let input = b"Hello, World!"; - let expected = "Hello, World!"; - assert_eq!(pdf_to_utf8(input), expected); - } - - #[test] - fn test_copyright_symbol() { - let input = &[0xA9]; // © - let expected = "\u{00A9}"; - assert_eq!(pdf_to_utf8(input), expected); - } - - #[test] - fn test_en_and_em_dash() { - let input = &[0xAD, 0xAF]; // en dash, em dash - let expected = "\u{2013}\u{2014}"; // –— - assert_eq!(pdf_to_utf8(input), expected); - } - - #[test] - fn test_typographic_characters() { - let input = &[0xD0, 0xD1, 0xD2, 0xD3]; // †‡•… - let expected = "\u{2020}\u{2021}\u{2022}\u{2026}"; - assert_eq!(pdf_to_utf8(input), expected); - } - - #[test] - fn test_euro_sign() { - let input = &[0x80]; // € - let expected = "\u{20AC}"; - assert_eq!(pdf_to_utf8(input), expected); - } - - #[test] - fn test_nonbreaking_space() { - let input = &[0xFE]; - let expected = "\u{00A0}"; - assert_eq!(pdf_to_utf8(input), expected); - } - - #[test] - fn test_line_separator() { - let input = &[0xFF]; - let expected = "\u{2028}"; - assert_eq!(pdf_to_utf8(input), expected); - } - - #[test] - fn test_unknown_byte_gives_replacement_char() { - let input = &[0x90]; // Undefined in PDFDocEncoding - let expected = "\u{FFFD}"; // Replacement character - assert_eq!(pdf_to_utf8(input), expected); - } - - #[test] - fn test_mixed_ascii_and_pdfdoc_chars() { - let input = &[b'H', b'i', b' ', 0xA9, b' ', 0x80]; // "Hi © €" - let expected = "Hi \u{00A9} \u{20AC}"; - assert_eq!(pdf_to_utf8(input), expected); - } - - #[test] - fn test_parse_dictionary_with_pdfdoc_encoding_characters() { - let input = b"<< - /Title (Rust Programming \xA9 2025) - /Note (\x80 price - valid until \xD3) - /Dash (\xAD\xAF) - /Fancy (\xD0\xD1\xD2) - /SpaceTest (\xFE\xFF) >>"; - - let result = parse_pdf_dictionary(input).unwrap(); - - assert_eq!( - result.get("Title"), - Some(&PdfValue::String( - "Rust Programming \u{00A9} 2025".to_string() - )) - ); - - assert_eq!( - result.get("Note"), - Some(&PdfValue::String( - "\u{20AC} price - valid until \u{2026}".to_string() - )) - ); - - assert_eq!( - result.get("Dash"), - Some(&PdfValue::String("\u{2013}\u{2014}".to_string())) - ); - - assert_eq!( - result.get("Fancy"), - Some(&PdfValue::String("\u{2020}\u{2021}\u{2022}".to_string())) - ); - - assert_eq!( - result.get("SpaceTest"), - Some(&PdfValue::String("\u{00A0}\u{2028}".to_string())) - ); - } -} From 18701bc9d0d187d23fe3b769102e107d224b2b27 Mon Sep 17 00:00:00 2001 From: Kristof Date: Mon, 2 Mar 2026 20:00:22 +0100 Subject: [PATCH 5/8] fix formatting --- container/src/container_processor.rs | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/container/src/container_processor.rs b/container/src/container_processor.rs index 4246931..086bb36 100644 --- a/container/src/container_processor.rs +++ b/container/src/container_processor.rs @@ -581,9 +581,10 @@ impl ProcessBuffer for PreflateContainerProcessor { ChunkParseState::DeflateContinue(state) => { // here we have a deflate stream that we need to continue match state.decompress(&self.content) { - Err(ref e) if e.exit_code() == ExitCode::ShortRead - && !input_complete - && self.content.len() <= self.config.max_chunk_size => + Err(ref e) + if e.exit_code() == ExitCode::ShortRead + && !input_complete + && self.content.len() <= self.config.max_chunk_size => { // Not enough data to complete the next block yet; wait for more. break; @@ -1620,9 +1621,7 @@ pub(crate) mod test { "PDF with embedded JPEGs should produce at least one JPEG_LEPTON block" ); - let has_deflate = blocks - .iter() - .any(|&(_, t)| t == BLOCK_TYPE_DEFLATE); + let has_deflate = blocks.iter().any(|&(_, t)| t == BLOCK_TYPE_DEFLATE); assert!( has_deflate, "PDF with embedded JPEGs should also produce DEFLATE blocks for compressed objects" From 7800a3f1e882db3b68de025ce39819ed1a22971d Mon Sep 17 00:00:00 2001 From: Kristof Date: Mon, 2 Mar 2026 20:58:44 +0100 Subject: [PATCH 6/8] update claude.md --- CLAUDE.md | 12 +++++----- container/CLAUDE.md | 57 ++++++++++++++++++++++++++------------------- 2 files changed, 39 insertions(+), 30 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 89acbcb..87e889d 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -38,7 +38,7 @@ The release build uses Spectre mitigations (`/Qspectre /sdl`) and produces `pref | Crate | Output | Role | |---|---|---| | `preflate/` | library | Core DEFLATE analysis and reconstruction | -| `container/` | library | Scans binary files (ZIP, PNG, PDF) for DEFLATE streams | +| `container/` | library | Scans binary files (ZIP, PNG, JPEG) for DEFLATE streams | | `util/` | `preflate_util.exe` | CLI for testing on files/directories | | `dll/` | `preflate_rs_0_7.dll` | C FFI wrapper for .NET interop | | `fuzz/` | fuzz harnesses | libfuzzer targets | @@ -58,13 +58,13 @@ Parameters are serialized via `bitcode`; corrections via CABAC. The format is ch ### container crate -- **`scan_deflate.rs`** — Scans raw bytes to locate DEFLATE stream boundaries, identifying stream type (raw deflate, zlib-wrapped, PNG IDAT, etc.). +- **`scan_deflate.rs`** — Scans raw bytes to locate DEFLATE stream boundaries, identifying stream type (raw deflate, zlib-wrapped, PNG IDAT, ZIP, JPEG, etc.). - **`idat_parse.rs`** — Extracts and reassembles PNG IDAT chunks. -- **`pdf_parse.rs`** — Detects PDF compressed object streams. -- **`zstd_compression.rs`** — Pipelines preflate output through Zstd for final storage. -- **`container_processor.rs`** — Orchestrates scanning → preflate → Zstd (compress) and Zstd → recreate → reassembly (decompress). +- **`container_processor.rs`** — Orchestrates scanning → preflate → Zstd (compress) and Zstd → recreate → reassembly (decompress). Zstd encode/decode is handled inline using a single persistent encoder. +- **`utils.rs`** — `process_limited_buffer()` and test helpers. +- **`scoped_read.rs`** — Bounded reader adapter. -The optional `webp` feature (enabled by default) allows PNG images to be stored as WEBP instead of losslessly. +The optional `webp` feature (enabled by default) allows PNG images to be stored as WebP instead of losslessly. PDF streams are not scanned (pdf_parse was removed). ### Code constraints diff --git a/container/CLAUDE.md b/container/CLAUDE.md index 5544e3e..9676860 100644 --- a/container/CLAUDE.md +++ b/container/CLAUDE.md @@ -1,6 +1,6 @@ # container (preflate-container) -Scans binary files (ZIP, PNG, PDF, JPEG) for DEFLATE streams, orchestrates the +Scans binary files (ZIP, PNG, JPEG) for DEFLATE streams, orchestrates the preflate + Zstd pipeline, and reassembles the output. Only format version 2 exists (v1 was removed). @@ -8,25 +8,33 @@ preflate + Zstd pipeline, and reassembles the output. Only format version 2 exis ```rust // Compress a file/buffer containing embedded DEFLATE streams -PreflateContainerProcessor::new(config, level, test_baseline) -> Self -impl ProcessBuffer for PreflateContainerProcessor { - fn process_buffer(&mut self, input: &[u8], input_complete: bool) -> Result>; -} +PreflateContainerProcessor::new(config: &PreflateContainerConfig, level: i32, test_baseline: bool) -> Self +impl ProcessBuffer for PreflateContainerProcessor { ... } // Decompress a preflate container back to the original file -RecreateContainerProcessor::new(capacity) -> Self -impl ProcessBuffer for RecreateContainerProcessor { - fn process_buffer(&mut self, input: &[u8], input_complete: bool) -> Result>; +RecreateContainerProcessor::new(capacity: usize) -> Self +impl ProcessBuffer for RecreateContainerProcessor { ... } + +// Core trait — both processors implement this +pub trait ProcessBuffer { + fn process_buffer(&mut self, input: &[u8], input_complete: bool, writer: &mut impl Write) -> Result<()>; + fn stats(&self) -> PreflateStats { PreflateStats::default() } // default no-op; overridden by Compress + fn copy_to_end(&mut self, input: &mut impl BufRead, output: &mut impl Write) -> Result<()>; + fn copy_to_end_size(&mut self, input: &mut impl BufRead, output: &mut impl Write, chunk: usize) -> Result<()>; } -// DLL helper: wraps process_buffer to respect a max output size per call -fn process_limited_buffer(processor, input, input_complete, max_output) -> Result<(Vec, bool)>; - -// Stats after compression -PreflateContainerProcessor::get_stats() -> &PreflateStats +// DLL helper: writes to a fixed output buffer, spills overflow into a VecDeque +fn process_limited_buffer( + process: &mut impl ProcessBuffer, + input: &[u8], + input_complete: bool, + output_buffer: &mut [u8], + output_extra: &mut VecDeque, +) -> Result<(bool, usize)>; // (all_output_drained, bytes_written_to_output_buffer) ``` -`PreflateContainerConfig` holds knobs like `max_chain_length`, `verify_compression`, etc. +`PreflateContainerConfig` holds knobs: `min_chunk_size`, `max_chunk_size`, +`total_plain_text_limit`, `chunk_plain_text_limit`, `validate_compression`, `max_chain_length`. ## Wire Format (v2 only) @@ -161,23 +169,23 @@ if input_complete && !self.input_complete { // NOT just `if input_complete` src/ lib.rs ← public types and re-exports container_processor.rs ← PreflateContainerProcessor, RecreateContainerProcessor, - V2BlockInfo enum, process_v2_compressed_block() + ProcessBuffer trait, MeasureWriteSink, + block-type constants, emit_compressed_block(), + write_chunk_block_v2(), write_varint(), read_varint() scan_deflate.rs ← locates DEFLATE stream boundaries in raw bytes - identifies: raw deflate, zlib-wrapped, PNG IDAT, ZIP, JPEG, PDF + identifies: raw deflate, zlib-wrapped, PNG IDAT, ZIP, JPEG idat_parse.rs ← extracts / reassembles PNG IDAT chunks; parses IHDR - pdf_parse.rs ← detects PDF FlateDecode compressed object streams - zstd_compression.rs ← ZstdCompressContext / ZstdDecompressContext (internal) scoped_read.rs ← bounded reader adapter - utils.rs ← process_limited_buffer() and other helpers + utils.rs ← process_limited_buffer(), TakeReader, test helpers ``` ## Key Internal Types | Type | Purpose | |---|---| -| `V2BlockInfo` | `Compressed(u8)` or `Jpeg(Vec)` — one entry per scanned block | -| `MeasureWriteSink` | `pub(crate)` sink that counts bytes; used for baseline measurement | -| `PreflateStats` | `deflate_compressed_size`, `zstd_compressed_size`, `zstd_baseline_size`, … | +| `MeasureWriteSink` | `pub(crate)` sink that counts bytes; used for baseline Zstd measurement | +| `PreflateStats` | pub struct: `deflate_compressed_size`, `zstd_compressed_size`, `uncompressed_size`, `overhead_bytes`, `hash_algorithm`, `zstd_baseline_size` | +| `TakeReader` | `pub` BufRead wrapper that reads at most N bytes (used in utils.rs) | ## Features @@ -186,9 +194,10 @@ src/ ## Dependencies of Note -- `lepton_jpeg` (0.5.1) — JPEG blocks are recompressed with Lepton, not Zstd. -- `zstd` (0.13) — single persistent encoder across all non-JPEG blocks. +- `lepton_jpeg` (0.5.1) — JPEG blocks are recompressed with Lepton, bypassing Zstd entirely. +- `zstd` (0.13) — single persistent encoder across all non-JPEG/WebP blocks. - `preflate-rs` — core analysis/reconstruction (path dependency). +- `webp` (0.3, optional, default-enabled) — PNG images can be stored as WebP lossless. ## Constraints From cbdd8b6a503adabce8a37c0a5f00493e85a1f2a9 Mon Sep 17 00:00:00 2001 From: Kristof Date: Wed, 4 Mar 2026 15:26:14 +0100 Subject: [PATCH 7/8] refactored out container reader / writer --- Cargo.lock | 10 +- Cargo.toml | 16 +- container/CLAUDE.md | 4 +- container/Cargo.toml | 14 +- container/src/container_common.rs | 1085 ++++++++++++++ container/src/container_processor.rs | 2081 -------------------------- container/src/container_read.rs | 535 +++++++ container/src/container_write.rs | 534 +++++++ container/src/idat_parse.rs | 2 +- container/src/lib.rs | 11 +- container/src/utils.rs | 40 +- dll/Cargo.toml | 10 +- preflate/Cargo.toml | 14 +- util/Cargo.toml | 9 +- 14 files changed, 2205 insertions(+), 2160 deletions(-) create mode 100644 container/src/container_common.rs delete mode 100644 container/src/container_processor.rs create mode 100644 container/src/container_read.rs create mode 100644 container/src/container_write.rs diff --git a/Cargo.lock b/Cargo.lock index 269ac3b..900124c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -551,7 +551,7 @@ dependencies = [ [[package]] name = "preflate-container" -version = "0.7.5" +version = "0.7.6" dependencies = [ "adler32", "byteorder", @@ -567,7 +567,7 @@ dependencies = [ [[package]] name = "preflate-rs" -version = "0.7.5" +version = "0.7.6" dependencies = [ "bitcode", "byteorder", @@ -589,7 +589,7 @@ dependencies = [ [[package]] name = "preflate-rs-root" -version = "0.0.0" +version = "0.7.6" dependencies = [ "libdeflate-sys", "libz-ng-sys", @@ -601,7 +601,7 @@ dependencies = [ [[package]] name = "preflate_rs_0_7" -version = "0.7.5" +version = "0.7.6" dependencies = [ "preflate-container", "preflate-rs", @@ -609,7 +609,7 @@ dependencies = [ [[package]] name = "preflate_util" -version = "0.0.0" +version = "0.7.6" dependencies = [ "clap", "cpu-time", diff --git a/Cargo.toml b/Cargo.toml index 0196a28..8b4cacc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,11 +1,11 @@ # root project only exists to refer to the packages -# and run the end-to-end tests in the tests directory +# and run the end-to-end tests in the tests directory [package] name = "preflate-rs-root" -version = "0.0.0" -edition = "2024" -rust-version = "1.85" +version.workspace = true +edition.workspace = true +rust-version.workspace = true [profile.release] debug = true @@ -14,6 +14,14 @@ debug = true members = ["preflate", "container", "dll", "util", "fuzz"] resolver = "2" +[workspace.package] +version = "0.7.6" +edition = "2024" +authors = ["Kristof Roomp "] +license = "Apache-2.0" +rust-version = "1.85" +repository = "https://github.com/microsoft/preflate-rs" + [dev-dependencies] preflate-rs = { path = "preflate" } preflate-container = { path = "container" } diff --git a/container/CLAUDE.md b/container/CLAUDE.md index 9676860..a752e85 100644 --- a/container/CLAUDE.md +++ b/container/CLAUDE.md @@ -80,7 +80,6 @@ Mask constants (defined in `container_processor.rs`): | `BLOCK_TYPE_DEFLATE_CONTINUE` | `0x03` | `0x43` | Continuation of a DEFLATE stream that spanned a chunk boundary | | `BLOCK_TYPE_JPEG_LEPTON` | `0x04` | `0x04` | JPEG re-compressed with Lepton; bypasses Zstd entirely | | `BLOCK_TYPE_WEBP` | `0x05` | `0x05` | PNG image stored as WebP lossless; bypasses Zstd entirely | -| `BLOCK_TYPE_EOS` | `0x3F` | `0x7F` | The `encoder.finish()` bytes that close the Zstd stream | ### Zstd encoder/decoder lifecycle @@ -92,8 +91,7 @@ Mask constants (defined in `container_processor.rs`): - Each flush segment is decodable in sequence: the decoder is a persistent `zstd::stream::raw::Decoder` that maintains cross-block history, so compression quality benefits from all previously seen blocks. -- The `BLOCK_TYPE_EOS` (`0x7F`) end-of-stream block carries the `encoder.finish()` output - that closes the Zstd frame cleanly. No decompressed bytes are expected from it. +- The stream is terminated by EOF — there is no explicit end-of-stream block. ### Inner payload layout (inside Zstd, after decompression) diff --git a/container/Cargo.toml b/container/Cargo.toml index ef285df..11826a7 100644 --- a/container/Cargo.toml +++ b/container/Cargo.toml @@ -1,15 +1,15 @@ [package] name = "preflate-container" -version = "0.7.5" -edition = "2024" -authors = ["Kristof Roomp "] -license = "Apache-2.0" -rust-version = "1.85" +version.workspace = true +edition.workspace = true +authors.workspace = true +license.workspace = true +rust-version.workspace = true +repository.workspace = true description = """ Scans binary files for zStd streams and uses Preflate-rs to decompress the stream and repack with -zStd compression. For PNG files, we use WEBP compression for RGB and RGBA to get better results. +zStd compression. For PNG files, we use WEBP compression for RGB and RGBA to get better results. """ -repository = "https://github.com/microsoft/preflate-rs" categories = ["compression"] keywords = ["gzip", "deflate", "zlib", "zip"] diff --git a/container/src/container_common.rs b/container/src/container_common.rs new file mode 100644 index 0000000..b41e771 --- /dev/null +++ b/container/src/container_common.rs @@ -0,0 +1,1085 @@ +use std::io::{BufRead, Read, Write}; + +use preflate_rs::{AddContext, HashAlgorithm, PreflateConfig, Result}; + +/// Configuration for the deflate process +#[derive(Debug, Clone)] +pub struct PreflateContainerConfig { + /// As we scan for deflate streams, we need to have a minimum memory + /// chunk to process. We scan this chunk for deflate streams and at least + /// deflate one block has to fit into a chunk for us to recognize it. + pub min_chunk_size: usize, + + /// The maximum size of a deflate or PNG compressed block we will consider. If + /// a deflate stream is larger than this, we will not decompress it and + /// just write it out as a literal block. + pub max_chunk_size: usize, + + /// The maximum overall size of plain text that we will compress. This is + /// global to the entire container and limits the amount of processing that + /// we will do to avoid running out of CPU time on a single file. Once we + /// hit this limit, we will stop looking for deflate streams and just write + /// out the rest of the data as literal blocks. + pub total_plain_text_limit: u64, + + /// The maximum size of a plain text chunk that we will decompress at a time. This limits + /// the memory usage of the decompression process. + pub chunk_plain_text_limit: usize, + + /// true if we should verify that the decompressed data can be recompressed to the same bytes. + /// This is important since there may be corner cases where the data may not yield the same bytes. + /// + /// If this is false, we will not verify the decompressed data and just write it out as is and it is + /// up to the caller to make sure the data is valid. In no case should you just assume that you + /// can get the same data back without verifying it. + pub validate_compression: bool, + + /// Maximum number of lookups we will do in the hash chain. This will limit the CPU time we spend + /// on deflate stream processing but also means that we won't be able to recompress deflate streams + /// that were compressed with a larger chain length (eg level 9 has 4096). + pub max_chain_length: u32, +} + +impl Default for PreflateContainerConfig { + fn default() -> Self { + PreflateContainerConfig { + min_chunk_size: 1024 * 1024, + max_chunk_size: 64 * 1024 * 1024, + total_plain_text_limit: 512 * 1024 * 1024, + chunk_plain_text_limit: 128 * 1024 * 1024, + max_chain_length: 4096, + validate_compression: true, + } + } +} + +impl PreflateContainerConfig { + pub fn preflate_config(&self) -> PreflateConfig { + PreflateConfig { + max_chain_length: self.max_chain_length, + plain_text_limit: self.chunk_plain_text_limit, + verify_compression: self.validate_compression, + } + } +} + +pub(crate) const COMPRESSED_WRAPPER_VERSION_2: u8 = 2; + +// Bit-field masks for the block type byte +// Bits 7-6: compression algorithm Bits 5-0: block content kind +pub(crate) const BLOCK_COMPRESSION_MASK: u8 = 0xC0; +pub(crate) const BLOCK_TYPE_MASK: u8 = 0x3F; + +// Compression algorithms (top 2 bits) +pub(crate) const BLOCK_COMPRESSION_NONE: u8 = 0x00; +pub(crate) const BLOCK_COMPRESSION_ZSTD: u8 = 0x40; + +// Block content kinds (bottom 6 bits) +pub(crate) const BLOCK_TYPE_LITERAL: u8 = 0x00; +pub(crate) const BLOCK_TYPE_DEFLATE: u8 = 0x01; +pub(crate) const BLOCK_TYPE_PNG: u8 = 0x02; +pub(crate) const BLOCK_TYPE_DEFLATE_CONTINUE: u8 = 0x03; +pub(crate) const BLOCK_TYPE_JPEG_LEPTON: u8 = 0x04; +pub(crate) const BLOCK_TYPE_WEBP: u8 = 0x05; + +pub(crate) fn write_varint(destination: &mut impl Write, value: u32) -> std::io::Result<()> { + let mut value = value; + loop { + let mut byte = (value & 0x7F) as u8; + value >>= 7; + if value != 0 { + byte |= 0x80; + } + destination.write_all(&[byte])?; + if value == 0 { + break; + } + } + + Ok(()) +} + +pub(crate) fn read_varint(source: &mut impl Read) -> std::io::Result { + let mut result = 0; + let mut shift = 0; + loop { + let mut byte = [0u8; 1]; + source.read_exact(&mut byte)?; + let byte = byte[0]; + result |= ((byte & 0x7F) as u32) << shift; + shift += 7; + if byte & 0x80 == 0 { + break; + } + } + Ok(result) +} + +#[test] +fn test_variant_roundtrip() { + let values = [ + 0, 1, 127, 128, 255, 256, 16383, 16384, 2097151, 2097152, 268435455, 268435456, 4294967295, + ]; + + let mut buffer = Vec::new(); + for &v in values.iter() { + write_varint(&mut buffer, v).unwrap(); + } + + let mut buffer = &buffer[..]; + + for &v in values.iter() { + assert_eq!(v, read_varint(&mut buffer).unwrap()); + } +} + +/// Statistics about the preflate process +#[derive(Debug, Copy, Clone, Default)] +pub struct PreflateStats { + pub deflate_compressed_size: u64, + pub zstd_compressed_size: u64, + pub uncompressed_size: u64, + pub overhead_bytes: u64, + pub hash_algorithm: HashAlgorithm, + pub zstd_baseline_size: u64, +} + +/// Processes an input buffer and writes the output to a writer +pub trait ProcessBuffer { + fn process_buffer( + &mut self, + input: &[u8], + input_complete: bool, + writer: &mut impl Write, + ) -> Result<()>; + + #[cfg(test)] + fn process_vec(&mut self, input: &[u8]) -> Result> { + let mut writer = Vec::new(); + + self.copy_to_end(&mut std::io::Cursor::new(&input), &mut writer) + .context()?; + + Ok(writer) + } + + #[cfg(test)] + fn process_vec_size(&mut self, input: &[u8], read_chunk_size: usize) -> Result> { + let mut writer = Vec::new(); + + self.copy_to_end_size( + &mut std::io::Cursor::new(&input), + &mut writer, + read_chunk_size, + ) + .context()?; + + Ok(writer) + } + + /// Reads everything from input and writes it to the output. + /// Wraps calls to process buffer + fn copy_to_end(&mut self, input: &mut impl BufRead, output: &mut impl Write) -> Result<()> { + self.copy_to_end_size(input, output, 1024 * 1024) + } + + /// Reads everything from input and writes it to the output. + /// Wraps calls to process buffer + fn copy_to_end_size( + &mut self, + input: &mut impl BufRead, + output: &mut impl Write, + read_chunk_size: usize, + ) -> Result<()> { + let mut input_complete = false; + loop { + let buffer: &[u8]; + if input_complete { + buffer = &[]; + } else { + buffer = input.fill_buf().context()?; + if buffer.len() == 0 { + input_complete = true + } + }; + + if input_complete { + self.process_buffer(&[], true, output).context()?; + break; + } else { + // process buffer a piece at a time to avoid overflowing memory + let mut amount_read = 0; + while amount_read < buffer.len() { + let chunk_size = (buffer.len() - amount_read).min(read_chunk_size); + + self.process_buffer( + &buffer[amount_read..amount_read + chunk_size], + false, + output, + ) + .context()?; + + amount_read += chunk_size; + } + + let buflen = buffer.len(); + input.consume(buflen); + } + } + + Ok(()) + } + + fn stats(&self) -> PreflateStats { + PreflateStats::default() + } +} + +#[cfg(test)] +pub(crate) mod test { + use std::io::Write; + + use preflate_rs::{AddContext, Result}; + + use crate::container_common::{ + BLOCK_COMPRESSION_MASK, BLOCK_COMPRESSION_NONE, BLOCK_COMPRESSION_ZSTD, BLOCK_TYPE_DEFLATE, + BLOCK_TYPE_DEFLATE_CONTINUE, BLOCK_TYPE_JPEG_LEPTON, BLOCK_TYPE_LITERAL, BLOCK_TYPE_MASK, + BLOCK_TYPE_PNG, BLOCK_TYPE_WEBP, COMPRESSED_WRAPPER_VERSION_2, PreflateContainerConfig, + ProcessBuffer, read_varint, + }; + use crate::container_read::RecreateContainerProcessor; + use crate::container_write::PreflateContainerProcessor; + + pub struct NopProcessBuffer {} + + impl ProcessBuffer for NopProcessBuffer { + fn process_buffer( + &mut self, + input: &[u8], + _input_complete: bool, + writer: &mut impl Write, + ) -> Result<()> { + writer.write_all(input).context()?; + + Ok(()) + } + } + + fn roundtrip_deflate_chunks(filename: &str) { + use crate::utils::assert_eq_array; + + let f = crate::utils::read_file(filename); + + println!("Processing file: {}", filename); + + let mut expanded = Vec::new(); + let mut ctx = + PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); + ctx.copy_to_end(&mut std::io::Cursor::new(&f), &mut expanded) + .unwrap(); + + println!("Recreating file: {}", filename); + + let mut destination = Vec::new(); + let mut ctx = RecreateContainerProcessor::new(usize::MAX); + ctx.copy_to_end(&mut std::io::Cursor::new(expanded), &mut destination) + .unwrap(); + + assert_eq_array(&destination, &f); + } + + #[test] + fn roundtrip_skip_length_crash() { + roundtrip_deflate_chunks("skiplengthcrash.bin"); + } + + #[test] + fn roundtrip_png_chunks() { + roundtrip_deflate_chunks("treegdi.png"); + } + + #[test] + fn roundtrip_zip_chunks() { + roundtrip_deflate_chunks("samplezip.zip"); + } + + #[test] + fn roundtrip_gz_chunks() { + roundtrip_deflate_chunks("sample1.bin.gz"); + } + + #[test] + fn roundtrip_png_chunks2() { + roundtrip_deflate_chunks("starcontrol.samplesave"); + } + + #[test] + fn roundtrip_small_chunk() { + use crate::utils::{assert_eq_array, read_file}; + + let original = read_file("pptxplaintext.zip"); + + let mut context = PreflateContainerProcessor::new( + &PreflateContainerConfig { + min_chunk_size: 100000, + max_chunk_size: 100000, + total_plain_text_limit: u64::MAX, + ..Default::default() + }, + 1, + false, + ); + + let compressed = context.process_vec_size(&original, 20001).unwrap(); + + let mut context = RecreateContainerProcessor::new(usize::MAX); + let recreated = context.process_vec_size(&compressed, 20001).unwrap(); + + assert_eq_array(&original, &recreated); + } + + #[test] + fn roundtrip_small_plain_text() { + use crate::utils::{assert_eq_array, read_file}; + + let original = read_file("pptxplaintext.zip"); + + let mut context = PreflateContainerProcessor::new( + &PreflateContainerConfig { + min_chunk_size: 100000, + max_chunk_size: 100000, + total_plain_text_limit: u64::MAX, + ..Default::default() + }, + 1, + false, + ); + + let compressed = context.process_vec_size(&original, 2001).unwrap(); + + let mut context = RecreateContainerProcessor::new(usize::MAX); + let recreated = context.process_vec_size(&compressed, 2001).unwrap(); + + assert_eq_array(&original, &recreated); + } + + #[test] + fn roundtrip_zstd_per_block() { + use crate::utils::{assert_eq_array, read_file}; + + let original = read_file("samplezip.zip"); + + let mut context = + PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); + + let compressed = context.process_vec(&original).unwrap(); + + let mut context = RecreateContainerProcessor::new(usize::MAX); + let recreated = context.process_vec(&compressed).unwrap(); + + assert_eq_array(&original, &recreated); + } + + // ── Block type bit-field tests ─────────────────────────────────────────────── + + /// Parse the outer framing of a v2 container and return each block's + /// (compression_bits, block_type_bits) in order, stopping at the 0xFF CRC end block. + fn parse_wire_block_types(data: &[u8]) -> Vec<(u8, u8)> { + use byteorder::ReadBytesExt; + let mut cursor = std::io::Cursor::new(data); + let version = cursor.read_u8().unwrap(); + assert_eq!(version, COMPRESSED_WRAPPER_VERSION_2); + let mut blocks = Vec::new(); + while (cursor.position() as usize) < data.len() { + let type_byte = cursor.read_u8().unwrap(); + if type_byte == 0xFF { + break; // CRC end block; 4 raw bytes follow but we stop here + } + let compression = type_byte & BLOCK_COMPRESSION_MASK; + let block_type = type_byte & BLOCK_TYPE_MASK; + blocks.push((compression, block_type)); + let size = read_varint(&mut cursor).unwrap() as u64; + cursor.set_position(cursor.position() + size); + } + blocks + } + + /// Feed `stream` to the decoder with input_complete=true and assert the + /// error exit code matches `expected`. + fn assert_decoder_fails(stream: &[u8], expected: preflate_rs::ExitCode) { + let mut ctx = RecreateContainerProcessor::new(usize::MAX); + let mut out = Vec::new(); + let err = ctx + .process_buffer(stream, true, &mut out) + .expect_err("expected an error, but decoder returned Ok"); + assert_eq!( + err.exit_code(), + expected, + "wrong exit code for stream {stream:02X?}" + ); + } + + /// The two masks must partition the byte: non-overlapping and together covering all 8 bits. + /// Every content-kind constant must sit entirely within BLOCK_TYPE_MASK, and every + /// compression constant within BLOCK_COMPRESSION_MASK. + #[test] + fn test_bit_field_masks_partition_byte() { + assert_eq!( + BLOCK_COMPRESSION_MASK | BLOCK_TYPE_MASK, + 0xFF, + "masks do not cover all bits" + ); + assert_eq!( + BLOCK_COMPRESSION_MASK & BLOCK_TYPE_MASK, + 0x00, + "masks overlap" + ); + for kind in [ + BLOCK_TYPE_LITERAL, + BLOCK_TYPE_DEFLATE, + BLOCK_TYPE_PNG, + BLOCK_TYPE_DEFLATE_CONTINUE, + BLOCK_TYPE_JPEG_LEPTON, + BLOCK_TYPE_WEBP, + ] { + assert_eq!( + kind & BLOCK_COMPRESSION_MASK, + 0, + "BLOCK_TYPE 0x{kind:02X} bleeds into compression bits" + ); + } + for comp in [BLOCK_COMPRESSION_NONE, BLOCK_COMPRESSION_ZSTD] { + assert_eq!( + comp & BLOCK_TYPE_MASK, + 0, + "BLOCK_COMPRESSION 0x{comp:02X} bleeds into type bits" + ); + } + } + + /// The combined (compression | kind) wire bytes must match the expected values + /// documented in CLAUDE.md. This catches accidental constant drift. + #[test] + fn test_combined_wire_values() { + assert_eq!(BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_LITERAL, 0x40); + assert_eq!(BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_DEFLATE, 0x41); + assert_eq!(BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_PNG, 0x42); + assert_eq!(BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_DEFLATE_CONTINUE, 0x43); + assert_eq!(BLOCK_COMPRESSION_NONE | BLOCK_TYPE_JPEG_LEPTON, 0x04); + assert_eq!(BLOCK_COMPRESSION_NONE | BLOCK_TYPE_WEBP, 0x05); + } + + /// Reserved compression bits 0x80 (10xx_xxxx) must be rejected by the decoder. + #[test] + fn test_decoder_rejects_reserved_compression_bits_10() { + assert_decoder_fails( + &[COMPRESSED_WRAPPER_VERSION_2, 0x80], + preflate_rs::ExitCode::InvalidCompressedWrapper, + ); + } + + /// Reserved compression bits 0xC0 (11xx_xxxx) must be rejected by the decoder. + #[test] + fn test_decoder_rejects_reserved_compression_bits_11() { + assert_decoder_fails( + &[COMPRESSED_WRAPPER_VERSION_2, 0xC0], + preflate_rs::ExitCode::InvalidCompressedWrapper, + ); + } + + /// BLOCK_COMPRESSION_NONE | BLOCK_TYPE_LITERAL (0x00) must be rejected: + /// literal blocks are Zstd-only; there is no raw literal block type. + #[test] + fn test_decoder_rejects_raw_literal_block_type() { + let byte = BLOCK_COMPRESSION_NONE | BLOCK_TYPE_LITERAL; // == 0x00 + assert_decoder_fails( + &[COMPRESSED_WRAPPER_VERSION_2, byte], + preflate_rs::ExitCode::InvalidCompressedWrapper, + ); + } + + /// Any BLOCK_COMPRESSION_NONE byte that is not JPEG_LEPTON or WEBP must be rejected. + #[test] + fn test_decoder_rejects_undefined_raw_block_types() { + // 0x10 is arbitrary: not 0x04 (JPEG) or 0x05 (WEBP) + let byte = BLOCK_COMPRESSION_NONE | 0x10; + assert_decoder_fails( + &[COMPRESSED_WRAPPER_VERSION_2, byte], + preflate_rs::ExitCode::InvalidCompressedWrapper, + ); + } + + /// Compressing plain bytes (no embedded DEFLATE streams) must produce a stream + /// whose first block carries BLOCK_COMPRESSION_ZSTD and BLOCK_TYPE_LITERAL. + #[test] + fn test_encoder_literal_block_carries_zstd_compression_bit() { + let input = vec![0xABu8; 512]; + let mut ctx = + PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); + let compressed = ctx.process_vec(&input).unwrap(); + + let blocks = parse_wire_block_types(&compressed); + assert!( + !blocks.is_empty(), + "expected at least one block in the output" + ); + assert_eq!( + blocks[0], + (BLOCK_COMPRESSION_ZSTD, BLOCK_TYPE_LITERAL), + "first block should be a Zstd literal block" + ); + } + + /// Every block type byte in a real compressed output must have compression bits + /// of either BLOCK_COMPRESSION_NONE or BLOCK_COMPRESSION_ZSTD — never the + /// reserved patterns 0x80 or 0xC0. + #[test] + fn test_encoder_never_emits_reserved_compression_bits() { + let input = crate::utils::read_file("samplezip.zip"); + let mut ctx = + PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); + let compressed = ctx.process_vec(&input).unwrap(); + + for &(compression, _) in &parse_wire_block_types(&compressed) { + assert!( + compression == BLOCK_COMPRESSION_NONE || compression == BLOCK_COMPRESSION_ZSTD, + "found reserved compression bits 0x{compression:02X} in output" + ); + } + } + + /// Verify that the decoder extracts the lower 6 bits as block_type rather + /// than passing the full byte to process_compressed_block. If it passed the + /// full byte (0x41) instead of the kind bits (0x01), the match would fall + /// through to the error arm and the round-trip would fail. + #[test] + fn test_decoder_strips_compression_bits_before_dispatch() { + use crate::utils::{assert_eq_array, read_file}; + // A zip file exercises DEFLATE blocks (wire type 0x41 = ZSTD|DEFLATE). + // A successful round-trip proves the decoder is matching on 0x01, not 0x41. + let original = read_file("samplezip.zip"); + let mut enc = + PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); + let compressed = enc.process_vec(&original).unwrap(); + + // Confirm the stream actually contains DEFLATE blocks (type 0x41), + // so the test is meaningful and not trivially passing. + let has_deflate = parse_wire_block_types(&compressed) + .iter() + .any(|&(c, t)| c == BLOCK_COMPRESSION_ZSTD && t == BLOCK_TYPE_DEFLATE); + assert!( + has_deflate, + "test file produced no DEFLATE blocks — test is vacuous" + ); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + assert_eq_array(&original, &recreated); + } + + /// A PNG file must produce at least one PNG or WebP IDAT block (not merely a DEFLATE + /// block), and must round-trip to the original bytes. The PNG code path in the encoder + /// is distinct from the plain DEFLATE path: it reconstructs IDAT framing and, when the + /// `webp` feature is enabled, may store pixels as WebP lossless instead of raw. + #[test] + fn test_png_produces_idat_block_and_roundtrips() { + use crate::utils::{assert_eq_array, read_file}; + let original = read_file("treegdi.png"); + let mut enc = + PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); + let compressed = enc.process_vec(&original).unwrap(); + + let blocks = parse_wire_block_types(&compressed); + let has_png_block = blocks + .iter() + .any(|&(_, t)| t == BLOCK_TYPE_PNG || t == BLOCK_TYPE_WEBP); + assert!( + has_png_block, + "PNG input should produce at least one PNG (0x02) or WebP (0x05) block, \ + got: {blocks:?}" + ); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + assert_eq_array(&original, &recreated); + } + + /// A PDF containing embedded JPEG images must produce JPEG_LEPTON blocks (raw, + /// outside Zstd) as well as DEFLATE blocks for the PDF's own compressed object + /// streams. Both must survive a full round-trip. + #[test] + fn test_pdf_with_jpegs_produces_lepton_and_deflate_blocks_and_roundtrips() { + use crate::utils::{assert_eq_array, read_file}; + let original = read_file("embedded-images.pdf"); + let mut enc = + PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); + let compressed = enc.process_vec(&original).unwrap(); + + let blocks = parse_wire_block_types(&compressed); + + let has_lepton = blocks + .iter() + .any(|&(c, t)| c == BLOCK_COMPRESSION_NONE && t == BLOCK_TYPE_JPEG_LEPTON); + assert!( + has_lepton, + "PDF with embedded JPEGs should produce at least one JPEG_LEPTON block" + ); + + let has_deflate = blocks.iter().any(|&(_, t)| t == BLOCK_TYPE_DEFLATE); + assert!( + has_deflate, + "PDF with embedded JPEGs should also produce DEFLATE blocks for compressed objects" + ); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + assert_eq_array(&original, &recreated); + } + + /// DEFLATE_CONTINUE blocks are produced when the compressed-data buffer is + /// truncated mid-stream: `DeflateParser::parse` reads to EOF and returns + /// `Ok` with `is_done()=false`, the encoder emits a DEFLATE block for the + /// plaintext decoded so far, saves the mid-stream state, and resumes on + /// subsequent calls via DEFLATE_CONTINUE blocks. + /// + /// `sample1.bin.gz` is a single gzip stream with ~418 KiB of uncompressed + /// content. Feeding it in 10 KiB slices (with `min_chunk_size=5000` so the + /// processor starts immediately) means the scanner always sees only a + /// partial window of the compressed stream, forcing many DEFLATE_CONTINUE + /// blocks that must all round-trip correctly. + #[test] + fn test_deflate_continue_blocks_appear_and_roundtrip() { + use crate::utils::{assert_eq_array, read_file}; + let original = read_file("sample1.bin.gz"); + // min_chunk_size: 0 so the loop processes data immediately after Start, + // letting Searching run with the first truncated chunk rather than waiting + // for an additional min_chunk_size bytes before beginning. + let mut enc = PreflateContainerProcessor::new( + &PreflateContainerConfig { + min_chunk_size: 0, + ..PreflateContainerConfig::default() + }, + 1, + false, + ); + // Feed the 263 KiB file in two pieces. The first piece (200 KiB) truncates + // the DEFLATE stream mid-way; decompress() hits EOF with at least one + // complete block already parsed, so it returns Ok(partial) / is_done()=false, + // causing the encoder to emit a DEFLATE block and enter DeflateContinue. + // The second piece completes the stream → DEFLATE_CONTINUE block. + let mut compressed = Vec::new(); + { + let chunk1 = &original[..200_000.min(original.len())]; + enc.process_buffer(chunk1, false, &mut compressed).unwrap(); + if original.len() > 200_000 { + let chunk2 = &original[200_000..]; + enc.process_buffer(chunk2, false, &mut compressed).unwrap(); + } + enc.process_buffer(&[], true, &mut compressed).unwrap(); + } + + let blocks = parse_wire_block_types(&compressed); + let n_continue = blocks + .iter() + .filter(|&&(_, t)| t == BLOCK_TYPE_DEFLATE_CONTINUE) + .count(); + assert!( + n_continue > 0, + "200 KiB chunks on a ~263 KiB gzip should force at least one DEFLATE_CONTINUE block; \ + blocks seen: {blocks:?}" + ); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec_size(&compressed, 10_000).unwrap(); + assert_eq_array(&original, &recreated); + } + + /// When `total_plain_text_limit` is exceeded the encoder stops analysing + /// deflate streams and writes the remaining bytes as LITERAL blocks. The + /// decoder must still reproduce the original bytes exactly, including the + /// unprocessed portion. + #[test] + fn test_total_plain_text_limit_forces_literal_fallback_and_roundtrips() { + use crate::utils::{assert_eq_array, read_file}; + // samplezip.zip has several DEFLATE entries; setting the limit to 1 byte + // ensures that after the first DEFLATE entry's plaintext is accumulated, + // every subsequent scan sees total_plain_text_seen > limit and falls back + // to writing remaining content as a single LITERAL block. + let original = read_file("samplezip.zip"); + let mut enc = PreflateContainerProcessor::new( + &PreflateContainerConfig { + total_plain_text_limit: 1, + ..PreflateContainerConfig::default() + }, + 1, + false, + ); + let compressed = enc.process_vec(&original).unwrap(); + + let blocks = parse_wire_block_types(&compressed); + + // At least one LITERAL block must appear (the fallback content). + let has_literal = blocks.iter().any(|&(_, t)| t == BLOCK_TYPE_LITERAL); + assert!( + has_literal, + "after total_plain_text_limit is exceeded, remaining content must be LITERAL" + ); + + // The stream must still decode back to the original bytes. + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + assert_eq_array(&original, &recreated); + } + + // ── Multi-scheme fixture tests ─────────────────────────────────────────────── + + /// Helper: compress `data` in one shot and return `(compressed, blocks)`. + fn compress_default(data: &[u8]) -> (Vec, Vec<(u8, u8)>) { + let mut enc = + PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); + let compressed = enc.process_vec(data).unwrap(); + let blocks = parse_wire_block_types(&compressed); + (compressed, blocks) + } + + /// Helper: full roundtrip assertion — compress then decompress, check byte equality. + fn assert_roundtrip(original: &[u8]) { + let (compressed, _) = compress_default(original); + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + crate::utils::assert_eq_array(original, &recreated); + } + + /// Count how many blocks have a given block-type kind. + fn count_block_type(blocks: &[(u8, u8)], kind: u8) -> usize { + blocks.iter().filter(|&&(_, t)| t == kind).count() + } + + /// Two concatenated gzip streams — each contains plaintext well above + /// MIN_BLOCKSIZE=1024, so the scanner must emit exactly two DEFLATE blocks. + /// + /// Fixture: `test_two_gzip_streams.bin` + /// Expected wire sequence: literal, deflate, literal, deflate, literal, EOS + #[test] + fn test_two_gzip_streams_produce_two_deflate_blocks_and_roundtrip() { + use crate::utils::read_file; + let original = read_file("test_two_gzip_streams.bin"); + let (compressed, blocks) = compress_default(&original); + + assert_eq!( + count_block_type(&blocks, BLOCK_TYPE_DEFLATE), + 2, + "two consecutive gzip streams should each produce one DEFLATE block; blocks={blocks:?}" + ); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + crate::utils::assert_eq_array(&original, &recreated); + } + + /// A gzip stream whose plaintext is below MIN_BLOCKSIZE (500 < 1024) must NOT + /// be promoted to a DEFLATE block — the whole file becomes a single literal chunk. + /// + /// Fixture: `test_tiny_gzip.bin` + /// Expected wire sequence: literal, EOS (no DEFLATE blocks) + #[test] + fn test_tiny_gzip_below_min_blocksize_becomes_literal_and_roundtrip() { + use crate::utils::read_file; + let original = read_file("test_tiny_gzip.bin"); + let (compressed, blocks) = compress_default(&original); + + assert_eq!( + count_block_type(&blocks, BLOCK_TYPE_DEFLATE), + 0, + "gzip with 500-byte plaintext ( MIN_BLOCKSIZE) immediately followed by + /// a tiny gzip (plaintext < MIN_BLOCKSIZE). Only the large stream must become a + /// DEFLATE block; the small one stays literal. + /// + /// Fixture: `test_big_then_small_gzip.bin` + /// Expected wire sequence: literal, deflate, literal, EOS (exactly 1 DEFLATE block) + #[test] + fn test_big_gzip_deflate_small_gzip_literal_and_roundtrip() { + use crate::utils::read_file; + let original = read_file("test_big_then_small_gzip.bin"); + let (compressed, blocks) = compress_default(&original); + + assert_eq!( + count_block_type(&blocks, BLOCK_TYPE_DEFLATE), + 1, + "only the large gzip stream should become a DEFLATE block; blocks={blocks:?}" + ); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + crate::utils::assert_eq_array(&original, &recreated); + } + + /// A file with a valid gzip header but a deliberately corrupted DEFLATE body + /// (0xFF leading byte) must not crash. The scanner must gracefully abandon the + /// stream and encode the entire file as a literal block. + /// + /// Fixture: `test_corrupted_deflate.bin` + /// Expected wire sequence: literal, EOS (0 DEFLATE blocks) + #[test] + fn test_corrupted_deflate_body_falls_back_to_literal_and_roundtrip() { + use crate::utils::read_file; + let original = read_file("test_corrupted_deflate.bin"); + let (compressed, blocks) = compress_default(&original); + + assert_eq!( + count_block_type(&blocks, BLOCK_TYPE_DEFLATE), + 0, + "corrupted DEFLATE body must not produce a DEFLATE block; blocks={blocks:?}" + ); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + crate::utils::assert_eq_array(&original, &recreated); + } + + /// A file containing padding bytes, then two zlib streams (each with plaintext + /// > MIN_BLOCKSIZE), then more padding. The scanner must find both zlib headers + /// and emit exactly two DEFLATE blocks. + /// + /// Fixture: `test_two_zlib_streams.bin` + /// layout: 100 × `\xDE\xAD` | zlib(EEEE×6000) | 100 × `\xDE\xAD` | zlib(FFFF×6000) | 100 × `\xDE\xAD` + /// Expected wire sequence: literal, deflate, literal, deflate, literal, EOS + #[test] + fn test_two_zlib_streams_produce_two_deflate_blocks_and_roundtrip() { + use crate::utils::read_file; + let original = read_file("test_two_zlib_streams.bin"); + let (compressed, blocks) = compress_default(&original); + + assert_eq!( + count_block_type(&blocks, BLOCK_TYPE_DEFLATE), + 2, + "two zlib streams surrounded by literal bytes should each produce a DEFLATE block; \ + blocks={blocks:?}" + ); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + crate::utils::assert_eq_array(&original, &recreated); + } + + /// A ZIP file containing three DEFLATE-compressed entries must produce exactly three + /// DEFLATE blocks — one per entry — and round-trip correctly. + /// + /// Fixture: `test_zip_3entries.zip` (entries G×20000, H×20000, I×20000 bytes) + /// Expected wire sequence: literal, deflate, literal, deflate, literal, deflate, literal, EOS + #[test] + fn test_zip_three_deflated_entries_produce_three_deflate_blocks_and_roundtrip() { + use crate::utils::read_file; + let original = read_file("test_zip_3entries.zip"); + let (compressed, blocks) = compress_default(&original); + + assert_eq!( + count_block_type(&blocks, BLOCK_TYPE_DEFLATE), + 3, + "ZIP with 3 DEFLATED entries should produce 3 DEFLATE blocks; blocks={blocks:?}" + ); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + crate::utils::assert_eq_array(&original, &recreated); + } + + /// A ZIP file with a STORED entry (method=0) followed by a DEFLATED entry (method=8). + /// `parse_zip_stream` returns `Err` for STORED entries so they become literal blocks; + /// only the DEFLATED entry is analysed and emitted as a DEFLATE block. + /// + /// Fixture: `test_zip_stored_then_deflated.zip` (J×8000 STORED, K×20000 DEFLATED) + /// Expected wire sequence: literal, deflate, literal, EOS (exactly 1 DEFLATE block) + #[test] + fn test_zip_stored_entry_stays_literal_deflated_entry_becomes_deflate_and_roundtrip() { + use crate::utils::read_file; + let original = read_file("test_zip_stored_then_deflated.zip"); + let (compressed, blocks) = compress_default(&original); + + assert_eq!( + count_block_type(&blocks, BLOCK_TYPE_DEFLATE), + 1, + "only the DEFLATED entry should become a DEFLATE block; STORED stays literal; \ + blocks={blocks:?}" + ); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + crate::utils::assert_eq_array(&original, &recreated); + } + + /// A buffer filled with pseudo-random bytes contains no recognisable DEFLATE/zlib/gzip + /// signatures. The entire file must be emitted as a single literal block with no + /// DEFLATE analysis. + /// + /// Fixture: `test_random_bytes.bin` (32 KiB pseudo-random) + /// Expected wire sequence: literal, EOS + #[test] + fn test_random_bytes_produce_no_deflate_blocks_and_roundtrip() { + use crate::utils::read_file; + let original = read_file("test_random_bytes.bin"); + let (compressed, blocks) = compress_default(&original); + + assert_eq!( + count_block_type(&blocks, BLOCK_TYPE_DEFLATE), + 0, + "random bytes contain no DEFLATE streams; blocks={blocks:?}" + ); + + // The literal block must survive the round-trip. + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + crate::utils::assert_eq_array(&original, &recreated); + } + + /// Two gzip streams separated by a 1000-byte null gap. Both streams have + /// plaintext > MIN_BLOCKSIZE, so both must produce DEFLATE blocks, and the gap + /// must appear as a literal block between them. + /// + /// Fixture: `test_gzip_with_gap.bin` + /// Expected wire sequence: literal, deflate, literal, deflate, literal, EOS + #[test] + fn test_two_gzip_streams_with_null_gap_produce_two_deflate_blocks_and_roundtrip() { + use crate::utils::read_file; + let original = read_file("test_gzip_with_gap.bin"); + let (compressed, blocks) = compress_default(&original); + + assert_eq!( + count_block_type(&blocks, BLOCK_TYPE_DEFLATE), + 2, + "both gzip streams should become DEFLATE blocks; null gap stays literal; \ + blocks={blocks:?}" + ); + // There should be at least one literal block (the gap between the two streams). + assert!( + count_block_type(&blocks, BLOCK_TYPE_LITERAL) >= 1, + "null gap between gzip streams should produce at least one literal block; \ + blocks={blocks:?}" + ); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + crate::utils::assert_eq_array(&original, &recreated); + } + + /// Feed a fixture containing two gzip streams in very small chunks (64 bytes at a + /// time) via the incremental `process_buffer` API to exercise boundary handling. + /// The round-trip result must be byte-exact regardless of where chunk boundaries fall. + #[test] + fn test_two_gzip_streams_incremental_small_chunks_roundtrip() { + use crate::utils::{assert_eq_array, read_file}; + let original = read_file("test_two_gzip_streams.bin"); + + let mut enc = PreflateContainerProcessor::new( + &PreflateContainerConfig { + min_chunk_size: 0, + ..PreflateContainerConfig::default() + }, + 1, + false, + ); + let mut compressed = Vec::new(); + let chunk_size = 64; + let mut pos = 0; + while pos < original.len() { + let end = (pos + chunk_size).min(original.len()); + enc.process_buffer(&original[pos..end], false, &mut compressed) + .unwrap(); + pos = end; + } + enc.process_buffer(&[], true, &mut compressed).unwrap(); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + assert_eq_array(&original, &recreated); + } + + /// Feed `test_two_zlib_streams.bin` in small chunks (128 bytes) to confirm that + /// the incremental path handles mixed literal padding + zlib streams correctly. + #[test] + fn test_two_zlib_streams_incremental_small_chunks_roundtrip() { + use crate::utils::{assert_eq_array, read_file}; + let original = read_file("test_two_zlib_streams.bin"); + + let mut enc = PreflateContainerProcessor::new( + &PreflateContainerConfig { + min_chunk_size: 0, + ..PreflateContainerConfig::default() + }, + 1, + false, + ); + let mut compressed = Vec::new(); + let chunk_size = 128; + let mut pos = 0; + while pos < original.len() { + let end = (pos + chunk_size).min(original.len()); + enc.process_buffer(&original[pos..end], false, &mut compressed) + .unwrap(); + pos = end; + } + enc.process_buffer(&[], true, &mut compressed).unwrap(); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + assert_eq_array(&original, &recreated); + } + + /// Feed a ZIP fixture in small chunks (256 bytes) to check that chunk boundaries + /// inside the ZIP local-file headers and DEFLATE bodies are handled gracefully. + #[test] + fn test_zip_three_entries_incremental_small_chunks_roundtrip() { + use crate::utils::{assert_eq_array, read_file}; + let original = read_file("test_zip_3entries.zip"); + + let mut enc = PreflateContainerProcessor::new( + &PreflateContainerConfig { + min_chunk_size: 0, + ..PreflateContainerConfig::default() + }, + 1, + false, + ); + let mut compressed = Vec::new(); + let chunk_size = 256; + let mut pos = 0; + while pos < original.len() { + let end = (pos + chunk_size).min(original.len()); + enc.process_buffer(&original[pos..end], false, &mut compressed) + .unwrap(); + pos = end; + } + enc.process_buffer(&[], true, &mut compressed).unwrap(); + + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec(&compressed).unwrap(); + assert_eq_array(&original, &recreated); + } + + /// Verify that the decoder also handles the recreated stream correctly when fed in + /// small chunks, not just when given the entire buffer at once. + /// Uses `test_zip_stored_then_deflated.zip` (mixed STORED + DEFLATED entries). + #[test] + fn test_zip_stored_then_deflated_decoder_small_chunks_roundtrip() { + use crate::utils::{assert_eq_array, read_file}; + let original = read_file("test_zip_stored_then_deflated.zip"); + + let mut enc = + PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); + let compressed = enc.process_vec(&original).unwrap(); + + // Decompress in 512-byte chunks to exercise the incremental decoder. + let mut dec = RecreateContainerProcessor::new(usize::MAX); + let recreated = dec.process_vec_size(&compressed, 512).unwrap(); + assert_eq_array(&original, &recreated); + } +} diff --git a/container/src/container_processor.rs b/container/src/container_processor.rs deleted file mode 100644 index 086bb36..0000000 --- a/container/src/container_processor.rs +++ /dev/null @@ -1,2081 +0,0 @@ -use byteorder::ReadBytesExt; -use lepton_jpeg::{DEFAULT_THREAD_POOL, EnabledFeatures}; - -use std::{ - collections::VecDeque, - io::{BufRead, Cursor, Read, Write}, - usize, -}; - -use crate::{ - idat_parse::{IdatContents, PngHeader, recreate_idat}, - scan_deflate::{FindStreamResult, FoundStream, FoundStreamType, find_compressable_stream}, - scoped_read::ScopedRead, -}; - -use preflate_rs::{ - AddContext, ExitCode, HashAlgorithm, PreflateConfig, PreflateError, PreflateStreamProcessor, - RecreateStreamProcessor, Result, err_exit_code, recreate_whole_deflate_stream, -}; - -/// Configuration for the deflate process -#[derive(Debug, Clone)] -pub struct PreflateContainerConfig { - /// As we scan for deflate streams, we need to have a minimum memory - /// chunk to process. We scan this chunk for deflate streams and at least - /// deflate one block has to fit into a chunk for us to recognize it. - pub min_chunk_size: usize, - - /// The maximum size of a deflate or PNG compressed block we will consider. If - /// a deflate stream is larger than this, we will not decompress it and - /// just write it out as a literal block. - pub max_chunk_size: usize, - - /// The maximum overall size of plain text that we will compress. This is - /// global to the entire container and limits the amount of processing that - /// we will do to avoid running out of CPU time on a single file. Once we - /// hit this limit, we will stop looking for deflate streams and just write - /// out the rest of the data as literal blocks. - pub total_plain_text_limit: u64, - - /// The maximum size of a plain text chunk that we will decompress at a time. This limits - /// the memory usage of the decompression process. - pub chunk_plain_text_limit: usize, - - /// true if we should verify that the decompressed data can be recompressed to the same bytes. - /// This is important since there may be corner cases where the data may not yield the same bytes. - /// - /// If this is false, we will not verify the decompressed data and just write it out as is and it is - /// up to the caller to make sure the data is valid. In no case should you just assume that you - /// can get the same data back without verifying it. - pub validate_compression: bool, - - /// Maximum number of lookups we will do in the hash chain. This will limit the CPU time we spend - /// on deflate stream processing but also means that we won't be able to recompress deflate streams - /// that were compressed with a larger chain length (eg level 9 has 4096). - pub max_chain_length: u32, -} - -impl Default for PreflateContainerConfig { - fn default() -> Self { - PreflateContainerConfig { - min_chunk_size: 1024 * 1024, - max_chunk_size: 64 * 1024 * 1024, - total_plain_text_limit: 512 * 1024 * 1024, - chunk_plain_text_limit: 128 * 1024 * 1024, - max_chain_length: 4096, - validate_compression: true, - } - } -} - -impl PreflateContainerConfig { - pub fn preflate_config(&self) -> PreflateConfig { - PreflateConfig { - max_chain_length: self.max_chain_length, - plain_text_limit: self.chunk_plain_text_limit, - verify_compression: self.validate_compression, - } - } -} - -const COMPRESSED_WRAPPER_VERSION_2: u8 = 2; - -// Bit-field masks for the block type byte -// Bits 7-6: compression algorithm Bits 5-0: block content kind -const BLOCK_COMPRESSION_MASK: u8 = 0xC0; -const BLOCK_TYPE_MASK: u8 = 0x3F; - -// Compression algorithms (top 2 bits) -const BLOCK_COMPRESSION_NONE: u8 = 0x00; -const BLOCK_COMPRESSION_ZSTD: u8 = 0x40; - -// Block content kinds (bottom 6 bits) -const BLOCK_TYPE_LITERAL: u8 = 0x00; -const BLOCK_TYPE_DEFLATE: u8 = 0x01; -const BLOCK_TYPE_PNG: u8 = 0x02; -const BLOCK_TYPE_DEFLATE_CONTINUE: u8 = 0x03; -const BLOCK_TYPE_JPEG_LEPTON: u8 = 0x04; -const BLOCK_TYPE_WEBP: u8 = 0x05; -const BLOCK_TYPE_EOS: u8 = 0x3F; // end-of-stream - -pub(crate) fn write_varint(destination: &mut impl Write, value: u32) -> std::io::Result<()> { - let mut value = value; - loop { - let mut byte = (value & 0x7F) as u8; - value >>= 7; - if value != 0 { - byte |= 0x80; - } - destination.write_all(&[byte])?; - if value == 0 { - break; - } - } - - Ok(()) -} - -pub(crate) fn read_varint(source: &mut impl Read) -> std::io::Result { - let mut result = 0; - let mut shift = 0; - loop { - let mut byte = [0u8; 1]; - source.read_exact(&mut byte)?; - let byte = byte[0]; - result |= ((byte & 0x7F) as u32) << shift; - shift += 7; - if byte & 0x80 == 0 { - break; - } - } - Ok(result) -} - -#[test] -fn test_variant_roundtrip() { - let values = [ - 0, 1, 127, 128, 255, 256, 16383, 16384, 2097151, 2097152, 268435455, 268435456, 4294967295, - ]; - - let mut buffer = Vec::new(); - for &v in values.iter() { - write_varint(&mut buffer, v).unwrap(); - } - - let mut buffer = &buffer[..]; - - for &v in values.iter() { - assert_eq!(v, read_varint(&mut buffer).unwrap()); - } -} - -/// Flushes the encoder, writes [block_type][varint(compressed_size)][compressed_bytes] to -/// destination, clears the encoder's inner buffer, and returns the compressed byte count. -fn emit_compressed_block( - block_type: u8, - encoder: &mut zstd::stream::write::Encoder<'static, Vec>, - destination: &mut impl Write, -) -> Result { - encoder.flush().context()?; - let compressed = encoder.get_mut(); - let len = compressed.len(); - destination.write_all(&[block_type])?; - write_varint(destination, len as u32)?; - destination.write_all(compressed)?; - compressed.clear(); - Ok(len) -} - -/// V2 variant of write_chunk_block: block content goes through the persistent Zstd encoder. -/// JPEG blocks are written raw to writer (bypass encoder). -/// Returns (total compressed bytes written, optional continue state). -fn write_chunk_block_v2( - encoder: &mut zstd::stream::write::Encoder<'static, Vec>, - writer: &mut impl Write, - chunk: FoundStream, - stats: &mut PreflateStats, -) -> Result<(usize, Option)> { - match chunk.chunk_type { - FoundStreamType::DeflateStream(parameters, state) => { - write_varint(encoder, chunk.corrections.len() as u32)?; - write_varint(encoder, state.plain_text().text().len() as u32)?; - encoder.write_all(&chunk.corrections)?; - encoder.write_all(&state.plain_text().text())?; - - let compressed_size = emit_compressed_block( - BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_DEFLATE, - encoder, - writer, - )?; - - stats.overhead_bytes += chunk.corrections.len() as u64; - stats.uncompressed_size += state.plain_text().len() as u64; - stats.hash_algorithm = parameters.hash_algorithm; - - if !state.is_done() { - return Ok((compressed_size, Some(state))); - } - Ok((compressed_size, None)) - } - - FoundStreamType::IDATDeflate(parameters, mut idat, plain_text) => { - log::debug!( - "IDATDeflate param {:?} corrections {}", - parameters, - chunk.corrections.len() - ); - - let mut temp_vec = Vec::new(); - - if webp_compress(&mut temp_vec, plain_text.text(), &chunk.corrections, &idat).is_ok() { - // WebP is already compressed — write raw, bypassing the Zstd encoder. - // temp_vec[0] is the BLOCK_TYPE_PNG placeholder byte; temp_vec[1..] is the payload. - let payload = &temp_vec[1..]; - writer.write_all(&[BLOCK_COMPRESSION_NONE | BLOCK_TYPE_WEBP])?; - write_varint(writer, payload.len() as u32)?; - writer.write_all(payload)?; - - stats.uncompressed_size += plain_text.len() as u64; - stats.hash_algorithm = parameters.hash_algorithm; - stats.overhead_bytes += chunk.corrections.len() as u64; - - Ok((payload.len(), None)) - } else { - // Non-WebP PNG: corrections + plaintext are compressible, send through Zstd. - log::debug!("non-Webp compressed {}", idat.total_chunk_length); - write_varint(encoder, chunk.corrections.len() as u32)?; - write_varint(encoder, plain_text.text().len() as u32)?; - idat.png_header = None; - idat.write_to_bytestream(encoder)?; - encoder.write_all(&chunk.corrections)?; - encoder.write_all(plain_text.text())?; - - let compressed_size = emit_compressed_block( - BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_PNG, - encoder, - writer, - )?; - - stats.uncompressed_size += plain_text.len() as u64; - stats.hash_algorithm = parameters.hash_algorithm; - stats.overhead_bytes += chunk.corrections.len() as u64; - - Ok((compressed_size, None)) - } - } - - FoundStreamType::JPEGLepton(data) => { - // JPEG is written raw (bypasses the encoder entirely) - writer.write_all(&[BLOCK_COMPRESSION_NONE | BLOCK_TYPE_JPEG_LEPTON])?; - write_varint(writer, data.len() as u32)?; - writer.write_all(&data)?; - - stats.uncompressed_size += data.len() as u64; - Ok((0, None)) - } - } -} - -/// used to measure the length of the output without storing it -struct MeasureWriteSink { - pub length: usize, -} - -impl Write for MeasureWriteSink { - fn write(&mut self, buf: &[u8]) -> std::io::Result { - self.length += buf.len(); - Ok(buf.len()) - } - - fn flush(&mut self) -> std::io::Result<()> { - Ok(()) - } -} - -/// Statistics about the preflate process -#[derive(Debug, Copy, Clone, Default)] -pub struct PreflateStats { - pub deflate_compressed_size: u64, - pub zstd_compressed_size: u64, - pub uncompressed_size: u64, - pub overhead_bytes: u64, - pub hash_algorithm: HashAlgorithm, - pub zstd_baseline_size: u64, -} - -/// Processes an input buffer and writes the output to a writer -pub trait ProcessBuffer { - fn process_buffer( - &mut self, - input: &[u8], - input_complete: bool, - writer: &mut impl Write, - ) -> Result<()>; - - #[cfg(test)] - fn process_vec(&mut self, input: &[u8]) -> Result> { - let mut writer = Vec::new(); - - self.copy_to_end(&mut std::io::Cursor::new(&input), &mut writer) - .context()?; - - Ok(writer) - } - - #[cfg(test)] - fn process_vec_size(&mut self, input: &[u8], read_chunk_size: usize) -> Result> { - let mut writer = Vec::new(); - - self.copy_to_end_size( - &mut std::io::Cursor::new(&input), - &mut writer, - read_chunk_size, - ) - .context()?; - - Ok(writer) - } - - /// Reads everything from input and writes it to the output. - /// Wraps calls to process buffer - fn copy_to_end(&mut self, input: &mut impl BufRead, output: &mut impl Write) -> Result<()> { - self.copy_to_end_size(input, output, 1024 * 1024) - } - - /// Reads everything from input and writes it to the output. - /// Wraps calls to process buffer - fn copy_to_end_size( - &mut self, - input: &mut impl BufRead, - output: &mut impl Write, - read_chunk_size: usize, - ) -> Result<()> { - let mut input_complete = false; - loop { - let buffer: &[u8]; - if input_complete { - buffer = &[]; - } else { - buffer = input.fill_buf().context()?; - if buffer.len() == 0 { - input_complete = true - } - }; - - if input_complete { - self.process_buffer(&[], true, output).context()?; - break; - } else { - // process buffer a piece at a time to avoid overflowing memory - let mut amount_read = 0; - while amount_read < buffer.len() { - let chunk_size = (buffer.len() - amount_read).min(read_chunk_size); - - self.process_buffer( - &buffer[amount_read..amount_read + chunk_size], - false, - output, - ) - .context()?; - - amount_read += chunk_size; - } - - let buflen = buffer.len(); - input.consume(buflen); - } - } - - Ok(()) - } - - fn stats(&self) -> PreflateStats { - PreflateStats::default() - } -} - -#[derive(Debug)] -enum ChunkParseState { - Start, - /// we are looking for a deflate stream or PNG chunk. The data of the PNG file - /// is stored later than the IHDR chunk that will tell us the dimensions of the image, - /// so we need to keep track of the IHDR chunk so we can use it later to properly - /// compress the PNG data. - Searching(Option), - DeflateContinue(PreflateStreamProcessor), -} - -/// Takes a sequence of bytes that may contain deflate streams, find -/// the streams, and emits a new stream that containus the decompressed -/// streams along with the corrections needed to recreate the original. -/// -/// This output can then be compressed with a better algorithm, like Zstandard -/// and achieve much better compression than if we tried to compress the -/// deflate stream directlyh. -pub struct PreflateContainerProcessor { - content: Vec, - compression_stats: PreflateStats, - input_complete: bool, - total_plain_text_seen: u64, - - /// used to track the last attempted chunk size, in case we - /// need more input to continue, we will collect at least min_chunk_size - /// more input before trying to process again until we reach max_chunk_size - last_attempt_chunk_size: usize, - - state: ChunkParseState, - config: PreflateContainerConfig, - - /// each block is individually compressed with this encoder (v2 format) - encoder: Option>>, - - /// when present, all raw input is also fed to this encoder so we can measure - /// baseline Zstd compression (without preflate processing) - baseline_encoder: Option>, -} - -impl PreflateContainerProcessor { - /// Creates a processor that uses v2 format with a persistent Zstd encoder shared - /// across all non-JPEG blocks. JPEG blocks bypass the encoder entirely. - pub fn new(config: &PreflateContainerConfig, level: i32, test_baseline: bool) -> Self { - PreflateContainerProcessor { - content: Vec::new(), - compression_stats: PreflateStats::default(), - input_complete: false, - state: ChunkParseState::Start, - total_plain_text_seen: 0, - last_attempt_chunk_size: 0, - config: config.clone(), - encoder: Some(zstd::stream::write::Encoder::new(Vec::new(), level).unwrap()), - baseline_encoder: if test_baseline { - Some( - zstd::stream::write::Encoder::new(MeasureWriteSink { length: 0 }, level) - .unwrap(), - ) - } else { - None - }, - } - } -} - -impl ProcessBuffer for PreflateContainerProcessor { - fn process_buffer( - &mut self, - input: &[u8], - input_complete: bool, - writer: &mut impl Write, - ) -> Result<()> { - if self.input_complete && (input.len() > 0 || !input_complete) { - return Err(PreflateError::new( - ExitCode::InvalidParameter, - "more data provided after input_complete signaled", - )); - } - - if input.len() > 0 { - self.compression_stats.deflate_compressed_size += input.len() as u64; - self.content.extend_from_slice(input); - - if let Some(encoder) = &mut self.baseline_encoder { - encoder.write_all(input).context()?; - } - } - - loop { - // wait until we have at least min_chunk_size before we start processing - if self.content.is_empty() - || (!input_complete - && (self.content.len() - self.last_attempt_chunk_size) - < self.config.min_chunk_size - && self.content.len() <= self.config.max_chunk_size) - { - break; - } - - self.last_attempt_chunk_size = self.content.len(); - - match &mut self.state { - ChunkParseState::Start => { - writer.write_all(&[COMPRESSED_WRAPPER_VERSION_2])?; - self.state = ChunkParseState::Searching(None); - } - ChunkParseState::Searching(prev_ihdr) => { - if self.total_plain_text_seen > self.config.total_plain_text_limit { - // once we've exceeded our limit, we don't do any more compression - let encoder = self.encoder.as_mut().unwrap(); - write_varint(encoder, self.content.len() as u32)?; - encoder.write_all(&self.content)?; - let sz = emit_compressed_block( - BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_LITERAL, - encoder, - writer, - )?; - self.compression_stats.zstd_compressed_size += sz as u64; - - self.last_attempt_chunk_size = 0; - self.content.clear(); - break; - } - - // here we are looking for a deflate stream or PNG chunk - match find_compressable_stream( - &self.content, - prev_ihdr, - input_complete, - &self.config, - ) { - FindStreamResult::Found(next, chunk) => { - // the gap between the start and the beginning of the deflate stream - // is written out as a literal block - if next.start != 0 { - let encoder = self.encoder.as_mut().unwrap(); - write_varint(encoder, next.start as u32)?; - encoder.write_all(&self.content[..next.start])?; - let sz = emit_compressed_block( - BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_LITERAL, - encoder, - writer, - )?; - self.compression_stats.zstd_compressed_size += sz as u64; - } - - let (compressed_size, next_state) = write_chunk_block_v2( - self.encoder.as_mut().unwrap(), - writer, - chunk, - &mut self.compression_stats, - ) - .context()?; - self.compression_stats.zstd_compressed_size += compressed_size as u64; - - if let Some(mut state) = next_state { - self.total_plain_text_seen += state.plain_text().len() as u64; - state.shrink_to_dictionary(); - self.state = ChunkParseState::DeflateContinue(state); - } - - self.content.drain(0..next.end); - self.last_attempt_chunk_size = self.content.len(); - } - FindStreamResult::ShortRead => { - if input_complete || self.content.len() > self.config.max_chunk_size { - // if we have too much data or have no more data, - // we just write it out as a literal block with everything we have - let encoder = self.encoder.as_mut().unwrap(); - write_varint(encoder, self.content.len() as u32)?; - encoder.write_all(&self.content)?; - let sz = emit_compressed_block( - BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_LITERAL, - encoder, - writer, - )?; - self.compression_stats.zstd_compressed_size += sz as u64; - - self.content.clear(); - self.last_attempt_chunk_size = 0; - } else { - // we don't have enough data to process the stream, so we just - // wait for more data - break; - } - } - FindStreamResult::None => { - // couldn't find anything, just write the rest as a literal block - let encoder = self.encoder.as_mut().unwrap(); - write_varint(encoder, self.content.len() as u32)?; - encoder.write_all(&self.content)?; - let sz = emit_compressed_block( - BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_LITERAL, - encoder, - writer, - )?; - self.compression_stats.zstd_compressed_size += sz as u64; - - self.content.clear(); - self.last_attempt_chunk_size = 0; - } - } - } - ChunkParseState::DeflateContinue(state) => { - // here we have a deflate stream that we need to continue - match state.decompress(&self.content) { - Err(ref e) - if e.exit_code() == ExitCode::ShortRead - && !input_complete - && self.content.len() <= self.config.max_chunk_size => - { - // Not enough data to complete the next block yet; wait for more. - break; - } - Err(_e) => { - // Stream analysis diverged or no more data is coming; give up on - // continuation and fall back to treating the remaining bytes as raw. - self.state = ChunkParseState::Searching(None); - - log::debug!("Error while trying to continue compression {:?}", _e); - } - Ok(res) => { - log::debug!( - "Deflate continue: {} -> {}", - state.plain_text().len(), - res.compressed_size - ); - - let encoder = self.encoder.as_mut().unwrap(); - write_varint(encoder, res.corrections.len() as u32)?; - write_varint(encoder, state.plain_text().len() as u32)?; - encoder.write_all(&res.corrections)?; - encoder.write_all(&state.plain_text().text())?; - let sz = emit_compressed_block( - BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_DEFLATE_CONTINUE, - encoder, - writer, - )?; - self.compression_stats.zstd_compressed_size += sz as u64; - - self.total_plain_text_seen += state.plain_text().len() as u64; - self.compression_stats.overhead_bytes += res.corrections.len() as u64; - self.compression_stats.uncompressed_size += - state.plain_text().len() as u64; - - self.content.drain(0..res.compressed_size); - self.last_attempt_chunk_size = self.content.len(); - - if state.is_done() { - self.state = ChunkParseState::Searching(None); - } else { - state.shrink_to_dictionary(); - } - } - } - } - } - } - - if input_complete && !self.input_complete { - self.input_complete = true; - - if self.content.len() > 0 { - let encoder = self.encoder.as_mut().unwrap(); - write_varint(encoder, self.content.len() as u32)?; - encoder.write_all(&self.content)?; - let sz = emit_compressed_block( - BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_LITERAL, - encoder, - writer, - )?; - self.compression_stats.zstd_compressed_size += sz as u64; - } - self.content.clear(); - - // Finalize the Zstd encoder and write the end-of-stream marker - let encoder = self.encoder.take().unwrap(); - let finish_bytes = encoder.finish().context()?; - writer.write_all(&[BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_EOS])?; - write_varint(writer, finish_bytes.len() as u32)?; - writer.write_all(&finish_bytes)?; - self.compression_stats.zstd_compressed_size += finish_bytes.len() as u64; - - // Finalize baseline encoder for stats - if let Some(mut encoder) = self.baseline_encoder.take() { - encoder.flush().context()?; - encoder.do_finish().context()?; - self.compression_stats.zstd_baseline_size = encoder.get_mut().length as u64; - } - } - - Ok(()) - } - - fn stats(&self) -> PreflateStats { - self.compression_stats - } -} - -enum DecompressionState { - Start, - StartSegment, - /// accumulate compressed_size bytes then decode and process the block immediately. - AccumulateBlock { - block_type: u8, - compressed_size: usize, - }, - /// accumulate lepton bytes then decode the JPEG block immediately. - JpegAccumulate { - lepton_length: usize, - }, - /// accumulate raw WebP-compressed PNG bytes then process the block immediately. - WebpAccumulate { - total_len: usize, - }, - /// accumulate the final Zstd finish bytes to close the frame cleanly. - ZstdEndOfStream { - final_size: usize, - }, -} - -/// recreates the orignal content from the chunked data -pub struct RecreateContainerProcessor { - capacity: usize, - input: VecDeque, - input_complete: bool, - state: DecompressionState, - - /// state of the predictor and plain text if we need to contiune a deflate stream - /// if it was too big to complete in a single chunk - deflate_continue_state: Option, - - /// persistent Zstd decoder — maintains the streaming context across blocks - zstd_decoder: zstd::stream::raw::Decoder<'static>, -} - -impl RecreateContainerProcessor { - pub fn new(capacity: usize) -> Self { - RecreateContainerProcessor { - input: VecDeque::new(), - capacity, - input_complete: false, - state: DecompressionState::Start, - deflate_continue_state: None, - zstd_decoder: zstd::stream::raw::Decoder::new().expect("failed to create zstd decoder"), - } - } -} - -impl ProcessBuffer for RecreateContainerProcessor { - fn process_buffer( - &mut self, - input: &[u8], - input_complete: bool, - writer: &mut impl Write, - ) -> Result<()> { - if self.input_complete && (input.len() > 0 || !input_complete) { - return Err(PreflateError::new( - ExitCode::InvalidParameter, - "more data provided after input_complete signaled", - )); - } - - // we could have been passed a big buffer, so we need to process it in chunks - let mut amount_read = 0; - loop { - let amount_to_read = (input.len() - amount_read).min(self.capacity); - - // when we get to the end and we've read everything, we can signal that we are done - if amount_read + amount_to_read == input.len() && input_complete { - self.input_complete = true; - } - - self.input - .extend(&input[amount_read..amount_read + amount_to_read]); - - amount_read += amount_to_read; - - self.process_buffer_internal(writer)?; - - if amount_read == input.len() { - break; - } - } - - Ok(()) - } -} - -impl RecreateContainerProcessor { - fn process_buffer_internal(&mut self, writer: &mut impl Write) -> Result<()> { - loop { - match &mut self.state { - DecompressionState::Start => { - if !self.input_complete && self.input.len() == 0 { - break; - } - - let version = self.input.read_u8()?; - - match version { - COMPRESSED_WRAPPER_VERSION_2 => { - self.state = DecompressionState::StartSegment; - } - _ => { - return err_exit_code( - ExitCode::InvalidCompressedWrapper, - format!("Invalid version {version}"), - ); - } - } - } - DecompressionState::StartSegment => { - // here's a good place to stop if we run out of input - if self.input.len() == 0 { - break; - } - - // read type byte, then dispatch - self.state = match self.input.scoped_read(|r| { - let type_byte = r.read_u8()?; - let compression = type_byte & BLOCK_COMPRESSION_MASK; - let block_type = type_byte & BLOCK_TYPE_MASK; - match compression { - BLOCK_COMPRESSION_NONE => match block_type { - BLOCK_TYPE_JPEG_LEPTON => { - let lepton_length = read_varint(r)? as usize; - Ok(DecompressionState::JpegAccumulate { lepton_length }) - } - BLOCK_TYPE_WEBP => { - let total_len = read_varint(r)? as usize; - Ok(DecompressionState::WebpAccumulate { total_len }) - } - _ => err_exit_code( - ExitCode::InvalidCompressedWrapper, - "unknown raw block type", - ), - }, - BLOCK_COMPRESSION_ZSTD => match block_type { - BLOCK_TYPE_EOS => { - let final_size = read_varint(r)? as usize; - Ok(DecompressionState::ZstdEndOfStream { final_size }) - } - other => { - let compressed_size = read_varint(r)? as usize; - Ok(DecompressionState::AccumulateBlock { - block_type: other, - compressed_size, - }) - } - }, - _ => err_exit_code( - ExitCode::InvalidCompressedWrapper, - "unknown compression algorithm", - ), - } - }) { - Ok(s) => s, - Err(e) => { - if !self.input_complete && e.exit_code() == ExitCode::ShortRead { - break; - } else { - return Err(e); - } - } - }; - } - - DecompressionState::AccumulateBlock { - block_type, - compressed_size, - } => { - if self.input.len() < *compressed_size { - if self.input_complete { - return Err(PreflateError::new( - ExitCode::InvalidCompressedWrapper, - "unexpected end of input in block", - )); - } - break; - } - - let block_type = *block_type; - let compressed_bytes: Vec = self.input.drain(0..*compressed_size).collect(); - let decoded = drain_zstd_block(&mut self.zstd_decoder, &compressed_bytes)?; - process_compressed_block( - block_type, - &mut Cursor::new(decoded), - &mut self.deflate_continue_state, - writer, - )?; - self.state = DecompressionState::StartSegment; - } - - DecompressionState::JpegAccumulate { lepton_length } => { - if self.input.len() < *lepton_length { - if self.input_complete { - return Err(PreflateError::new( - ExitCode::InvalidCompressedWrapper, - "unexpected end of input in jpeg block", - )); - } - break; - } - - let lepton_bytes: Vec = self.input.drain(0..*lepton_length).collect(); - match lepton_jpeg::decode_lepton( - &mut Cursor::new(&lepton_bytes), - writer, - &EnabledFeatures::compat_lepton_vector_read(), - &DEFAULT_THREAD_POOL, - ) { - Err(e) => { - return Err(PreflateError::new( - ExitCode::InvalidCompressedWrapper, - format!("JPEG Lepton decode failed: {}", e), - )); - } - Ok(_) => {} - } - self.state = DecompressionState::StartSegment; - } - - DecompressionState::WebpAccumulate { total_len } => { - if self.input.len() < *total_len { - if self.input_complete { - return Err(PreflateError::new( - ExitCode::InvalidCompressedWrapper, - "unexpected end of input in webp block", - )); - } - break; - } - - let webp_bytes: Vec = self.input.drain(0..*total_len).collect(); - // Payload is what webp_compress wrote after the BLOCK_TYPE_PNG type byte, - // so process_compressed_block can parse it directly. - process_compressed_block( - BLOCK_TYPE_PNG, - &mut Cursor::new(webp_bytes), - &mut self.deflate_continue_state, - writer, - )?; - self.state = DecompressionState::StartSegment; - } - - DecompressionState::ZstdEndOfStream { final_size } => { - if self.input.len() < *final_size { - if self.input_complete { - return Err(PreflateError::new( - ExitCode::InvalidCompressedWrapper, - "unexpected end of input in end-of-stream", - )); - } - break; - } - - // Feed the finish bytes to cleanly close the Zstd frame. - // No decompressed output is expected since the encoder flushes after each block. - let finish_bytes: Vec = self.input.drain(0..*final_size).collect(); - drain_zstd_block(&mut self.zstd_decoder, &finish_bytes)?; - - self.state = DecompressionState::StartSegment; - } - } - } - - Ok(()) - } -} - -/// Feeds `compressed` bytes into the persistent `decoder` and returns all decompressed output. -/// -/// Each call corresponds to one Zstd flush frame (written by the encoder via `flush()`). -/// After consuming all input bytes the decoder is drained until it produces no more output, -/// which is guaranteed because `ZSTD_e_flush` ensures all data is available to the decoder -/// before the next block starts. -fn drain_zstd_block( - decoder: &mut zstd::stream::raw::Decoder<'static>, - compressed: &[u8], -) -> Result> { - use zstd::stream::raw::{InBuffer, Operation, OutBuffer}; - - let mut output = Vec::new(); - let mut scratch = vec![0u8; 65536]; - let mut in_buf = InBuffer::around(compressed); - - loop { - let mut out_buf = OutBuffer::around(scratch.as_mut_slice()); - decoder.run(&mut in_buf, &mut out_buf).map_err(|e| { - PreflateError::new( - ExitCode::InvalidCompressedWrapper, - format!("zstd decode failed: {e}"), - ) - })?; - let produced = out_buf.pos(); - output.extend_from_slice(&scratch[..produced]); - - // Stop when all input has been consumed and the decoder produced no more output. - // zstd guarantees progress (either bytes_read > 0 or bytes_written > 0) so this - // loop always terminates. - if in_buf.pos() >= compressed.len() && produced == 0 { - break; - } - } - - Ok(output) -} - -/// Parses and processes a single non-JPEG/non-WebP block. -/// -/// `cursor` wraps the output of `drain_zstd_block` for compressed blocks, -/// or the raw WebP payload for `BLOCK_TYPE_PNG` blocks stored outside Zstd. -/// -/// Layout written by the encoder for each block type (block_type = lower 6 bits): -/// BLOCK_TYPE_LITERAL: varint(len) + data -/// BLOCK_TYPE_DEFLATE: varint(corrections_len) + varint(plaintext_len) + corrections + plaintext -/// BLOCK_TYPE_DEFLATE_CONTINUE: same as BLOCK_TYPE_DEFLATE -/// BLOCK_TYPE_PNG: varint(correction_length) + varint(uncompressed_length) + -/// IdatContents + [filters if png_header present] + -/// corrections + (webp_data or raw_plaintext) -fn process_compressed_block( - block_type: u8, - cursor: &mut Cursor>, - deflate_continue_state: &mut Option, - writer: &mut impl Write, -) -> Result<()> { - match block_type { - BLOCK_TYPE_LITERAL => { - let length = read_varint(cursor)? as usize; - let mut data = vec![0u8; length]; - cursor.read_exact(&mut data).map_err(|e| { - PreflateError::new(ExitCode::InvalidCompressedWrapper, e.to_string()) - })?; - writer.write_all(&data)?; - } - BLOCK_TYPE_DEFLATE => { - *deflate_continue_state = None; - - let correction_length = read_varint(cursor)? as usize; - let uncompressed_length = read_varint(cursor)? as usize; - - let mut corrections = vec![0u8; correction_length]; - cursor.read_exact(&mut corrections).map_err(|e| { - PreflateError::new(ExitCode::InvalidCompressedWrapper, e.to_string()) - })?; - - let mut plain_text_buf = vec![0u8; uncompressed_length]; - cursor.read_exact(&mut plain_text_buf).map_err(|e| { - PreflateError::new(ExitCode::InvalidCompressedWrapper, e.to_string()) - })?; - - let mut reconstruct = RecreateStreamProcessor::new(); - let (comp, _) = reconstruct - .recompress(&mut Cursor::new(&plain_text_buf), &corrections) - .context()?; - - writer.write_all(&comp)?; - *deflate_continue_state = Some(reconstruct); - } - BLOCK_TYPE_DEFLATE_CONTINUE => { - let correction_length = read_varint(cursor)? as usize; - let uncompressed_length = read_varint(cursor)? as usize; - - let mut corrections = vec![0u8; correction_length]; - cursor.read_exact(&mut corrections).map_err(|e| { - PreflateError::new(ExitCode::InvalidCompressedWrapper, e.to_string()) - })?; - - let mut plain_text_buf = vec![0u8; uncompressed_length]; - cursor.read_exact(&mut plain_text_buf).map_err(|e| { - PreflateError::new(ExitCode::InvalidCompressedWrapper, e.to_string()) - })?; - - let reconstruct = deflate_continue_state.as_mut().ok_or_else(|| { - PreflateError::new( - ExitCode::InvalidCompressedWrapper, - "no deflate state to continue", - ) - })?; - - let (comp, _) = reconstruct - .recompress(&mut Cursor::new(&plain_text_buf), &corrections) - .context()?; - - writer.write_all(&comp)?; - } - BLOCK_TYPE_PNG => { - let correction_length = read_varint(cursor)? as usize; - let uncompressed_length = read_varint(cursor)? as usize; - let idat = IdatContents::read_from_bytestream(cursor)?; - - let mut filters = Vec::new(); - if let Some(png_header) = &idat.png_header { - filters.resize(png_header.height as usize, 0); - cursor.read_exact(&mut filters[..]).map_err(|e| { - PreflateError::new(ExitCode::InvalidCompressedWrapper, e.to_string()) - })?; - } - - let mut corrections = vec![0u8; correction_length]; - cursor.read_exact(&mut corrections).map_err(|e| { - PreflateError::new(ExitCode::InvalidCompressedWrapper, e.to_string()) - })?; - - let plain_text; - if let Some(header) = &idat.png_header { - let mut webp = vec![0u8; uncompressed_length]; - cursor.read_exact(&mut webp).map_err(|e| { - PreflateError::new(ExitCode::InvalidCompressedWrapper, e.to_string()) - })?; - plain_text = webp_decompress(&filters, webp, header).context()?; - } else { - let mut raw = vec![0u8; uncompressed_length]; - cursor.read_exact(&mut raw).map_err(|e| { - PreflateError::new(ExitCode::InvalidCompressedWrapper, e.to_string()) - })?; - plain_text = raw; - } - - let recompressed = - recreate_whole_deflate_stream(&plain_text, &corrections).context()?; - - recreate_idat(&idat, &recompressed[..], writer).context()?; - } - _ => { - return err_exit_code( - ExitCode::InvalidCompressedWrapper, - format!("Unknown block type {block_type}"), - ); - } - } - Ok(()) -} - -fn webp_compress( - result: &mut impl Write, - plain_text: &[u8], - corrections: &[u8], - idat: &IdatContents, -) -> Result<()> { - log::debug!("{:?}", idat); - - #[cfg(feature = "webp")] - if let Some(png_header) = idat.png_header { - use crate::idat_parse::{PngColorType, undo_png_filters}; - use std::ops::Deref; - - let bbp = png_header.color_type.bytes_per_pixel(); - let w = png_header.width as usize; - let h = png_header.height as usize; - - log::debug!( - "plain text compressing {} bytes ({}x{}x{})", - plain_text.len(), - w, - h, - bbp - ); - - // see if the bitmap looks like the way with think it should (bits per pixel map + 1 height worth of filter bytes) - if (bbp * w * h) + h == plain_text.len() { - let (bitmap, filters) = undo_png_filters(plain_text, w, h, bbp); - - let enc = webp::Encoder::new( - &bitmap, - match png_header.color_type { - PngColorType::RGB => webp::PixelLayout::Rgb, - PngColorType::RGBA => webp::PixelLayout::Rgba, - }, - png_header.width, - png_header.height, - ); - - let mut webpconfig = webp::WebPConfig::new().unwrap(); - webpconfig.lossless = 1; - webpconfig.alpha_compression = 0; - webpconfig.exact = 1; // undocumented option, but required to not throw away color if alpha channel is zero - - // this is the default quality setting for webp lossless, we could dial it up - // but the quality gains are marginal for the CPU cost, although the - // CPU decompression cost is the same. - webpconfig.quality = 75.0; // 0..100 higher is slower but better compression - webpconfig.method = 4; // 0..6 higher is slower but better compression - - let comp = match enc.encode_advanced(&webpconfig) { - Ok(c) => c, - Err(e) => { - return err_exit_code( - ExitCode::WebPDecodeError, - format!("Webp encode failed: {:?}", e), - ); - } - }; - - result.write_all(&[BLOCK_TYPE_PNG])?; // placeholder — caller skips this byte - - write_varint(result, corrections.len() as u32)?; - write_varint(result, comp.deref().len() as u32)?; - - log::debug!( - "Webp compressed {} bytes (vs {})", - comp.deref().len(), - idat.total_chunk_length - ); - - idat.write_to_bytestream(result)?; - result.write_all(&filters)?; - - result.write_all(&corrections)?; - result.write_all(comp.deref())?; - - return Ok(()); - } - } - - return err_exit_code( - ExitCode::InvalidCompressedWrapper, - "Webp compression not supported", - ); -} - -fn webp_decompress( - filters: &[u8], - webp: Vec, - header: &crate::idat_parse::PngHeader, -) -> Result> { - #[cfg(feature = "webp")] - match webp::Decoder::new(webp.as_slice()).decode() { - Some(result) => { - use crate::idat_parse::apply_png_filters_with_types; - use std::ops::Deref; - - let m = result.deref(); - - return Ok(apply_png_filters_with_types( - m, - header.width as usize, - header.height as usize, - if result.is_alpha() { 4 } else { 3 }, - header.color_type.bytes_per_pixel(), - &filters, - )); - } - _ => {} - } - return err_exit_code(ExitCode::InvalidCompressedWrapper, "Webp decode failed"); -} - -#[cfg(test)] -pub(crate) mod test { - use super::*; - - pub struct NopProcessBuffer {} - - impl ProcessBuffer for NopProcessBuffer { - fn process_buffer( - &mut self, - input: &[u8], - _input_complete: bool, - writer: &mut impl Write, - ) -> Result<()> { - writer.write_all(input).context()?; - - Ok(()) - } - } - - fn roundtrip_deflate_chunks(filename: &str) { - use crate::utils::assert_eq_array; - - let f = crate::utils::read_file(filename); - - println!("Processing file: {}", filename); - - let mut expanded = Vec::new(); - let mut ctx = - PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); - ctx.copy_to_end(&mut std::io::Cursor::new(&f), &mut expanded) - .unwrap(); - - println!("Recreating file: {}", filename); - - let mut destination = Vec::new(); - let mut ctx = RecreateContainerProcessor::new(usize::MAX); - ctx.copy_to_end(&mut std::io::Cursor::new(expanded), &mut destination) - .unwrap(); - - assert_eq_array(&destination, &f); - } - - #[test] - fn roundtrip_skip_length_crash() { - roundtrip_deflate_chunks("skiplengthcrash.bin"); - } - - #[test] - fn roundtrip_png_chunks() { - roundtrip_deflate_chunks("treegdi.png"); - } - - #[test] - fn roundtrip_zip_chunks() { - roundtrip_deflate_chunks("samplezip.zip"); - } - - #[test] - fn roundtrip_gz_chunks() { - roundtrip_deflate_chunks("sample1.bin.gz"); - } - - #[test] - fn roundtrip_png_chunks2() { - roundtrip_deflate_chunks("starcontrol.samplesave"); - } - - #[test] - fn roundtrip_small_chunk() { - use crate::utils::{assert_eq_array, read_file}; - - let original = read_file("pptxplaintext.zip"); - - let mut context = PreflateContainerProcessor::new( - &PreflateContainerConfig { - min_chunk_size: 100000, - max_chunk_size: 100000, - total_plain_text_limit: u64::MAX, - ..Default::default() - }, - 1, - false, - ); - - let compressed = context.process_vec_size(&original, 20001).unwrap(); - - let mut context = RecreateContainerProcessor::new(usize::MAX); - let recreated = context.process_vec_size(&compressed, 20001).unwrap(); - - assert_eq_array(&original, &recreated); - } - - #[test] - fn roundtrip_small_plain_text() { - use crate::utils::{assert_eq_array, read_file}; - - let original = read_file("pptxplaintext.zip"); - - let mut context = PreflateContainerProcessor::new( - &PreflateContainerConfig { - min_chunk_size: 100000, - max_chunk_size: 100000, - total_plain_text_limit: u64::MAX, - ..Default::default() - }, - 1, - false, - ); - - let compressed = context.process_vec_size(&original, 2001).unwrap(); - - let mut context = RecreateContainerProcessor::new(usize::MAX); - let recreated = context.process_vec_size(&compressed, 2001).unwrap(); - - assert_eq_array(&original, &recreated); - } - - #[test] - fn roundtrip_zstd_per_block() { - use crate::utils::{assert_eq_array, read_file}; - - let original = read_file("samplezip.zip"); - - let mut context = - PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); - - let compressed = context.process_vec(&original).unwrap(); - - let mut context = RecreateContainerProcessor::new(usize::MAX); - let recreated = context.process_vec(&compressed).unwrap(); - - assert_eq_array(&original, &recreated); - } - - // ── Block type bit-field tests ─────────────────────────────────────────────── - - /// Parse the outer framing of a v2 container and return each block's - /// (compression_bits, block_type_bits) in order, stopping after EOS. - fn parse_wire_block_types(data: &[u8]) -> Vec<(u8, u8)> { - let mut cursor = std::io::Cursor::new(data); - let version = cursor.read_u8().unwrap(); - assert_eq!(version, COMPRESSED_WRAPPER_VERSION_2); - let mut blocks = Vec::new(); - while (cursor.position() as usize) < data.len() { - let type_byte = cursor.read_u8().unwrap(); - let compression = type_byte & BLOCK_COMPRESSION_MASK; - let block_type = type_byte & BLOCK_TYPE_MASK; - blocks.push((compression, block_type)); - let size = read_varint(&mut cursor).unwrap() as u64; - cursor.set_position(cursor.position() + size); - if compression == BLOCK_COMPRESSION_ZSTD && block_type == BLOCK_TYPE_EOS { - break; - } - } - blocks - } - - /// Feed `stream` to the decoder with input_complete=true and assert the - /// error exit code matches `expected`. - fn assert_decoder_fails(stream: &[u8], expected: preflate_rs::ExitCode) { - let mut ctx = RecreateContainerProcessor::new(usize::MAX); - let mut out = Vec::new(); - let err = ctx - .process_buffer(stream, true, &mut out) - .expect_err("expected an error, but decoder returned Ok"); - assert_eq!( - err.exit_code(), - expected, - "wrong exit code for stream {stream:02X?}" - ); - } - - /// The two masks must partition the byte: non-overlapping and together covering all 8 bits. - /// Every content-kind constant must sit entirely within BLOCK_TYPE_MASK, and every - /// compression constant within BLOCK_COMPRESSION_MASK. - #[test] - fn test_bit_field_masks_partition_byte() { - assert_eq!( - BLOCK_COMPRESSION_MASK | BLOCK_TYPE_MASK, - 0xFF, - "masks do not cover all bits" - ); - assert_eq!( - BLOCK_COMPRESSION_MASK & BLOCK_TYPE_MASK, - 0x00, - "masks overlap" - ); - for kind in [ - BLOCK_TYPE_LITERAL, - BLOCK_TYPE_DEFLATE, - BLOCK_TYPE_PNG, - BLOCK_TYPE_DEFLATE_CONTINUE, - BLOCK_TYPE_JPEG_LEPTON, - BLOCK_TYPE_WEBP, - BLOCK_TYPE_EOS, - ] { - assert_eq!( - kind & BLOCK_COMPRESSION_MASK, - 0, - "BLOCK_TYPE 0x{kind:02X} bleeds into compression bits" - ); - } - for comp in [BLOCK_COMPRESSION_NONE, BLOCK_COMPRESSION_ZSTD] { - assert_eq!( - comp & BLOCK_TYPE_MASK, - 0, - "BLOCK_COMPRESSION 0x{comp:02X} bleeds into type bits" - ); - } - } - - /// The combined (compression | kind) wire bytes must match the expected values - /// documented in CLAUDE.md. This catches accidental constant drift. - #[test] - fn test_combined_wire_values() { - assert_eq!(BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_LITERAL, 0x40); - assert_eq!(BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_DEFLATE, 0x41); - assert_eq!(BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_PNG, 0x42); - assert_eq!(BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_DEFLATE_CONTINUE, 0x43); - assert_eq!(BLOCK_COMPRESSION_NONE | BLOCK_TYPE_JPEG_LEPTON, 0x04); - assert_eq!(BLOCK_COMPRESSION_NONE | BLOCK_TYPE_WEBP, 0x05); - assert_eq!(BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_EOS, 0x7F); - } - - /// Reserved compression bits 0x80 (10xx_xxxx) must be rejected by the decoder. - #[test] - fn test_decoder_rejects_reserved_compression_bits_10() { - assert_decoder_fails( - &[COMPRESSED_WRAPPER_VERSION_2, 0x80], - preflate_rs::ExitCode::InvalidCompressedWrapper, - ); - } - - /// Reserved compression bits 0xC0 (11xx_xxxx) must be rejected by the decoder. - #[test] - fn test_decoder_rejects_reserved_compression_bits_11() { - assert_decoder_fails( - &[COMPRESSED_WRAPPER_VERSION_2, 0xC0], - preflate_rs::ExitCode::InvalidCompressedWrapper, - ); - } - - /// BLOCK_COMPRESSION_NONE | BLOCK_TYPE_LITERAL (0x00) must be rejected: - /// literal blocks are Zstd-only; there is no raw literal block type. - #[test] - fn test_decoder_rejects_raw_literal_block_type() { - let byte = BLOCK_COMPRESSION_NONE | BLOCK_TYPE_LITERAL; // == 0x00 - assert_decoder_fails( - &[COMPRESSED_WRAPPER_VERSION_2, byte], - preflate_rs::ExitCode::InvalidCompressedWrapper, - ); - } - - /// Any BLOCK_COMPRESSION_NONE byte that is not JPEG_LEPTON or WEBP must be rejected. - #[test] - fn test_decoder_rejects_undefined_raw_block_types() { - // 0x10 is arbitrary: not 0x04 (JPEG) or 0x05 (WEBP) - let byte = BLOCK_COMPRESSION_NONE | 0x10; - assert_decoder_fails( - &[COMPRESSED_WRAPPER_VERSION_2, byte], - preflate_rs::ExitCode::InvalidCompressedWrapper, - ); - } - - /// Compressing plain bytes (no embedded DEFLATE streams) must produce a stream - /// whose first block carries BLOCK_COMPRESSION_ZSTD and BLOCK_TYPE_LITERAL. - #[test] - fn test_encoder_literal_block_carries_zstd_compression_bit() { - let input = vec![0xABu8; 512]; - let mut ctx = - PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); - let compressed = ctx.process_vec(&input).unwrap(); - - let blocks = parse_wire_block_types(&compressed); - assert!( - !blocks.is_empty(), - "expected at least one block in the output" - ); - assert_eq!( - blocks[0], - (BLOCK_COMPRESSION_ZSTD, BLOCK_TYPE_LITERAL), - "first block should be a Zstd literal block" - ); - } - - /// The EOS block that closes the Zstd frame must always use BLOCK_COMPRESSION_ZSTD. - #[test] - fn test_encoder_eos_uses_zstd_compression_bit() { - // Plain bytes with no DEFLATE streams → [version][literal][EOS]. - let input = vec![0xABu8; 64]; - let mut ctx = - PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); - let compressed = ctx.process_vec(&input).unwrap(); - - let blocks = parse_wire_block_types(&compressed); - assert_eq!( - blocks.last(), - Some(&(BLOCK_COMPRESSION_ZSTD, BLOCK_TYPE_EOS)), - "last block must be the Zstd EOS marker" - ); - } - - /// Every block type byte in a real compressed output must have compression bits - /// of either BLOCK_COMPRESSION_NONE or BLOCK_COMPRESSION_ZSTD — never the - /// reserved patterns 0x80 or 0xC0. - #[test] - fn test_encoder_never_emits_reserved_compression_bits() { - let input = crate::utils::read_file("samplezip.zip"); - let mut ctx = - PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); - let compressed = ctx.process_vec(&input).unwrap(); - - for &(compression, _) in &parse_wire_block_types(&compressed) { - assert!( - compression == BLOCK_COMPRESSION_NONE || compression == BLOCK_COMPRESSION_ZSTD, - "found reserved compression bits 0x{compression:02X} in output" - ); - } - } - - /// Verify that the decoder extracts the lower 6 bits as block_type rather - /// than passing the full byte to process_compressed_block. If it passed the - /// full byte (0x41) instead of the kind bits (0x01), the match would fall - /// through to the error arm and the round-trip would fail. - #[test] - fn test_decoder_strips_compression_bits_before_dispatch() { - use crate::utils::{assert_eq_array, read_file}; - // A zip file exercises DEFLATE blocks (wire type 0x41 = ZSTD|DEFLATE). - // A successful round-trip proves the decoder is matching on 0x01, not 0x41. - let original = read_file("samplezip.zip"); - let mut enc = - PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); - let compressed = enc.process_vec(&original).unwrap(); - - // Confirm the stream actually contains DEFLATE blocks (type 0x41), - // so the test is meaningful and not trivially passing. - let has_deflate = parse_wire_block_types(&compressed) - .iter() - .any(|&(c, t)| c == BLOCK_COMPRESSION_ZSTD && t == BLOCK_TYPE_DEFLATE); - assert!( - has_deflate, - "test file produced no DEFLATE blocks — test is vacuous" - ); - - let mut dec = RecreateContainerProcessor::new(usize::MAX); - let recreated = dec.process_vec(&compressed).unwrap(); - assert_eq_array(&original, &recreated); - } - - /// A PNG file must produce at least one PNG or WebP IDAT block (not merely a DEFLATE - /// block), and must round-trip to the original bytes. The PNG code path in the encoder - /// is distinct from the plain DEFLATE path: it reconstructs IDAT framing and, when the - /// `webp` feature is enabled, may store pixels as WebP lossless instead of raw. - #[test] - fn test_png_produces_idat_block_and_roundtrips() { - use crate::utils::{assert_eq_array, read_file}; - let original = read_file("treegdi.png"); - let mut enc = - PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); - let compressed = enc.process_vec(&original).unwrap(); - - let blocks = parse_wire_block_types(&compressed); - let has_png_block = blocks - .iter() - .any(|&(_, t)| t == BLOCK_TYPE_PNG || t == BLOCK_TYPE_WEBP); - assert!( - has_png_block, - "PNG input should produce at least one PNG (0x02) or WebP (0x05) block, \ - got: {blocks:?}" - ); - - let mut dec = RecreateContainerProcessor::new(usize::MAX); - let recreated = dec.process_vec(&compressed).unwrap(); - assert_eq_array(&original, &recreated); - } - - /// A PDF containing embedded JPEG images must produce JPEG_LEPTON blocks (raw, - /// outside Zstd) as well as DEFLATE blocks for the PDF's own compressed object - /// streams. Both must survive a full round-trip. - #[test] - fn test_pdf_with_jpegs_produces_lepton_and_deflate_blocks_and_roundtrips() { - use crate::utils::{assert_eq_array, read_file}; - let original = read_file("embedded-images.pdf"); - let mut enc = - PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); - let compressed = enc.process_vec(&original).unwrap(); - - let blocks = parse_wire_block_types(&compressed); - - let has_lepton = blocks - .iter() - .any(|&(c, t)| c == BLOCK_COMPRESSION_NONE && t == BLOCK_TYPE_JPEG_LEPTON); - assert!( - has_lepton, - "PDF with embedded JPEGs should produce at least one JPEG_LEPTON block" - ); - - let has_deflate = blocks.iter().any(|&(_, t)| t == BLOCK_TYPE_DEFLATE); - assert!( - has_deflate, - "PDF with embedded JPEGs should also produce DEFLATE blocks for compressed objects" - ); - - let mut dec = RecreateContainerProcessor::new(usize::MAX); - let recreated = dec.process_vec(&compressed).unwrap(); - assert_eq_array(&original, &recreated); - } - - /// DEFLATE_CONTINUE blocks are produced when the compressed-data buffer is - /// truncated mid-stream: `DeflateParser::parse` reads to EOF and returns - /// `Ok` with `is_done()=false`, the encoder emits a DEFLATE block for the - /// plaintext decoded so far, saves the mid-stream state, and resumes on - /// subsequent calls via DEFLATE_CONTINUE blocks. - /// - /// `sample1.bin.gz` is a single gzip stream with ~418 KiB of uncompressed - /// content. Feeding it in 10 KiB slices (with `min_chunk_size=5000` so the - /// processor starts immediately) means the scanner always sees only a - /// partial window of the compressed stream, forcing many DEFLATE_CONTINUE - /// blocks that must all round-trip correctly. - #[test] - fn test_deflate_continue_blocks_appear_and_roundtrip() { - use crate::utils::{assert_eq_array, read_file}; - let original = read_file("sample1.bin.gz"); - // min_chunk_size: 0 so the loop processes data immediately after Start, - // letting Searching run with the first truncated chunk rather than waiting - // for an additional min_chunk_size bytes before beginning. - let mut enc = PreflateContainerProcessor::new( - &PreflateContainerConfig { - min_chunk_size: 0, - ..PreflateContainerConfig::default() - }, - 1, - false, - ); - // Feed the 263 KiB file in two pieces. The first piece (200 KiB) truncates - // the DEFLATE stream mid-way; decompress() hits EOF with at least one - // complete block already parsed, so it returns Ok(partial) / is_done()=false, - // causing the encoder to emit a DEFLATE block and enter DeflateContinue. - // The second piece completes the stream → DEFLATE_CONTINUE block. - let mut compressed = Vec::new(); - { - let chunk1 = &original[..200_000.min(original.len())]; - enc.process_buffer(chunk1, false, &mut compressed).unwrap(); - if original.len() > 200_000 { - let chunk2 = &original[200_000..]; - enc.process_buffer(chunk2, false, &mut compressed).unwrap(); - } - enc.process_buffer(&[], true, &mut compressed).unwrap(); - } - - let blocks = parse_wire_block_types(&compressed); - let n_continue = blocks - .iter() - .filter(|&&(_, t)| t == BLOCK_TYPE_DEFLATE_CONTINUE) - .count(); - assert!( - n_continue > 0, - "200 KiB chunks on a ~263 KiB gzip should force at least one DEFLATE_CONTINUE block; \ - blocks seen: {blocks:?}" - ); - - let mut dec = RecreateContainerProcessor::new(usize::MAX); - let recreated = dec.process_vec_size(&compressed, 10_000).unwrap(); - assert_eq_array(&original, &recreated); - } - - /// When `total_plain_text_limit` is exceeded the encoder stops analysing - /// deflate streams and writes the remaining bytes as LITERAL blocks. The - /// decoder must still reproduce the original bytes exactly, including the - /// unprocessed portion. - #[test] - fn test_total_plain_text_limit_forces_literal_fallback_and_roundtrips() { - use crate::utils::{assert_eq_array, read_file}; - // samplezip.zip has several DEFLATE entries; setting the limit to 1 byte - // ensures that after the first DEFLATE entry's plaintext is accumulated, - // every subsequent scan sees total_plain_text_seen > limit and falls back - // to writing remaining content as a single LITERAL block. - let original = read_file("samplezip.zip"); - let mut enc = PreflateContainerProcessor::new( - &PreflateContainerConfig { - total_plain_text_limit: 1, - ..PreflateContainerConfig::default() - }, - 1, - false, - ); - let compressed = enc.process_vec(&original).unwrap(); - - let blocks = parse_wire_block_types(&compressed); - - // At least one LITERAL block must appear (the fallback content). - let has_literal = blocks.iter().any(|&(_, t)| t == BLOCK_TYPE_LITERAL); - assert!( - has_literal, - "after total_plain_text_limit is exceeded, remaining content must be LITERAL" - ); - - // The stream must still decode back to the original bytes. - let mut dec = RecreateContainerProcessor::new(usize::MAX); - let recreated = dec.process_vec(&compressed).unwrap(); - assert_eq_array(&original, &recreated); - } - - // ── Multi-scheme fixture tests ─────────────────────────────────────────────── - - /// Helper: compress `data` in one shot and return `(compressed, blocks)`. - fn compress_default(data: &[u8]) -> (Vec, Vec<(u8, u8)>) { - let mut enc = - PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); - let compressed = enc.process_vec(data).unwrap(); - let blocks = parse_wire_block_types(&compressed); - (compressed, blocks) - } - - /// Helper: full roundtrip assertion — compress then decompress, check byte equality. - fn assert_roundtrip(original: &[u8]) { - let (compressed, _) = compress_default(original); - let mut dec = RecreateContainerProcessor::new(usize::MAX); - let recreated = dec.process_vec(&compressed).unwrap(); - crate::utils::assert_eq_array(original, &recreated); - } - - /// Count how many blocks have a given block-type kind. - fn count_block_type(blocks: &[(u8, u8)], kind: u8) -> usize { - blocks.iter().filter(|&&(_, t)| t == kind).count() - } - - /// Two concatenated gzip streams — each contains plaintext well above - /// MIN_BLOCKSIZE=1024, so the scanner must emit exactly two DEFLATE blocks. - /// - /// Fixture: `test_two_gzip_streams.bin` - /// Expected wire sequence: literal, deflate, literal, deflate, literal, EOS - #[test] - fn test_two_gzip_streams_produce_two_deflate_blocks_and_roundtrip() { - use crate::utils::read_file; - let original = read_file("test_two_gzip_streams.bin"); - let (compressed, blocks) = compress_default(&original); - - assert_eq!( - count_block_type(&blocks, BLOCK_TYPE_DEFLATE), - 2, - "two consecutive gzip streams should each produce one DEFLATE block; blocks={blocks:?}" - ); - - let mut dec = RecreateContainerProcessor::new(usize::MAX); - let recreated = dec.process_vec(&compressed).unwrap(); - crate::utils::assert_eq_array(&original, &recreated); - } - - /// A gzip stream whose plaintext is below MIN_BLOCKSIZE (500 < 1024) must NOT - /// be promoted to a DEFLATE block — the whole file becomes a single literal chunk. - /// - /// Fixture: `test_tiny_gzip.bin` - /// Expected wire sequence: literal, EOS (no DEFLATE blocks) - #[test] - fn test_tiny_gzip_below_min_blocksize_becomes_literal_and_roundtrip() { - use crate::utils::read_file; - let original = read_file("test_tiny_gzip.bin"); - let (compressed, blocks) = compress_default(&original); - - assert_eq!( - count_block_type(&blocks, BLOCK_TYPE_DEFLATE), - 0, - "gzip with 500-byte plaintext ( MIN_BLOCKSIZE) immediately followed by - /// a tiny gzip (plaintext < MIN_BLOCKSIZE). Only the large stream must become a - /// DEFLATE block; the small one stays literal. - /// - /// Fixture: `test_big_then_small_gzip.bin` - /// Expected wire sequence: literal, deflate, literal, EOS (exactly 1 DEFLATE block) - #[test] - fn test_big_gzip_deflate_small_gzip_literal_and_roundtrip() { - use crate::utils::read_file; - let original = read_file("test_big_then_small_gzip.bin"); - let (compressed, blocks) = compress_default(&original); - - assert_eq!( - count_block_type(&blocks, BLOCK_TYPE_DEFLATE), - 1, - "only the large gzip stream should become a DEFLATE block; blocks={blocks:?}" - ); - - let mut dec = RecreateContainerProcessor::new(usize::MAX); - let recreated = dec.process_vec(&compressed).unwrap(); - crate::utils::assert_eq_array(&original, &recreated); - } - - /// A file with a valid gzip header but a deliberately corrupted DEFLATE body - /// (0xFF leading byte) must not crash. The scanner must gracefully abandon the - /// stream and encode the entire file as a literal block. - /// - /// Fixture: `test_corrupted_deflate.bin` - /// Expected wire sequence: literal, EOS (0 DEFLATE blocks) - #[test] - fn test_corrupted_deflate_body_falls_back_to_literal_and_roundtrip() { - use crate::utils::read_file; - let original = read_file("test_corrupted_deflate.bin"); - let (compressed, blocks) = compress_default(&original); - - assert_eq!( - count_block_type(&blocks, BLOCK_TYPE_DEFLATE), - 0, - "corrupted DEFLATE body must not produce a DEFLATE block; blocks={blocks:?}" - ); - - let mut dec = RecreateContainerProcessor::new(usize::MAX); - let recreated = dec.process_vec(&compressed).unwrap(); - crate::utils::assert_eq_array(&original, &recreated); - } - - /// A file containing padding bytes, then two zlib streams (each with plaintext - /// > MIN_BLOCKSIZE), then more padding. The scanner must find both zlib headers - /// and emit exactly two DEFLATE blocks. - /// - /// Fixture: `test_two_zlib_streams.bin` - /// layout: 100 × `\xDE\xAD` | zlib(EEEE×6000) | 100 × `\xDE\xAD` | zlib(FFFF×6000) | 100 × `\xDE\xAD` - /// Expected wire sequence: literal, deflate, literal, deflate, literal, EOS - #[test] - fn test_two_zlib_streams_produce_two_deflate_blocks_and_roundtrip() { - use crate::utils::read_file; - let original = read_file("test_two_zlib_streams.bin"); - let (compressed, blocks) = compress_default(&original); - - assert_eq!( - count_block_type(&blocks, BLOCK_TYPE_DEFLATE), - 2, - "two zlib streams surrounded by literal bytes should each produce a DEFLATE block; \ - blocks={blocks:?}" - ); - - let mut dec = RecreateContainerProcessor::new(usize::MAX); - let recreated = dec.process_vec(&compressed).unwrap(); - crate::utils::assert_eq_array(&original, &recreated); - } - - /// A ZIP file containing three DEFLATE-compressed entries must produce exactly three - /// DEFLATE blocks — one per entry — and round-trip correctly. - /// - /// Fixture: `test_zip_3entries.zip` (entries G×20000, H×20000, I×20000 bytes) - /// Expected wire sequence: literal, deflate, literal, deflate, literal, deflate, literal, EOS - #[test] - fn test_zip_three_deflated_entries_produce_three_deflate_blocks_and_roundtrip() { - use crate::utils::read_file; - let original = read_file("test_zip_3entries.zip"); - let (compressed, blocks) = compress_default(&original); - - assert_eq!( - count_block_type(&blocks, BLOCK_TYPE_DEFLATE), - 3, - "ZIP with 3 DEFLATED entries should produce 3 DEFLATE blocks; blocks={blocks:?}" - ); - - let mut dec = RecreateContainerProcessor::new(usize::MAX); - let recreated = dec.process_vec(&compressed).unwrap(); - crate::utils::assert_eq_array(&original, &recreated); - } - - /// A ZIP file with a STORED entry (method=0) followed by a DEFLATED entry (method=8). - /// `parse_zip_stream` returns `Err` for STORED entries so they become literal blocks; - /// only the DEFLATED entry is analysed and emitted as a DEFLATE block. - /// - /// Fixture: `test_zip_stored_then_deflated.zip` (J×8000 STORED, K×20000 DEFLATED) - /// Expected wire sequence: literal, deflate, literal, EOS (exactly 1 DEFLATE block) - #[test] - fn test_zip_stored_entry_stays_literal_deflated_entry_becomes_deflate_and_roundtrip() { - use crate::utils::read_file; - let original = read_file("test_zip_stored_then_deflated.zip"); - let (compressed, blocks) = compress_default(&original); - - assert_eq!( - count_block_type(&blocks, BLOCK_TYPE_DEFLATE), - 1, - "only the DEFLATED entry should become a DEFLATE block; STORED stays literal; \ - blocks={blocks:?}" - ); - - let mut dec = RecreateContainerProcessor::new(usize::MAX); - let recreated = dec.process_vec(&compressed).unwrap(); - crate::utils::assert_eq_array(&original, &recreated); - } - - /// A buffer filled with pseudo-random bytes contains no recognisable DEFLATE/zlib/gzip - /// signatures. The entire file must be emitted as a single literal block with no - /// DEFLATE analysis. - /// - /// Fixture: `test_random_bytes.bin` (32 KiB pseudo-random) - /// Expected wire sequence: literal, EOS - #[test] - fn test_random_bytes_produce_no_deflate_blocks_and_roundtrip() { - use crate::utils::read_file; - let original = read_file("test_random_bytes.bin"); - let (compressed, blocks) = compress_default(&original); - - assert_eq!( - count_block_type(&blocks, BLOCK_TYPE_DEFLATE), - 0, - "random bytes contain no DEFLATE streams; blocks={blocks:?}" - ); - - // The literal block must survive the round-trip. - let mut dec = RecreateContainerProcessor::new(usize::MAX); - let recreated = dec.process_vec(&compressed).unwrap(); - crate::utils::assert_eq_array(&original, &recreated); - } - - /// Two gzip streams separated by a 1000-byte null gap. Both streams have - /// plaintext > MIN_BLOCKSIZE, so both must produce DEFLATE blocks, and the gap - /// must appear as a literal block between them. - /// - /// Fixture: `test_gzip_with_gap.bin` - /// Expected wire sequence: literal, deflate, literal, deflate, literal, EOS - #[test] - fn test_two_gzip_streams_with_null_gap_produce_two_deflate_blocks_and_roundtrip() { - use crate::utils::read_file; - let original = read_file("test_gzip_with_gap.bin"); - let (compressed, blocks) = compress_default(&original); - - assert_eq!( - count_block_type(&blocks, BLOCK_TYPE_DEFLATE), - 2, - "both gzip streams should become DEFLATE blocks; null gap stays literal; \ - blocks={blocks:?}" - ); - // There should be at least one literal block (the gap between the two streams). - assert!( - count_block_type(&blocks, BLOCK_TYPE_LITERAL) >= 1, - "null gap between gzip streams should produce at least one literal block; \ - blocks={blocks:?}" - ); - - let mut dec = RecreateContainerProcessor::new(usize::MAX); - let recreated = dec.process_vec(&compressed).unwrap(); - crate::utils::assert_eq_array(&original, &recreated); - } - - /// Feed a fixture containing two gzip streams in very small chunks (64 bytes at a - /// time) via the incremental `process_buffer` API to exercise boundary handling. - /// The round-trip result must be byte-exact regardless of where chunk boundaries fall. - #[test] - fn test_two_gzip_streams_incremental_small_chunks_roundtrip() { - use crate::utils::{assert_eq_array, read_file}; - let original = read_file("test_two_gzip_streams.bin"); - - let mut enc = PreflateContainerProcessor::new( - &PreflateContainerConfig { - min_chunk_size: 0, - ..PreflateContainerConfig::default() - }, - 1, - false, - ); - let mut compressed = Vec::new(); - let chunk_size = 64; - let mut pos = 0; - while pos < original.len() { - let end = (pos + chunk_size).min(original.len()); - enc.process_buffer(&original[pos..end], false, &mut compressed) - .unwrap(); - pos = end; - } - enc.process_buffer(&[], true, &mut compressed).unwrap(); - - let mut dec = RecreateContainerProcessor::new(usize::MAX); - let recreated = dec.process_vec(&compressed).unwrap(); - assert_eq_array(&original, &recreated); - } - - /// Feed `test_two_zlib_streams.bin` in small chunks (128 bytes) to confirm that - /// the incremental path handles mixed literal padding + zlib streams correctly. - #[test] - fn test_two_zlib_streams_incremental_small_chunks_roundtrip() { - use crate::utils::{assert_eq_array, read_file}; - let original = read_file("test_two_zlib_streams.bin"); - - let mut enc = PreflateContainerProcessor::new( - &PreflateContainerConfig { - min_chunk_size: 0, - ..PreflateContainerConfig::default() - }, - 1, - false, - ); - let mut compressed = Vec::new(); - let chunk_size = 128; - let mut pos = 0; - while pos < original.len() { - let end = (pos + chunk_size).min(original.len()); - enc.process_buffer(&original[pos..end], false, &mut compressed) - .unwrap(); - pos = end; - } - enc.process_buffer(&[], true, &mut compressed).unwrap(); - - let mut dec = RecreateContainerProcessor::new(usize::MAX); - let recreated = dec.process_vec(&compressed).unwrap(); - assert_eq_array(&original, &recreated); - } - - /// Feed a ZIP fixture in small chunks (256 bytes) to check that chunk boundaries - /// inside the ZIP local-file headers and DEFLATE bodies are handled gracefully. - #[test] - fn test_zip_three_entries_incremental_small_chunks_roundtrip() { - use crate::utils::{assert_eq_array, read_file}; - let original = read_file("test_zip_3entries.zip"); - - let mut enc = PreflateContainerProcessor::new( - &PreflateContainerConfig { - min_chunk_size: 0, - ..PreflateContainerConfig::default() - }, - 1, - false, - ); - let mut compressed = Vec::new(); - let chunk_size = 256; - let mut pos = 0; - while pos < original.len() { - let end = (pos + chunk_size).min(original.len()); - enc.process_buffer(&original[pos..end], false, &mut compressed) - .unwrap(); - pos = end; - } - enc.process_buffer(&[], true, &mut compressed).unwrap(); - - let mut dec = RecreateContainerProcessor::new(usize::MAX); - let recreated = dec.process_vec(&compressed).unwrap(); - assert_eq_array(&original, &recreated); - } - - /// Verify that the decoder also handles the recreated stream correctly when fed in - /// small chunks, not just when given the entire buffer at once. - /// Uses `test_zip_stored_then_deflated.zip` (mixed STORED + DEFLATED entries). - #[test] - fn test_zip_stored_then_deflated_decoder_small_chunks_roundtrip() { - use crate::utils::{assert_eq_array, read_file}; - let original = read_file("test_zip_stored_then_deflated.zip"); - - let mut enc = - PreflateContainerProcessor::new(&PreflateContainerConfig::default(), 1, false); - let compressed = enc.process_vec(&original).unwrap(); - - // Decompress in 512-byte chunks to exercise the incremental decoder. - let mut dec = RecreateContainerProcessor::new(usize::MAX); - let recreated = dec.process_vec_size(&compressed, 512).unwrap(); - assert_eq_array(&original, &recreated); - } -} diff --git a/container/src/container_read.rs b/container/src/container_read.rs new file mode 100644 index 0000000..795382a --- /dev/null +++ b/container/src/container_read.rs @@ -0,0 +1,535 @@ +use byteorder::ReadBytesExt; +use crc32fast::Hasher as CrcHasher; +use lepton_jpeg::{DEFAULT_THREAD_POOL, EnabledFeatures}; + +use std::{ + collections::VecDeque, + io::{Cursor, Read, Write}, +}; + +use crate::{ + container_common::{ + BLOCK_COMPRESSION_MASK, BLOCK_COMPRESSION_NONE, BLOCK_COMPRESSION_ZSTD, BLOCK_TYPE_DEFLATE, + BLOCK_TYPE_DEFLATE_CONTINUE, BLOCK_TYPE_JPEG_LEPTON, BLOCK_TYPE_LITERAL, BLOCK_TYPE_MASK, + BLOCK_TYPE_PNG, BLOCK_TYPE_WEBP, COMPRESSED_WRAPPER_VERSION_2, ProcessBuffer, read_varint, + }, + idat_parse::{IdatContents, recreate_idat}, + scoped_read::ScopedRead, +}; + +use preflate_rs::{ + AddContext, ExitCode, PreflateError, RecreateStreamProcessor, Result, err_exit_code, + recreate_whole_deflate_stream, +}; + +/// Write wrapper that computes a running CRC-32 of every byte written. +struct CrcWriter<'a, W: Write> { + inner: &'a mut W, + hasher: &'a mut CrcHasher, +} + +impl Write for CrcWriter<'_, W> { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + let n = self.inner.write(buf)?; + self.hasher.update(&buf[..n]); + Ok(n) + } + fn flush(&mut self) -> std::io::Result<()> { + self.inner.flush() + } +} + +enum DecompressionState { + Start, + StartSegment, + /// accumulate compressed_size bytes then decode and process the block immediately. + AccumulateBlock { + block_type: u8, + compressed_size: usize, + }, + /// accumulate lepton bytes then decode the JPEG block immediately. + JpegAccumulate { + lepton_length: usize, + }, + /// accumulate raw WebP-compressed PNG bytes then process the block immediately. + WebpAccumulate { + total_len: usize, + }, + /// 0xFF end block was parsed; CRC check deferred to process_buffer. + CrcCheck { + expected: u32, + }, +} + +/// recreates the orignal content from the chunked data +pub struct RecreateContainerProcessor { + capacity: usize, + input: VecDeque, + input_complete: bool, + state: DecompressionState, + + /// state of the predictor and plain text if we need to contiune a deflate stream + /// if it was too big to complete in a single chunk + deflate_continue_state: Option, + + /// persistent Zstd decoder — maintains the streaming context across blocks + zstd_decoder: zstd::stream::raw::Decoder<'static>, + + /// running CRC-32 of all output bytes, verified against the 0xFF end block + output_crc: CrcHasher, +} + +impl RecreateContainerProcessor { + pub fn new(capacity: usize) -> Self { + RecreateContainerProcessor { + input: VecDeque::new(), + capacity, + input_complete: false, + state: DecompressionState::Start, + deflate_continue_state: None, + zstd_decoder: zstd::stream::raw::Decoder::new().expect("failed to create zstd decoder"), + output_crc: CrcHasher::new(), + } + } +} + +impl ProcessBuffer for RecreateContainerProcessor { + fn process_buffer( + &mut self, + input: &[u8], + input_complete: bool, + writer: &mut impl Write, + ) -> Result<()> { + if self.input_complete && (input.len() > 0 || !input_complete) { + return Err(PreflateError::new( + ExitCode::InvalidParameter, + "more data provided after input_complete signaled", + )); + } + + // we could have been passed a big buffer, so we need to process it in chunks + let mut amount_read = 0; + loop { + let amount_to_read = (input.len() - amount_read).min(self.capacity); + + // when we get to the end and we've read everything, we can signal that we are done + if amount_read + amount_to_read == input.len() && input_complete { + self.input_complete = true; + } + + self.input + .extend(&input[amount_read..amount_read + amount_to_read]); + + amount_read += amount_to_read; + + self.with_crc_writer(writer, |this, crc_writer| { + this.process_buffer_internal(crc_writer) + })?; + + // If process_buffer_internal parsed the 0xFF end block, verify the CRC now + // that output_crc has been restored with all written bytes. + if let DecompressionState::CrcCheck { expected } = self.state { + let actual = self.output_crc.clone().finalize(); + if actual != expected { + return err_exit_code( + ExitCode::InvalidCompressedWrapper, + format!("CRC-32 mismatch: expected {expected:#010x}, got {actual:#010x}"), + ); + } + self.state = DecompressionState::StartSegment; + } + + if amount_read == input.len() { + break; + } + } + + Ok(()) + } +} + +impl RecreateContainerProcessor { + /// Runs `f` with a `CrcWriter` wrapping `writer`, then restores `self.output_crc`. + /// + /// `self.output_crc` must be borrowed mutably through the `CrcWriter`, but + /// `f` also needs `&mut self` to drive the state machine — a direct borrow + /// conflict. This helper resolves it by temporarily swapping `output_crc` + /// out of `self` with `mem::replace`, so the field is no longer part of the + /// active `&mut self` borrow while `f` runs. + fn with_crc_writer(&mut self, writer: &mut W, f: F) -> Result<()> + where + F: FnOnce(&mut Self, &mut CrcWriter<'_, W>) -> Result<()>, + { + let mut hasher = std::mem::replace(&mut self.output_crc, CrcHasher::new()); + let result = { + let mut crc_writer = CrcWriter { + inner: writer, + hasher: &mut hasher, + }; + f(self, &mut crc_writer) + }; + self.output_crc = hasher; + result + } + + fn process_buffer_internal(&mut self, writer: &mut impl Write) -> Result<()> { + loop { + match &mut self.state { + DecompressionState::Start => { + if !self.input_complete && self.input.len() == 0 { + break; + } + + let version = self.input.read_u8()?; + + match version { + COMPRESSED_WRAPPER_VERSION_2 => { + self.state = DecompressionState::StartSegment; + } + _ => { + return err_exit_code( + ExitCode::InvalidCompressedWrapper, + format!("Invalid version {version}"), + ); + } + } + } + DecompressionState::StartSegment => { + // here's a good place to stop if we run out of input + if self.input.len() == 0 { + break; + } + + // read type byte, then dispatch + self.state = match self.input.scoped_read(|r| { + let type_byte = r.read_u8()?; + + // 0xFF is the CRC end block: 4 raw bytes, no varint. + if type_byte == 0xFF { + let mut buf = [0u8; 4]; + r.read_exact(&mut buf)?; + return Ok(DecompressionState::CrcCheck { + expected: u32::from_le_bytes(buf), + }); + } + + let compression = type_byte & BLOCK_COMPRESSION_MASK; + let block_type = type_byte & BLOCK_TYPE_MASK; + match compression { + BLOCK_COMPRESSION_NONE => match block_type { + BLOCK_TYPE_JPEG_LEPTON => { + let lepton_length = read_varint(r)? as usize; + Ok(DecompressionState::JpegAccumulate { lepton_length }) + } + BLOCK_TYPE_WEBP => { + let total_len = read_varint(r)? as usize; + Ok(DecompressionState::WebpAccumulate { total_len }) + } + _ => err_exit_code( + ExitCode::InvalidCompressedWrapper, + "unknown raw block type", + ), + }, + BLOCK_COMPRESSION_ZSTD => { + let compressed_size = read_varint(r)? as usize; + Ok(DecompressionState::AccumulateBlock { + block_type, + compressed_size, + }) + } + _ => err_exit_code( + ExitCode::InvalidCompressedWrapper, + "unknown compression algorithm", + ), + } + }) { + Ok(s) => s, + Err(e) => { + if !self.input_complete && e.exit_code() == ExitCode::ShortRead { + break; + } else { + return Err(e); + } + } + }; + } + + DecompressionState::AccumulateBlock { + block_type, + compressed_size, + } => { + if self.input.len() < *compressed_size { + if self.input_complete { + return Err(PreflateError::new( + ExitCode::InvalidCompressedWrapper, + "unexpected end of input in block", + )); + } + break; + } + + let block_type = *block_type; + let compressed_bytes: Vec = self.input.drain(0..*compressed_size).collect(); + let decoded = drain_zstd_block(&mut self.zstd_decoder, &compressed_bytes)?; + process_compressed_block( + block_type, + &mut Cursor::new(decoded), + &mut self.deflate_continue_state, + writer, + )?; + self.state = DecompressionState::StartSegment; + } + + DecompressionState::JpegAccumulate { lepton_length } => { + if self.input.len() < *lepton_length { + if self.input_complete { + return Err(PreflateError::new( + ExitCode::InvalidCompressedWrapper, + "unexpected end of input in jpeg block", + )); + } + break; + } + + let lepton_bytes: Vec = self.input.drain(0..*lepton_length).collect(); + match lepton_jpeg::decode_lepton( + &mut Cursor::new(&lepton_bytes), + writer, + &EnabledFeatures::compat_lepton_vector_read(), + &DEFAULT_THREAD_POOL, + ) { + Err(e) => { + return Err(PreflateError::new( + ExitCode::InvalidCompressedWrapper, + format!("JPEG Lepton decode failed: {}", e), + )); + } + Ok(_) => {} + } + self.state = DecompressionState::StartSegment; + } + + DecompressionState::WebpAccumulate { total_len } => { + if self.input.len() < *total_len { + if self.input_complete { + return Err(PreflateError::new( + ExitCode::InvalidCompressedWrapper, + "unexpected end of input in webp block", + )); + } + break; + } + + let webp_bytes: Vec = self.input.drain(0..*total_len).collect(); + // Payload is what webp_compress wrote after the BLOCK_TYPE_PNG type byte, + // so process_compressed_block can parse it directly. + process_compressed_block( + BLOCK_TYPE_PNG, + &mut Cursor::new(webp_bytes), + &mut self.deflate_continue_state, + writer, + )?; + self.state = DecompressionState::StartSegment; + } + + DecompressionState::CrcCheck { .. } => { + // CRC verification is handled in process_buffer after this returns. + break; + } + } + } + + Ok(()) + } +} + +/// Feeds `compressed` bytes into the persistent `decoder` and returns all decompressed output. +/// +/// Each call corresponds to one Zstd flush frame (written by the encoder via `flush()`). +/// After consuming all input bytes the decoder is drained until it produces no more output, +/// which is guaranteed because `ZSTD_e_flush` ensures all data is available to the decoder +/// before the next block starts. +fn drain_zstd_block( + decoder: &mut zstd::stream::raw::Decoder<'static>, + compressed: &[u8], +) -> Result> { + use zstd::stream::raw::{InBuffer, Operation, OutBuffer}; + + let mut output = Vec::new(); + let mut scratch = vec![0u8; 65536]; + let mut in_buf = InBuffer::around(compressed); + + loop { + let mut out_buf = OutBuffer::around(scratch.as_mut_slice()); + decoder.run(&mut in_buf, &mut out_buf).map_err(|e| { + PreflateError::new( + ExitCode::InvalidCompressedWrapper, + format!("zstd decode failed: {e}"), + ) + })?; + let produced = out_buf.pos(); + output.extend_from_slice(&scratch[..produced]); + + // Stop when all input has been consumed and the decoder produced no more output. + // zstd guarantees progress (either bytes_read > 0 or bytes_written > 0) so this + // loop always terminates. + if in_buf.pos() >= compressed.len() && produced == 0 { + break; + } + } + + Ok(output) +} + +/// Parses and processes a single non-JPEG/non-WebP block. +/// +/// `cursor` wraps the output of `drain_zstd_block` for compressed blocks, +/// or the raw WebP payload for `BLOCK_TYPE_PNG` blocks stored outside Zstd. +/// +/// Layout written by the encoder for each block type (block_type = lower 6 bits): +/// BLOCK_TYPE_LITERAL: varint(len) + data +/// BLOCK_TYPE_DEFLATE: varint(corrections_len) + varint(plaintext_len) + corrections + plaintext +/// BLOCK_TYPE_DEFLATE_CONTINUE: same as BLOCK_TYPE_DEFLATE +/// BLOCK_TYPE_PNG: varint(correction_length) + varint(uncompressed_length) + +/// IdatContents + [filters if png_header present] + +/// corrections + (webp_data or raw_plaintext) +fn process_compressed_block( + block_type: u8, + cursor: &mut Cursor>, + deflate_continue_state: &mut Option, + writer: &mut impl Write, +) -> Result<()> { + match block_type { + BLOCK_TYPE_LITERAL => { + let length = read_varint(cursor)? as usize; + let mut data = vec![0u8; length]; + cursor.read_exact(&mut data).map_err(|e| { + PreflateError::new(ExitCode::InvalidCompressedWrapper, e.to_string()) + })?; + writer.write_all(&data)?; + } + BLOCK_TYPE_DEFLATE => { + *deflate_continue_state = None; + + let correction_length = read_varint(cursor)? as usize; + let uncompressed_length = read_varint(cursor)? as usize; + + let mut corrections = vec![0u8; correction_length]; + cursor.read_exact(&mut corrections).map_err(|e| { + PreflateError::new(ExitCode::InvalidCompressedWrapper, e.to_string()) + })?; + + let mut plain_text_buf = vec![0u8; uncompressed_length]; + cursor.read_exact(&mut plain_text_buf).map_err(|e| { + PreflateError::new(ExitCode::InvalidCompressedWrapper, e.to_string()) + })?; + + let mut reconstruct = RecreateStreamProcessor::new(); + let (comp, _) = reconstruct + .recompress(&mut Cursor::new(&plain_text_buf), &corrections) + .context()?; + + writer.write_all(&comp)?; + *deflate_continue_state = Some(reconstruct); + } + BLOCK_TYPE_DEFLATE_CONTINUE => { + let correction_length = read_varint(cursor)? as usize; + let uncompressed_length = read_varint(cursor)? as usize; + + let mut corrections = vec![0u8; correction_length]; + cursor.read_exact(&mut corrections).map_err(|e| { + PreflateError::new(ExitCode::InvalidCompressedWrapper, e.to_string()) + })?; + + let mut plain_text_buf = vec![0u8; uncompressed_length]; + cursor.read_exact(&mut plain_text_buf).map_err(|e| { + PreflateError::new(ExitCode::InvalidCompressedWrapper, e.to_string()) + })?; + + let reconstruct = deflate_continue_state.as_mut().ok_or_else(|| { + PreflateError::new( + ExitCode::InvalidCompressedWrapper, + "no deflate state to continue", + ) + })?; + + let (comp, _) = reconstruct + .recompress(&mut Cursor::new(&plain_text_buf), &corrections) + .context()?; + + writer.write_all(&comp)?; + } + BLOCK_TYPE_PNG => { + let correction_length = read_varint(cursor)? as usize; + let uncompressed_length = read_varint(cursor)? as usize; + let idat = IdatContents::read_from_bytestream(cursor)?; + + let mut filters = Vec::new(); + if let Some(png_header) = &idat.png_header { + filters.resize(png_header.height as usize, 0); + cursor.read_exact(&mut filters[..]).map_err(|e| { + PreflateError::new(ExitCode::InvalidCompressedWrapper, e.to_string()) + })?; + } + + let mut corrections = vec![0u8; correction_length]; + cursor.read_exact(&mut corrections).map_err(|e| { + PreflateError::new(ExitCode::InvalidCompressedWrapper, e.to_string()) + })?; + + let plain_text; + if let Some(header) = &idat.png_header { + let mut webp = vec![0u8; uncompressed_length]; + cursor.read_exact(&mut webp).map_err(|e| { + PreflateError::new(ExitCode::InvalidCompressedWrapper, e.to_string()) + })?; + plain_text = webp_decompress(&filters, webp, header).context()?; + } else { + let mut raw = vec![0u8; uncompressed_length]; + cursor.read_exact(&mut raw).map_err(|e| { + PreflateError::new(ExitCode::InvalidCompressedWrapper, e.to_string()) + })?; + plain_text = raw; + } + + let recompressed = + recreate_whole_deflate_stream(&plain_text, &corrections).context()?; + + recreate_idat(&idat, &recompressed[..], writer).context()?; + } + _ => { + return err_exit_code( + ExitCode::InvalidCompressedWrapper, + format!("Unknown block type {block_type}"), + ); + } + } + Ok(()) +} + +fn webp_decompress( + filters: &[u8], + webp: Vec, + header: &crate::idat_parse::PngHeader, +) -> Result> { + #[cfg(feature = "webp")] + match webp::Decoder::new(webp.as_slice()).decode() { + Some(result) => { + use crate::idat_parse::apply_png_filters_with_types; + use std::ops::Deref; + + let m = result.deref(); + + return Ok(apply_png_filters_with_types( + m, + header.width as usize, + header.height as usize, + if result.is_alpha() { 4 } else { 3 }, + header.color_type.bytes_per_pixel(), + &filters, + )); + } + _ => {} + } + return err_exit_code(ExitCode::InvalidCompressedWrapper, "Webp decode failed"); +} diff --git a/container/src/container_write.rs b/container/src/container_write.rs new file mode 100644 index 0000000..2e9c5e5 --- /dev/null +++ b/container/src/container_write.rs @@ -0,0 +1,534 @@ +use crc32fast::Hasher as CrcHasher; + +use std::io::Write; + +use crate::{ + container_common::{ + BLOCK_COMPRESSION_NONE, BLOCK_COMPRESSION_ZSTD, BLOCK_TYPE_DEFLATE, + BLOCK_TYPE_DEFLATE_CONTINUE, BLOCK_TYPE_JPEG_LEPTON, BLOCK_TYPE_LITERAL, BLOCK_TYPE_PNG, + BLOCK_TYPE_WEBP, PreflateContainerConfig, PreflateStats, ProcessBuffer, write_varint, + }, + idat_parse::{IdatContents, PngHeader}, + scan_deflate::{FindStreamResult, FoundStream, FoundStreamType, find_compressable_stream}, +}; + +use preflate_rs::{AddContext, ExitCode, PreflateError, PreflateStreamProcessor, Result}; + +/// used to measure the length of the output without storing it +pub(crate) struct MeasureWriteSink { + pub length: usize, +} + +impl Write for MeasureWriteSink { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + self.length += buf.len(); + Ok(buf.len()) + } + + fn flush(&mut self) -> std::io::Result<()> { + Ok(()) + } +} + +#[derive(Debug)] +pub(crate) enum ChunkParseState { + Start, + /// we are looking for a deflate stream or PNG chunk. The data of the PNG file + /// is stored later than the IHDR chunk that will tell us the dimensions of the image, + /// so we need to keep track of the IHDR chunk so we can use it later to properly + /// compress the PNG data. + Searching(Option), + DeflateContinue(PreflateStreamProcessor), +} + +/// V2 variant of write_chunk_block: block content goes through the persistent Zstd encoder. +/// JPEG blocks are written raw to writer (bypass encoder). +/// Returns (total compressed bytes written, optional continue state). +pub(crate) fn write_chunk_block_v2( + encoder: &mut zstd::stream::write::Encoder<'static, Vec>, + writer: &mut impl Write, + chunk: FoundStream, + stats: &mut PreflateStats, +) -> Result<(usize, Option)> { + match chunk.chunk_type { + FoundStreamType::DeflateStream(parameters, state) => { + write_varint(encoder, chunk.corrections.len() as u32)?; + write_varint(encoder, state.plain_text().text().len() as u32)?; + encoder.write_all(&chunk.corrections)?; + encoder.write_all(&state.plain_text().text())?; + + let compressed_size = emit_compressed_block( + BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_DEFLATE, + encoder, + writer, + )?; + + stats.overhead_bytes += chunk.corrections.len() as u64; + stats.uncompressed_size += state.plain_text().len() as u64; + stats.hash_algorithm = parameters.hash_algorithm; + + if !state.is_done() { + return Ok((compressed_size, Some(state))); + } + Ok((compressed_size, None)) + } + + FoundStreamType::IDATDeflate(parameters, mut idat, plain_text) => { + log::debug!( + "IDATDeflate param {:?} corrections {}", + parameters, + chunk.corrections.len() + ); + + let mut temp_vec = Vec::new(); + + if webp_compress(&mut temp_vec, plain_text.text(), &chunk.corrections, &idat).is_ok() { + // WebP is already compressed — write raw, bypassing the Zstd encoder. + // temp_vec[0] is the BLOCK_TYPE_PNG placeholder byte; temp_vec[1..] is the payload. + let payload = &temp_vec[1..]; + writer.write_all(&[BLOCK_COMPRESSION_NONE | BLOCK_TYPE_WEBP])?; + write_varint(writer, payload.len() as u32)?; + writer.write_all(payload)?; + + stats.uncompressed_size += plain_text.len() as u64; + stats.hash_algorithm = parameters.hash_algorithm; + stats.overhead_bytes += chunk.corrections.len() as u64; + + Ok((payload.len(), None)) + } else { + // Non-WebP PNG: corrections + plaintext are compressible, send through Zstd. + log::debug!("non-Webp compressed {}", idat.total_chunk_length); + write_varint(encoder, chunk.corrections.len() as u32)?; + write_varint(encoder, plain_text.text().len() as u32)?; + idat.png_header = None; + idat.write_to_bytestream(encoder)?; + encoder.write_all(&chunk.corrections)?; + encoder.write_all(plain_text.text())?; + + let compressed_size = emit_compressed_block( + BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_PNG, + encoder, + writer, + )?; + + stats.uncompressed_size += plain_text.len() as u64; + stats.hash_algorithm = parameters.hash_algorithm; + stats.overhead_bytes += chunk.corrections.len() as u64; + + Ok((compressed_size, None)) + } + } + + FoundStreamType::JPEGLepton(data) => { + // JPEG is written raw (bypasses the encoder entirely) + writer.write_all(&[BLOCK_COMPRESSION_NONE | BLOCK_TYPE_JPEG_LEPTON])?; + write_varint(writer, data.len() as u32)?; + writer.write_all(&data)?; + + stats.uncompressed_size += data.len() as u64; + Ok((0, None)) + } + } +} + +/// Takes a sequence of bytes that may contain deflate streams, find +/// the streams, and emits a new stream that containus the decompressed +/// streams along with the corrections needed to recreate the original. +/// +/// This output can then be compressed with a better algorithm, like Zstandard +/// and achieve much better compression than if we tried to compress the +/// deflate stream directlyh. +pub struct PreflateContainerProcessor { + content: Vec, + compression_stats: PreflateStats, + input_complete: bool, + total_plain_text_seen: u64, + + /// used to track the last attempted chunk size, in case we + /// need more input to continue, we will collect at least min_chunk_size + /// more input before trying to process again until we reach max_chunk_size + last_attempt_chunk_size: usize, + + state: ChunkParseState, + config: PreflateContainerConfig, + + /// running CRC-32 of all input bytes, written as the final block + input_crc: CrcHasher, + + /// each block is individually compressed with this encoder (v2 format) + encoder: Option>>, + + /// when present, all raw input is also fed to this encoder so we can measure + /// baseline Zstd compression (without preflate processing) + baseline_encoder: Option>, +} + +impl PreflateContainerProcessor { + /// Creates a processor that uses v2 format with a persistent Zstd encoder shared + /// across all non-JPEG blocks. JPEG blocks bypass the encoder entirely. + pub fn new(config: &PreflateContainerConfig, level: i32, test_baseline: bool) -> Self { + PreflateContainerProcessor { + content: Vec::new(), + compression_stats: PreflateStats::default(), + input_complete: false, + state: ChunkParseState::Start, + total_plain_text_seen: 0, + last_attempt_chunk_size: 0, + config: config.clone(), + input_crc: CrcHasher::new(), + encoder: Some(zstd::stream::write::Encoder::new(Vec::new(), level).unwrap()), + baseline_encoder: if test_baseline { + Some( + zstd::stream::write::Encoder::new(MeasureWriteSink { length: 0 }, level) + .unwrap(), + ) + } else { + None + }, + } + } +} + +impl ProcessBuffer for PreflateContainerProcessor { + fn process_buffer( + &mut self, + input: &[u8], + input_complete: bool, + writer: &mut impl Write, + ) -> Result<()> { + use crate::container_common::COMPRESSED_WRAPPER_VERSION_2; + + if self.input_complete && (input.len() > 0 || !input_complete) { + return Err(PreflateError::new( + ExitCode::InvalidParameter, + "more data provided after input_complete signaled", + )); + } + + if input.len() > 0 { + self.compression_stats.deflate_compressed_size += input.len() as u64; + self.input_crc.update(input); + self.content.extend_from_slice(input); + + if let Some(encoder) = &mut self.baseline_encoder { + encoder.write_all(input).context()?; + } + } + + loop { + // wait until we have at least min_chunk_size before we start processing + if self.content.is_empty() + || (!input_complete + && (self.content.len() - self.last_attempt_chunk_size) + < self.config.min_chunk_size + && self.content.len() <= self.config.max_chunk_size) + { + break; + } + + self.last_attempt_chunk_size = self.content.len(); + + match &mut self.state { + ChunkParseState::Start => { + writer.write_all(&[COMPRESSED_WRAPPER_VERSION_2])?; + self.state = ChunkParseState::Searching(None); + } + ChunkParseState::Searching(prev_ihdr) => { + if self.total_plain_text_seen > self.config.total_plain_text_limit { + // once we've exceeded our limit, we don't do any more compression + let encoder = self.encoder.as_mut().unwrap(); + write_varint(encoder, self.content.len() as u32)?; + encoder.write_all(&self.content)?; + let sz = emit_compressed_block( + BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_LITERAL, + encoder, + writer, + )?; + self.compression_stats.zstd_compressed_size += sz as u64; + + self.last_attempt_chunk_size = 0; + self.content.clear(); + break; + } + + // here we are looking for a deflate stream or PNG chunk + match find_compressable_stream( + &self.content, + prev_ihdr, + input_complete, + &self.config, + ) { + FindStreamResult::Found(next, chunk) => { + // the gap between the start and the beginning of the deflate stream + // is written out as a literal block + if next.start != 0 { + let encoder = self.encoder.as_mut().unwrap(); + write_varint(encoder, next.start as u32)?; + encoder.write_all(&self.content[..next.start])?; + let sz = emit_compressed_block( + BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_LITERAL, + encoder, + writer, + )?; + self.compression_stats.zstd_compressed_size += sz as u64; + } + + let (compressed_size, next_state) = write_chunk_block_v2( + self.encoder.as_mut().unwrap(), + writer, + chunk, + &mut self.compression_stats, + ) + .context()?; + self.compression_stats.zstd_compressed_size += compressed_size as u64; + + if let Some(mut state) = next_state { + self.total_plain_text_seen += state.plain_text().len() as u64; + state.shrink_to_dictionary(); + self.state = ChunkParseState::DeflateContinue(state); + } + + self.content.drain(0..next.end); + self.last_attempt_chunk_size = self.content.len(); + } + FindStreamResult::ShortRead => { + if input_complete || self.content.len() > self.config.max_chunk_size { + // if we have too much data or have no more data, + // we just write it out as a literal block with everything we have + let encoder = self.encoder.as_mut().unwrap(); + write_varint(encoder, self.content.len() as u32)?; + encoder.write_all(&self.content)?; + let sz = emit_compressed_block( + BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_LITERAL, + encoder, + writer, + )?; + self.compression_stats.zstd_compressed_size += sz as u64; + + self.content.clear(); + self.last_attempt_chunk_size = 0; + } else { + // we don't have enough data to process the stream, so we just + // wait for more data + break; + } + } + FindStreamResult::None => { + // couldn't find anything, just write the rest as a literal block + let encoder = self.encoder.as_mut().unwrap(); + write_varint(encoder, self.content.len() as u32)?; + encoder.write_all(&self.content)?; + let sz = emit_compressed_block( + BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_LITERAL, + encoder, + writer, + )?; + self.compression_stats.zstd_compressed_size += sz as u64; + + self.content.clear(); + self.last_attempt_chunk_size = 0; + } + } + } + ChunkParseState::DeflateContinue(state) => { + // here we have a deflate stream that we need to continue + match state.decompress(&self.content) { + Err(ref e) + if e.exit_code() == ExitCode::ShortRead + && !input_complete + && self.content.len() <= self.config.max_chunk_size => + { + // Not enough data to complete the next block yet; wait for more. + break; + } + Err(_e) => { + // Stream analysis diverged or no more data is coming; give up on + // continuation and fall back to treating the remaining bytes as raw. + self.state = ChunkParseState::Searching(None); + + log::debug!("Error while trying to continue compression {:?}", _e); + } + Ok(res) => { + log::debug!( + "Deflate continue: {} -> {}", + state.plain_text().len(), + res.compressed_size + ); + + let encoder = self.encoder.as_mut().unwrap(); + write_varint(encoder, res.corrections.len() as u32)?; + write_varint(encoder, state.plain_text().len() as u32)?; + encoder.write_all(&res.corrections)?; + encoder.write_all(&state.plain_text().text())?; + let sz = emit_compressed_block( + BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_DEFLATE_CONTINUE, + encoder, + writer, + )?; + self.compression_stats.zstd_compressed_size += sz as u64; + + self.total_plain_text_seen += state.plain_text().len() as u64; + self.compression_stats.overhead_bytes += res.corrections.len() as u64; + self.compression_stats.uncompressed_size += + state.plain_text().len() as u64; + + self.content.drain(0..res.compressed_size); + self.last_attempt_chunk_size = self.content.len(); + + if state.is_done() { + self.state = ChunkParseState::Searching(None); + } else { + state.shrink_to_dictionary(); + } + } + } + } + } + } + + if input_complete && !self.input_complete { + self.input_complete = true; + + if self.content.len() > 0 { + let encoder = self.encoder.as_mut().unwrap(); + write_varint(encoder, self.content.len() as u32)?; + encoder.write_all(&self.content)?; + let sz = emit_compressed_block( + BLOCK_COMPRESSION_ZSTD | BLOCK_TYPE_LITERAL, + encoder, + writer, + )?; + self.compression_stats.zstd_compressed_size += sz as u64; + } + self.content.clear(); + + // Finalize the Zstd encoder; finish bytes are discarded since each block + // was already flushed and the decoder relies on EOF as the stream terminator. + let encoder = self.encoder.take().unwrap(); + let _ = encoder.finish(); + + // Write the CRC-32 end block: 0xFF sentinel + 4-byte LE CRC of original input. + let crc = self.input_crc.clone().finalize(); + writer.write_all(&[0xFF])?; + writer.write_all(&crc.to_le_bytes())?; + + // Finalize baseline encoder for stats + if let Some(mut encoder) = self.baseline_encoder.take() { + encoder.flush().context()?; + encoder.do_finish().context()?; + self.compression_stats.zstd_baseline_size = encoder.get_mut().length as u64; + } + } + + Ok(()) + } + + fn stats(&self) -> PreflateStats { + self.compression_stats + } +} + +/// Flushes the encoder, writes [block_type][varint(compressed_size)][compressed_bytes] to +/// destination, clears the encoder's inner buffer, and returns the compressed byte count. +fn emit_compressed_block( + block_type: u8, + encoder: &mut zstd::stream::write::Encoder<'static, Vec>, + destination: &mut impl Write, +) -> Result { + encoder.flush().context()?; + let compressed = encoder.get_mut(); + let len = compressed.len(); + destination.write_all(&[block_type])?; + write_varint(destination, len as u32)?; + destination.write_all(compressed)?; + compressed.clear(); + Ok(len) +} + +fn webp_compress( + result: &mut impl Write, + plain_text: &[u8], + corrections: &[u8], + idat: &IdatContents, +) -> Result<()> { + use crate::container_common::BLOCK_TYPE_PNG; + log::debug!("{:?}", idat); + + #[cfg(feature = "webp")] + if let Some(png_header) = idat.png_header { + use crate::idat_parse::{PngColorType, undo_png_filters}; + use std::ops::Deref; + + let bbp = png_header.color_type.bytes_per_pixel(); + let w = png_header.width as usize; + let h = png_header.height as usize; + + log::debug!( + "plain text compressing {} bytes ({}x{}x{})", + plain_text.len(), + w, + h, + bbp + ); + + // see if the bitmap looks like the way with think it should (bits per pixel map + 1 height worth of filter bytes) + if (bbp * w * h) + h == plain_text.len() { + let (bitmap, filters) = undo_png_filters(plain_text, w, h, bbp); + + let enc = webp::Encoder::new( + &bitmap, + match png_header.color_type { + PngColorType::RGB => webp::PixelLayout::Rgb, + PngColorType::RGBA => webp::PixelLayout::Rgba, + }, + png_header.width, + png_header.height, + ); + + let mut webpconfig = webp::WebPConfig::new().unwrap(); + webpconfig.lossless = 1; + webpconfig.alpha_compression = 0; + webpconfig.exact = 1; // undocumented option, but required to not throw away color if alpha channel is zero + + // this is the default quality setting for webp lossless, we could dial it up + // but the quality gains are marginal for the CPU cost, although the + // CPU decompression cost is the same. + webpconfig.quality = 75.0; // 0..100 higher is slower but better compression + webpconfig.method = 4; // 0..6 higher is slower but better compression + + let comp = match enc.encode_advanced(&webpconfig) { + Ok(c) => c, + Err(e) => { + return preflate_rs::err_exit_code( + ExitCode::WebPDecodeError, + format!("Webp encode failed: {:?}", e), + ); + } + }; + + result.write_all(&[BLOCK_TYPE_PNG])?; // placeholder — caller skips this byte + + write_varint(result, corrections.len() as u32)?; + write_varint(result, comp.deref().len() as u32)?; + + log::debug!( + "Webp compressed {} bytes (vs {})", + comp.deref().len(), + idat.total_chunk_length + ); + + idat.write_to_bytestream(result)?; + result.write_all(&filters)?; + + result.write_all(&corrections)?; + result.write_all(comp.deref())?; + + return Ok(()); + } + } + + return preflate_rs::err_exit_code( + ExitCode::InvalidCompressedWrapper, + "Webp compression not supported", + ); +} diff --git a/container/src/idat_parse.rs b/container/src/idat_parse.rs index cc56363..ed225b0 100644 --- a/container/src/idat_parse.rs +++ b/container/src/idat_parse.rs @@ -4,7 +4,7 @@ use byteorder::{ReadBytesExt, WriteBytesExt}; use preflate_rs::{ExitCode, Result, err_exit_code}; -use crate::container_processor::{read_varint, write_varint}; +use crate::container_common::{read_varint, write_varint}; /// The contents of a PNG IDat stream. These are treated specially since they /// contain a Zlib stream that is split into multiple chunks and would be diff --git a/container/src/lib.rs b/container/src/lib.rs index c368df2..9b0593e 100644 --- a/container/src/lib.rs +++ b/container/src/lib.rs @@ -20,16 +20,17 @@ #![forbid(macro_use_extern_crate)] #![forbid(missing_unsafe_on_extern)] -mod container_processor; +mod container_common; +mod container_read; +mod container_write; mod idat_parse; mod scan_deflate; mod scoped_read; mod utils; -pub use container_processor::{PreflateContainerConfig, PreflateStats}; -pub use container_processor::{ - PreflateContainerProcessor, ProcessBuffer, RecreateContainerProcessor, -}; +pub use container_common::{PreflateContainerConfig, PreflateStats, ProcessBuffer}; +pub use container_read::RecreateContainerProcessor; +pub use container_write::PreflateContainerProcessor; pub use utils::process_limited_buffer; diff --git a/container/src/utils.rs b/container/src/utils.rs index 763b415..8d345f5 100644 --- a/container/src/utils.rs +++ b/container/src/utils.rs @@ -1,48 +1,12 @@ use std::{ collections::VecDeque, - io::{BufRead, Read, Write}, + io::{Read, Write}, }; use preflate_rs::Result; use crate::ProcessBuffer; -/// A BufRead implementation that reads at most `limit` bytes from the underlying reader. -pub struct TakeReader { - inner: T, - amount_left: usize, -} - -impl TakeReader { - pub fn new(inner: T, limit: usize) -> Self { - TakeReader { - inner, - amount_left: limit, - } - } -} - -impl BufRead for TakeReader { - fn fill_buf(&mut self) -> std::io::Result<&[u8]> { - let buf = self.inner.fill_buf()?; - Ok(&buf[..buf.len().min(self.amount_left)]) - } - - fn consume(&mut self, amt: usize) { - self.amount_left -= amt; - self.inner.consume(amt); - } -} - -impl Read for TakeReader { - fn read(&mut self, buf: &mut [u8]) -> std::io::Result { - let len = buf.len().min(self.amount_left); - let read = self.inner.read(&mut buf[..len])?; - self.amount_left -= read; - Ok(read) - } -} - #[allow(dead_code)] #[cfg(test)] pub fn write_file(filename: &str, data: &[u8]) { @@ -174,7 +138,7 @@ pub fn process_limited_buffer( #[test] fn test_process_limited_buffer() { - let mut p = crate::container_processor::test::NopProcessBuffer {}; + let mut p = crate::container_common::test::NopProcessBuffer {}; let input = b"Hello, world!"; let mut output = [0u8; 5]; diff --git a/dll/Cargo.toml b/dll/Cargo.toml index e4f06a0..a4ef4ad 100644 --- a/dll/Cargo.toml +++ b/dll/Cargo.toml @@ -3,12 +3,12 @@ # this makes sure that we can keep old versions around to decode old encodings since the format # is complicated enough that maintaining backwards compat is hard (even minor changes in the # predictor will break the format) -version = "0.7.5" name = "preflate_rs_0_7" -edition = "2024" -authors = ["Kristof Roomp "] -license = "Apache-2.0" -rust-version = "1.85" +version.workspace = true +edition.workspace = true +authors.workspace = true +license.workspace = true +rust-version.workspace = true [dependencies] preflate-rs = { path = "../preflate" } diff --git a/preflate/Cargo.toml b/preflate/Cargo.toml index cf89cf9..6310898 100644 --- a/preflate/Cargo.toml +++ b/preflate/Cargo.toml @@ -1,17 +1,17 @@ [package] name = "preflate-rs" -version = "0.7.5" -edition = "2024" -authors = ["Kristof Roomp "] -license = "Apache-2.0" -rust-version = "1.85" +version.workspace = true +edition.workspace = true +authors.workspace = true +license.workspace = true +rust-version.workspace = true +repository.workspace = true description = """ Decompresses existing DEFLATE and PNG streams to allow for better with a more state-of-the-art compression -(eg with ZStandard, Brotli) while allowing the exact original binary DEFLATE stream to be recreated +(eg with ZStandard, Brotli) while allowing the exact original binary DEFLATE stream to be recreated by detecting the parameters used during compression. """ readme = "../README.md" -repository = "https://github.com/microsoft/preflate-rs" categories = ["compression"] keywords = ["gzip", "deflate", "zlib", "zip", "png"] diff --git a/util/Cargo.toml b/util/Cargo.toml index 5c5b026..a8f4a4a 100644 --- a/util/Cargo.toml +++ b/util/Cargo.toml @@ -1,9 +1,10 @@ [package] name = "preflate_util" -edition = "2024" -authors = ["Kristof Roomp "] -license = "Apache-2.0" -rust-version = "1.85" +version.workspace = true +edition.workspace = true +authors.workspace = true +license.workspace = true +rust-version.workspace = true [dependencies] preflate-rs = { path = "../preflate" } From 0804476eaa0bfb8c60afbe810e538c0b6d6722fc Mon Sep 17 00:00:00 2001 From: Kristof Roomp Date: Wed, 4 Mar 2026 16:42:10 +0100 Subject: [PATCH 8/8] Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- preflate/CLAUDE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/preflate/CLAUDE.md b/preflate/CLAUDE.md index c39e5ea..e02dddd 100644 --- a/preflate/CLAUDE.md +++ b/preflate/CLAUDE.md @@ -12,7 +12,7 @@ PreflateStreamProcessor::new(config) -> Self PreflateStreamProcessor::decompress(input: &[u8]) -> Result // Recreate: given plaintext + correction data, reproduce the original DEFLATE stream -RecreateStreamProcessor::new(capacity) -> Self +RecreateStreamProcessor::new() -> Self RecreateStreamProcessor::recreate(chunk: PreflateStreamChunkResult) -> Result> // One-shot helpers