From 3f5cee3548ba34d149942d6084702231b10e97ac Mon Sep 17 00:00:00 2001 From: Davide Angelocola Date: Fri, 26 Jun 2026 16:37:10 +0200 Subject: [PATCH 1/2] test: add golden-corpus integration tests Drive zstd's own vendored corpus (third_party/zstd/tests/) from the integration-tests module: round-trip golden-compression inputs across the FFM/JNI boundary both directions, cross-check golden-decompression frames against the zstd-jni reference, and assert golden-decompression-errors frames are rejected. Submodule-gated so shallow clones still build. Fix CLAUDE.md: the integration reference is zstd-jni plus the golden corpus, not "the Rust reference". Co-Authored-By: Claude Opus 4.8 --- CLAUDE.md | 7 +- .../github/dfa1/zstd/it/GoldenCorpusTest.java | 173 ++++++++++++++++++ 2 files changed, 178 insertions(+), 2 deletions(-) create mode 100644 integration-tests/src/test/java/io/github/dfa1/zstd/it/GoldenCorpusTest.java diff --git a/CLAUDE.md b/CLAUDE.md index 62ae631..c9e02bf 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -49,8 +49,11 @@ Built `.dylib/.so/.dll` are git-ignored; they are regenerated from the submodule - Cover happy path, negative cases (invalid input / errors), and corners (empty, zero, max, boundaries). Unit tests must be fast — no file I/O, network, or sleep; mock or use in-memory data. -- **Integration tests are ground truth** (no formal spec): interop with the Rust reference. Write - one for every encoding round-trip and file-format boundary. +- **Integration tests are ground truth** (no formal spec): interop with the `zstd-jni` reference + binding (luben, the zstd C library via JNI) and the vendored golden corpus under + `third_party/zstd/tests/` (`golden-compression`, `golden-decompression`, + `golden-decompression-errors`, `golden-dictionaries`). Write one for every encoding round-trip + and file-format boundary. - JUnit 5 + Mockito (BDDMockito) + AssertJ. Class under test named `sut`. Every test has `// Given` / `// When` / `// Then`. BDDMockito only: `given(mock.m()).willReturn(v)` / `then(...)` (static-import only `given`/`then`, never `willReturn`/`willThrow`). diff --git a/integration-tests/src/test/java/io/github/dfa1/zstd/it/GoldenCorpusTest.java b/integration-tests/src/test/java/io/github/dfa1/zstd/it/GoldenCorpusTest.java new file mode 100644 index 0000000..1c92393 --- /dev/null +++ b/integration-tests/src/test/java/io/github/dfa1/zstd/it/GoldenCorpusTest.java @@ -0,0 +1,173 @@ +package io.github.dfa1.zstd.it; + +import io.github.dfa1.zstd.Zstd; +import io.github.dfa1.zstd.ZstdException; +import io.github.dfa1.zstd.ZstdInputStream; +import org.junit.jupiter.api.Nested; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; +import java.util.stream.Stream; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +/// Tests against zstd's own vendored golden corpus under +/// `third_party/zstd/tests/` — the canonical, version-matched fixtures the C +/// project uses for its own regression suite. These exercise encoder corners no +/// synthetic payload reaches (block-128k boundaries, RLE/empty blocks, huffman, +/// the PR-3517 block-splitter case) and adversarial frames that must fail. +/// +/// The corpus ships via the `third_party/zstd` git submodule. When it is not +/// checked out the cases are skipped, so a shallow clone still builds. +class GoldenCorpusTest { + + private static final Path TESTS = locateCorpus(); + + /// Walks up from the working directory to find `third_party/zstd/tests`, + /// or returns `null` if the submodule is absent. + private static Path locateCorpus() { + Path dir = Path.of("").toAbsolutePath(); + for (; dir != null; dir = dir.getParent()) { + Path candidate = dir.resolve("third_party/zstd/tests"); + if (Files.isDirectory(candidate)) { + return candidate; + } + } + return null; + } + + private static Stream filesIn(String subdir, String suffix) { + if (TESTS == null) { + return Stream.empty(); + } + Path dir = TESTS.resolve(subdir); + if (!Files.isDirectory(dir)) { + return Stream.empty(); + } + try (Stream entries = Files.list(dir)) { + List files = entries + .filter(Files::isRegularFile) + .filter(p -> p.getFileName().toString().endsWith(suffix)) + .sorted() + .toList(); + return files.stream().map(p -> Arguments.of(p.getFileName().toString(), p)); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + private static byte[] read(Path file) { + try { + return Files.readAllBytes(file); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + /// Streaming decode (handles frames that do not store content size). + private static byte[] javaStreamDecode(byte[] frame) { + try (ZstdInputStream in = new ZstdInputStream(new ByteArrayInputStream(frame))) { + return in.readAllBytes(); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + private static byte[] jniStreamDecode(byte[] frame) { + try (var in = new com.github.luben.zstd.ZstdInputStream(new ByteArrayInputStream(frame))) { + return in.readAllBytes(); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + /// Valid frames the C project guarantees decode. Decode with both zstd-java + /// and the zstd-jni reference; both must succeed and agree byte-for-byte. + @Nested + class GoldenDecompression { + + static Stream frames() { + return filesIn("golden-decompression", ".zst"); + } + + @ParameterizedTest(name = "{0}") + @MethodSource("frames") + void javaDecodeMatchesJni(String name, Path file) { + // Given + byte[] frame = read(file); + + // When + byte[] javaOut = javaStreamDecode(frame); + + // Then + assertThat(javaOut).isEqualTo(jniStreamDecode(frame)); + } + } + + /// Raw inputs the C project compresses in its suite. Round-trip them across + /// the JNI/FFM boundary in both directions. + @Nested + class GoldenCompression { + + static Stream inputs() { + return filesIn("golden-compression", ""); + } + + @ParameterizedTest(name = "{0}") + @MethodSource("inputs") + void javaCompressJniDecompress(String name, Path file) { + // Given + byte[] data = read(file); + + // When + byte[] frame = Zstd.compress(data, Zstd.defaultCompressionLevel()); + + // Then + assertThat(com.github.luben.zstd.Zstd.decompress(frame, data.length)).isEqualTo(data); + } + + @ParameterizedTest(name = "{0}") + @MethodSource("inputs") + void jniCompressJavaDecompress(String name, Path file) { + // Given + byte[] data = read(file); + + // When + byte[] frame = com.github.luben.zstd.Zstd.compress(data, Zstd.maxCompressionLevel()); + + // Then + assertThat(Zstd.decompress(frame, data.length)).isEqualTo(data); + } + } + + /// Malformed frames the C project guarantees fail to decode. zstd-java must + /// reject every one rather than return garbage. + @Nested + class GoldenDecompressionErrors { + + static Stream frames() { + return filesIn("golden-decompression-errors", ".zst"); + } + + @ParameterizedTest(name = "{0}") + @MethodSource("frames") + void javaDecodeThrows(String name, Path file) { + // Given + byte[] frame = read(file); + + // When + org.assertj.core.api.ThrowableAssert.ThrowingCallable result = () -> javaStreamDecode(frame); + + // Then + assertThatThrownBy(result).isInstanceOfAny(ZstdException.class, UncheckedIOException.class); + } + } +} From 6f25dc520d0d1e468f856abc1dd26552c04f7d6b Mon Sep 17 00:00:00 2001 From: Davide Angelocola Date: Fri, 26 Jun 2026 16:39:40 +0200 Subject: [PATCH 2/2] test: cover remaining zstd-jni interop gaps Add ZstdInteropExtrasTest covering interop paths the round-trip suite missed: checksum trailers (write/verify both directions + corruption rejection), skippable frames (jni stream skips a java-written one), multi-frame concatenation (streaming decode of joined frames both ways), foreign frame-header parsing (content size and dict id from jni frames), and streaming driven one tiny chunk at a time across a payload spread. Co-Authored-By: Claude Opus 4.8 --- .../dfa1/zstd/it/ZstdInteropExtrasTest.java | 302 ++++++++++++++++++ 1 file changed, 302 insertions(+) create mode 100644 integration-tests/src/test/java/io/github/dfa1/zstd/it/ZstdInteropExtrasTest.java diff --git a/integration-tests/src/test/java/io/github/dfa1/zstd/it/ZstdInteropExtrasTest.java b/integration-tests/src/test/java/io/github/dfa1/zstd/it/ZstdInteropExtrasTest.java new file mode 100644 index 0000000..e1140ce --- /dev/null +++ b/integration-tests/src/test/java/io/github/dfa1/zstd/it/ZstdInteropExtrasTest.java @@ -0,0 +1,302 @@ +package io.github.dfa1.zstd.it; + +import com.github.luben.zstd.ZstdCompressCtx; +import io.github.dfa1.zstd.Zstd; +import io.github.dfa1.zstd.ZstdDictionary; +import io.github.dfa1.zstd.ZstdException; +import io.github.dfa1.zstd.ZstdFrame; +import io.github.dfa1.zstd.ZstdFrameHeader; +import io.github.dfa1.zstd.ZstdFrameType; +import io.github.dfa1.zstd.ZstdInputStream; +import io.github.dfa1.zstd.ZstdOutputStream; +import org.assertj.core.api.ThrowableAssert.ThrowingCallable; +import org.junit.jupiter.api.Nested; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; +import java.util.Random; +import java.util.stream.Stream; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +/// Interop gaps the round-trip suite does not cover: checksum trailers, skippable +/// frames, multi-frame concatenation, foreign frame-header parsing, and streaming +/// across small write/read chunks. Reference binding is zstd-jni (luben). +class ZstdInteropExtrasTest { + + private static byte[] javaStreamDecode(byte[] frame) { + try (ZstdInputStream in = new ZstdInputStream(new ByteArrayInputStream(frame))) { + return in.readAllBytes(); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + private static byte[] jniStreamDecode(byte[] frame) { + try (var in = new com.github.luben.zstd.ZstdInputStream(new ByteArrayInputStream(frame))) { + return in.readAllBytes(); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + private static byte[] concat(byte[] a, byte[] b) { + byte[] out = new byte[a.length + b.length]; + System.arraycopy(a, 0, out, 0, a.length); + System.arraycopy(b, 0, out, a.length, b.length); + return out; + } + + /// The 4-byte content checksum must be written and verified compatibly in + /// both directions, and a corrupted payload under a checksum must be rejected. + @Nested + class Checksum { + + private final byte[] data = "checksum payload ".repeat(4096).getBytes(StandardCharsets.UTF_8); + + @Test + void javaChecksumDecodedByJni() { + // Given + byte[] frame; + try (io.github.dfa1.zstd.ZstdCompressCtx ctx = new io.github.dfa1.zstd.ZstdCompressCtx()) { + frame = ctx.checksum(true).compress(data); + } + + // When + byte[] restored = com.github.luben.zstd.Zstd.decompress(frame, data.length); + + // Then + assertThat(ZstdFrame.header(frame).hasChecksum()).isTrue(); + assertThat(restored).isEqualTo(data); + } + + @Test + void jniChecksumDecodedByJava() { + // Given + byte[] frame; + try (ZstdCompressCtx ctx = new ZstdCompressCtx()) { + frame = ctx.setChecksum(true).compress(data); + } + + // When + byte[] restored = javaStreamDecode(frame); + + // Then + assertThat(ZstdFrame.header(frame).hasChecksum()).isTrue(); + assertThat(restored).isEqualTo(data); + } + + @Test + void javaRejectsCorruptedChecksum() { + // Given + byte[] frame; + try (io.github.dfa1.zstd.ZstdCompressCtx ctx = new io.github.dfa1.zstd.ZstdCompressCtx()) { + frame = ctx.checksum(true).compress(data); + } + frame[frame.length / 2] ^= 0x7F; + + // When + ThrowingCallable result = () -> javaStreamDecode(frame); + + // Then + assertThatThrownBy(result).isInstanceOfAny(ZstdException.class, UncheckedIOException.class); + } + } + + /// Skippable frames carry application metadata the decoder must pass over. A + /// skippable frame written by this library must be recognised and skipped by + /// the zstd-jni stream decoder, leaving the following real frame intact. + @Nested + class Skippable { + + @Test + void jniStreamSkipsJavaSkippableFrame() { + // Given + byte[] payload = "after the skippable frame ".repeat(1000).getBytes(StandardCharsets.UTF_8); + byte[] meta = "sidecar-metadata".getBytes(StandardCharsets.UTF_8); + byte[] skippable = ZstdFrame.writeSkippableFrame(meta, 0); + byte[] real = Zstd.compress(payload, Zstd.defaultCompressionLevel()); + + // When + byte[] restored = jniStreamDecode(concat(skippable, real)); + + // Then + assertThat(restored).isEqualTo(payload); + } + + @Test + void javaParsesItsOwnSkippableFrameHeader() { + // Given + byte[] meta = "sidecar".getBytes(StandardCharsets.UTF_8); + byte[] skippable = ZstdFrame.writeSkippableFrame(meta, 5); + + // When + ZstdFrameHeader header = ZstdFrame.header(skippable); + + // Then + assertThat(ZstdFrame.isSkippableFrame(skippable)).isTrue(); + assertThat(header.frameType()).isEqualTo(ZstdFrameType.SKIPPABLE); + assertThat(ZstdFrame.readSkippableFrame(skippable).content()).isEqualTo(meta); + } + } + + /// zstd streaming concatenates adjacent frames transparently. Two frames from + /// one binding must decode as the joined payload through the other's streamer. + @Nested + class MultiFrame { + + private final byte[] a = "first frame body ".repeat(2000).getBytes(StandardCharsets.UTF_8); + private final byte[] b = "second frame body ".repeat(2000).getBytes(StandardCharsets.UTF_8); + + @Test + void javaFramesConcatReadByJniStream() { + // Given + byte[] joined = concat( + Zstd.compress(a, Zstd.defaultCompressionLevel()), + Zstd.compress(b, Zstd.defaultCompressionLevel())); + + // When + byte[] restored = jniStreamDecode(joined); + + // Then + assertThat(restored).isEqualTo(concat(a, b)); + } + + @Test + void jniFramesConcatReadByJavaStream() { + // Given + byte[] joined = concat( + com.github.luben.zstd.Zstd.compress(a, Zstd.defaultCompressionLevel()), + com.github.luben.zstd.Zstd.compress(b, Zstd.defaultCompressionLevel())); + + // When + byte[] restored = javaStreamDecode(joined); + + // Then + assertThat(restored).isEqualTo(concat(a, b)); + } + } + + /// This library must read the header of a frame produced elsewhere: content + /// size when pledged, and the dictionary id when one was used. + @Nested + class FrameHeader { + + private final byte[] data = "header introspection ".repeat(3000).getBytes(StandardCharsets.UTF_8); + + @Test + void javaReadsContentSizeFromJniFrame() { + // Given + byte[] frame; + try (ZstdCompressCtx ctx = new ZstdCompressCtx()) { + frame = ctx.setContentSize(true).compress(data); + } + + // When + ZstdFrameHeader header = ZstdFrame.header(frame); + + // Then + assertThat(header.frameType()).isEqualTo(ZstdFrameType.STANDARD); + assertThat(header.contentSize()).hasValue(data.length); + } + + @Test + void javaReadsDictIdFromJniDictFrame() { + // Given + ZstdDictionary dict = trainDict(); + var jniDict = new com.github.luben.zstd.ZstdDictCompress( + dict.toByteArray(), Zstd.defaultCompressionLevel()); + byte[] frame = com.github.luben.zstd.Zstd.compress(record(7), jniDict); + + // When + int dictId = ZstdFrame.dictId(frame); + + // Then + assertThat(dictId).isEqualTo(dict.id()); + assertThat(dictId).isNotZero(); + } + + private ZstdDictionary trainDict() { + List samples = new ArrayList<>(); + for (int i = 0; i < 3000; i++) { + samples.add(record(i)); + } + return ZstdDictionary.train(samples, 8 * 1024); + } + + private byte[] record(int i) { + return ("{\"id\":" + i + ",\"user\":\"u" + (i % 30) + "\",\"event\":\"click\"}") + .getBytes(StandardCharsets.UTF_8); + } + } + + /// Streaming must survive being driven one tiny chunk at a time — exercising + /// internal buffer refills and flush boundaries — across a spread of payloads. + @Nested + class ChunkedStreaming { + + private static final int CHUNK = 7; + + static Stream payloads() { + Random r = new Random(0x5EED); + return Stream.of( + Arguments.of("empty", new byte[0]), + Arguments.of("one-byte", new byte[]{42}), + Arguments.of("text", "stream me ".repeat(5000).getBytes(StandardCharsets.UTF_8)), + Arguments.of("random-64k", random(r, 64 * 1024))); + } + + @ParameterizedTest(name = "{0}") + @MethodSource("payloads") + void javaChunkedWriteJniRead(String name, byte[] data) throws IOException { + // Given + ByteArrayOutputStream sink = new ByteArrayOutputStream(); + + // When + try (ZstdOutputStream zout = new ZstdOutputStream(sink, 7)) { + writeInChunks(zout, data); + } + + // Then + assertThat(jniStreamDecode(sink.toByteArray())).isEqualTo(data); + } + + @ParameterizedTest(name = "{0}") + @MethodSource("payloads") + void jniChunkedWriteJavaRead(String name, byte[] data) throws IOException { + // Given + ByteArrayOutputStream sink = new ByteArrayOutputStream(); + + // When + try (var zout = new com.github.luben.zstd.ZstdOutputStream(sink)) { + writeInChunks(zout, data); + } + + // Then + assertThat(javaStreamDecode(sink.toByteArray())).isEqualTo(data); + } + + private static void writeInChunks(java.io.OutputStream out, byte[] data) throws IOException { + for (int off = 0; off < data.length; off += CHUNK) { + out.write(data, off, Math.min(CHUNK, data.length - off)); + out.flush(); + } + } + + private static byte[] random(Random r, int size) { + byte[] b = new byte[size]; + r.nextBytes(b); + return b; + } + } +}