From 6915a8373f935b66126782b69c50b09449d834d8 Mon Sep 17 00:00:00 2001 From: Davide Angelocola Date: Fri, 26 Jun 2026 23:14:09 +0200 Subject: [PATCH] chore: dedup the ZDICT sample-flattening to clear new-code duplication train / optimize / finalizeFrom each repeated the same block that packs the samples into one native buffer plus a parallel size_t[] of lengths, and the same produced-size error-check/copy-out tail. The recent null-validation edits marked these methods as "new code", so Sonar's quality gate failed on new_duplicated_lines_density (9.5%). Extract flatten() -> FlatSamples, toDictionary(), and requireNonEmpty(); the three trainers now share them. Behaviour and messages unchanged. Co-Authored-By: Claude Opus 4.8 --- .../io/github/dfa1/zstd/ZstdDictionary.java | 119 ++++++++---------- 1 file changed, 49 insertions(+), 70 deletions(-) diff --git a/zstd/src/main/java/io/github/dfa1/zstd/ZstdDictionary.java b/zstd/src/main/java/io/github/dfa1/zstd/ZstdDictionary.java index 42cbc72..b9704e5 100644 --- a/zstd/src/main/java/io/github/dfa1/zstd/ZstdDictionary.java +++ b/zstd/src/main/java/io/github/dfa1/zstd/ZstdDictionary.java @@ -96,38 +96,18 @@ public static ZstdDictionary of(byte[] raw) { /// @throws ZstdException if training fails (commonly: not enough sample data) public static ZstdDictionary train(List samples, int maxDictBytes) { Objects.requireNonNull(samples, SAMPLES); - if (samples.isEmpty()) { - throw new ZstdException("cannot train a dictionary from zero samples"); - } - long total = 0; - for (byte[] s : samples) { - total += s.length; - } + requireNonEmpty(samples, "train"); try (Arena arena = Arena.ofConfined()) { - // flatten all samples into one buffer + a parallel size_t[] of lengths - MemorySegment flat = arena.allocate(Math.max(total, 1)); - MemorySegment sizes = arena.allocate(JAVA_LONG, samples.size()); - long offset = 0; - for (int i = 0; i < samples.size(); i++) { - byte[] s = samples.get(i); - MemorySegment.copy(s, 0, flat, JAVA_BYTE, offset, s.length); - sizes.setAtIndex(JAVA_LONG, i, s.length); - offset += s.length; - } + FlatSamples in = flatten(arena, samples); MemorySegment dictBuf = arena.allocate(maxDictBytes); long produced; try { produced = (long) Bindings.ZDICT_TRAIN.invokeExact( - dictBuf, (long) maxDictBytes, flat, sizes, samples.size()); + dictBuf, (long) maxDictBytes, in.data(), in.sizes(), in.count()); } catch (Throwable t) { throw NativeCall.rethrow(t); } - if (zdictIsError(produced)) { - throw new ZstdException("dictionary training failed: " + zdictErrorName(produced)); - } - byte[] out = new byte[Math.toIntExact(produced)]; - MemorySegment.copy(dictBuf, JAVA_BYTE, 0, out, 0, out.length); - return new ZstdDictionary(out); + return toDictionary(dictBuf, produced, "dictionary training"); } } @@ -180,23 +160,9 @@ public static ZstdDictionary trainFastCover(List samples, int maxDictByt private static ZstdDictionary optimize(List samples, int maxDictBytes, int compressionLevel, boolean fast) { Objects.requireNonNull(samples, SAMPLES); - if (samples.isEmpty()) { - throw new ZstdException("cannot train a dictionary from zero samples"); - } + requireNonEmpty(samples, "train"); try (Arena arena = Arena.ofConfined()) { - long total = 0; - for (byte[] s : samples) { - total += s.length; - } - MemorySegment flat = arena.allocate(Math.max(total, 1)); - MemorySegment sizes = arena.allocate(JAVA_LONG, samples.size()); - long offset = 0; - for (int i = 0; i < samples.size(); i++) { - byte[] s = samples.get(i); - MemorySegment.copy(s, 0, flat, JAVA_BYTE, offset, s.length); - sizes.setAtIndex(JAVA_LONG, i, s.length); - offset += s.length; - } + FlatSamples in = flatten(arena, samples); // zeroed params (auto-tune k/d/steps); set single-threaded + target level. MemoryLayout layout = fast ? FASTCOVER_PARAMS : COVER_PARAMS; MemorySegment params = arena.allocate(layout); @@ -207,16 +173,11 @@ private static ZstdDictionary optimize(List samples, int maxDictBytes, long produced; try { produced = (long) handle.invokeExact( - dictBuf, (long) maxDictBytes, flat, sizes, samples.size(), params); + dictBuf, (long) maxDictBytes, in.data(), in.sizes(), in.count(), params); } catch (Throwable t) { throw NativeCall.rethrow(t); } - if (zdictIsError(produced)) { - throw new ZstdException("dictionary training failed: " + zdictErrorName(produced)); - } - byte[] out = new byte[Math.toIntExact(produced)]; - MemorySegment.copy(dictBuf, JAVA_BYTE, 0, out, 0, out.length); - return new ZstdDictionary(out); + return toDictionary(dictBuf, produced, "dictionary training"); } } @@ -235,23 +196,9 @@ public static ZstdDictionary finalizeFrom(byte[] content, List samples, int maxDictBytes, int compressionLevel) { Objects.requireNonNull(content, "content"); Objects.requireNonNull(samples, SAMPLES); - if (samples.isEmpty()) { - throw new ZstdException("cannot finalise a dictionary from zero samples"); - } + requireNonEmpty(samples, "finalise"); try (Arena arena = Arena.ofConfined()) { - long total = 0; - for (byte[] s : samples) { - total += s.length; - } - MemorySegment flat = arena.allocate(Math.max(total, 1)); - MemorySegment sizes = arena.allocate(JAVA_LONG, samples.size()); - long offset = 0; - for (int i = 0; i < samples.size(); i++) { - byte[] s = samples.get(i); - MemorySegment.copy(s, 0, flat, JAVA_BYTE, offset, s.length); - sizes.setAtIndex(JAVA_LONG, i, s.length); - offset += s.length; - } + FlatSamples in = flatten(arena, samples); MemorySegment contentSeg = Zstd.copyIn(arena, content); MemorySegment params = arena.allocate(Bindings.ZDICT_PARAMS_LAYOUT); params.set(JAVA_INT, 0, compressionLevel); // compressionLevel; notificationLevel/dictID = 0 @@ -260,17 +207,49 @@ public static ZstdDictionary finalizeFrom(byte[] content, List samples, try { produced = (long) Bindings.ZDICT_FINALIZE_DICTIONARY.invokeExact( dictBuf, (long) maxDictBytes, contentSeg, (long) content.length, - flat, sizes, samples.size(), params); + in.data(), in.sizes(), in.count(), params); } catch (Throwable t) { throw NativeCall.rethrow(t); } - if (zdictIsError(produced)) { - throw new ZstdException("dictionary finalisation failed: " + zdictErrorName(produced)); - } - byte[] out = new byte[Math.toIntExact(produced)]; - MemorySegment.copy(dictBuf, JAVA_BYTE, 0, out, 0, out.length); - return new ZstdDictionary(out); + return toDictionary(dictBuf, produced, "dictionary finalisation"); + } + } + + /// One native buffer holding all samples back to back, plus a parallel + /// `size_t[]` of their lengths — the shape the ZDICT trainers consume. + private record FlatSamples(MemorySegment data, MemorySegment sizes, int count) { + } + + private static FlatSamples flatten(Arena arena, List samples) { + long total = 0; + for (byte[] s : samples) { + total += s.length; + } + MemorySegment data = arena.allocate(Math.max(total, 1)); + MemorySegment sizes = arena.allocate(JAVA_LONG, samples.size()); + long offset = 0; + for (int i = 0; i < samples.size(); i++) { + byte[] s = samples.get(i); + MemorySegment.copy(s, 0, data, JAVA_BYTE, offset, s.length); + sizes.setAtIndex(JAVA_LONG, i, s.length); + offset += s.length; + } + return new FlatSamples(data, sizes, samples.size()); + } + + private static void requireNonEmpty(List samples, String verb) { + if (samples.isEmpty()) { + throw new ZstdException("cannot " + verb + " a dictionary from zero samples"); + } + } + + private static ZstdDictionary toDictionary(MemorySegment dictBuf, long produced, String what) { + if (zdictIsError(produced)) { + throw new ZstdException(what + " failed: " + zdictErrorName(produced)); } + byte[] out = new byte[Math.toIntExact(produced)]; + MemorySegment.copy(dictBuf, JAVA_BYTE, 0, out, 0, out.length); + return new ZstdDictionary(out); } /// The dictionary id zstd stamps into frames compressed with this dictionary,