From cff5eef7c3227368e99573642932a8b9d840d46d Mon Sep 17 00:00:00 2001 From: Davide Angelocola Date: Mon, 22 Jun 2026 22:53:23 +0200 Subject: [PATCH 1/2] refactor(encoding): extract shared PrimitiveArrays.toLongs/fromLongs to core MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit toLongs (integer array -> long[] widen) was copy-pasted in 4 writer encoders (Delta, Bitpacked, FrameOfReference, Patched), and fromLongs (long[] -> off-heap segment) in the Delta encoder and decoder — identical but for the EncodingId in the error throw. Since fromLongs is needed by both reader and writer, the shared home is core (io.github.dfa1.vortex.encoding), the only module both depend on. New PrimitiveArrays holds the inverse pair: toLongs(data, ptype, encoding) and fromLongs(longs, ptype, arena). The EncodingId is now a parameter so each caller keeps its own error attribution. Deliberately NOT folded in: RleEncodingEncoder.toLongs (a superset that also raw-bit-packs F32/F64/F16), Rle.fromLongs (takes a count prefix), and Patched.fromLongs (returns a boxed java array, opposite direction). Those are genuinely different methods, left alone. Ground truth green: JavaWritesRustReads (213) + JavaRoundTrip round-trips pass, so the extracted widen/narrow produces byte-identical output. Co-Authored-By: Claude Opus 4.8 --- .../dfa1/vortex/encoding/PrimitiveArrays.java | 109 ++++++++++++++++++ .../reader/decode/DeltaEncodingDecoder.java | 19 +-- .../encode/BitpackedEncodingEncoder.java | 57 +-------- .../writer/encode/DeltaEncodingEncoder.java | 80 +------------ .../FrameOfReferenceEncodingEncoder.java | 35 +----- .../writer/encode/PatchedEncodingEncoder.java | 59 +--------- 6 files changed, 123 insertions(+), 236 deletions(-) create mode 100644 core/src/main/java/io/github/dfa1/vortex/encoding/PrimitiveArrays.java diff --git a/core/src/main/java/io/github/dfa1/vortex/encoding/PrimitiveArrays.java b/core/src/main/java/io/github/dfa1/vortex/encoding/PrimitiveArrays.java new file mode 100644 index 000000000..ec3ada599 --- /dev/null +++ b/core/src/main/java/io/github/dfa1/vortex/encoding/PrimitiveArrays.java @@ -0,0 +1,109 @@ +package io.github.dfa1.vortex.encoding; + +import io.github.dfa1.vortex.core.PType; +import io.github.dfa1.vortex.core.VortexException; + +import java.lang.foreign.MemorySegment; +import java.lang.foreign.SegmentAllocator; +import java.lang.foreign.ValueLayout; + +/// Conversions between a boxed Java primitive value array and its wide / off-heap forms, +/// shared by the integer encodings on both the read and write sides. +/// +/// [#toLongs(Object, PType, EncodingId)] and [#fromLongs(long[], PType, SegmentAllocator)] are +/// inverses: the first widens any 8–64 bit integer array to a `long[]`, the second writes a +/// `long[]` back to a little-endian off-heap segment of the target width. Floating-point ptypes +/// are not handled here — they reinterpret to raw bits or take type-specific encode paths instead. +public final class PrimitiveArrays { + + private PrimitiveArrays() { + } + + /// Widens a boxed primitive integer array to `long[]`, zero-extending the unsigned ptypes and + /// sign-extending the signed ones. The I64/U64 case returns the input array directly (no copy). + /// + /// @param data the value array; its runtime type must match `ptype` + /// (`byte[]` for I8/U8, `short[]` for I16/U16, `int[]` for I32/U32, `long[]` for I64/U64) + /// @param ptype the logical primitive type of `data` + /// @param encoding the encoding requesting the widening, used for error attribution + /// @return a `long[]` holding every element of `data` widened to 64 bits + /// @throws VortexException if `ptype` is not an integer ptype + public static long[] toLongs(Object data, PType ptype, EncodingId encoding) { + return switch (ptype) { + case I8 -> { + byte[] arr = (byte[]) data; + long[] r = new long[arr.length]; + for (int i = 0; i < arr.length; i++) { + r[i] = arr[i]; + } + yield r; + } + case U8 -> { + byte[] arr = (byte[]) data; + long[] r = new long[arr.length]; + for (int i = 0; i < arr.length; i++) { + r[i] = Byte.toUnsignedLong(arr[i]); + } + yield r; + } + case I16 -> { + short[] arr = (short[]) data; + long[] r = new long[arr.length]; + for (int i = 0; i < arr.length; i++) { + r[i] = arr[i]; + } + yield r; + } + case U16 -> { + short[] arr = (short[]) data; + long[] r = new long[arr.length]; + for (int i = 0; i < arr.length; i++) { + r[i] = Short.toUnsignedLong(arr[i]); + } + yield r; + } + case I32 -> { + int[] arr = (int[]) data; + long[] r = new long[arr.length]; + for (int i = 0; i < arr.length; i++) { + r[i] = arr[i]; + } + yield r; + } + case U32 -> { + int[] arr = (int[]) data; + long[] r = new long[arr.length]; + for (int i = 0; i < arr.length; i++) { + r[i] = Integer.toUnsignedLong(arr[i]); + } + yield r; + } + case I64, U64 -> (long[]) data; + default -> throw new VortexException(encoding, "unsupported ptype: " + ptype); + }; + } + + /// Writes a `long[]` to a freshly allocated little-endian off-heap segment whose element width + /// is that of `ptype`, narrowing each element to the low bytes. Inverse of + /// [#toLongs(Object, PType, EncodingId)]. The I64/U64 case bulk-copies; narrower widths write + /// element by element through [PTypeIO#set(MemorySegment, long, PType, long)]. + /// + /// @param longs the wide values to write + /// @param ptype the target primitive width + /// @param arena allocator for the output segment + /// @return a little-endian segment of `longs.length` elements at `ptype`'s width + public static MemorySegment fromLongs(long[] longs, PType ptype, SegmentAllocator arena) { + if (ptype == PType.I64 || ptype == PType.U64) { + MemorySegment dst = arena.allocate((long) longs.length * 8); + MemorySegment.copy(MemorySegment.ofArray(longs), ValueLayout.JAVA_LONG, 0L, dst, PTypeIO.LE_LONG, 0L, longs.length); + return dst; + } + int n = longs.length; + long elemSize = ptype.byteSize(); + MemorySegment seg = arena.allocate(n * elemSize); + for (int i = 0; i < n; i++) { + PTypeIO.set(seg, i * elemSize, ptype, longs[i]); + } + return seg; + } +} diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/decode/DeltaEncodingDecoder.java b/reader/src/main/java/io/github/dfa1/vortex/reader/decode/DeltaEncodingDecoder.java index 3c1d9d7f6..028cb999b 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/decode/DeltaEncodingDecoder.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/decode/DeltaEncodingDecoder.java @@ -4,6 +4,7 @@ import io.github.dfa1.vortex.core.PType; import io.github.dfa1.vortex.core.VortexException; import io.github.dfa1.vortex.encoding.EncodingId; +import io.github.dfa1.vortex.encoding.PrimitiveArrays; import io.github.dfa1.vortex.encoding.PTypeIO; import io.github.dfa1.vortex.proto.DeltaMetadata; import io.github.dfa1.vortex.reader.array.Array; @@ -14,7 +15,6 @@ import java.io.IOException; import java.lang.foreign.MemorySegment; -import java.lang.foreign.SegmentAllocator; import java.lang.foreign.ValueLayout; import java.nio.ByteBuffer; @@ -107,7 +107,7 @@ public Array decode(DecodeContext ctx) { long[] result = new long[(int) rowCount]; System.arraycopy(decoded, offset, result, 0, (int) rowCount); - MemorySegment seg = fromLongs(result, ptype, ctx.arena()); + MemorySegment seg = PrimitiveArrays.fromLongs(result, ptype, ctx.arena()); return switch (ptype) { case I64, U64 -> new MaterializedLongArray(ctx.dtype(), rowCount, seg); case I32, U32 -> new MaterializedIntArray(ctx.dtype(), rowCount, seg); @@ -179,19 +179,4 @@ private static long typeMask(PType ptype) { return bits == 64 ? -1L : (1L << bits) - 1; } - private static MemorySegment fromLongs(long[] longs, PType ptype, SegmentAllocator arena) { - if (ptype == PType.I64 || ptype == PType.U64) { - MemorySegment dst = arena.allocate((long) longs.length * 8); - MemorySegment.copy(MemorySegment.ofArray(longs), ValueLayout.JAVA_LONG, 0L, dst, PTypeIO.LE_LONG, 0L, longs.length); - return dst; - } - int n = longs.length; - long elemSize = ptype.byteSize(); - MemorySegment seg = arena.allocate(n * elemSize); - for (int i = 0; i < n; i++) { - PTypeIO.set(seg, i * elemSize, ptype, longs[i]); - } - return seg; - } - } diff --git a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/BitpackedEncodingEncoder.java b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/BitpackedEncodingEncoder.java index 5e6a0ab7b..b0e74bbf6 100644 --- a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/BitpackedEncodingEncoder.java +++ b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/BitpackedEncodingEncoder.java @@ -4,6 +4,7 @@ import io.github.dfa1.vortex.core.PType; import io.github.dfa1.vortex.core.VortexException; import io.github.dfa1.vortex.encoding.EncodingId; +import io.github.dfa1.vortex.encoding.PrimitiveArrays; import io.github.dfa1.vortex.encoding.PTypeIO; import io.github.dfa1.vortex.proto.BitPackedMetadata; import io.github.dfa1.vortex.proto.PatchesMetadata; @@ -43,7 +44,7 @@ public boolean accepts(DType dtype) { @Override public EncodeResult encode(DType dtype, Object data, EncodeContext ctx) { PType ptype = ((DType.Primitive) dtype).ptype(); - long[] longs = toLongs(data, ptype); + long[] longs = PrimitiveArrays.toLongs(data, ptype, EncodingId.FASTLANES_BITPACKED); int n = longs.length; int typeBits = ptype.byteSize() * 8; long typeMask = typeMask(typeBits); @@ -237,60 +238,6 @@ private static MemorySegment packFastLanes(long[] values, int n, int bitWidth, i return seg; } - private static long[] toLongs(Object data, PType ptype) { - return switch (ptype) { - case I8 -> { - byte[] arr = (byte[]) data; - long[] r = new long[arr.length]; - for (int i = 0; i < arr.length; i++) { - r[i] = arr[i]; - } - yield r; - } - case U8 -> { - byte[] arr = (byte[]) data; - long[] r = new long[arr.length]; - for (int i = 0; i < arr.length; i++) { - r[i] = Byte.toUnsignedLong(arr[i]); - } - yield r; - } - case I16 -> { - short[] arr = (short[]) data; - long[] r = new long[arr.length]; - for (int i = 0; i < arr.length; i++) { - r[i] = arr[i]; - } - yield r; - } - case U16 -> { - short[] arr = (short[]) data; - long[] r = new long[arr.length]; - for (int i = 0; i < arr.length; i++) { - r[i] = Short.toUnsignedLong(arr[i]); - } - yield r; - } - case I32 -> { - int[] arr = (int[]) data; - long[] r = new long[arr.length]; - for (int i = 0; i < arr.length; i++) { - r[i] = arr[i]; - } - yield r; - } - case U32 -> { - int[] arr = (int[]) data; - long[] r = new long[arr.length]; - for (int i = 0; i < arr.length; i++) { - r[i] = Integer.toUnsignedLong(arr[i]); - } - yield r; - } - case I64, U64 -> (long[]) data; - default -> throw new VortexException(EncodingId.FASTLANES_BITPACKED, "unsupported ptype: " + ptype); - }; - } private static long typeMask(int typeBits) { return typeBits == 64 ? -1L : (1L << typeBits) - 1L; diff --git a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/DeltaEncodingEncoder.java b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/DeltaEncodingEncoder.java index 88482a063..083502ff8 100644 --- a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/DeltaEncodingEncoder.java +++ b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/DeltaEncodingEncoder.java @@ -2,15 +2,12 @@ import io.github.dfa1.vortex.core.DType; import io.github.dfa1.vortex.core.PType; -import io.github.dfa1.vortex.core.VortexException; import io.github.dfa1.vortex.encoding.EncodingId; -import io.github.dfa1.vortex.encoding.PTypeIO; +import io.github.dfa1.vortex.encoding.PrimitiveArrays; import io.github.dfa1.vortex.proto.DeltaMetadata; import io.github.dfa1.vortex.proto.ScalarValue; import java.lang.foreign.MemorySegment; -import java.lang.foreign.SegmentAllocator; -import java.lang.foreign.ValueLayout; import java.nio.ByteBuffer; import java.util.List; @@ -40,7 +37,7 @@ public boolean accepts(DType dtype) { @Override public EncodeResult encode(DType dtype, Object data, EncodeContext ctx) { PType ptype = ((DType.Primitive) dtype).ptype(); - long[] longs = toLongs(data, ptype); + long[] longs = PrimitiveArrays.toLongs(data, ptype, EncodingId.FASTLANES_DELTA); int n = longs.length; int typeBits = typeBits(ptype); int lanes = lanes(ptype); @@ -93,8 +90,8 @@ public EncodeResult encode(DType dtype, Object data, EncodeContext ctx) { System.arraycopy(chunkDelta, 0, deltasAll, chunk * FL_CHUNK_SIZE, FL_CHUNK_SIZE); } - MemorySegment basesSeg = fromLongs(basesAll, ptype, ctx.arena()); - MemorySegment deltasSeg = fromLongs(deltasAll, ptype, ctx.arena()); + MemorySegment basesSeg = PrimitiveArrays.fromLongs(basesAll, ptype, ctx.arena()); + MemorySegment deltasSeg = PrimitiveArrays.fromLongs(deltasAll, ptype, ctx.arena()); byte[] metaBytes = new DeltaMetadata(paddedLen, 0).encode(); @@ -120,61 +117,6 @@ private static void deltaChunk(long[] transposed, long[] bases, int lanes, int t } } - private static long[] toLongs(Object data, PType ptype) { - return switch (ptype) { - case I8 -> { - byte[] arr = (byte[]) data; - long[] r = new long[arr.length]; - for (int i = 0; i < arr.length; i++) { - r[i] = arr[i]; - } - yield r; - } - case U8 -> { - byte[] arr = (byte[]) data; - long[] r = new long[arr.length]; - for (int i = 0; i < arr.length; i++) { - r[i] = Byte.toUnsignedLong(arr[i]); - } - yield r; - } - case I16 -> { - short[] arr = (short[]) data; - long[] r = new long[arr.length]; - for (int i = 0; i < arr.length; i++) { - r[i] = arr[i]; - } - yield r; - } - case U16 -> { - short[] arr = (short[]) data; - long[] r = new long[arr.length]; - for (int i = 0; i < arr.length; i++) { - r[i] = Short.toUnsignedLong(arr[i]); - } - yield r; - } - case I32 -> { - int[] arr = (int[]) data; - long[] r = new long[arr.length]; - for (int i = 0; i < arr.length; i++) { - r[i] = arr[i]; - } - yield r; - } - case U32 -> { - int[] arr = (int[]) data; - long[] r = new long[arr.length]; - for (int i = 0; i < arr.length; i++) { - r[i] = Integer.toUnsignedLong(arr[i]); - } - yield r; - } - case I64, U64 -> (long[]) data; - default -> throw new VortexException(EncodingId.FASTLANES_DELTA, "unsupported ptype: " + ptype); - }; - } - private static boolean isUnsigned(PType ptype) { return switch (ptype) { case U8, U16, U32, U64 -> true; @@ -219,19 +161,5 @@ private static long typeMask(PType ptype) { return bits == 64 ? -1L : (1L << bits) - 1; } - private static MemorySegment fromLongs(long[] longs, PType ptype, SegmentAllocator arena) { - if (ptype == PType.I64 || ptype == PType.U64) { - MemorySegment dst = arena.allocate((long) longs.length * 8); - MemorySegment.copy(MemorySegment.ofArray(longs), ValueLayout.JAVA_LONG, 0L, dst, PTypeIO.LE_LONG, 0L, longs.length); - return dst; - } - int n = longs.length; - long elemSize = ptype.byteSize(); - MemorySegment seg = arena.allocate(n * elemSize); - for (int i = 0; i < n; i++) { - PTypeIO.set(seg, i * elemSize, ptype, longs[i]); - } - return seg; - } } diff --git a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/FrameOfReferenceEncodingEncoder.java b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/FrameOfReferenceEncodingEncoder.java index aaa8f15bb..4ec66d5c6 100644 --- a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/FrameOfReferenceEncodingEncoder.java +++ b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/FrameOfReferenceEncodingEncoder.java @@ -4,6 +4,7 @@ import io.github.dfa1.vortex.core.PType; import io.github.dfa1.vortex.core.VortexException; import io.github.dfa1.vortex.encoding.EncodingId; +import io.github.dfa1.vortex.encoding.PrimitiveArrays; import io.github.dfa1.vortex.encoding.PTypeIO; import io.github.dfa1.vortex.proto.ScalarValue; @@ -34,7 +35,7 @@ public EncodeResult encode(DType dtype, Object data, EncodeContext ctx) { throw new VortexException(EncodingId.FASTLANES_FOR, "expected primitive dtype, got " + dtype); } PType ptype = p.ptype(); - long[] longs = toLongs(data, ptype); + long[] longs = PrimitiveArrays.toLongs(data, ptype, EncodingId.FASTLANES_FOR); int n = longs.length; long ref = computeRef(longs, n); @@ -52,7 +53,7 @@ public CascadeStep encodeCascade(DType dtype, Object data, EncodeContext encodeC throw new VortexException(EncodingId.FASTLANES_FOR, "expected primitive dtype, got " + dtype); } PType ptype = p.ptype(); - long[] longs = toLongs(data, ptype); + long[] longs = PrimitiveArrays.toLongs(data, ptype, EncodingId.FASTLANES_FOR); int n = longs.length; long ref = computeRef(longs, n); @@ -130,36 +131,6 @@ private static Object residualsAsNativeArray(long[] longs, long ref, PType ptype }; } - private static long[] toLongs(Object data, PType ptype) { - return switch (ptype) { - case I8, U8 -> { - byte[] arr = (byte[]) data; - long[] r = new long[arr.length]; - for (int i = 0; i < arr.length; i++) { - r[i] = ptype == PType.U8 ? Byte.toUnsignedLong(arr[i]) : arr[i]; - } - yield r; - } - case I16, U16 -> { - short[] arr = (short[]) data; - long[] r = new long[arr.length]; - for (int i = 0; i < arr.length; i++) { - r[i] = ptype == PType.U16 ? Short.toUnsignedLong(arr[i]) : arr[i]; - } - yield r; - } - case I32, U32 -> { - int[] arr = (int[]) data; - long[] r = new long[arr.length]; - for (int i = 0; i < arr.length; i++) { - r[i] = ptype == PType.U32 ? Integer.toUnsignedLong(arr[i]) : arr[i]; - } - yield r; - } - case I64, U64 -> (long[]) data; - default -> throw new VortexException(EncodingId.FASTLANES_FOR, "unsupported ptype: " + ptype); - }; - } private static MemorySegment toResidualBuffer(long[] longs, long ref, PType ptype, EncodeContext ctx) { int n = longs.length; diff --git a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/PatchedEncodingEncoder.java b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/PatchedEncodingEncoder.java index ad9dd2b0b..532daffe3 100644 --- a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/PatchedEncodingEncoder.java +++ b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/PatchedEncodingEncoder.java @@ -4,6 +4,7 @@ import io.github.dfa1.vortex.core.PType; import io.github.dfa1.vortex.core.VortexException; import io.github.dfa1.vortex.encoding.EncodingId; +import io.github.dfa1.vortex.encoding.PrimitiveArrays; import io.github.dfa1.vortex.encoding.PTypeIO; import io.github.dfa1.vortex.proto.PatchedMetadata; @@ -62,7 +63,7 @@ static CascadeStep encodeCascade(DType dtype, Object data) { return CascadeStep.notApplicable(); } PType ptype = p.ptype(); - long[] longs = toLongs(data, ptype); + long[] longs = PrimitiveArrays.toLongs(data, ptype, EncodingId.VORTEX_PATCHED); int n = longs.length; if (n == 0) { return CascadeStep.notApplicable(); @@ -96,7 +97,7 @@ static EncodeResult encode(DType dtype, Object data, EncodeContext ctx) { "expected primitive dtype, got " + dtype); } PType ptype = p.ptype(); - long[] longs = toLongs(data, ptype); + long[] longs = PrimitiveArrays.toLongs(data, ptype, EncodingId.VORTEX_PATCHED); int n = longs.length; PatchedData pd = computePatchedData(longs, ptype, n); @@ -269,60 +270,6 @@ private static Object fromLongs(long[] values, PType ptype) { }; } - private static long[] toLongs(Object data, PType ptype) { - return switch (ptype) { - case I8 -> { - byte[] a = (byte[]) data; - long[] r = new long[a.length]; - for (int i = 0; i < a.length; i++) { - r[i] = a[i]; - } - yield r; - } - case U8 -> { - byte[] a = (byte[]) data; - long[] r = new long[a.length]; - for (int i = 0; i < a.length; i++) { - r[i] = Byte.toUnsignedLong(a[i]); - } - yield r; - } - case I16 -> { - short[] a = (short[]) data; - long[] r = new long[a.length]; - for (int i = 0; i < a.length; i++) { - r[i] = a[i]; - } - yield r; - } - case U16 -> { - short[] a = (short[]) data; - long[] r = new long[a.length]; - for (int i = 0; i < a.length; i++) { - r[i] = Short.toUnsignedLong(a[i]); - } - yield r; - } - case I32 -> { - int[] a = (int[]) data; - long[] r = new long[a.length]; - for (int i = 0; i < a.length; i++) { - r[i] = a[i]; - } - yield r; - } - case U32 -> { - int[] a = (int[]) data; - long[] r = new long[a.length]; - for (int i = 0; i < a.length; i++) { - r[i] = Integer.toUnsignedLong(a[i]); - } - yield r; - } - case I64, U64 -> (long[]) data; - default -> throw new VortexException(EncodingId.VORTEX_PATCHED, "unsupported ptype: " + ptype); - }; - } } @SuppressWarnings("java:S6218") // internal data carrier; array fields are not compared for equality From 561ae8135ccefb8e0cf85be867f194b90c293ba0 Mon Sep 17 00:00:00 2001 From: Davide Angelocola Date: Mon, 22 Jun 2026 22:59:10 +0200 Subject: [PATCH 2/2] refactor(core): add PType.isUnsigned, drop 3 private copies; test new helpers isUnsigned(ptype) was copy-pasted in the Delta, Bitpacked and FrameOfReference encoders. It is exactly the complement of the existing PType.isSigned() (every non-unsigned ptype is a signed integer or floating-point), so add PType.isUnsigned() returning !isSigned() and route the call sites through it. Tests: - PTypeTest: isUnsigned true/false partitions + an exact-complement-of-isSigned property over every enum constant. - PrimitiveArraysTest (new, for the prior extraction): toLongs sign/zero-extension per width, I64 passthrough identity, floating-ptype throw carrying the caller's EncodingId; fromLongs round-trip through toLongs, little-endian I64 layout, and low-byte-only narrowing. Ground truth green: JavaWritesRustReads (213) + JavaRoundTrip. Co-Authored-By: Claude Opus 4.8 --- .../io/github/dfa1/vortex/core/PType.java | 8 + .../io/github/dfa1/vortex/core/PTypeTest.java | 21 +++ .../vortex/encoding/PrimitiveArraysTest.java | 170 ++++++++++++++++++ .../encode/BitpackedEncodingEncoder.java | 11 +- .../writer/encode/DeltaEncodingEncoder.java | 11 +- .../FrameOfReferenceEncodingEncoder.java | 8 +- 6 files changed, 204 insertions(+), 25 deletions(-) create mode 100644 core/src/test/java/io/github/dfa1/vortex/encoding/PrimitiveArraysTest.java diff --git a/core/src/main/java/io/github/dfa1/vortex/core/PType.java b/core/src/main/java/io/github/dfa1/vortex/core/PType.java index a49174020..410527419 100644 --- a/core/src/main/java/io/github/dfa1/vortex/core/PType.java +++ b/core/src/main/java/io/github/dfa1/vortex/core/PType.java @@ -56,6 +56,14 @@ public boolean isSigned() { || this == F16 || this == F32 || this == F64; } + /// Returns `true` for the unsigned integer types (`U8`–`U64`) — the complement of + /// [#isSigned()], since every non-unsigned ptype is either a signed integer or floating-point. + /// + /// @return `true` if this ptype is an unsigned integer + public boolean isUnsigned() { + return !isSigned(); + } + /// Returns the [PType] for the given enum ordinal — the integer value the wire format /// uses to identify a physical type. /// diff --git a/core/src/test/java/io/github/dfa1/vortex/core/PTypeTest.java b/core/src/test/java/io/github/dfa1/vortex/core/PTypeTest.java index 74ce819e0..8ed1338ce 100644 --- a/core/src/test/java/io/github/dfa1/vortex/core/PTypeTest.java +++ b/core/src/test/java/io/github/dfa1/vortex/core/PTypeTest.java @@ -51,6 +51,27 @@ void isSigned_falseForUnsigned(PType ptype) { assertThat(ptype.isSigned()).isFalse(); } + @ParameterizedTest + @EnumSource(value = PType.class, names = {"U8", "U16", "U32", "U64"}) + void isUnsigned_trueForUnsigned(PType ptype) { + // Given / When / Then + assertThat(ptype.isUnsigned()).isTrue(); + } + + @ParameterizedTest + @EnumSource(value = PType.class, names = {"I8", "I16", "I32", "I64", "F16", "F32", "F64"}) + void isUnsigned_falseForSignedAndFloats(PType ptype) { + // Given / When / Then + assertThat(ptype.isUnsigned()).isFalse(); + } + + @ParameterizedTest + @EnumSource(PType.class) + void isUnsigned_isExactComplementOfIsSigned(PType ptype) { + // Given / When / Then — the two must partition every ptype; isUnsigned is defined as !isSigned + assertThat(ptype.isUnsigned()).isNotEqualTo(ptype.isSigned()); + } + @ParameterizedTest @EnumSource(PType.class) void fromOrdinal_roundTrips(PType ptype) { diff --git a/core/src/test/java/io/github/dfa1/vortex/encoding/PrimitiveArraysTest.java b/core/src/test/java/io/github/dfa1/vortex/encoding/PrimitiveArraysTest.java new file mode 100644 index 000000000..c2f3cceaf --- /dev/null +++ b/core/src/test/java/io/github/dfa1/vortex/encoding/PrimitiveArraysTest.java @@ -0,0 +1,170 @@ +package io.github.dfa1.vortex.encoding; + +import io.github.dfa1.vortex.core.PType; +import io.github.dfa1.vortex.core.VortexException; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; + +import java.lang.foreign.Arena; +import java.lang.foreign.MemorySegment; +import java.lang.foreign.ValueLayout; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +class PrimitiveArraysTest { + + @Test + void toLongs_i8_signExtends() { + // Given a byte array with a negative value + byte[] data = {0, 1, -1, Byte.MIN_VALUE, Byte.MAX_VALUE}; + + // When + long[] result = PrimitiveArrays.toLongs(data, PType.I8, EncodingId.FASTLANES_DELTA); + + // Then negatives sign-extend to 64 bits + assertThat(result).containsExactly(0L, 1L, -1L, -128L, 127L); + } + + @Test + void toLongs_u8_zeroExtends() { + // Given a byte array whose high bit is set (would be negative if signed) + byte[] data = {0, 1, -1, Byte.MIN_VALUE}; + + // When + long[] result = PrimitiveArrays.toLongs(data, PType.U8, EncodingId.FASTLANES_DELTA); + + // Then the raw byte is zero-extended into 0..255 + assertThat(result).containsExactly(0L, 1L, 255L, 128L); + } + + @Test + void toLongs_i16_signExtends() { + // Given + short[] data = {0, -1, Short.MIN_VALUE, Short.MAX_VALUE}; + + // When + long[] result = PrimitiveArrays.toLongs(data, PType.I16, EncodingId.FASTLANES_DELTA); + + // Then + assertThat(result).containsExactly(0L, -1L, -32768L, 32767L); + } + + @Test + void toLongs_u16_zeroExtends() { + // Given a value with the high bit set + short[] data = {-1, Short.MIN_VALUE}; + + // When + long[] result = PrimitiveArrays.toLongs(data, PType.U16, EncodingId.FASTLANES_DELTA); + + // Then zero-extended into 0..65535 + assertThat(result).containsExactly(65535L, 32768L); + } + + @Test + void toLongs_i32_signExtends() { + // Given + int[] data = {0, -1, Integer.MIN_VALUE, Integer.MAX_VALUE}; + + // When + long[] result = PrimitiveArrays.toLongs(data, PType.I32, EncodingId.FASTLANES_DELTA); + + // Then + assertThat(result).containsExactly(0L, -1L, (long) Integer.MIN_VALUE, (long) Integer.MAX_VALUE); + } + + @Test + void toLongs_u32_zeroExtends() { + // Given a value with the high bit set + int[] data = {-1, Integer.MIN_VALUE}; + + // When + long[] result = PrimitiveArrays.toLongs(data, PType.U32, EncodingId.FASTLANES_DELTA); + + // Then zero-extended into 0..2^32-1 + assertThat(result).containsExactly(0xFFFF_FFFFL, 0x8000_0000L); + } + + @Test + void toLongs_i64_returnsSameArrayNoCopy() { + // Given a long array + long[] data = {1L, -1L, Long.MIN_VALUE, Long.MAX_VALUE}; + + // When + long[] result = PrimitiveArrays.toLongs(data, PType.I64, EncodingId.FASTLANES_DELTA); + + // Then the I64/U64 path is a passthrough — no copy + assertThat(result).isSameAs(data); + } + + @ParameterizedTest + @EnumSource(value = PType.class, names = {"F16", "F32", "F64"}) + void toLongs_floatingPtypes_throwWithSuppliedEncodingId(PType ptype) { + // Given floating ptypes are not integer-widen targets; When/Then it throws, attributed to + // the caller's encoding id (here FrameOfReference) rather than a hardcoded one + assertThatThrownBy(() -> PrimitiveArrays.toLongs(new float[1], ptype, EncodingId.FASTLANES_FOR)) + .isInstanceOf(VortexException.class) + .hasMessageContaining("unsupported ptype: " + ptype); + } + + @ParameterizedTest + @EnumSource(value = PType.class, names = {"I8", "U8", "I16", "U16", "I32", "U32", "I64", "U64"}) + void fromLongs_roundTripsThroughToLongs(PType ptype) { + // Given values that exercise the low bytes at each width + long[] original = {0L, 1L, 2L, 7L, 42L}; + + try (Arena arena = Arena.ofConfined()) { + // When written to a segment and read back at the ptype's width + MemorySegment seg = PrimitiveArrays.fromLongs(original, ptype, arena); + + // Then the segment has one element per value at the expected width... + assertThat(seg.byteSize()).isEqualTo((long) original.length * ptype.byteSize()); + // ...and each element round-trips (values are small + positive, so width-narrowing is lossless) + for (int i = 0; i < original.length; i++) { + assertThat(readElement(seg, ptype, i)).isEqualTo(original[i]); + } + } + } + + @Test + void fromLongs_i64_writesLittleEndian() { + // Given a single value with distinct bytes + long[] original = {0x0102_0304_0506_0708L}; + + try (Arena arena = Arena.ofConfined()) { + // When written via the bulk I64 path + MemorySegment seg = PrimitiveArrays.fromLongs(original, PType.I64, arena); + + // Then it is stored little-endian (lowest byte first) + assertThat(seg.get(ValueLayout.JAVA_BYTE, 0)).isEqualTo((byte) 0x08); + assertThat(seg.getAtIndex(PTypeIO.LE_LONG, 0)).isEqualTo(0x0102_0304_0506_0708L); + } + } + + @Test + void fromLongs_narrowWidth_keepsOnlyLowBytes() { + // Given a value whose high bytes exceed the target width + long[] original = {0x1234_5678L}; + + try (Arena arena = Arena.ofConfined()) { + // When narrowed to I8 (1 byte/elem) + MemorySegment seg = PrimitiveArrays.fromLongs(original, PType.I8, arena); + + // Then only the low byte survives + assertThat(seg.byteSize()).isEqualTo(1L); + assertThat(seg.get(ValueLayout.JAVA_BYTE, 0)).isEqualTo((byte) 0x78); + } + } + + private static long readElement(MemorySegment seg, PType ptype, int i) { + return switch (ptype) { + case I8, U8 -> seg.get(ValueLayout.JAVA_BYTE, i); + case I16, U16 -> seg.getAtIndex(PTypeIO.LE_SHORT, i); + case I32, U32 -> seg.getAtIndex(PTypeIO.LE_INT, i); + case I64, U64 -> seg.getAtIndex(PTypeIO.LE_LONG, i); + default -> throw new IllegalArgumentException("not an integer ptype: " + ptype); + }; + } +} diff --git a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/BitpackedEncodingEncoder.java b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/BitpackedEncodingEncoder.java index b0e74bbf6..b962126eb 100644 --- a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/BitpackedEncodingEncoder.java +++ b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/BitpackedEncodingEncoder.java @@ -48,7 +48,7 @@ public EncodeResult encode(DType dtype, Object data, EncodeContext ctx) { int n = longs.length; int typeBits = ptype.byteSize() * 8; long typeMask = typeMask(typeBits); - boolean unsign = isUnsigned(ptype); + boolean unsign = ptype.isUnsigned(); long signedMin = 0L; long signedMax = 0L; @@ -243,15 +243,8 @@ private static long typeMask(int typeBits) { return typeBits == 64 ? -1L : (1L << typeBits) - 1L; } - private static boolean isUnsigned(PType ptype) { - return switch (ptype) { - case U8, U16, U32, U64 -> true; - default -> false; - }; - } - private static byte[] statsBytes(PType ptype, long value) { - if (isUnsigned(ptype)) { + if (ptype.isUnsigned()) { return ScalarValue.ofUint64Value(value).encode(); } return ScalarValue.ofInt64Value(value).encode(); diff --git a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/DeltaEncodingEncoder.java b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/DeltaEncodingEncoder.java index 083502ff8..7c093b506 100644 --- a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/DeltaEncodingEncoder.java +++ b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/DeltaEncodingEncoder.java @@ -42,7 +42,7 @@ public EncodeResult encode(DType dtype, Object data, EncodeContext ctx) { int typeBits = typeBits(ptype); int lanes = lanes(ptype); long mask = typeMask(ptype); - boolean unsign = isUnsigned(ptype); + boolean unsign = ptype.isUnsigned(); long minVal = 0L; long maxVal = 0L; @@ -117,15 +117,8 @@ private static void deltaChunk(long[] transposed, long[] bases, int lanes, int t } } - private static boolean isUnsigned(PType ptype) { - return switch (ptype) { - case U8, U16, U32, U64 -> true; - default -> false; - }; - } - private static byte[] statsBytes(PType ptype, long value) { - if (isUnsigned(ptype)) { + if (ptype.isUnsigned()) { return ScalarValue.ofUint64Value(value).encode(); } return ScalarValue.ofInt64Value(value).encode(); diff --git a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/FrameOfReferenceEncodingEncoder.java b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/FrameOfReferenceEncodingEncoder.java index 4ec66d5c6..a1e895575 100644 --- a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/FrameOfReferenceEncodingEncoder.java +++ b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/FrameOfReferenceEncodingEncoder.java @@ -60,7 +60,7 @@ public CascadeStep encodeCascade(DType dtype, Object data, EncodeContext encodeC // Skip when ref == 0 and ptype is unsigned: residuals == input, so FOR adds metadata // overhead (ref scalar + extra node) for zero compression benefit over plain bitpack. // Matches Rust IntFoRScheme's skip estimate for this case. - if (ref == 0L && isUnsigned(ptype)) { + if (ref == 0L && ptype.isUnsigned()) { return CascadeStep.notApplicable(); } ByteBuffer meta = buildForMeta(ref, ptype); @@ -70,12 +70,6 @@ public CascadeStep encodeCascade(DType dtype, Object data, EncodeContext encodeC return new CascadeStep(partialRoot, List.of(), List.of(slot), null, null, true); } - private static boolean isUnsigned(PType ptype) { - return switch (ptype) { - case U8, U16, U32, U64 -> true; - default -> false; - }; - } private static long computeRef(long[] longs, int n) { long ref = n > 0 ? longs[0] : 0L;