From 277d0313fb4370686e879628ffa99de77e4807b2 Mon Sep 17 00:00:00 2001 From: Davide Angelocola Date: Mon, 22 Jun 2026 23:10:47 +0200 Subject: [PATCH 1/4] docs(todo): add ticket to optimize FastLanes.transposeIndex hot-loop math Co-Authored-By: Claude Opus 4.8 --- TODO.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/TODO.md b/TODO.md index 0e30d8c60..6e2622c32 100644 --- a/TODO.md +++ b/TODO.md @@ -11,6 +11,10 @@ - [ ] Performance tests must be peer reviewed - [ ] Run performance tests on other machines (I have access only to Apple M5) +- [ ] **Optimize `FastLanes.transposeIndex(int)`** — per-element `%`/`/` violate the hot-loop rule; + called once per element in the delta transpose loops (`DeltaEncodingDecoder`, `DeltaEncodingEncoder`). + Divisors are power-of-two constants (16/8/128); replace with shifts/masks or a precomputed + 1024-entry permutation table. Profile first, benchmark both. - [ ] **Vector API adoption** — deferred; see [ADR-0005](docs/adr/0005-vector-api-adoption.md) for adoption criteria and candidate loops. ## Security From dd72870f23b05dd65705ac68f52743109a385370 Mon Sep 17 00:00:00 2001 From: Davide Angelocola Date: Mon, 22 Jun 2026 23:11:20 +0200 Subject: [PATCH 2/4] docs(todo): include iterateIndex in FastLanes transpose optimization ticket Co-Authored-By: Claude Opus 4.8 --- TODO.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/TODO.md b/TODO.md index 6e2622c32..3194d9539 100644 --- a/TODO.md +++ b/TODO.md @@ -11,10 +11,10 @@ - [ ] Performance tests must be peer reviewed - [ ] Run performance tests on other machines (I have access only to Apple M5) -- [ ] **Optimize `FastLanes.transposeIndex(int)`** — per-element `%`/`/` violate the hot-loop rule; - called once per element in the delta transpose loops (`DeltaEncodingDecoder`, `DeltaEncodingEncoder`). - Divisors are power-of-two constants (16/8/128); replace with shifts/masks or a precomputed - 1024-entry permutation table. Profile first, benchmark both. +- [ ] **Optimize `FastLanes.transposeIndex(int)` / `iterateIndex(int, int)`** — per-element `%`/`/` + violate the hot-loop rule; called once per element in the delta transpose loops + (`DeltaEncodingDecoder`, `DeltaEncodingEncoder`). Divisors are power-of-two constants (16/8/128); + replace with shifts/masks or a precomputed permutation table. Profile first, benchmark both. - [ ] **Vector API adoption** — deferred; see [ADR-0005](docs/adr/0005-vector-api-adoption.md) for adoption criteria and candidate loops. ## Security From daf0660fb138fd86fad45ac948d9ea4f2ed11efa Mon Sep 17 00:00:00 2001 From: Davide Angelocola Date: Mon, 22 Jun 2026 23:14:44 +0200 Subject: [PATCH 3/4] refactor(encoding): extract shared FastLanes layout + PType.bits to core MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit FL_CHUNK_SIZE/FL_ORDER + transposeIndex/iterateIndex/lanes were duplicated in the Delta encoder and decoder; the low-bit typeMask and the byteSize*8 width were also copy-pasted across Delta, Bitpacked and Patched. Pull the FastLanes layout into a shared core.encoding.FastLanes (CHUNK, transposeIndex, iterateIndex, lanes, lowMask) and add PType.bits() for the width. Cross-module (reader + writer) so the home is core, mirroring PTypeIO. Hot paths deliberately untouched: Bitpacked keeps its own FL_ORDER constant and the unrolled pack/unpack kernels are byte-identical — only the cold per-call typeMask / width setup now routes through FastLanes. Delta's transposeIndex/iterateIndex were already standalone static calls, so moving them across a class boundary does not change inlining. Pco is excluded (not FastLanes-family, perf-critical). Ground truth green both directions: RustWritesJavaReads (12), JavaWritesRustReads (213), JavaRoundTrip. Co-Authored-By: Claude Opus 4.8 --- .../io/github/dfa1/vortex/core/PType.java | 7 +++ .../dfa1/vortex/encoding/FastLanes.java | 63 +++++++++++++++++++ .../decode/BitpackedEncodingDecoder.java | 2 +- .../reader/decode/DeltaEncodingDecoder.java | 58 +++++------------ .../encode/BitpackedEncodingEncoder.java | 10 ++- .../writer/encode/DeltaEncodingEncoder.java | 62 +++++------------- .../writer/encode/PatchedEncodingEncoder.java | 5 +- 7 files changed, 109 insertions(+), 98 deletions(-) create mode 100644 core/src/main/java/io/github/dfa1/vortex/encoding/FastLanes.java diff --git a/core/src/main/java/io/github/dfa1/vortex/core/PType.java b/core/src/main/java/io/github/dfa1/vortex/core/PType.java index 410527419..86c2f8a75 100644 --- a/core/src/main/java/io/github/dfa1/vortex/core/PType.java +++ b/core/src/main/java/io/github/dfa1/vortex/core/PType.java @@ -41,6 +41,13 @@ public int byteSize() { }; } + /// Number of bits per element — [#byteSize()] times eight (8, 16, 32, or 64). + /// + /// @return the bit width of this physical type + public int bits() { + return byteSize() * 8; + } + /// Returns `true` for `F16`, `F32`, and `F64`. /// /// @return `true` if this ptype is a floating-point type diff --git a/core/src/main/java/io/github/dfa1/vortex/encoding/FastLanes.java b/core/src/main/java/io/github/dfa1/vortex/encoding/FastLanes.java new file mode 100644 index 000000000..2cb8080b6 --- /dev/null +++ b/core/src/main/java/io/github/dfa1/vortex/encoding/FastLanes.java @@ -0,0 +1,63 @@ +package io.github.dfa1.vortex.encoding; + +import io.github.dfa1.vortex.core.PType; + +/// Shared FastLanes layout constants and index math used by the bit-packing and delta encodings on +/// both the read and write sides. +/// +/// FastLanes processes values in fixed 1024-element chunks ([#CHUNK]) arranged into an interleaved +/// lane order ([#ORDER]) so that the unpack inner loop is data-parallel. [#transposeIndex(int)] and +/// [#iterateIndex(int, int)] map between the logical element order and that interleaved layout; +/// [#lanes(PType)] is the lane count for a width and [#lowMask(int)] the low-`bits` value mask. +/// +/// Mirrors the reference layout in `spiraldb/fastlanes` (`src/macros.rs`). +public final class FastLanes { + + /// Number of elements per FastLanes chunk. + public static final int CHUNK = 1024; + + /// The FastLanes transpose order — the lane permutation applied within each 8-row group. + private static final int[] ORDER = {0, 4, 2, 6, 1, 5, 3, 7}; + + private FastLanes() { + } + + /// Maps a logical element index to its position in the transposed (interleaved-lane) layout. + /// + /// @param idx logical element index within a chunk, in `[0, CHUNK)` + /// @return the corresponding index in the transposed buffer + public static int transposeIndex(int idx) { + int lane = idx % 16; + int order = (idx / 16) % 8; + int row = idx / 128; + return lane * 64 + ORDER[order] * 8 + row; + } + + /// Computes the logical element index visited at the given `row` and `lane` of the FastLanes + /// iteration order — the inverse mapping used while packing or unpacking. + /// + /// @param row the row within the chunk + /// @param lane the lane within the row + /// @return the logical element index + public static int iterateIndex(int row, int lane) { + int o = row / 8; + int s = row % 8; + return ORDER[o] * 16 + s * 128 + lane; + } + + /// Returns the FastLanes lane count for `ptype` — [#CHUNK] divided by the type's bit width. + /// + /// @param ptype the physical type being packed + /// @return the number of lanes + public static int lanes(PType ptype) { + return CHUNK / ptype.bits(); + } + + /// Returns a mask selecting the low `bits` of a `long` (all ones when `bits == 64`). + /// + /// @param bits the number of low bits to keep, in `[1, 64]` + /// @return the low-`bits` mask + public static long lowMask(int bits) { + return bits == 64 ? -1L : (1L << bits) - 1; + } +} diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/decode/BitpackedEncodingDecoder.java b/reader/src/main/java/io/github/dfa1/vortex/reader/decode/BitpackedEncodingDecoder.java index 538b03f4c..49c0bfb1e 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/decode/BitpackedEncodingDecoder.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/decode/BitpackedEncodingDecoder.java @@ -64,7 +64,7 @@ public Array decode(DecodeContext ctx) { int bitWidth = meta.bit_width(); int offset = meta.offset(); PType ptype = ((DType.Primitive) ctx.dtype()).ptype(); - int typeBits = ptype.byteSize() * 8; + int typeBits = ptype.bits(); long rowCount = ctx.rowCount(); MemorySegment packed = ctx.buffer(0); diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/decode/DeltaEncodingDecoder.java b/reader/src/main/java/io/github/dfa1/vortex/reader/decode/DeltaEncodingDecoder.java index 028cb999b..491cfdca5 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/decode/DeltaEncodingDecoder.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/decode/DeltaEncodingDecoder.java @@ -4,6 +4,7 @@ import io.github.dfa1.vortex.core.PType; import io.github.dfa1.vortex.core.VortexException; import io.github.dfa1.vortex.encoding.EncodingId; +import io.github.dfa1.vortex.encoding.FastLanes; import io.github.dfa1.vortex.encoding.PrimitiveArrays; import io.github.dfa1.vortex.encoding.PTypeIO; import io.github.dfa1.vortex.proto.DeltaMetadata; @@ -58,9 +59,9 @@ public Array decode(DecodeContext ctx) { PType ptype = ((DType.Primitive) ctx.dtype()).ptype(); long rowCount = ctx.rowCount(); - int typeBits = typeBits(ptype); - int lanes = lanes(ptype); - long mask = typeMask(ptype); + int typeBits = ptype.bits(); + int lanes = FastLanes.lanes(ptype); + long mask = FastLanes.lowMask(ptype.bits()); long deltasLen = meta.deltas_len(); int offset = meta.offset(); @@ -76,32 +77,32 @@ public Array decode(DecodeContext ctx) { }; } - long basesLen = (deltasLen / FL_CHUNK_SIZE) * lanes; + long basesLen = (deltasLen / FastLanes.CHUNK) * lanes; DType dtype = ctx.dtype(); long[] basesAll = readLongs(ctx.decodeChildSegment(0, dtype, basesLen), (int) basesLen, ptype); long[] deltasAll = readLongs(ctx.decodeChildSegment(1, dtype, deltasLen), (int) deltasLen, ptype); - int numChunks = (int) (deltasLen / FL_CHUNK_SIZE); + int numChunks = (int) (deltasLen / FastLanes.CHUNK); long[] decoded = new long[(int) deltasLen]; - long[] untransposedChunk = new long[FL_CHUNK_SIZE]; + long[] untransposedChunk = new long[FastLanes.CHUNK]; long[] chunkBases = new long[lanes]; - long[] chunkDeltas = new long[FL_CHUNK_SIZE]; - long[] chunkUndelta = new long[FL_CHUNK_SIZE]; + long[] chunkDeltas = new long[FastLanes.CHUNK]; + long[] chunkUndelta = new long[FastLanes.CHUNK]; for (int chunk = 0; chunk < numChunks; chunk++) { int basesOff = chunk * lanes; - int deltaOff = chunk * FL_CHUNK_SIZE; + int deltaOff = chunk * FastLanes.CHUNK; System.arraycopy(basesAll, basesOff, chunkBases, 0, lanes); - System.arraycopy(deltasAll, deltaOff, chunkDeltas, 0, FL_CHUNK_SIZE); + System.arraycopy(deltasAll, deltaOff, chunkDeltas, 0, FastLanes.CHUNK); undeltaChunk(chunkDeltas, chunkBases, lanes, typeBits, mask, chunkUndelta); - for (int i = 0; i < FL_CHUNK_SIZE; i++) { - untransposedChunk[transposeIndex(i)] = chunkUndelta[i]; + for (int i = 0; i < FastLanes.CHUNK; i++) { + untransposedChunk[FastLanes.transposeIndex(i)] = chunkUndelta[i]; } - System.arraycopy(untransposedChunk, 0, decoded, deltaOff, FL_CHUNK_SIZE); + System.arraycopy(untransposedChunk, 0, decoded, deltaOff, FastLanes.CHUNK); } long[] result = new long[(int) rowCount]; @@ -121,7 +122,7 @@ private static void undeltaChunk(long[] deltas, long[] bases, int lanes, int typ for (int lane = 0; lane < lanes; lane++) { long prev = bases[lane] & mask; for (int row = 0; row < typeBits; row++) { - int idx = iterateIndex(row, lane); + int idx = FastLanes.iterateIndex(row, lane); long next = ((deltas[idx] & mask) + prev) & mask; out[idx] = next; prev = next; @@ -149,34 +150,5 @@ private static long[] readLongs(MemorySegment buf, int count, PType ptype) { return out; } - private static final int FL_CHUNK_SIZE = 1024; - - private static final int[] FL_ORDER = {0, 4, 2, 6, 1, 5, 3, 7}; - - private static int transposeIndex(int idx) { - int lane = idx % 16; - int order = (idx / 16) % 8; - int row = idx / 128; - return lane * 64 + FL_ORDER[order] * 8 + row; - } - - private static int iterateIndex(int row, int lane) { - int o = row / 8; - int s = row % 8; - return FL_ORDER[o] * 16 + s * 128 + lane; - } - - private static int lanes(PType ptype) { - return FL_CHUNK_SIZE / (ptype.byteSize() * 8); - } - - private static int typeBits(PType ptype) { - return ptype.byteSize() * 8; - } - - private static long typeMask(PType ptype) { - int bits = ptype.byteSize() * 8; - return bits == 64 ? -1L : (1L << bits) - 1; - } } diff --git a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/BitpackedEncodingEncoder.java b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/BitpackedEncodingEncoder.java index b962126eb..0cdb12beb 100644 --- a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/BitpackedEncodingEncoder.java +++ b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/BitpackedEncodingEncoder.java @@ -4,6 +4,7 @@ import io.github.dfa1.vortex.core.PType; import io.github.dfa1.vortex.core.VortexException; import io.github.dfa1.vortex.encoding.EncodingId; +import io.github.dfa1.vortex.encoding.FastLanes; import io.github.dfa1.vortex.encoding.PrimitiveArrays; import io.github.dfa1.vortex.encoding.PTypeIO; import io.github.dfa1.vortex.proto.BitPackedMetadata; @@ -46,8 +47,8 @@ public EncodeResult encode(DType dtype, Object data, EncodeContext ctx) { PType ptype = ((DType.Primitive) dtype).ptype(); long[] longs = PrimitiveArrays.toLongs(data, ptype, EncodingId.FASTLANES_BITPACKED); int n = longs.length; - int typeBits = ptype.byteSize() * 8; - long typeMask = typeMask(typeBits); + int typeBits = ptype.bits(); + long typeMask = FastLanes.lowMask(typeBits); boolean unsign = ptype.isUnsigned(); long signedMin = 0L; @@ -198,7 +199,7 @@ private static MemorySegment packFastLanes(long[] values, int n, int bitWidth, i int lanes = 1024 / typeBits; int wordBytes = typeBits / 8; int blockCount = (n + 1023) / 1024; - long typeMask = typeMask(typeBits); + long typeMask = FastLanes.lowMask(typeBits); // Mask values to the chosen bit width so over-cap entries (handled separately as // patches) don't spill into the next row's region in the packed layout. long widthMask = bitWidth >= 64 ? -1L : (1L << bitWidth) - 1L; @@ -239,9 +240,6 @@ private static MemorySegment packFastLanes(long[] values, int n, int bitWidth, i } - private static long typeMask(int typeBits) { - return typeBits == 64 ? -1L : (1L << typeBits) - 1L; - } private static byte[] statsBytes(PType ptype, long value) { if (ptype.isUnsigned()) { diff --git a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/DeltaEncodingEncoder.java b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/DeltaEncodingEncoder.java index 7c093b506..3ea88778a 100644 --- a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/DeltaEncodingEncoder.java +++ b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/DeltaEncodingEncoder.java @@ -3,6 +3,7 @@ import io.github.dfa1.vortex.core.DType; import io.github.dfa1.vortex.core.PType; import io.github.dfa1.vortex.encoding.EncodingId; +import io.github.dfa1.vortex.encoding.FastLanes; import io.github.dfa1.vortex.encoding.PrimitiveArrays; import io.github.dfa1.vortex.proto.DeltaMetadata; import io.github.dfa1.vortex.proto.ScalarValue; @@ -39,9 +40,9 @@ public EncodeResult encode(DType dtype, Object data, EncodeContext ctx) { PType ptype = ((DType.Primitive) dtype).ptype(); long[] longs = PrimitiveArrays.toLongs(data, ptype, EncodingId.FASTLANES_DELTA); int n = longs.length; - int typeBits = typeBits(ptype); - int lanes = lanes(ptype); - long mask = typeMask(ptype); + int typeBits = ptype.bits(); + int lanes = FastLanes.lanes(ptype); + long mask = FastLanes.lowMask(ptype.bits()); boolean unsign = ptype.isUnsigned(); long minVal = 0L; @@ -60,34 +61,34 @@ public EncodeResult encode(DType dtype, Object data, EncodeContext ctx) { } } - int numChunks = n == 0 ? 0 : (n + FL_CHUNK_SIZE - 1) / FL_CHUNK_SIZE; - long paddedLen = (long) numChunks * FL_CHUNK_SIZE; + int numChunks = n == 0 ? 0 : (n + FastLanes.CHUNK - 1) / FastLanes.CHUNK; + long paddedLen = (long) numChunks * FastLanes.CHUNK; int basesLen = numChunks * lanes; long[] basesAll = new long[basesLen]; long[] deltasAll = new long[(int) paddedLen]; - long[] chunkBuf = new long[FL_CHUNK_SIZE]; - long[] transposed = new long[FL_CHUNK_SIZE]; + long[] chunkBuf = new long[FastLanes.CHUNK]; + long[] transposed = new long[FastLanes.CHUNK]; long[] chunkBases = new long[lanes]; - long[] chunkDelta = new long[FL_CHUNK_SIZE]; + long[] chunkDelta = new long[FastLanes.CHUNK]; for (int chunk = 0; chunk < numChunks; chunk++) { - int start = chunk * FL_CHUNK_SIZE; - int end = Math.min(start + FL_CHUNK_SIZE, n); + int start = chunk * FastLanes.CHUNK; + int end = Math.min(start + FastLanes.CHUNK, n); for (int i = start; i < end; i++) { chunkBuf[i - start] = longs[i] & mask; } - for (int i = end - start; i < FL_CHUNK_SIZE; i++) { + for (int i = end - start; i < FastLanes.CHUNK; i++) { chunkBuf[i] = 0L; } - for (int i = 0; i < FL_CHUNK_SIZE; i++) { - transposed[i] = chunkBuf[transposeIndex(i)]; + for (int i = 0; i < FastLanes.CHUNK; i++) { + transposed[i] = chunkBuf[FastLanes.transposeIndex(i)]; } int basesOff = chunk * lanes; System.arraycopy(transposed, 0, basesAll, basesOff, lanes); System.arraycopy(basesAll, basesOff, chunkBases, 0, lanes); deltaChunk(transposed, chunkBases, lanes, typeBits, mask, chunkDelta); - System.arraycopy(chunkDelta, 0, deltasAll, chunk * FL_CHUNK_SIZE, FL_CHUNK_SIZE); + System.arraycopy(chunkDelta, 0, deltasAll, chunk * FastLanes.CHUNK, FastLanes.CHUNK); } MemorySegment basesSeg = PrimitiveArrays.fromLongs(basesAll, ptype, ctx.arena()); @@ -109,7 +110,7 @@ private static void deltaChunk(long[] transposed, long[] bases, int lanes, int t for (int lane = 0; lane < lanes; lane++) { long prev = bases[lane] & mask; for (int row = 0; row < typeBits; row++) { - int idx = iterateIndex(row, lane); + int idx = FastLanes.iterateIndex(row, lane); long next = transposed[idx] & mask; out[idx] = (next - prev) & mask; prev = next; @@ -124,35 +125,4 @@ private static byte[] statsBytes(PType ptype, long value) { return ScalarValue.ofInt64Value(value).encode(); } - private static final int FL_CHUNK_SIZE = 1024; - - private static final int[] FL_ORDER = {0, 4, 2, 6, 1, 5, 3, 7}; - - private static int transposeIndex(int idx) { - int lane = idx % 16; - int order = (idx / 16) % 8; - int row = idx / 128; - return lane * 64 + FL_ORDER[order] * 8 + row; - } - - private static int iterateIndex(int row, int lane) { - int o = row / 8; - int s = row % 8; - return FL_ORDER[o] * 16 + s * 128 + lane; - } - - private static int lanes(PType ptype) { - return FL_CHUNK_SIZE / (ptype.byteSize() * 8); - } - - private static int typeBits(PType ptype) { - return ptype.byteSize() * 8; - } - - private static long typeMask(PType ptype) { - int bits = ptype.byteSize() * 8; - return bits == 64 ? -1L : (1L << bits) - 1; - } - - } diff --git a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/PatchedEncodingEncoder.java b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/PatchedEncodingEncoder.java index 532daffe3..d430f4511 100644 --- a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/PatchedEncodingEncoder.java +++ b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/PatchedEncodingEncoder.java @@ -4,6 +4,7 @@ import io.github.dfa1.vortex.core.PType; import io.github.dfa1.vortex.core.VortexException; import io.github.dfa1.vortex.encoding.EncodingId; +import io.github.dfa1.vortex.encoding.FastLanes; import io.github.dfa1.vortex.encoding.PrimitiveArrays; import io.github.dfa1.vortex.encoding.PTypeIO; import io.github.dfa1.vortex.proto.PatchedMetadata; @@ -140,8 +141,8 @@ static EncodeResult encode(DType dtype, Object data, EncodeContext ctx) { } private static PatchedData computePatchedData(long[] longs, PType ptype, int n) { - int typeBits = ptype.byteSize() * 8; - long typeMask = typeBits == 64 ? -1L : (1L << typeBits) - 1L; + int typeBits = ptype.bits(); + long typeMask = FastLanes.lowMask(typeBits); int elemBytes = ptype.byteSize(); int[] bitWidthFreq = new int[typeBits + 1]; From 68778f09f9e5714e45d9160c5bb9f0ad13aac26f Mon Sep 17 00:00:00 2001 From: Davide Angelocola Date: Mon, 22 Jun 2026 23:18:15 +0200 Subject: [PATCH 4/4] refactor(writer): delegate RleEncodingEncoder.toLongs integer arms to PrimitiveArrays Rle.toLongs is a superset of PrimitiveArrays.toLongs: identical widen for the eight integer ptypes, plus F32/F64/F16 raw-bit packing unique to RLE. The integer half was a verbatim copy. Keep the float/f16 arms local and route every other ptype through PrimitiveArrays.toLongs via the switch default (floats are matched first, so the default only ever sees integers). Drops ~48 duplicated lines. Ground truth green: JavaWritesRustReads (213). Co-Authored-By: Claude Opus 4.8 --- .../writer/encode/RleEncodingEncoder.java | 52 ++----------------- 1 file changed, 3 insertions(+), 49 deletions(-) diff --git a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/RleEncodingEncoder.java b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/RleEncodingEncoder.java index d808dbd24..a291dd960 100644 --- a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/RleEncodingEncoder.java +++ b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/RleEncodingEncoder.java @@ -4,6 +4,7 @@ import io.github.dfa1.vortex.core.PType; import io.github.dfa1.vortex.core.VortexException; import io.github.dfa1.vortex.encoding.EncodingId; +import io.github.dfa1.vortex.encoding.PrimitiveArrays; import io.github.dfa1.vortex.encoding.PTypeIO; import io.github.dfa1.vortex.proto.RLEMetadata; @@ -149,55 +150,6 @@ private static EncodeResult encodeEmpty(EncodeContext ctx) { private static long[] toLongs(Object data, PType ptype) { return switch (ptype) { - case I8 -> { - byte[] arr = (byte[]) data; - long[] r = new long[arr.length]; - for (int i = 0; i < arr.length; i++) { - r[i] = arr[i]; - } - yield r; - } - case U8 -> { - byte[] arr = (byte[]) data; - long[] r = new long[arr.length]; - for (int i = 0; i < arr.length; i++) { - r[i] = Byte.toUnsignedLong(arr[i]); - } - yield r; - } - case I16 -> { - short[] arr = (short[]) data; - long[] r = new long[arr.length]; - for (int i = 0; i < arr.length; i++) { - r[i] = arr[i]; - } - yield r; - } - case U16 -> { - short[] arr = (short[]) data; - long[] r = new long[arr.length]; - for (int i = 0; i < arr.length; i++) { - r[i] = Short.toUnsignedLong(arr[i]); - } - yield r; - } - case I32 -> { - int[] arr = (int[]) data; - long[] r = new long[arr.length]; - for (int i = 0; i < arr.length; i++) { - r[i] = arr[i]; - } - yield r; - } - case U32 -> { - int[] arr = (int[]) data; - long[] r = new long[arr.length]; - for (int i = 0; i < arr.length; i++) { - r[i] = Integer.toUnsignedLong(arr[i]); - } - yield r; - } - case I64, U64 -> (long[]) data; case F32 -> { float[] arr = (float[]) data; long[] r = new long[arr.length]; @@ -222,6 +174,8 @@ private static long[] toLongs(Object data, PType ptype) { } yield r; } + // Integer ptypes share the standard widen; floats above keep RLE's raw-bit packing. + default -> PrimitiveArrays.toLongs(data, ptype, EncodingId.FASTLANES_RLE); }; }