dfa1 · dfa1 · Jun 22, 2026 · Jun 22, 2026 · Jun 22, 2026 · Jun 22, 2026
diff --git a/TODO.md b/TODO.md
@@ -11,6 +11,10 @@
 
 - [ ] Performance tests must be peer reviewed
 - [ ] Run performance tests on other machines (I have access only to Apple M5)
+- [ ] **Optimize `FastLanes.transposeIndex(int)` / `iterateIndex(int, int)`** — per-element `%`/`/`
+  violate the hot-loop rule; called once per element in the delta transpose loops
+  (`DeltaEncodingDecoder`, `DeltaEncodingEncoder`). Divisors are power-of-two constants (16/8/128);
+  replace with shifts/masks or a precomputed permutation table. Profile first, benchmark both.
 - [ ] **Vector API adoption** — deferred; see [ADR-0005](docs/adr/0005-vector-api-adoption.md) for adoption criteria and candidate loops.
 
 ## Security

diff --git a/core/src/main/java/io/github/dfa1/vortex/core/PType.java b/core/src/main/java/io/github/dfa1/vortex/core/PType.java
@@ -41,6 +41,13 @@ public int byteSize() {
         };
     }
 
+    /// Number of bits per element — [#byteSize()] times eight (8, 16, 32, or 64).
+    ///
+    /// @return the bit width of this physical type
+    public int bits() {
+        return byteSize() * 8;
+    }
+
     /// Returns `true` for `F16`, `F32`, and `F64`.
     ///
     /// @return `true` if this ptype is a floating-point type

diff --git a/core/src/main/java/io/github/dfa1/vortex/encoding/FastLanes.java b/core/src/main/java/io/github/dfa1/vortex/encoding/FastLanes.java
@@ -0,0 +1,63 @@
+package io.github.dfa1.vortex.encoding;
+
+import io.github.dfa1.vortex.core.PType;
+
+/// Shared FastLanes layout constants and index math used by the bit-packing and delta encodings on
+/// both the read and write sides.
+///
+/// FastLanes processes values in fixed 1024-element chunks ([#CHUNK]) arranged into an interleaved
+/// lane order ([#ORDER]) so that the unpack inner loop is data-parallel. [#transposeIndex(int)] and
+/// [#iterateIndex(int, int)] map between the logical element order and that interleaved layout;
+/// [#lanes(PType)] is the lane count for a width and [#lowMask(int)] the low-`bits` value mask.
+///
+/// Mirrors the reference layout in `spiraldb/fastlanes` (`src/macros.rs`).
+public final class FastLanes {
+
+    /// Number of elements per FastLanes chunk.
+    public static final int CHUNK = 1024;
+
+    /// The FastLanes transpose order — the lane permutation applied within each 8-row group.
+    private static final int[] ORDER = {0, 4, 2, 6, 1, 5, 3, 7};
+
+    private FastLanes() {
+    }
+
+    /// Maps a logical element index to its position in the transposed (interleaved-lane) layout.
+    ///
+    /// @param idx logical element index within a chunk, in `[0, CHUNK)`
+    /// @return the corresponding index in the transposed buffer
+    public static int transposeIndex(int idx) {
+        int lane = idx % 16;
+        int order = (idx / 16) % 8;
+        int row = idx / 128;
+        return lane * 64 + ORDER[order] * 8 + row;
+    }
+
+    /// Computes the logical element index visited at the given `row` and `lane` of the FastLanes
+    /// iteration order — the inverse mapping used while packing or unpacking.
+    ///
+    /// @param row  the row within the chunk
+    /// @param lane the lane within the row
+    /// @return the logical element index
+    public static int iterateIndex(int row, int lane) {
+        int o = row / 8;
+        int s = row % 8;
+        return ORDER[o] * 16 + s * 128 + lane;
+    }
+
+    /// Returns the FastLanes lane count for `ptype` — [#CHUNK] divided by the type's bit width.
+    ///
+    /// @param ptype the physical type being packed
+    /// @return the number of lanes
+    public static int lanes(PType ptype) {
+        return CHUNK / ptype.bits();
+    }
+
+    /// Returns a mask selecting the low `bits` of a `long` (all ones when `bits == 64`).
+    ///
+    /// @param bits the number of low bits to keep, in `[1, 64]`
+    /// @return the low-`bits` mask
+    public static long lowMask(int bits) {
+        return bits == 64 ? -1L : (1L << bits) - 1;
+    }
+}
diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/decode/BitpackedEncodingDecoder.java b/reader/src/main/java/io/github/dfa1/vortex/reader/decode/BitpackedEncodingDecoder.java
@@ -64,7 +64,7 @@ public Array decode(DecodeContext ctx) {
         int bitWidth = meta.bit_width();
         int offset = meta.offset();
         PType ptype = ((DType.Primitive) ctx.dtype()).ptype();
-        int typeBits = ptype.byteSize() * 8;
+        int typeBits = ptype.bits();
         long rowCount = ctx.rowCount();
 
         MemorySegment packed = ctx.buffer(0);

diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/decode/DeltaEncodingDecoder.java b/reader/src/main/java/io/github/dfa1/vortex/reader/decode/DeltaEncodingDecoder.java
@@ -4,6 +4,7 @@
 import io.github.dfa1.vortex.core.PType;
 import io.github.dfa1.vortex.core.VortexException;
 import io.github.dfa1.vortex.encoding.EncodingId;
+import io.github.dfa1.vortex.encoding.FastLanes;
 import io.github.dfa1.vortex.encoding.PrimitiveArrays;
 import io.github.dfa1.vortex.encoding.PTypeIO;
 import io.github.dfa1.vortex.proto.DeltaMetadata;
@@ -58,9 +59,9 @@ public Array decode(DecodeContext ctx) {
 
         PType ptype = ((DType.Primitive) ctx.dtype()).ptype();
         long rowCount = ctx.rowCount();
-        int typeBits = typeBits(ptype);
-        int lanes = lanes(ptype);
-        long mask = typeMask(ptype);
+        int typeBits = ptype.bits();
+        int lanes = FastLanes.lanes(ptype);
+        long mask = FastLanes.lowMask(ptype.bits());
 
         long deltasLen = meta.deltas_len();
         int offset = meta.offset();
@@ -76,32 +77,32 @@ public Array decode(DecodeContext ctx) {
             };
         }
 
-        long basesLen = (deltasLen / FL_CHUNK_SIZE) * lanes;
+        long basesLen = (deltasLen / FastLanes.CHUNK) * lanes;
         DType dtype = ctx.dtype();
 
         long[] basesAll = readLongs(ctx.decodeChildSegment(0, dtype, basesLen), (int) basesLen, ptype);
         long[] deltasAll = readLongs(ctx.decodeChildSegment(1, dtype, deltasLen), (int) deltasLen, ptype);
 
-        int numChunks = (int) (deltasLen / FL_CHUNK_SIZE);
+        int numChunks = (int) (deltasLen / FastLanes.CHUNK);
         long[] decoded = new long[(int) deltasLen];
-        long[] untransposedChunk = new long[FL_CHUNK_SIZE];
+        long[] untransposedChunk = new long[FastLanes.CHUNK];
         long[] chunkBases = new long[lanes];
-        long[] chunkDeltas = new long[FL_CHUNK_SIZE];
-        long[] chunkUndelta = new long[FL_CHUNK_SIZE];
+        long[] chunkDeltas = new long[FastLanes.CHUNK];
+        long[] chunkUndelta = new long[FastLanes.CHUNK];
 
         for (int chunk = 0; chunk < numChunks; chunk++) {
             int basesOff = chunk * lanes;
-            int deltaOff = chunk * FL_CHUNK_SIZE;
+            int deltaOff = chunk * FastLanes.CHUNK;
 
             System.arraycopy(basesAll, basesOff, chunkBases, 0, lanes);
-            System.arraycopy(deltasAll, deltaOff, chunkDeltas, 0, FL_CHUNK_SIZE);
+            System.arraycopy(deltasAll, deltaOff, chunkDeltas, 0, FastLanes.CHUNK);
 
             undeltaChunk(chunkDeltas, chunkBases, lanes, typeBits, mask, chunkUndelta);
 
-            for (int i = 0; i < FL_CHUNK_SIZE; i++) {
-                untransposedChunk[transposeIndex(i)] = chunkUndelta[i];
+            for (int i = 0; i < FastLanes.CHUNK; i++) {
+                untransposedChunk[FastLanes.transposeIndex(i)] = chunkUndelta[i];
             }
-            System.arraycopy(untransposedChunk, 0, decoded, deltaOff, FL_CHUNK_SIZE);
+            System.arraycopy(untransposedChunk, 0, decoded, deltaOff, FastLanes.CHUNK);
         }
 
         long[] result = new long[(int) rowCount];
@@ -121,7 +122,7 @@ private static void undeltaChunk(long[] deltas, long[] bases, int lanes, int typ
         for (int lane = 0; lane < lanes; lane++) {
             long prev = bases[lane] & mask;
             for (int row = 0; row < typeBits; row++) {
-                int idx = iterateIndex(row, lane);
+                int idx = FastLanes.iterateIndex(row, lane);
                 long next = ((deltas[idx] & mask) + prev) & mask;
                 out[idx] = next;
                 prev = next;
@@ -149,34 +150,5 @@ private static long[] readLongs(MemorySegment buf, int count, PType ptype) {
         return out;
     }
 
-    private static final int FL_CHUNK_SIZE = 1024;
-
-    private static final int[] FL_ORDER = {0, 4, 2, 6, 1, 5, 3, 7};
-
-    private static int transposeIndex(int idx) {
-        int lane = idx % 16;
-        int order = (idx / 16) % 8;
-        int row = idx / 128;
-        return lane * 64 + FL_ORDER[order] * 8 + row;
-    }
-
-    private static int iterateIndex(int row, int lane) {
-        int o = row / 8;
-        int s = row % 8;
-        return FL_ORDER[o] * 16 + s * 128 + lane;
-    }
-
-    private static int lanes(PType ptype) {
-        return FL_CHUNK_SIZE / (ptype.byteSize() * 8);
-    }
-
-    private static int typeBits(PType ptype) {
-        return ptype.byteSize() * 8;
-    }
-
-    private static long typeMask(PType ptype) {
-        int bits = ptype.byteSize() * 8;
-        return bits == 64 ? -1L : (1L << bits) - 1;
-    }
 
 }
diff --git a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/BitpackedEncodingEncoder.java b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/BitpackedEncodingEncoder.java
@@ -4,6 +4,7 @@
 import io.github.dfa1.vortex.core.PType;
 import io.github.dfa1.vortex.core.VortexException;
 import io.github.dfa1.vortex.encoding.EncodingId;
+import io.github.dfa1.vortex.encoding.FastLanes;
 import io.github.dfa1.vortex.encoding.PrimitiveArrays;
 import io.github.dfa1.vortex.encoding.PTypeIO;
 import io.github.dfa1.vortex.proto.BitPackedMetadata;
@@ -46,8 +47,8 @@ public EncodeResult encode(DType dtype, Object data, EncodeContext ctx) {
         PType ptype = ((DType.Primitive) dtype).ptype();
         long[] longs = PrimitiveArrays.toLongs(data, ptype, EncodingId.FASTLANES_BITPACKED);
         int n = longs.length;
-        int typeBits = ptype.byteSize() * 8;
-        long typeMask = typeMask(typeBits);
+        int typeBits = ptype.bits();
+        long typeMask = FastLanes.lowMask(typeBits);
         boolean unsign = ptype.isUnsigned();
 
         long signedMin = 0L;
@@ -198,7 +199,7 @@ private static MemorySegment packFastLanes(long[] values, int n, int bitWidth, i
         int lanes = 1024 / typeBits;
         int wordBytes = typeBits / 8;
         int blockCount = (n + 1023) / 1024;
-        long typeMask = typeMask(typeBits);
+        long typeMask = FastLanes.lowMask(typeBits);
         // Mask values to the chosen bit width so over-cap entries (handled separately as
         // patches) don't spill into the next row's region in the packed layout.
         long widthMask = bitWidth >= 64 ? -1L : (1L << bitWidth) - 1L;
@@ -239,9 +240,6 @@ private static MemorySegment packFastLanes(long[] values, int n, int bitWidth, i
     }
 
 
-    private static long typeMask(int typeBits) {
-        return typeBits == 64 ? -1L : (1L << typeBits) - 1L;
-    }
 
     private static byte[] statsBytes(PType ptype, long value) {
         if (ptype.isUnsigned()) {

diff --git a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/DeltaEncodingEncoder.java b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/DeltaEncodingEncoder.java
@@ -3,6 +3,7 @@
 import io.github.dfa1.vortex.core.DType;
 import io.github.dfa1.vortex.core.PType;
 import io.github.dfa1.vortex.encoding.EncodingId;
+import io.github.dfa1.vortex.encoding.FastLanes;
 import io.github.dfa1.vortex.encoding.PrimitiveArrays;
 import io.github.dfa1.vortex.proto.DeltaMetadata;
 import io.github.dfa1.vortex.proto.ScalarValue;
@@ -39,9 +40,9 @@ public EncodeResult encode(DType dtype, Object data, EncodeContext ctx) {
         PType ptype = ((DType.Primitive) dtype).ptype();
         long[] longs = PrimitiveArrays.toLongs(data, ptype, EncodingId.FASTLANES_DELTA);
         int n = longs.length;
-        int typeBits = typeBits(ptype);
-        int lanes = lanes(ptype);
-        long mask = typeMask(ptype);
+        int typeBits = ptype.bits();
+        int lanes = FastLanes.lanes(ptype);
+        long mask = FastLanes.lowMask(ptype.bits());
         boolean unsign = ptype.isUnsigned();
 
         long minVal = 0L;
@@ -60,34 +61,34 @@ public EncodeResult encode(DType dtype, Object data, EncodeContext ctx) {
             }
         }
 
-        int numChunks = n == 0 ? 0 : (n + FL_CHUNK_SIZE - 1) / FL_CHUNK_SIZE;
-        long paddedLen = (long) numChunks * FL_CHUNK_SIZE;
+        int numChunks = n == 0 ? 0 : (n + FastLanes.CHUNK - 1) / FastLanes.CHUNK;
+        long paddedLen = (long) numChunks * FastLanes.CHUNK;
         int basesLen = numChunks * lanes;
 
         long[] basesAll = new long[basesLen];
         long[] deltasAll = new long[(int) paddedLen];
-        long[] chunkBuf = new long[FL_CHUNK_SIZE];
-        long[] transposed = new long[FL_CHUNK_SIZE];
+        long[] chunkBuf = new long[FastLanes.CHUNK];
+        long[] transposed = new long[FastLanes.CHUNK];
         long[] chunkBases = new long[lanes];
-        long[] chunkDelta = new long[FL_CHUNK_SIZE];
+        long[] chunkDelta = new long[FastLanes.CHUNK];
 
         for (int chunk = 0; chunk < numChunks; chunk++) {
-            int start = chunk * FL_CHUNK_SIZE;
-            int end = Math.min(start + FL_CHUNK_SIZE, n);
+            int start = chunk * FastLanes.CHUNK;
+            int end = Math.min(start + FastLanes.CHUNK, n);
             for (int i = start; i < end; i++) {
                 chunkBuf[i - start] = longs[i] & mask;
             }
-            for (int i = end - start; i < FL_CHUNK_SIZE; i++) {
+            for (int i = end - start; i < FastLanes.CHUNK; i++) {
                 chunkBuf[i] = 0L;
             }
-            for (int i = 0; i < FL_CHUNK_SIZE; i++) {
-                transposed[i] = chunkBuf[transposeIndex(i)];
+            for (int i = 0; i < FastLanes.CHUNK; i++) {
+                transposed[i] = chunkBuf[FastLanes.transposeIndex(i)];
             }
             int basesOff = chunk * lanes;
             System.arraycopy(transposed, 0, basesAll, basesOff, lanes);
             System.arraycopy(basesAll, basesOff, chunkBases, 0, lanes);
             deltaChunk(transposed, chunkBases, lanes, typeBits, mask, chunkDelta);
-            System.arraycopy(chunkDelta, 0, deltasAll, chunk * FL_CHUNK_SIZE, FL_CHUNK_SIZE);
+            System.arraycopy(chunkDelta, 0, deltasAll, chunk * FastLanes.CHUNK, FastLanes.CHUNK);
         }
 
         MemorySegment basesSeg = PrimitiveArrays.fromLongs(basesAll, ptype, ctx.arena());
@@ -109,7 +110,7 @@ private static void deltaChunk(long[] transposed, long[] bases, int lanes, int t
         for (int lane = 0; lane < lanes; lane++) {
             long prev = bases[lane] & mask;
             for (int row = 0; row < typeBits; row++) {
-                int idx = iterateIndex(row, lane);
+                int idx = FastLanes.iterateIndex(row, lane);
                 long next = transposed[idx] & mask;
                 out[idx] = (next - prev) & mask;
                 prev = next;
@@ -124,35 +125,4 @@ private static byte[] statsBytes(PType ptype, long value) {
         return ScalarValue.ofInt64Value(value).encode();
     }
 
-    private static final int FL_CHUNK_SIZE = 1024;
-
-    private static final int[] FL_ORDER = {0, 4, 2, 6, 1, 5, 3, 7};
-
-    private static int transposeIndex(int idx) {
-        int lane = idx % 16;
-        int order = (idx / 16) % 8;
-        int row = idx / 128;
-        return lane * 64 + FL_ORDER[order] * 8 + row;
-    }
-
-    private static int iterateIndex(int row, int lane) {
-        int o = row / 8;
-        int s = row % 8;
-        return FL_ORDER[o] * 16 + s * 128 + lane;
-    }
-
-    private static int lanes(PType ptype) {
-        return FL_CHUNK_SIZE / (ptype.byteSize() * 8);
-    }
-
-    private static int typeBits(PType ptype) {
-        return ptype.byteSize() * 8;
-    }
-
-    private static long typeMask(PType ptype) {
-        int bits = ptype.byteSize() * 8;
-        return bits == 64 ? -1L : (1L << bits) - 1;
-    }
-
-
 }
diff --git a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/PatchedEncodingEncoder.java b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/PatchedEncodingEncoder.java
@@ -4,6 +4,7 @@
 import io.github.dfa1.vortex.core.PType;
 import io.github.dfa1.vortex.core.VortexException;
 import io.github.dfa1.vortex.encoding.EncodingId;
+import io.github.dfa1.vortex.encoding.FastLanes;
 import io.github.dfa1.vortex.encoding.PrimitiveArrays;
 import io.github.dfa1.vortex.encoding.PTypeIO;
 import io.github.dfa1.vortex.proto.PatchedMetadata;
@@ -140,8 +141,8 @@ static EncodeResult encode(DType dtype, Object data, EncodeContext ctx) {
         }
 
         private static PatchedData computePatchedData(long[] longs, PType ptype, int n) {
-            int typeBits = ptype.byteSize() * 8;
-            long typeMask = typeBits == 64 ? -1L : (1L << typeBits) - 1L;
+            int typeBits = ptype.bits();
+            long typeMask = FastLanes.lowMask(typeBits);
             int elemBytes = ptype.byteSize();
 
             int[] bitWidthFreq = new int[typeBits + 1];