From ecd34bb117f4554f3d8c097894079cec5fea02e9 Mon Sep 17 00:00:00 2001
From: Davide Angelocola <davide.angelocola@gmail.com>
Date: Tue, 23 Jun 2026 07:30:33 +0200
Subject: [PATCH] perf(encoding): precompute FastLanes transpose/iterate index
 tables

transposeIndex and iterateIndex computed per-element % / / plus an ORDER[]
indirection. In the delta transpose and (un)delta hot loops that dependency
chain (div -> ORDER load -> mul) serializes scatter address generation,
throttling how many scatter misses stay in flight.

Replace with permutation tables built once in a static initializer:
- TRANSPOSE[CHUNK] for transposeIndex
- ITERATE_BASE[64] for iterateIndex (lane added per call)

Public API unchanged.

JMH (Apple M5, long[], FastLanesTransposeBenchmark) across L1 -> DRAM working
sets:
- transpose: 3.4x (L1) ... 1.7x (256 MB)
- undelta:   1.6x (L1) ... 1.4x (256 MB)

Win persists when memory-bound: same dst indices = same traffic, so the gain
is memory-level parallelism, not bandwidth. Shift-reduction control variants in
the benchmark show strength reduction alone recovers only part of it
(~1.5x transpose, ~1.08x undelta) - the dominant cost is the dependent ORDER[]
load, which only the table removes.

Also drops the now-completed FastLanes optimization item from TODO.md.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 TODO.md                                       |   4 -
 .../dfa1/vortex/encoding/FastLanes.java       |  35 +++-
 .../FastLanesTransposeBenchmark.java          | 186 ++++++++++++++++++
 3 files changed, 214 insertions(+), 11 deletions(-)
 create mode 100644 performance/src/main/java/io/github/dfa1/vortex/performance/FastLanesTransposeBenchmark.java

diff --git a/TODO.md b/TODO.md
index 3194d9539..0e30d8c60 100644
--- a/TODO.md
+++ b/TODO.md
@@ -11,10 +11,6 @@
 
 - [ ] Performance tests must be peer reviewed
 - [ ] Run performance tests on other machines (I have access only to Apple M5)
-- [ ] **Optimize `FastLanes.transposeIndex(int)` / `iterateIndex(int, int)`** — per-element `%`/`/`
-  violate the hot-loop rule; called once per element in the delta transpose loops
-  (`DeltaEncodingDecoder`, `DeltaEncodingEncoder`). Divisors are power-of-two constants (16/8/128);
-  replace with shifts/masks or a precomputed permutation table. Profile first, benchmark both.
 - [ ] **Vector API adoption** — deferred; see [ADR-0005](docs/adr/0005-vector-api-adoption.md) for adoption criteria and candidate loops.
 
 ## Security
diff --git a/core/src/main/java/io/github/dfa1/vortex/encoding/FastLanes.java b/core/src/main/java/io/github/dfa1/vortex/encoding/FastLanes.java
index 2cb8080b6..2844d55a3 100644
--- a/core/src/main/java/io/github/dfa1/vortex/encoding/FastLanes.java
+++ b/core/src/main/java/io/github/dfa1/vortex/encoding/FastLanes.java
@@ -19,30 +19,51 @@ public final class FastLanes {
     /// The FastLanes transpose order — the lane permutation applied within each 8-row group.
     private static final int[] ORDER = {0, 4, 2, 6, 1, 5, 3, 7};
 
+    /// Precomputed logical-to-transposed permutation for one chunk (see [#transposeIndex(int)]).
+    private static final int[] TRANSPOSE = new int[CHUNK];
+
+    /// Precomputed per-row base offsets for [#iterateIndex(int, int)]; the lane is added at use.
+    /// Sized to the maximum row count (a 64-bit type has 64 rows per chunk).
+    private static final int[] ITERATE_BASE = new int[64];
+
+    static {
+        for (int idx = 0; idx < CHUNK; idx++) {
+            int lane = idx % 16;
+            int order = (idx / 16) % 8;
+            int row = idx / 128;
+            TRANSPOSE[idx] = lane * 64 + ORDER[order] * 8 + row;
+        }
+        for (int row = 0; row < ITERATE_BASE.length; row++) {
+            ITERATE_BASE[row] = ORDER[row / 8] * 16 + (row % 8) * 128;
+        }
+    }
+
     private FastLanes() {
     }
 
     /// Maps a logical element index to its position in the transposed (interleaved-lane) layout.
     ///
+    /// The mapping is precomputed into a per-chunk table; the lookup avoids the per-element
+    /// division and `ORDER` indirection that would otherwise serialize address generation in the
+    /// transpose hot loop.
+    ///
     /// @param idx logical element index within a chunk, in `[0, CHUNK)`
     /// @return the corresponding index in the transposed buffer
     public static int transposeIndex(int idx) {
-        int lane = idx % 16;
-        int order = (idx / 16) % 8;
-        int row = idx / 128;
-        return lane * 64 + ORDER[order] * 8 + row;
+        return TRANSPOSE[idx];
     }
 
     /// Computes the logical element index visited at the given `row` and `lane` of the FastLanes
     /// iteration order — the inverse mapping used while packing or unpacking.
     ///
+    /// The row-dependent part is precomputed; only the lane is added per call, keeping the
+    /// pack/unpack inner loop free of division and `ORDER` indirection.
+    ///
     /// @param row  the row within the chunk
     /// @param lane the lane within the row
     /// @return the logical element index
     public static int iterateIndex(int row, int lane) {
-        int o = row / 8;
-        int s = row % 8;
-        return ORDER[o] * 16 + s * 128 + lane;
+        return ITERATE_BASE[row] + lane;
     }
 
     /// Returns the FastLanes lane count for `ptype` — [#CHUNK] divided by the type's bit width.
diff --git a/performance/src/main/java/io/github/dfa1/vortex/performance/FastLanesTransposeBenchmark.java b/performance/src/main/java/io/github/dfa1/vortex/performance/FastLanesTransposeBenchmark.java
new file mode 100644
index 000000000..9434c76cc
--- /dev/null
+++ b/performance/src/main/java/io/github/dfa1/vortex/performance/FastLanesTransposeBenchmark.java
@@ -0,0 +1,186 @@
+package io.github.dfa1.vortex.performance;
+
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+
+/// Head-to-head micro-benchmark for the FastLanes index math used by the delta encoding's
+/// transpose and undelta loops, comparing the original per-element arithmetic
+/// (`%`/`/` + `ORDER[]` lookup) against the precomputed permutation tables now shipped in
+/// `FastLanes`.
+///
+/// Both kernels permute chunk-by-chunk directly into a large destination array, so the
+/// working set scales with `size`: at `size == 1024` it is a single L1-resident chunk
+/// (pure index-math cost), and at large `size` the scatter spans L2/SLC/DRAM (memory-bound,
+/// where the win shrinks but persists because faster address generation keeps more scatter
+/// misses in flight). The crossover is the whole point of the sweep.
+///
+/// Run: ./bench FastLanesTransposeBenchmark
+@State(Scope.Benchmark)
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.MICROSECONDS)
+@Warmup(iterations = 3, time = 2)
+@Measurement(iterations = 5, time = 2)
+@Fork(1)
+public class FastLanesTransposeBenchmark {
+
+    private static final int CHUNK = 1024;
+    private static final int[] ORDER = {0, 4, 2, 6, 1, 5, 3, 7};
+
+    /// Precomputed logical-to-transposed permutation for one chunk.
+    private static final int[] TRANSPOSE = new int[CHUNK];
+    /// Precomputed `iterateIndex` base per row for the 64-bit (typeBits == 64) path.
+    private static final int[] ITERATE_BASE = new int[64];
+
+    static {
+        for (int i = 0; i < CHUNK; i++) {
+            int lane = i % 16;
+            int order = (i / 16) % 8;
+            int row = i / 128;
+            TRANSPOSE[i] = lane * 64 + ORDER[order] * 8 + row;
+        }
+        for (int row = 0; row < 64; row++) {
+            ITERATE_BASE[row] = ORDER[row / 8] * 16 + (row % 8) * 128;
+        }
+    }
+
+    /// Working-set sizes: 8 KB (L1) -> 256 MB (DRAM) in `long` elements.
+    @Param({"1024", "32768", "262144", "2097152", "8388608", "33554432"})
+    private int size;
+
+    private long[] src;
+    private long[] dst;
+    private int numChunks;
+
+    @Setup(Level.Trial)
+    public void setup() {
+        numChunks = size / CHUNK;
+        src = new long[size];
+        dst = new long[size];
+        Random random = new Random(42);
+        for (int i = 0; i < size; i++) {
+            src[i] = random.nextLong();
+        }
+    }
+
+    private static int transposeIndex(int idx) {
+        int lane = idx % 16;
+        int order = (idx / 16) % 8;
+        int row = idx / 128;
+        return lane * 64 + ORDER[order] * 8 + row;
+    }
+
+    private static int transposeIndexShift(int idx) {
+        int lane = idx & 15;
+        int order = (idx >> 4) & 7;
+        int row = idx >> 7;
+        return lane * 64 + ORDER[order] * 8 + row;
+    }
+
+    @Benchmark
+    public void transposeArithmetic(Blackhole bh) {
+        for (int chunk = 0; chunk < numChunks; chunk++) {
+            int base = chunk * CHUNK;
+            for (int i = 0; i < CHUNK; i++) {
+                dst[base + transposeIndex(i)] = src[base + i];
+            }
+        }
+        bh.consume(dst);
+    }
+
+    @Benchmark
+    public void transposeTable(Blackhole bh) {
+        for (int chunk = 0; chunk < numChunks; chunk++) {
+            int base = chunk * CHUNK;
+            for (int i = 0; i < CHUNK; i++) {
+                dst[base + TRANSPOSE[i]] = src[base + i];
+            }
+        }
+        bh.consume(dst);
+    }
+
+    @Benchmark
+    public void transposeShift(Blackhole bh) {
+        for (int chunk = 0; chunk < numChunks; chunk++) {
+            int base = chunk * CHUNK;
+            for (int i = 0; i < CHUNK; i++) {
+                dst[base + transposeIndexShift(i)] = src[base + i];
+            }
+        }
+        bh.consume(dst);
+    }
+
+    @Benchmark
+    public void undeltaArithmetic(Blackhole bh) {
+        int lanes = 16;
+        int typeBits = 64;
+        for (int chunk = 0; chunk < numChunks; chunk++) {
+            int base = chunk * CHUNK;
+            for (int lane = 0; lane < lanes; lane++) {
+                long prev = src[base + lane];
+                for (int row = 0; row < typeBits; row++) {
+                    int o = row / 8;
+                    int s = row % 8;
+                    int idx = ORDER[o] * 16 + s * 128 + lane;
+                    long next = src[base + idx] + prev;
+                    dst[base + idx] = next;
+                    prev = next;
+                }
+            }
+        }
+        bh.consume(dst);
+    }
+
+    @Benchmark
+    public void undeltaShift(Blackhole bh) {
+        int lanes = 16;
+        int typeBits = 64;
+        for (int chunk = 0; chunk < numChunks; chunk++) {
+            int base = chunk * CHUNK;
+            for (int lane = 0; lane < lanes; lane++) {
+                long prev = src[base + lane];
+                for (int row = 0; row < typeBits; row++) {
+                    int o = row >> 3;
+                    int s = row & 7;
+                    int idx = ORDER[o] * 16 + (s << 7) + lane;
+                    long next = src[base + idx] + prev;
+                    dst[base + idx] = next;
+                    prev = next;
+                }
+            }
+        }
+        bh.consume(dst);
+    }
+
+    @Benchmark
+    public void undeltaTable(Blackhole bh) {
+        int lanes = 16;
+        int typeBits = 64;
+        for (int chunk = 0; chunk < numChunks; chunk++) {
+            int base = chunk * CHUNK;
+            for (int lane = 0; lane < lanes; lane++) {
+                long prev = src[base + lane];
+                for (int row = 0; row < typeBits; row++) {
+                    int idx = ITERATE_BASE[row] + lane;
+                    long next = src[base + idx] + prev;
+                    dst[base + idx] = next;
+                    prev = next;
+                }
+            }
+        }
+        bh.consume(dst);
+    }
+}