From ecd34bb117f4554f3d8c097894079cec5fea02e9 Mon Sep 17 00:00:00 2001 From: Davide Angelocola Date: Tue, 23 Jun 2026 07:30:33 +0200 Subject: [PATCH] perf(encoding): precompute FastLanes transpose/iterate index tables transposeIndex and iterateIndex computed per-element % / / plus an ORDER[] indirection. In the delta transpose and (un)delta hot loops that dependency chain (div -> ORDER load -> mul) serializes scatter address generation, throttling how many scatter misses stay in flight. Replace with permutation tables built once in a static initializer: - TRANSPOSE[CHUNK] for transposeIndex - ITERATE_BASE[64] for iterateIndex (lane added per call) Public API unchanged. JMH (Apple M5, long[], FastLanesTransposeBenchmark) across L1 -> DRAM working sets: - transpose: 3.4x (L1) ... 1.7x (256 MB) - undelta: 1.6x (L1) ... 1.4x (256 MB) Win persists when memory-bound: same dst indices = same traffic, so the gain is memory-level parallelism, not bandwidth. Shift-reduction control variants in the benchmark show strength reduction alone recovers only part of it (~1.5x transpose, ~1.08x undelta) - the dominant cost is the dependent ORDER[] load, which only the table removes. Also drops the now-completed FastLanes optimization item from TODO.md. Co-Authored-By: Claude Opus 4.8 --- TODO.md | 4 - .../dfa1/vortex/encoding/FastLanes.java | 35 +++- .../FastLanesTransposeBenchmark.java | 186 ++++++++++++++++++ 3 files changed, 214 insertions(+), 11 deletions(-) create mode 100644 performance/src/main/java/io/github/dfa1/vortex/performance/FastLanesTransposeBenchmark.java diff --git a/TODO.md b/TODO.md index 3194d9539..0e30d8c60 100644 --- a/TODO.md +++ b/TODO.md @@ -11,10 +11,6 @@ - [ ] Performance tests must be peer reviewed - [ ] Run performance tests on other machines (I have access only to Apple M5) -- [ ] **Optimize `FastLanes.transposeIndex(int)` / `iterateIndex(int, int)`** — per-element `%`/`/` - violate the hot-loop rule; called once per element in the delta transpose loops - (`DeltaEncodingDecoder`, `DeltaEncodingEncoder`). Divisors are power-of-two constants (16/8/128); - replace with shifts/masks or a precomputed permutation table. Profile first, benchmark both. - [ ] **Vector API adoption** — deferred; see [ADR-0005](docs/adr/0005-vector-api-adoption.md) for adoption criteria and candidate loops. ## Security diff --git a/core/src/main/java/io/github/dfa1/vortex/encoding/FastLanes.java b/core/src/main/java/io/github/dfa1/vortex/encoding/FastLanes.java index 2cb8080b6..2844d55a3 100644 --- a/core/src/main/java/io/github/dfa1/vortex/encoding/FastLanes.java +++ b/core/src/main/java/io/github/dfa1/vortex/encoding/FastLanes.java @@ -19,30 +19,51 @@ public final class FastLanes { /// The FastLanes transpose order — the lane permutation applied within each 8-row group. private static final int[] ORDER = {0, 4, 2, 6, 1, 5, 3, 7}; + /// Precomputed logical-to-transposed permutation for one chunk (see [#transposeIndex(int)]). + private static final int[] TRANSPOSE = new int[CHUNK]; + + /// Precomputed per-row base offsets for [#iterateIndex(int, int)]; the lane is added at use. + /// Sized to the maximum row count (a 64-bit type has 64 rows per chunk). + private static final int[] ITERATE_BASE = new int[64]; + + static { + for (int idx = 0; idx < CHUNK; idx++) { + int lane = idx % 16; + int order = (idx / 16) % 8; + int row = idx / 128; + TRANSPOSE[idx] = lane * 64 + ORDER[order] * 8 + row; + } + for (int row = 0; row < ITERATE_BASE.length; row++) { + ITERATE_BASE[row] = ORDER[row / 8] * 16 + (row % 8) * 128; + } + } + private FastLanes() { } /// Maps a logical element index to its position in the transposed (interleaved-lane) layout. /// + /// The mapping is precomputed into a per-chunk table; the lookup avoids the per-element + /// division and `ORDER` indirection that would otherwise serialize address generation in the + /// transpose hot loop. + /// /// @param idx logical element index within a chunk, in `[0, CHUNK)` /// @return the corresponding index in the transposed buffer public static int transposeIndex(int idx) { - int lane = idx % 16; - int order = (idx / 16) % 8; - int row = idx / 128; - return lane * 64 + ORDER[order] * 8 + row; + return TRANSPOSE[idx]; } /// Computes the logical element index visited at the given `row` and `lane` of the FastLanes /// iteration order — the inverse mapping used while packing or unpacking. /// + /// The row-dependent part is precomputed; only the lane is added per call, keeping the + /// pack/unpack inner loop free of division and `ORDER` indirection. + /// /// @param row the row within the chunk /// @param lane the lane within the row /// @return the logical element index public static int iterateIndex(int row, int lane) { - int o = row / 8; - int s = row % 8; - return ORDER[o] * 16 + s * 128 + lane; + return ITERATE_BASE[row] + lane; } /// Returns the FastLanes lane count for `ptype` — [#CHUNK] divided by the type's bit width. diff --git a/performance/src/main/java/io/github/dfa1/vortex/performance/FastLanesTransposeBenchmark.java b/performance/src/main/java/io/github/dfa1/vortex/performance/FastLanesTransposeBenchmark.java new file mode 100644 index 000000000..9434c76cc --- /dev/null +++ b/performance/src/main/java/io/github/dfa1/vortex/performance/FastLanesTransposeBenchmark.java @@ -0,0 +1,186 @@ +package io.github.dfa1.vortex.performance; + +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; + +import java.util.Random; +import java.util.concurrent.TimeUnit; + +/// Head-to-head micro-benchmark for the FastLanes index math used by the delta encoding's +/// transpose and undelta loops, comparing the original per-element arithmetic +/// (`%`/`/` + `ORDER[]` lookup) against the precomputed permutation tables now shipped in +/// `FastLanes`. +/// +/// Both kernels permute chunk-by-chunk directly into a large destination array, so the +/// working set scales with `size`: at `size == 1024` it is a single L1-resident chunk +/// (pure index-math cost), and at large `size` the scatter spans L2/SLC/DRAM (memory-bound, +/// where the win shrinks but persists because faster address generation keeps more scatter +/// misses in flight). The crossover is the whole point of the sweep. +/// +/// Run: ./bench FastLanesTransposeBenchmark +@State(Scope.Benchmark) +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.MICROSECONDS) +@Warmup(iterations = 3, time = 2) +@Measurement(iterations = 5, time = 2) +@Fork(1) +public class FastLanesTransposeBenchmark { + + private static final int CHUNK = 1024; + private static final int[] ORDER = {0, 4, 2, 6, 1, 5, 3, 7}; + + /// Precomputed logical-to-transposed permutation for one chunk. + private static final int[] TRANSPOSE = new int[CHUNK]; + /// Precomputed `iterateIndex` base per row for the 64-bit (typeBits == 64) path. + private static final int[] ITERATE_BASE = new int[64]; + + static { + for (int i = 0; i < CHUNK; i++) { + int lane = i % 16; + int order = (i / 16) % 8; + int row = i / 128; + TRANSPOSE[i] = lane * 64 + ORDER[order] * 8 + row; + } + for (int row = 0; row < 64; row++) { + ITERATE_BASE[row] = ORDER[row / 8] * 16 + (row % 8) * 128; + } + } + + /// Working-set sizes: 8 KB (L1) -> 256 MB (DRAM) in `long` elements. + @Param({"1024", "32768", "262144", "2097152", "8388608", "33554432"}) + private int size; + + private long[] src; + private long[] dst; + private int numChunks; + + @Setup(Level.Trial) + public void setup() { + numChunks = size / CHUNK; + src = new long[size]; + dst = new long[size]; + Random random = new Random(42); + for (int i = 0; i < size; i++) { + src[i] = random.nextLong(); + } + } + + private static int transposeIndex(int idx) { + int lane = idx % 16; + int order = (idx / 16) % 8; + int row = idx / 128; + return lane * 64 + ORDER[order] * 8 + row; + } + + private static int transposeIndexShift(int idx) { + int lane = idx & 15; + int order = (idx >> 4) & 7; + int row = idx >> 7; + return lane * 64 + ORDER[order] * 8 + row; + } + + @Benchmark + public void transposeArithmetic(Blackhole bh) { + for (int chunk = 0; chunk < numChunks; chunk++) { + int base = chunk * CHUNK; + for (int i = 0; i < CHUNK; i++) { + dst[base + transposeIndex(i)] = src[base + i]; + } + } + bh.consume(dst); + } + + @Benchmark + public void transposeTable(Blackhole bh) { + for (int chunk = 0; chunk < numChunks; chunk++) { + int base = chunk * CHUNK; + for (int i = 0; i < CHUNK; i++) { + dst[base + TRANSPOSE[i]] = src[base + i]; + } + } + bh.consume(dst); + } + + @Benchmark + public void transposeShift(Blackhole bh) { + for (int chunk = 0; chunk < numChunks; chunk++) { + int base = chunk * CHUNK; + for (int i = 0; i < CHUNK; i++) { + dst[base + transposeIndexShift(i)] = src[base + i]; + } + } + bh.consume(dst); + } + + @Benchmark + public void undeltaArithmetic(Blackhole bh) { + int lanes = 16; + int typeBits = 64; + for (int chunk = 0; chunk < numChunks; chunk++) { + int base = chunk * CHUNK; + for (int lane = 0; lane < lanes; lane++) { + long prev = src[base + lane]; + for (int row = 0; row < typeBits; row++) { + int o = row / 8; + int s = row % 8; + int idx = ORDER[o] * 16 + s * 128 + lane; + long next = src[base + idx] + prev; + dst[base + idx] = next; + prev = next; + } + } + } + bh.consume(dst); + } + + @Benchmark + public void undeltaShift(Blackhole bh) { + int lanes = 16; + int typeBits = 64; + for (int chunk = 0; chunk < numChunks; chunk++) { + int base = chunk * CHUNK; + for (int lane = 0; lane < lanes; lane++) { + long prev = src[base + lane]; + for (int row = 0; row < typeBits; row++) { + int o = row >> 3; + int s = row & 7; + int idx = ORDER[o] * 16 + (s << 7) + lane; + long next = src[base + idx] + prev; + dst[base + idx] = next; + prev = next; + } + } + } + bh.consume(dst); + } + + @Benchmark + public void undeltaTable(Blackhole bh) { + int lanes = 16; + int typeBits = 64; + for (int chunk = 0; chunk < numChunks; chunk++) { + int base = chunk * CHUNK; + for (int lane = 0; lane < lanes; lane++) { + long prev = src[base + lane]; + for (int row = 0; row < typeBits; row++) { + int idx = ITERATE_BASE[row] + lane; + long next = src[base + idx] + prev; + dst[base + idx] = next; + prev = next; + } + } + } + bh.consume(dst); + } +}