Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions TODO.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,6 @@

- [ ] Performance tests must be peer reviewed
- [ ] Run performance tests on other machines (I have access only to Apple M5)
- [ ] **Optimize `FastLanes.transposeIndex(int)` / `iterateIndex(int, int)`** — per-element `%`/`/`
violate the hot-loop rule; called once per element in the delta transpose loops
(`DeltaEncodingDecoder`, `DeltaEncodingEncoder`). Divisors are power-of-two constants (16/8/128);
replace with shifts/masks or a precomputed permutation table. Profile first, benchmark both.
- [ ] **Vector API adoption** — deferred; see [ADR-0005](docs/adr/0005-vector-api-adoption.md) for adoption criteria and candidate loops.

## Security
Expand Down
35 changes: 28 additions & 7 deletions core/src/main/java/io/github/dfa1/vortex/encoding/FastLanes.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,30 +19,51 @@ public final class FastLanes {
/// The FastLanes transpose order — the lane permutation applied within each 8-row group.
private static final int[] ORDER = {0, 4, 2, 6, 1, 5, 3, 7};

/// Precomputed logical-to-transposed permutation for one chunk (see [#transposeIndex(int)]).
private static final int[] TRANSPOSE = new int[CHUNK];

/// Precomputed per-row base offsets for [#iterateIndex(int, int)]; the lane is added at use.
/// Sized to the maximum row count (a 64-bit type has 64 rows per chunk).
private static final int[] ITERATE_BASE = new int[64];

static {
for (int idx = 0; idx < CHUNK; idx++) {
int lane = idx % 16;
int order = (idx / 16) % 8;
int row = idx / 128;
TRANSPOSE[idx] = lane * 64 + ORDER[order] * 8 + row;
}
for (int row = 0; row < ITERATE_BASE.length; row++) {
ITERATE_BASE[row] = ORDER[row / 8] * 16 + (row % 8) * 128;
}
}

private FastLanes() {
}

/// Maps a logical element index to its position in the transposed (interleaved-lane) layout.
///
/// The mapping is precomputed into a per-chunk table; the lookup avoids the per-element
/// division and `ORDER` indirection that would otherwise serialize address generation in the
/// transpose hot loop.
///
/// @param idx logical element index within a chunk, in `[0, CHUNK)`
/// @return the corresponding index in the transposed buffer
public static int transposeIndex(int idx) {
int lane = idx % 16;
int order = (idx / 16) % 8;
int row = idx / 128;
return lane * 64 + ORDER[order] * 8 + row;
return TRANSPOSE[idx];
}

/// Computes the logical element index visited at the given `row` and `lane` of the FastLanes
/// iteration order — the inverse mapping used while packing or unpacking.
///
/// The row-dependent part is precomputed; only the lane is added per call, keeping the
/// pack/unpack inner loop free of division and `ORDER` indirection.
///
/// @param row the row within the chunk
/// @param lane the lane within the row
/// @return the logical element index
public static int iterateIndex(int row, int lane) {
int o = row / 8;
int s = row % 8;
return ORDER[o] * 16 + s * 128 + lane;
return ITERATE_BASE[row] + lane;
}

/// Returns the FastLanes lane count for `ptype` — [#CHUNK] divided by the type's bit width.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
package io.github.dfa1.vortex.performance;

import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
import org.openjdk.jmh.infra.Blackhole;

import java.util.Random;
import java.util.concurrent.TimeUnit;

/// Head-to-head micro-benchmark for the FastLanes index math used by the delta encoding's
/// transpose and undelta loops, comparing the original per-element arithmetic
/// (`%`/`/` + `ORDER[]` lookup) against the precomputed permutation tables now shipped in
/// `FastLanes`.
///
/// Both kernels permute chunk-by-chunk directly into a large destination array, so the
/// working set scales with `size`: at `size == 1024` it is a single L1-resident chunk
/// (pure index-math cost), and at large `size` the scatter spans L2/SLC/DRAM (memory-bound,
/// where the win shrinks but persists because faster address generation keeps more scatter
/// misses in flight). The crossover is the whole point of the sweep.
///
/// Run: ./bench FastLanesTransposeBenchmark
@State(Scope.Benchmark)
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MICROSECONDS)
@Warmup(iterations = 3, time = 2)
@Measurement(iterations = 5, time = 2)
@Fork(1)
public class FastLanesTransposeBenchmark {

private static final int CHUNK = 1024;
private static final int[] ORDER = {0, 4, 2, 6, 1, 5, 3, 7};

/// Precomputed logical-to-transposed permutation for one chunk.
private static final int[] TRANSPOSE = new int[CHUNK];
/// Precomputed `iterateIndex` base per row for the 64-bit (typeBits == 64) path.
private static final int[] ITERATE_BASE = new int[64];

static {
for (int i = 0; i < CHUNK; i++) {
int lane = i % 16;
int order = (i / 16) % 8;
int row = i / 128;
TRANSPOSE[i] = lane * 64 + ORDER[order] * 8 + row;
}
for (int row = 0; row < 64; row++) {
ITERATE_BASE[row] = ORDER[row / 8] * 16 + (row % 8) * 128;
}
}

/// Working-set sizes: 8 KB (L1) -> 256 MB (DRAM) in `long` elements.
@Param({"1024", "32768", "262144", "2097152", "8388608", "33554432"})
private int size;

private long[] src;
private long[] dst;
private int numChunks;

@Setup(Level.Trial)
public void setup() {
numChunks = size / CHUNK;
src = new long[size];
dst = new long[size];
Random random = new Random(42);
for (int i = 0; i < size; i++) {
src[i] = random.nextLong();
}
}

private static int transposeIndex(int idx) {
int lane = idx % 16;
int order = (idx / 16) % 8;
int row = idx / 128;
return lane * 64 + ORDER[order] * 8 + row;
}

private static int transposeIndexShift(int idx) {
int lane = idx & 15;
int order = (idx >> 4) & 7;
int row = idx >> 7;
return lane * 64 + ORDER[order] * 8 + row;
}

@Benchmark
public void transposeArithmetic(Blackhole bh) {
for (int chunk = 0; chunk < numChunks; chunk++) {
int base = chunk * CHUNK;
for (int i = 0; i < CHUNK; i++) {
dst[base + transposeIndex(i)] = src[base + i];
}
}
bh.consume(dst);
}

@Benchmark
public void transposeTable(Blackhole bh) {
for (int chunk = 0; chunk < numChunks; chunk++) {
int base = chunk * CHUNK;
for (int i = 0; i < CHUNK; i++) {
dst[base + TRANSPOSE[i]] = src[base + i];
}
}
bh.consume(dst);
}

@Benchmark
public void transposeShift(Blackhole bh) {
for (int chunk = 0; chunk < numChunks; chunk++) {
int base = chunk * CHUNK;
for (int i = 0; i < CHUNK; i++) {
dst[base + transposeIndexShift(i)] = src[base + i];
}
}
bh.consume(dst);
}

@Benchmark
public void undeltaArithmetic(Blackhole bh) {
int lanes = 16;
int typeBits = 64;
for (int chunk = 0; chunk < numChunks; chunk++) {
int base = chunk * CHUNK;
for (int lane = 0; lane < lanes; lane++) {
long prev = src[base + lane];
for (int row = 0; row < typeBits; row++) {
int o = row / 8;
int s = row % 8;
int idx = ORDER[o] * 16 + s * 128 + lane;
long next = src[base + idx] + prev;
dst[base + idx] = next;
prev = next;
}
}
}
bh.consume(dst);
}

@Benchmark
public void undeltaShift(Blackhole bh) {
int lanes = 16;
int typeBits = 64;
for (int chunk = 0; chunk < numChunks; chunk++) {
int base = chunk * CHUNK;
for (int lane = 0; lane < lanes; lane++) {
long prev = src[base + lane];
for (int row = 0; row < typeBits; row++) {
int o = row >> 3;
int s = row & 7;
int idx = ORDER[o] * 16 + (s << 7) + lane;
long next = src[base + idx] + prev;
dst[base + idx] = next;
prev = next;
}
}
}
bh.consume(dst);
}

@Benchmark
public void undeltaTable(Blackhole bh) {
int lanes = 16;
int typeBits = 64;
for (int chunk = 0; chunk < numChunks; chunk++) {
int base = chunk * CHUNK;
for (int lane = 0; lane < lanes; lane++) {
long prev = src[base + lane];
for (int row = 0; row < typeBits; row++) {
int idx = ITERATE_BASE[row] + lane;
long next = src[base + idx] + prev;
dst[base + idx] = next;
prev = next;
}
}
}
bh.consume(dst);
}
}
Loading