Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions TODO.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@

- [ ] Performance tests must be peer reviewed
- [ ] Run performance tests on other machines (I have access only to Apple M5)
- [ ] **Optimize `FastLanes.transposeIndex(int)` / `iterateIndex(int, int)`** — per-element `%`/`/`
violate the hot-loop rule; called once per element in the delta transpose loops
(`DeltaEncodingDecoder`, `DeltaEncodingEncoder`). Divisors are power-of-two constants (16/8/128);
replace with shifts/masks or a precomputed permutation table. Profile first, benchmark both.
- [ ] **Vector API adoption** — deferred; see [ADR-0005](docs/adr/0005-vector-api-adoption.md) for adoption criteria and candidate loops.

## Security
Expand Down
7 changes: 7 additions & 0 deletions core/src/main/java/io/github/dfa1/vortex/core/PType.java
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,13 @@ public int byteSize() {
};
}

/// Number of bits per element — [#byteSize()] times eight (8, 16, 32, or 64).
///
/// @return the bit width of this physical type
public int bits() {
return byteSize() * 8;
}

/// Returns `true` for `F16`, `F32`, and `F64`.
///
/// @return `true` if this ptype is a floating-point type
Expand Down
63 changes: 63 additions & 0 deletions core/src/main/java/io/github/dfa1/vortex/encoding/FastLanes.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
package io.github.dfa1.vortex.encoding;

import io.github.dfa1.vortex.core.PType;

/// Shared FastLanes layout constants and index math used by the bit-packing and delta encodings on
/// both the read and write sides.
///
/// FastLanes processes values in fixed 1024-element chunks ([#CHUNK]) arranged into an interleaved
/// lane order ([#ORDER]) so that the unpack inner loop is data-parallel. [#transposeIndex(int)] and
/// [#iterateIndex(int, int)] map between the logical element order and that interleaved layout;
/// [#lanes(PType)] is the lane count for a width and [#lowMask(int)] the low-`bits` value mask.
///
/// Mirrors the reference layout in `spiraldb/fastlanes` (`src/macros.rs`).
public final class FastLanes {

/// Number of elements per FastLanes chunk.
public static final int CHUNK = 1024;

/// The FastLanes transpose order — the lane permutation applied within each 8-row group.
private static final int[] ORDER = {0, 4, 2, 6, 1, 5, 3, 7};

private FastLanes() {
}

/// Maps a logical element index to its position in the transposed (interleaved-lane) layout.
///
/// @param idx logical element index within a chunk, in `[0, CHUNK)`
/// @return the corresponding index in the transposed buffer
public static int transposeIndex(int idx) {
int lane = idx % 16;
int order = (idx / 16) % 8;
int row = idx / 128;
return lane * 64 + ORDER[order] * 8 + row;
}

/// Computes the logical element index visited at the given `row` and `lane` of the FastLanes
/// iteration order — the inverse mapping used while packing or unpacking.
///
/// @param row the row within the chunk
/// @param lane the lane within the row
/// @return the logical element index
public static int iterateIndex(int row, int lane) {
int o = row / 8;
int s = row % 8;
return ORDER[o] * 16 + s * 128 + lane;
}

/// Returns the FastLanes lane count for `ptype` — [#CHUNK] divided by the type's bit width.
///
/// @param ptype the physical type being packed
/// @return the number of lanes
public static int lanes(PType ptype) {
return CHUNK / ptype.bits();
}

/// Returns a mask selecting the low `bits` of a `long` (all ones when `bits == 64`).
///
/// @param bits the number of low bits to keep, in `[1, 64]`
/// @return the low-`bits` mask
public static long lowMask(int bits) {
return bits == 64 ? -1L : (1L << bits) - 1;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ public Array decode(DecodeContext ctx) {
int bitWidth = meta.bit_width();
int offset = meta.offset();
PType ptype = ((DType.Primitive) ctx.dtype()).ptype();
int typeBits = ptype.byteSize() * 8;
int typeBits = ptype.bits();
long rowCount = ctx.rowCount();

MemorySegment packed = ctx.buffer(0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import io.github.dfa1.vortex.core.PType;
import io.github.dfa1.vortex.core.VortexException;
import io.github.dfa1.vortex.encoding.EncodingId;
import io.github.dfa1.vortex.encoding.FastLanes;
import io.github.dfa1.vortex.encoding.PrimitiveArrays;
import io.github.dfa1.vortex.encoding.PTypeIO;
import io.github.dfa1.vortex.proto.DeltaMetadata;
Expand Down Expand Up @@ -58,9 +59,9 @@ public Array decode(DecodeContext ctx) {

PType ptype = ((DType.Primitive) ctx.dtype()).ptype();
long rowCount = ctx.rowCount();
int typeBits = typeBits(ptype);
int lanes = lanes(ptype);
long mask = typeMask(ptype);
int typeBits = ptype.bits();
int lanes = FastLanes.lanes(ptype);
long mask = FastLanes.lowMask(ptype.bits());

long deltasLen = meta.deltas_len();
int offset = meta.offset();
Expand All @@ -76,32 +77,32 @@ public Array decode(DecodeContext ctx) {
};
}

long basesLen = (deltasLen / FL_CHUNK_SIZE) * lanes;
long basesLen = (deltasLen / FastLanes.CHUNK) * lanes;
DType dtype = ctx.dtype();

long[] basesAll = readLongs(ctx.decodeChildSegment(0, dtype, basesLen), (int) basesLen, ptype);
long[] deltasAll = readLongs(ctx.decodeChildSegment(1, dtype, deltasLen), (int) deltasLen, ptype);

int numChunks = (int) (deltasLen / FL_CHUNK_SIZE);
int numChunks = (int) (deltasLen / FastLanes.CHUNK);
long[] decoded = new long[(int) deltasLen];
long[] untransposedChunk = new long[FL_CHUNK_SIZE];
long[] untransposedChunk = new long[FastLanes.CHUNK];
long[] chunkBases = new long[lanes];
long[] chunkDeltas = new long[FL_CHUNK_SIZE];
long[] chunkUndelta = new long[FL_CHUNK_SIZE];
long[] chunkDeltas = new long[FastLanes.CHUNK];
long[] chunkUndelta = new long[FastLanes.CHUNK];

for (int chunk = 0; chunk < numChunks; chunk++) {
int basesOff = chunk * lanes;
int deltaOff = chunk * FL_CHUNK_SIZE;
int deltaOff = chunk * FastLanes.CHUNK;

System.arraycopy(basesAll, basesOff, chunkBases, 0, lanes);
System.arraycopy(deltasAll, deltaOff, chunkDeltas, 0, FL_CHUNK_SIZE);
System.arraycopy(deltasAll, deltaOff, chunkDeltas, 0, FastLanes.CHUNK);

undeltaChunk(chunkDeltas, chunkBases, lanes, typeBits, mask, chunkUndelta);

for (int i = 0; i < FL_CHUNK_SIZE; i++) {
untransposedChunk[transposeIndex(i)] = chunkUndelta[i];
for (int i = 0; i < FastLanes.CHUNK; i++) {
untransposedChunk[FastLanes.transposeIndex(i)] = chunkUndelta[i];
}
System.arraycopy(untransposedChunk, 0, decoded, deltaOff, FL_CHUNK_SIZE);
System.arraycopy(untransposedChunk, 0, decoded, deltaOff, FastLanes.CHUNK);
}

long[] result = new long[(int) rowCount];
Expand All @@ -121,7 +122,7 @@ private static void undeltaChunk(long[] deltas, long[] bases, int lanes, int typ
for (int lane = 0; lane < lanes; lane++) {
long prev = bases[lane] & mask;
for (int row = 0; row < typeBits; row++) {
int idx = iterateIndex(row, lane);
int idx = FastLanes.iterateIndex(row, lane);
long next = ((deltas[idx] & mask) + prev) & mask;
out[idx] = next;
prev = next;
Expand Down Expand Up @@ -149,34 +150,5 @@ private static long[] readLongs(MemorySegment buf, int count, PType ptype) {
return out;
}

private static final int FL_CHUNK_SIZE = 1024;

private static final int[] FL_ORDER = {0, 4, 2, 6, 1, 5, 3, 7};

private static int transposeIndex(int idx) {
int lane = idx % 16;
int order = (idx / 16) % 8;
int row = idx / 128;
return lane * 64 + FL_ORDER[order] * 8 + row;
}

private static int iterateIndex(int row, int lane) {
int o = row / 8;
int s = row % 8;
return FL_ORDER[o] * 16 + s * 128 + lane;
}

private static int lanes(PType ptype) {
return FL_CHUNK_SIZE / (ptype.byteSize() * 8);
}

private static int typeBits(PType ptype) {
return ptype.byteSize() * 8;
}

private static long typeMask(PType ptype) {
int bits = ptype.byteSize() * 8;
return bits == 64 ? -1L : (1L << bits) - 1;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import io.github.dfa1.vortex.core.PType;
import io.github.dfa1.vortex.core.VortexException;
import io.github.dfa1.vortex.encoding.EncodingId;
import io.github.dfa1.vortex.encoding.FastLanes;
import io.github.dfa1.vortex.encoding.PrimitiveArrays;
import io.github.dfa1.vortex.encoding.PTypeIO;
import io.github.dfa1.vortex.proto.BitPackedMetadata;
Expand Down Expand Up @@ -46,8 +47,8 @@ public EncodeResult encode(DType dtype, Object data, EncodeContext ctx) {
PType ptype = ((DType.Primitive) dtype).ptype();
long[] longs = PrimitiveArrays.toLongs(data, ptype, EncodingId.FASTLANES_BITPACKED);
int n = longs.length;
int typeBits = ptype.byteSize() * 8;
long typeMask = typeMask(typeBits);
int typeBits = ptype.bits();
long typeMask = FastLanes.lowMask(typeBits);
boolean unsign = ptype.isUnsigned();

long signedMin = 0L;
Expand Down Expand Up @@ -198,7 +199,7 @@ private static MemorySegment packFastLanes(long[] values, int n, int bitWidth, i
int lanes = 1024 / typeBits;
int wordBytes = typeBits / 8;
int blockCount = (n + 1023) / 1024;
long typeMask = typeMask(typeBits);
long typeMask = FastLanes.lowMask(typeBits);
// Mask values to the chosen bit width so over-cap entries (handled separately as
// patches) don't spill into the next row's region in the packed layout.
long widthMask = bitWidth >= 64 ? -1L : (1L << bitWidth) - 1L;
Expand Down Expand Up @@ -239,9 +240,6 @@ private static MemorySegment packFastLanes(long[] values, int n, int bitWidth, i
}


private static long typeMask(int typeBits) {
return typeBits == 64 ? -1L : (1L << typeBits) - 1L;
}

private static byte[] statsBytes(PType ptype, long value) {
if (ptype.isUnsigned()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import io.github.dfa1.vortex.core.DType;
import io.github.dfa1.vortex.core.PType;
import io.github.dfa1.vortex.encoding.EncodingId;
import io.github.dfa1.vortex.encoding.FastLanes;
import io.github.dfa1.vortex.encoding.PrimitiveArrays;
import io.github.dfa1.vortex.proto.DeltaMetadata;
import io.github.dfa1.vortex.proto.ScalarValue;
Expand Down Expand Up @@ -39,9 +40,9 @@ public EncodeResult encode(DType dtype, Object data, EncodeContext ctx) {
PType ptype = ((DType.Primitive) dtype).ptype();
long[] longs = PrimitiveArrays.toLongs(data, ptype, EncodingId.FASTLANES_DELTA);
int n = longs.length;
int typeBits = typeBits(ptype);
int lanes = lanes(ptype);
long mask = typeMask(ptype);
int typeBits = ptype.bits();
int lanes = FastLanes.lanes(ptype);
long mask = FastLanes.lowMask(ptype.bits());
boolean unsign = ptype.isUnsigned();

long minVal = 0L;
Expand All @@ -60,34 +61,34 @@ public EncodeResult encode(DType dtype, Object data, EncodeContext ctx) {
}
}

int numChunks = n == 0 ? 0 : (n + FL_CHUNK_SIZE - 1) / FL_CHUNK_SIZE;
long paddedLen = (long) numChunks * FL_CHUNK_SIZE;
int numChunks = n == 0 ? 0 : (n + FastLanes.CHUNK - 1) / FastLanes.CHUNK;
long paddedLen = (long) numChunks * FastLanes.CHUNK;
int basesLen = numChunks * lanes;

long[] basesAll = new long[basesLen];
long[] deltasAll = new long[(int) paddedLen];
long[] chunkBuf = new long[FL_CHUNK_SIZE];
long[] transposed = new long[FL_CHUNK_SIZE];
long[] chunkBuf = new long[FastLanes.CHUNK];
long[] transposed = new long[FastLanes.CHUNK];
long[] chunkBases = new long[lanes];
long[] chunkDelta = new long[FL_CHUNK_SIZE];
long[] chunkDelta = new long[FastLanes.CHUNK];

for (int chunk = 0; chunk < numChunks; chunk++) {
int start = chunk * FL_CHUNK_SIZE;
int end = Math.min(start + FL_CHUNK_SIZE, n);
int start = chunk * FastLanes.CHUNK;
int end = Math.min(start + FastLanes.CHUNK, n);
for (int i = start; i < end; i++) {
chunkBuf[i - start] = longs[i] & mask;
}
for (int i = end - start; i < FL_CHUNK_SIZE; i++) {
for (int i = end - start; i < FastLanes.CHUNK; i++) {
chunkBuf[i] = 0L;
}
for (int i = 0; i < FL_CHUNK_SIZE; i++) {
transposed[i] = chunkBuf[transposeIndex(i)];
for (int i = 0; i < FastLanes.CHUNK; i++) {
transposed[i] = chunkBuf[FastLanes.transposeIndex(i)];
}
int basesOff = chunk * lanes;
System.arraycopy(transposed, 0, basesAll, basesOff, lanes);
System.arraycopy(basesAll, basesOff, chunkBases, 0, lanes);
deltaChunk(transposed, chunkBases, lanes, typeBits, mask, chunkDelta);
System.arraycopy(chunkDelta, 0, deltasAll, chunk * FL_CHUNK_SIZE, FL_CHUNK_SIZE);
System.arraycopy(chunkDelta, 0, deltasAll, chunk * FastLanes.CHUNK, FastLanes.CHUNK);
}

MemorySegment basesSeg = PrimitiveArrays.fromLongs(basesAll, ptype, ctx.arena());
Expand All @@ -109,7 +110,7 @@ private static void deltaChunk(long[] transposed, long[] bases, int lanes, int t
for (int lane = 0; lane < lanes; lane++) {
long prev = bases[lane] & mask;
for (int row = 0; row < typeBits; row++) {
int idx = iterateIndex(row, lane);
int idx = FastLanes.iterateIndex(row, lane);
long next = transposed[idx] & mask;
out[idx] = (next - prev) & mask;
prev = next;
Expand All @@ -124,35 +125,4 @@ private static byte[] statsBytes(PType ptype, long value) {
return ScalarValue.ofInt64Value(value).encode();
}

private static final int FL_CHUNK_SIZE = 1024;

private static final int[] FL_ORDER = {0, 4, 2, 6, 1, 5, 3, 7};

private static int transposeIndex(int idx) {
int lane = idx % 16;
int order = (idx / 16) % 8;
int row = idx / 128;
return lane * 64 + FL_ORDER[order] * 8 + row;
}

private static int iterateIndex(int row, int lane) {
int o = row / 8;
int s = row % 8;
return FL_ORDER[o] * 16 + s * 128 + lane;
}

private static int lanes(PType ptype) {
return FL_CHUNK_SIZE / (ptype.byteSize() * 8);
}

private static int typeBits(PType ptype) {
return ptype.byteSize() * 8;
}

private static long typeMask(PType ptype) {
int bits = ptype.byteSize() * 8;
return bits == 64 ? -1L : (1L << bits) - 1;
}


}
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import io.github.dfa1.vortex.core.PType;
import io.github.dfa1.vortex.core.VortexException;
import io.github.dfa1.vortex.encoding.EncodingId;
import io.github.dfa1.vortex.encoding.FastLanes;
import io.github.dfa1.vortex.encoding.PrimitiveArrays;
import io.github.dfa1.vortex.encoding.PTypeIO;
import io.github.dfa1.vortex.proto.PatchedMetadata;
Expand Down Expand Up @@ -140,8 +141,8 @@ static EncodeResult encode(DType dtype, Object data, EncodeContext ctx) {
}

private static PatchedData computePatchedData(long[] longs, PType ptype, int n) {
int typeBits = ptype.byteSize() * 8;
long typeMask = typeBits == 64 ? -1L : (1L << typeBits) - 1L;
int typeBits = ptype.bits();
long typeMask = FastLanes.lowMask(typeBits);
int elemBytes = ptype.byteSize();

int[] bitWidthFreq = new int[typeBits + 1];
Expand Down
Loading