diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/array/AbstractMaterializedArray.java b/reader/src/main/java/io/github/dfa1/vortex/reader/array/AbstractMaterializedArray.java index 9420787c..8beb6861 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/array/AbstractMaterializedArray.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/array/AbstractMaterializedArray.java @@ -51,6 +51,7 @@ public final long length() { /// /// @param arena unused; the existing buffer is returned as-is /// @return the backing segment + @SuppressWarnings("java:S1172") // arena is contractual: this implements Array#materialize(SegmentAllocator) for the leaf classes; unused only because the buffer is already materialized. public final MemorySegment materialize(SegmentAllocator arena) { return buffer; } diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/decode/BitpackedEncodingDecoder.java b/reader/src/main/java/io/github/dfa1/vortex/reader/decode/BitpackedEncodingDecoder.java index 49c0bfb1..563a5350 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/decode/BitpackedEncodingDecoder.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/decode/BitpackedEncodingDecoder.java @@ -100,36 +100,71 @@ private static void fastlanesUnpackToSeg( } } - private static void unpackLoop8(MemorySegment buf, int bitWidth, int offset, long rowCount, MemorySegment out) { - final int lanes = 128; - long totalElems = rowCount + offset; - int blockCount = (int) ((totalElems + 1023) / 1024); - long bitMask = (1L << bitWidth) - 1L; + /// Per-row unpack schedule for one FastLanes block, precomputed once per decode call. Every + /// array is indexed by `row` in `[0, typeBits)`. This setup is identical for the 8/16/32/64-bit + /// unpackers, so it lives here; the per-element unpack loops stay specialised per width because + /// their typed `ValueLayout` access must constant-fold for the JIT to vectorise them. + /// + /// @param shifts low-bit shift to apply to the current word, per row + /// @param remainingBits bits spilling into the next word (0 when the value fits one word) + /// @param currentBits bits taken from the current word (`bitWidth - remainingBits`) + /// @param loMasks mask for the low part read from the current word + /// @param hiMasks mask for the high part read from the next word + /// @param currWordByteBase byte offset of the current word within the block + /// @param nextWordByteBase byte offset of the next word within the block + /// @param outRowByteOff byte offset of the row within the transposed output block + private record UnpackSchedule( + int[] shifts, int[] remainingBits, int[] currentBits, + long[] loMasks, long[] hiMasks, + long[] currWordByteBase, long[] nextWordByteBase, long[] outRowByteOff) { + } - int[] shifts = new int[8]; - int[] remainingBits = new int[8]; - int[] currentBits = new int[8]; - long[] loMasks = new long[8]; - long[] hiMasks = new long[8]; - long[] currWordByteBase = new long[8]; - long[] nextWordByteBase = new long[8]; - long[] outRowByteOff = new long[8]; - for (int row = 0; row < 8; row++) { - int currWord = (row * bitWidth) / 8; - int nextWord = ((row + 1) * bitWidth) / 8; - shifts[row] = (row * bitWidth) % 8; - int rem = (nextWord > currWord) ? ((row + 1) * bitWidth) % 8 : 0; + private static UnpackSchedule schedule(int typeBits, int bitWidth) { + int lanes = 1024 / typeBits; + int elemBytes = typeBits / 8; + int[] shifts = new int[typeBits]; + int[] remainingBits = new int[typeBits]; + int[] currentBits = new int[typeBits]; + long[] loMasks = new long[typeBits]; + long[] hiMasks = new long[typeBits]; + long[] currWordByteBase = new long[typeBits]; + long[] nextWordByteBase = new long[typeBits]; + long[] outRowByteOff = new long[typeBits]; + for (int row = 0; row < typeBits; row++) { + int currWord = (row * bitWidth) / typeBits; + int nextWord = ((row + 1) * bitWidth) / typeBits; + shifts[row] = (row * bitWidth) % typeBits; + int rem = (nextWord > currWord) ? ((row + 1) * bitWidth) % typeBits : 0; remainingBits[row] = rem; int curr = bitWidth - rem; currentBits[row] = curr; loMasks[row] = rem > 0 ? (1L << curr) - 1L : 0L; hiMasks[row] = rem > 0 ? (1L << rem) - 1L : 0L; - currWordByteBase[row] = (long) lanes * currWord; - nextWordByteBase[row] = rem > 0 ? (long) lanes * nextWord : 0L; + currWordByteBase[row] = (long) lanes * currWord * elemBytes; + nextWordByteBase[row] = rem > 0 ? (long) lanes * nextWord * elemBytes : 0L; int o = row / 8; int s = row % 8; - outRowByteOff[row] = (long) FL_ORDER[o] * 16 + (long) s * 128; + outRowByteOff[row] = (long) (FL_ORDER[o] * 16 + s * 128) * elemBytes; } + return new UnpackSchedule(shifts, remainingBits, currentBits, loMasks, hiMasks, + currWordByteBase, nextWordByteBase, outRowByteOff); + } + + private static void unpackLoop8(MemorySegment buf, int bitWidth, int offset, long rowCount, MemorySegment out) { + final int lanes = 128; + long totalElems = rowCount + offset; + int blockCount = (int) ((totalElems + 1023) / 1024); + long bitMask = (1L << bitWidth) - 1L; + + UnpackSchedule sch = schedule(8, bitWidth); + int[] shifts = sch.shifts(); + int[] remainingBits = sch.remainingBits(); + int[] currentBits = sch.currentBits(); + long[] loMasks = sch.loMasks(); + long[] hiMasks = sch.hiMasks(); + long[] currWordByteBase = sch.currWordByteBase(); + long[] nextWordByteBase = sch.nextWordByteBase(); + long[] outRowByteOff = sch.outRowByteOff(); long blockByteOff = 0L; long blockByteStride = 128L * bitWidth; @@ -199,30 +234,15 @@ private static void unpackLoop16(MemorySegment buf, int bitWidth, int offset, lo int blockCount = (int) ((totalElems + 1023) / 1024); long bitMask = (1L << bitWidth) - 1L; - int[] shifts = new int[16]; - int[] remainingBits = new int[16]; - int[] currentBits = new int[16]; - long[] loMasks = new long[16]; - long[] hiMasks = new long[16]; - long[] currWordByteBase = new long[16]; - long[] nextWordByteBase = new long[16]; - long[] outRowByteOff = new long[16]; - for (int row = 0; row < 16; row++) { - int currWord = (row * bitWidth) / 16; - int nextWord = ((row + 1) * bitWidth) / 16; - shifts[row] = (row * bitWidth) % 16; - int rem = (nextWord > currWord) ? ((row + 1) * bitWidth) % 16 : 0; - remainingBits[row] = rem; - int curr = bitWidth - rem; - currentBits[row] = curr; - loMasks[row] = rem > 0 ? (1L << curr) - 1L : 0L; - hiMasks[row] = rem > 0 ? (1L << rem) - 1L : 0L; - currWordByteBase[row] = (long) lanes * currWord * 2L; - nextWordByteBase[row] = rem > 0 ? (long) lanes * nextWord * 2L : 0L; - int o = row / 8; - int s = row % 8; - outRowByteOff[row] = (FL_ORDER[o] * 16 + s * 128) * 2L; - } + UnpackSchedule sch = schedule(16, bitWidth); + int[] shifts = sch.shifts(); + int[] remainingBits = sch.remainingBits(); + int[] currentBits = sch.currentBits(); + long[] loMasks = sch.loMasks(); + long[] hiMasks = sch.hiMasks(); + long[] currWordByteBase = sch.currWordByteBase(); + long[] nextWordByteBase = sch.nextWordByteBase(); + long[] outRowByteOff = sch.outRowByteOff(); long blockByteOff = 0L; long blockByteStride = 128L * bitWidth; @@ -295,30 +315,15 @@ private static void unpackLoop32(MemorySegment buf, int bitWidth, int offset, lo int blockCount = (int) ((totalElems + 1023) / 1024); long bitMask = (1L << bitWidth) - 1L; - int[] shifts = new int[32]; - int[] remainingBits = new int[32]; - int[] currentBits = new int[32]; - long[] loMasks = new long[32]; - long[] hiMasks = new long[32]; - long[] currWordByteBase = new long[32]; - long[] nextWordByteBase = new long[32]; - long[] outRowByteOff = new long[32]; - for (int row = 0; row < 32; row++) { - int currWord = (row * bitWidth) / 32; - int nextWord = ((row + 1) * bitWidth) / 32; - shifts[row] = (row * bitWidth) % 32; - int rem = (nextWord > currWord) ? ((row + 1) * bitWidth) % 32 : 0; - remainingBits[row] = rem; - int curr = bitWidth - rem; - currentBits[row] = curr; - loMasks[row] = rem > 0 ? (1L << curr) - 1L : 0L; - hiMasks[row] = rem > 0 ? (1L << rem) - 1L : 0L; - currWordByteBase[row] = (long) lanes * currWord * 4L; - nextWordByteBase[row] = rem > 0 ? (long) lanes * nextWord * 4L : 0L; - int o = row / 8; - int s = row % 8; - outRowByteOff[row] = (FL_ORDER[o] * 16 + s * 128) * 4L; - } + UnpackSchedule sch = schedule(32, bitWidth); + int[] shifts = sch.shifts(); + int[] remainingBits = sch.remainingBits(); + int[] currentBits = sch.currentBits(); + long[] loMasks = sch.loMasks(); + long[] hiMasks = sch.hiMasks(); + long[] currWordByteBase = sch.currWordByteBase(); + long[] nextWordByteBase = sch.nextWordByteBase(); + long[] outRowByteOff = sch.outRowByteOff(); long blockByteOff = 0L; long blockByteStride = 128L * bitWidth; @@ -391,30 +396,15 @@ private static void unpackLoop64(MemorySegment buf, int bitWidth, int offset, lo int blockCount = (int) ((totalElems + 1023) / 1024); long bitMask = bitWidth == 64 ? -1L : (1L << bitWidth) - 1L; - int[] shifts = new int[64]; - int[] remainingBits = new int[64]; - int[] currentBits = new int[64]; - long[] loMasks = new long[64]; - long[] hiMasks = new long[64]; - long[] currWordByteBase = new long[64]; - long[] nextWordByteBase = new long[64]; - long[] outRowByteOff = new long[64]; - for (int row = 0; row < 64; row++) { - int currWord = (row * bitWidth) / 64; - int nextWord = ((row + 1) * bitWidth) / 64; - shifts[row] = (row * bitWidth) % 64; - int rem = (nextWord > currWord) ? ((row + 1) * bitWidth) % 64 : 0; - remainingBits[row] = rem; - int curr = bitWidth - rem; - currentBits[row] = curr; - loMasks[row] = rem > 0 ? (1L << curr) - 1L : 0L; - hiMasks[row] = rem > 0 ? (1L << rem) - 1L : 0L; - currWordByteBase[row] = (long) lanes * currWord * 8L; - nextWordByteBase[row] = rem > 0 ? (long) lanes * nextWord * 8L : 0L; - int o = row / 8; - int s = row % 8; - outRowByteOff[row] = (FL_ORDER[o] * 16 + s * 128) * 8L; - } + UnpackSchedule sch = schedule(64, bitWidth); + int[] shifts = sch.shifts(); + int[] remainingBits = sch.remainingBits(); + int[] currentBits = sch.currentBits(); + long[] loMasks = sch.loMasks(); + long[] hiMasks = sch.hiMasks(); + long[] currWordByteBase = sch.currWordByteBase(); + long[] nextWordByteBase = sch.nextWordByteBase(); + long[] outRowByteOff = sch.outRowByteOff(); long blockByteOff = 0L; long blockByteStride = 128L * bitWidth;