From 69c224b713adde6e269aa7c4d2c35ecc1962f20f Mon Sep 17 00:00:00 2001 From: Davide Angelocola Date: Tue, 23 Jun 2026 08:12:26 +0200 Subject: [PATCH 1/2] fix(sonar): suppress S1172 on AbstractMaterializedArray.materialize The arena parameter is contractual: this base method implements Array#materialize(SegmentAllocator) for the leaf Materialized* classes, which implement the sealed Array interface directly. Sonar cannot see the override because the base class is not itself declared to implement Array, so it flagged arena as an unused parameter. The parameter must stay to satisfy the interface; suppress with an explanatory reason instead. Co-Authored-By: Claude Opus 4.8 --- .../dfa1/vortex/reader/array/AbstractMaterializedArray.java | 1 + 1 file changed, 1 insertion(+) diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/array/AbstractMaterializedArray.java b/reader/src/main/java/io/github/dfa1/vortex/reader/array/AbstractMaterializedArray.java index 9420787c..8beb6861 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/array/AbstractMaterializedArray.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/array/AbstractMaterializedArray.java @@ -51,6 +51,7 @@ public final long length() { /// /// @param arena unused; the existing buffer is returned as-is /// @return the backing segment + @SuppressWarnings("java:S1172") // arena is contractual: this implements Array#materialize(SegmentAllocator) for the leaf classes; unused only because the buffer is already materialized. public final MemorySegment materialize(SegmentAllocator arena) { return buffer; } From c4f1f557a2a823ebae035b8fb795cd268b05c7bd Mon Sep 17 00:00:00 2001 From: Davide Angelocola Date: Tue, 23 Jun 2026 08:21:24 +0200 Subject: [PATCH 2/2] refactor(decode): extract shared FastLanes unpack schedule in BitpackedEncodingDecoder The four unpackLoop8/16/32/64 methods each rebuilt an identical per-row schedule (shifts, remaining/current bits, lo/hi masks, current/next word byte bases, transposed output offsets), differing only by typeBits and element width. Sonar flagged the four ~24-line setup blocks as duplication. Hoist that cold, run-once precompute into an UnpackSchedule record built by schedule(typeBits, bitWidth), which derives lanes (1024/typeBits) and elemBytes (typeBits/8) internally. Each loop now calls schedule(N, bitWidth) and aliases the arrays. The per-element inner unpack loops stay specialised per width on purpose: a generic ValueLayout/accessor would stop C2 from constant-folding the typed access and block superword vectorisation (hot-loop rule). Only the cold setup is shared, so there is no hot-path change. Verified: 120 round-trip decode tests (all widths + patches) and 107 encoder tests pass; reactor build + build-enforced javadoc clean. Co-Authored-By: Claude Opus 4.8 --- .../decode/BitpackedEncodingDecoder.java | 176 +++++++++--------- 1 file changed, 83 insertions(+), 93 deletions(-) diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/decode/BitpackedEncodingDecoder.java b/reader/src/main/java/io/github/dfa1/vortex/reader/decode/BitpackedEncodingDecoder.java index 49c0bfb1..563a5350 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/decode/BitpackedEncodingDecoder.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/decode/BitpackedEncodingDecoder.java @@ -100,36 +100,71 @@ private static void fastlanesUnpackToSeg( } } - private static void unpackLoop8(MemorySegment buf, int bitWidth, int offset, long rowCount, MemorySegment out) { - final int lanes = 128; - long totalElems = rowCount + offset; - int blockCount = (int) ((totalElems + 1023) / 1024); - long bitMask = (1L << bitWidth) - 1L; + /// Per-row unpack schedule for one FastLanes block, precomputed once per decode call. Every + /// array is indexed by `row` in `[0, typeBits)`. This setup is identical for the 8/16/32/64-bit + /// unpackers, so it lives here; the per-element unpack loops stay specialised per width because + /// their typed `ValueLayout` access must constant-fold for the JIT to vectorise them. + /// + /// @param shifts low-bit shift to apply to the current word, per row + /// @param remainingBits bits spilling into the next word (0 when the value fits one word) + /// @param currentBits bits taken from the current word (`bitWidth - remainingBits`) + /// @param loMasks mask for the low part read from the current word + /// @param hiMasks mask for the high part read from the next word + /// @param currWordByteBase byte offset of the current word within the block + /// @param nextWordByteBase byte offset of the next word within the block + /// @param outRowByteOff byte offset of the row within the transposed output block + private record UnpackSchedule( + int[] shifts, int[] remainingBits, int[] currentBits, + long[] loMasks, long[] hiMasks, + long[] currWordByteBase, long[] nextWordByteBase, long[] outRowByteOff) { + } - int[] shifts = new int[8]; - int[] remainingBits = new int[8]; - int[] currentBits = new int[8]; - long[] loMasks = new long[8]; - long[] hiMasks = new long[8]; - long[] currWordByteBase = new long[8]; - long[] nextWordByteBase = new long[8]; - long[] outRowByteOff = new long[8]; - for (int row = 0; row < 8; row++) { - int currWord = (row * bitWidth) / 8; - int nextWord = ((row + 1) * bitWidth) / 8; - shifts[row] = (row * bitWidth) % 8; - int rem = (nextWord > currWord) ? ((row + 1) * bitWidth) % 8 : 0; + private static UnpackSchedule schedule(int typeBits, int bitWidth) { + int lanes = 1024 / typeBits; + int elemBytes = typeBits / 8; + int[] shifts = new int[typeBits]; + int[] remainingBits = new int[typeBits]; + int[] currentBits = new int[typeBits]; + long[] loMasks = new long[typeBits]; + long[] hiMasks = new long[typeBits]; + long[] currWordByteBase = new long[typeBits]; + long[] nextWordByteBase = new long[typeBits]; + long[] outRowByteOff = new long[typeBits]; + for (int row = 0; row < typeBits; row++) { + int currWord = (row * bitWidth) / typeBits; + int nextWord = ((row + 1) * bitWidth) / typeBits; + shifts[row] = (row * bitWidth) % typeBits; + int rem = (nextWord > currWord) ? ((row + 1) * bitWidth) % typeBits : 0; remainingBits[row] = rem; int curr = bitWidth - rem; currentBits[row] = curr; loMasks[row] = rem > 0 ? (1L << curr) - 1L : 0L; hiMasks[row] = rem > 0 ? (1L << rem) - 1L : 0L; - currWordByteBase[row] = (long) lanes * currWord; - nextWordByteBase[row] = rem > 0 ? (long) lanes * nextWord : 0L; + currWordByteBase[row] = (long) lanes * currWord * elemBytes; + nextWordByteBase[row] = rem > 0 ? (long) lanes * nextWord * elemBytes : 0L; int o = row / 8; int s = row % 8; - outRowByteOff[row] = (long) FL_ORDER[o] * 16 + (long) s * 128; + outRowByteOff[row] = (long) (FL_ORDER[o] * 16 + s * 128) * elemBytes; } + return new UnpackSchedule(shifts, remainingBits, currentBits, loMasks, hiMasks, + currWordByteBase, nextWordByteBase, outRowByteOff); + } + + private static void unpackLoop8(MemorySegment buf, int bitWidth, int offset, long rowCount, MemorySegment out) { + final int lanes = 128; + long totalElems = rowCount + offset; + int blockCount = (int) ((totalElems + 1023) / 1024); + long bitMask = (1L << bitWidth) - 1L; + + UnpackSchedule sch = schedule(8, bitWidth); + int[] shifts = sch.shifts(); + int[] remainingBits = sch.remainingBits(); + int[] currentBits = sch.currentBits(); + long[] loMasks = sch.loMasks(); + long[] hiMasks = sch.hiMasks(); + long[] currWordByteBase = sch.currWordByteBase(); + long[] nextWordByteBase = sch.nextWordByteBase(); + long[] outRowByteOff = sch.outRowByteOff(); long blockByteOff = 0L; long blockByteStride = 128L * bitWidth; @@ -199,30 +234,15 @@ private static void unpackLoop16(MemorySegment buf, int bitWidth, int offset, lo int blockCount = (int) ((totalElems + 1023) / 1024); long bitMask = (1L << bitWidth) - 1L; - int[] shifts = new int[16]; - int[] remainingBits = new int[16]; - int[] currentBits = new int[16]; - long[] loMasks = new long[16]; - long[] hiMasks = new long[16]; - long[] currWordByteBase = new long[16]; - long[] nextWordByteBase = new long[16]; - long[] outRowByteOff = new long[16]; - for (int row = 0; row < 16; row++) { - int currWord = (row * bitWidth) / 16; - int nextWord = ((row + 1) * bitWidth) / 16; - shifts[row] = (row * bitWidth) % 16; - int rem = (nextWord > currWord) ? ((row + 1) * bitWidth) % 16 : 0; - remainingBits[row] = rem; - int curr = bitWidth - rem; - currentBits[row] = curr; - loMasks[row] = rem > 0 ? (1L << curr) - 1L : 0L; - hiMasks[row] = rem > 0 ? (1L << rem) - 1L : 0L; - currWordByteBase[row] = (long) lanes * currWord * 2L; - nextWordByteBase[row] = rem > 0 ? (long) lanes * nextWord * 2L : 0L; - int o = row / 8; - int s = row % 8; - outRowByteOff[row] = (FL_ORDER[o] * 16 + s * 128) * 2L; - } + UnpackSchedule sch = schedule(16, bitWidth); + int[] shifts = sch.shifts(); + int[] remainingBits = sch.remainingBits(); + int[] currentBits = sch.currentBits(); + long[] loMasks = sch.loMasks(); + long[] hiMasks = sch.hiMasks(); + long[] currWordByteBase = sch.currWordByteBase(); + long[] nextWordByteBase = sch.nextWordByteBase(); + long[] outRowByteOff = sch.outRowByteOff(); long blockByteOff = 0L; long blockByteStride = 128L * bitWidth; @@ -295,30 +315,15 @@ private static void unpackLoop32(MemorySegment buf, int bitWidth, int offset, lo int blockCount = (int) ((totalElems + 1023) / 1024); long bitMask = (1L << bitWidth) - 1L; - int[] shifts = new int[32]; - int[] remainingBits = new int[32]; - int[] currentBits = new int[32]; - long[] loMasks = new long[32]; - long[] hiMasks = new long[32]; - long[] currWordByteBase = new long[32]; - long[] nextWordByteBase = new long[32]; - long[] outRowByteOff = new long[32]; - for (int row = 0; row < 32; row++) { - int currWord = (row * bitWidth) / 32; - int nextWord = ((row + 1) * bitWidth) / 32; - shifts[row] = (row * bitWidth) % 32; - int rem = (nextWord > currWord) ? ((row + 1) * bitWidth) % 32 : 0; - remainingBits[row] = rem; - int curr = bitWidth - rem; - currentBits[row] = curr; - loMasks[row] = rem > 0 ? (1L << curr) - 1L : 0L; - hiMasks[row] = rem > 0 ? (1L << rem) - 1L : 0L; - currWordByteBase[row] = (long) lanes * currWord * 4L; - nextWordByteBase[row] = rem > 0 ? (long) lanes * nextWord * 4L : 0L; - int o = row / 8; - int s = row % 8; - outRowByteOff[row] = (FL_ORDER[o] * 16 + s * 128) * 4L; - } + UnpackSchedule sch = schedule(32, bitWidth); + int[] shifts = sch.shifts(); + int[] remainingBits = sch.remainingBits(); + int[] currentBits = sch.currentBits(); + long[] loMasks = sch.loMasks(); + long[] hiMasks = sch.hiMasks(); + long[] currWordByteBase = sch.currWordByteBase(); + long[] nextWordByteBase = sch.nextWordByteBase(); + long[] outRowByteOff = sch.outRowByteOff(); long blockByteOff = 0L; long blockByteStride = 128L * bitWidth; @@ -391,30 +396,15 @@ private static void unpackLoop64(MemorySegment buf, int bitWidth, int offset, lo int blockCount = (int) ((totalElems + 1023) / 1024); long bitMask = bitWidth == 64 ? -1L : (1L << bitWidth) - 1L; - int[] shifts = new int[64]; - int[] remainingBits = new int[64]; - int[] currentBits = new int[64]; - long[] loMasks = new long[64]; - long[] hiMasks = new long[64]; - long[] currWordByteBase = new long[64]; - long[] nextWordByteBase = new long[64]; - long[] outRowByteOff = new long[64]; - for (int row = 0; row < 64; row++) { - int currWord = (row * bitWidth) / 64; - int nextWord = ((row + 1) * bitWidth) / 64; - shifts[row] = (row * bitWidth) % 64; - int rem = (nextWord > currWord) ? ((row + 1) * bitWidth) % 64 : 0; - remainingBits[row] = rem; - int curr = bitWidth - rem; - currentBits[row] = curr; - loMasks[row] = rem > 0 ? (1L << curr) - 1L : 0L; - hiMasks[row] = rem > 0 ? (1L << rem) - 1L : 0L; - currWordByteBase[row] = (long) lanes * currWord * 8L; - nextWordByteBase[row] = rem > 0 ? (long) lanes * nextWord * 8L : 0L; - int o = row / 8; - int s = row % 8; - outRowByteOff[row] = (FL_ORDER[o] * 16 + s * 128) * 8L; - } + UnpackSchedule sch = schedule(64, bitWidth); + int[] shifts = sch.shifts(); + int[] remainingBits = sch.remainingBits(); + int[] currentBits = sch.currentBits(); + long[] loMasks = sch.loMasks(); + long[] hiMasks = sch.hiMasks(); + long[] currWordByteBase = sch.currWordByteBase(); + long[] nextWordByteBase = sch.nextWordByteBase(); + long[] outRowByteOff = sch.outRowByteOff(); long blockByteOff = 0L; long blockByteStride = 128L * bitWidth;