From 69c224b713adde6e269aa7c4d2c35ecc1962f20f Mon Sep 17 00:00:00 2001
From: Davide Angelocola <davide.angelocola@gmail.com>
Date: Tue, 23 Jun 2026 08:12:26 +0200
Subject: [PATCH 1/2] fix(sonar): suppress S1172 on
 AbstractMaterializedArray.materialize

The arena parameter is contractual: this base method implements
Array#materialize(SegmentAllocator) for the leaf Materialized* classes,
which implement the sealed Array interface directly. Sonar cannot see the
override because the base class is not itself declared to implement Array,
so it flagged arena as an unused parameter. The parameter must stay to
satisfy the interface; suppress with an explanatory reason instead.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .../dfa1/vortex/reader/array/AbstractMaterializedArray.java      | 1 +
 1 file changed, 1 insertion(+)

diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/array/AbstractMaterializedArray.java b/reader/src/main/java/io/github/dfa1/vortex/reader/array/AbstractMaterializedArray.java
index 9420787c..8beb6861 100644
--- a/reader/src/main/java/io/github/dfa1/vortex/reader/array/AbstractMaterializedArray.java
+++ b/reader/src/main/java/io/github/dfa1/vortex/reader/array/AbstractMaterializedArray.java
@@ -51,6 +51,7 @@ public final long length() {
     ///
     /// @param arena unused; the existing buffer is returned as-is
     /// @return the backing segment
+    @SuppressWarnings("java:S1172") // arena is contractual: this implements Array#materialize(SegmentAllocator) for the leaf classes; unused only because the buffer is already materialized.
     public final MemorySegment materialize(SegmentAllocator arena) {
         return buffer;
     }

From c4f1f557a2a823ebae035b8fb795cd268b05c7bd Mon Sep 17 00:00:00 2001
From: Davide Angelocola <davide.angelocola@gmail.com>
Date: Tue, 23 Jun 2026 08:21:24 +0200
Subject: [PATCH 2/2] refactor(decode): extract shared FastLanes unpack
 schedule in BitpackedEncodingDecoder

The four unpackLoop8/16/32/64 methods each rebuilt an identical per-row
schedule (shifts, remaining/current bits, lo/hi masks, current/next word byte
bases, transposed output offsets), differing only by typeBits and element
width. Sonar flagged the four ~24-line setup blocks as duplication.

Hoist that cold, run-once precompute into an UnpackSchedule record built by
schedule(typeBits, bitWidth), which derives lanes (1024/typeBits) and
elemBytes (typeBits/8) internally. Each loop now calls schedule(N, bitWidth)
and aliases the arrays.

The per-element inner unpack loops stay specialised per width on purpose: a
generic ValueLayout/accessor would stop C2 from constant-folding the typed
access and block superword vectorisation (hot-loop rule). Only the cold setup
is shared, so there is no hot-path change.

Verified: 120 round-trip decode tests (all widths + patches) and 107 encoder
tests pass; reactor build + build-enforced javadoc clean.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .../decode/BitpackedEncodingDecoder.java      | 176 +++++++++---------
 1 file changed, 83 insertions(+), 93 deletions(-)

diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/decode/BitpackedEncodingDecoder.java b/reader/src/main/java/io/github/dfa1/vortex/reader/decode/BitpackedEncodingDecoder.java
index 49c0bfb1..563a5350 100644
--- a/reader/src/main/java/io/github/dfa1/vortex/reader/decode/BitpackedEncodingDecoder.java
+++ b/reader/src/main/java/io/github/dfa1/vortex/reader/decode/BitpackedEncodingDecoder.java
@@ -100,36 +100,71 @@ private static void fastlanesUnpackToSeg(
         }
     }
 
-    private static void unpackLoop8(MemorySegment buf, int bitWidth, int offset, long rowCount, MemorySegment out) {
-        final int lanes = 128;
-        long totalElems = rowCount + offset;
-        int blockCount = (int) ((totalElems + 1023) / 1024);
-        long bitMask = (1L << bitWidth) - 1L;
+    /// Per-row unpack schedule for one FastLanes block, precomputed once per decode call. Every
+    /// array is indexed by `row` in `[0, typeBits)`. This setup is identical for the 8/16/32/64-bit
+    /// unpackers, so it lives here; the per-element unpack loops stay specialised per width because
+    /// their typed `ValueLayout` access must constant-fold for the JIT to vectorise them.
+    ///
+    /// @param shifts            low-bit shift to apply to the current word, per row
+    /// @param remainingBits     bits spilling into the next word (0 when the value fits one word)
+    /// @param currentBits       bits taken from the current word (`bitWidth - remainingBits`)
+    /// @param loMasks           mask for the low part read from the current word
+    /// @param hiMasks           mask for the high part read from the next word
+    /// @param currWordByteBase  byte offset of the current word within the block
+    /// @param nextWordByteBase  byte offset of the next word within the block
+    /// @param outRowByteOff     byte offset of the row within the transposed output block
+    private record UnpackSchedule(
+            int[] shifts, int[] remainingBits, int[] currentBits,
+            long[] loMasks, long[] hiMasks,
+            long[] currWordByteBase, long[] nextWordByteBase, long[] outRowByteOff) {
+    }
 
-        int[] shifts = new int[8];
-        int[] remainingBits = new int[8];
-        int[] currentBits = new int[8];
-        long[] loMasks = new long[8];
-        long[] hiMasks = new long[8];
-        long[] currWordByteBase = new long[8];
-        long[] nextWordByteBase = new long[8];
-        long[] outRowByteOff = new long[8];
-        for (int row = 0; row < 8; row++) {
-            int currWord = (row * bitWidth) / 8;
-            int nextWord = ((row + 1) * bitWidth) / 8;
-            shifts[row] = (row * bitWidth) % 8;
-            int rem = (nextWord > currWord) ? ((row + 1) * bitWidth) % 8 : 0;
+    private static UnpackSchedule schedule(int typeBits, int bitWidth) {
+        int lanes = 1024 / typeBits;
+        int elemBytes = typeBits / 8;
+        int[] shifts = new int[typeBits];
+        int[] remainingBits = new int[typeBits];
+        int[] currentBits = new int[typeBits];
+        long[] loMasks = new long[typeBits];
+        long[] hiMasks = new long[typeBits];
+        long[] currWordByteBase = new long[typeBits];
+        long[] nextWordByteBase = new long[typeBits];
+        long[] outRowByteOff = new long[typeBits];
+        for (int row = 0; row < typeBits; row++) {
+            int currWord = (row * bitWidth) / typeBits;
+            int nextWord = ((row + 1) * bitWidth) / typeBits;
+            shifts[row] = (row * bitWidth) % typeBits;
+            int rem = (nextWord > currWord) ? ((row + 1) * bitWidth) % typeBits : 0;
             remainingBits[row] = rem;
             int curr = bitWidth - rem;
             currentBits[row] = curr;
             loMasks[row] = rem > 0 ? (1L << curr) - 1L : 0L;
             hiMasks[row] = rem > 0 ? (1L << rem) - 1L : 0L;
-            currWordByteBase[row] = (long) lanes * currWord;
-            nextWordByteBase[row] = rem > 0 ? (long) lanes * nextWord : 0L;
+            currWordByteBase[row] = (long) lanes * currWord * elemBytes;
+            nextWordByteBase[row] = rem > 0 ? (long) lanes * nextWord * elemBytes : 0L;
             int o = row / 8;
             int s = row % 8;
-            outRowByteOff[row] = (long) FL_ORDER[o] * 16 + (long) s * 128;
+            outRowByteOff[row] = (long) (FL_ORDER[o] * 16 + s * 128) * elemBytes;
         }
+        return new UnpackSchedule(shifts, remainingBits, currentBits, loMasks, hiMasks,
+                currWordByteBase, nextWordByteBase, outRowByteOff);
+    }
+
+    private static void unpackLoop8(MemorySegment buf, int bitWidth, int offset, long rowCount, MemorySegment out) {
+        final int lanes = 128;
+        long totalElems = rowCount + offset;
+        int blockCount = (int) ((totalElems + 1023) / 1024);
+        long bitMask = (1L << bitWidth) - 1L;
+
+        UnpackSchedule sch = schedule(8, bitWidth);
+        int[] shifts = sch.shifts();
+        int[] remainingBits = sch.remainingBits();
+        int[] currentBits = sch.currentBits();
+        long[] loMasks = sch.loMasks();
+        long[] hiMasks = sch.hiMasks();
+        long[] currWordByteBase = sch.currWordByteBase();
+        long[] nextWordByteBase = sch.nextWordByteBase();
+        long[] outRowByteOff = sch.outRowByteOff();
 
         long blockByteOff = 0L;
         long blockByteStride = 128L * bitWidth;
@@ -199,30 +234,15 @@ private static void unpackLoop16(MemorySegment buf, int bitWidth, int offset, lo
         int blockCount = (int) ((totalElems + 1023) / 1024);
         long bitMask = (1L << bitWidth) - 1L;
 
-        int[] shifts = new int[16];
-        int[] remainingBits = new int[16];
-        int[] currentBits = new int[16];
-        long[] loMasks = new long[16];
-        long[] hiMasks = new long[16];
-        long[] currWordByteBase = new long[16];
-        long[] nextWordByteBase = new long[16];
-        long[] outRowByteOff = new long[16];
-        for (int row = 0; row < 16; row++) {
-            int currWord = (row * bitWidth) / 16;
-            int nextWord = ((row + 1) * bitWidth) / 16;
-            shifts[row] = (row * bitWidth) % 16;
-            int rem = (nextWord > currWord) ? ((row + 1) * bitWidth) % 16 : 0;
-            remainingBits[row] = rem;
-            int curr = bitWidth - rem;
-            currentBits[row] = curr;
-            loMasks[row] = rem > 0 ? (1L << curr) - 1L : 0L;
-            hiMasks[row] = rem > 0 ? (1L << rem) - 1L : 0L;
-            currWordByteBase[row] = (long) lanes * currWord * 2L;
-            nextWordByteBase[row] = rem > 0 ? (long) lanes * nextWord * 2L : 0L;
-            int o = row / 8;
-            int s = row % 8;
-            outRowByteOff[row] = (FL_ORDER[o] * 16 + s * 128) * 2L;
-        }
+        UnpackSchedule sch = schedule(16, bitWidth);
+        int[] shifts = sch.shifts();
+        int[] remainingBits = sch.remainingBits();
+        int[] currentBits = sch.currentBits();
+        long[] loMasks = sch.loMasks();
+        long[] hiMasks = sch.hiMasks();
+        long[] currWordByteBase = sch.currWordByteBase();
+        long[] nextWordByteBase = sch.nextWordByteBase();
+        long[] outRowByteOff = sch.outRowByteOff();
 
         long blockByteOff = 0L;
         long blockByteStride = 128L * bitWidth;
@@ -295,30 +315,15 @@ private static void unpackLoop32(MemorySegment buf, int bitWidth, int offset, lo
         int blockCount = (int) ((totalElems + 1023) / 1024);
         long bitMask = (1L << bitWidth) - 1L;
 
-        int[] shifts = new int[32];
-        int[] remainingBits = new int[32];
-        int[] currentBits = new int[32];
-        long[] loMasks = new long[32];
-        long[] hiMasks = new long[32];
-        long[] currWordByteBase = new long[32];
-        long[] nextWordByteBase = new long[32];
-        long[] outRowByteOff = new long[32];
-        for (int row = 0; row < 32; row++) {
-            int currWord = (row * bitWidth) / 32;
-            int nextWord = ((row + 1) * bitWidth) / 32;
-            shifts[row] = (row * bitWidth) % 32;
-            int rem = (nextWord > currWord) ? ((row + 1) * bitWidth) % 32 : 0;
-            remainingBits[row] = rem;
-            int curr = bitWidth - rem;
-            currentBits[row] = curr;
-            loMasks[row] = rem > 0 ? (1L << curr) - 1L : 0L;
-            hiMasks[row] = rem > 0 ? (1L << rem) - 1L : 0L;
-            currWordByteBase[row] = (long) lanes * currWord * 4L;
-            nextWordByteBase[row] = rem > 0 ? (long) lanes * nextWord * 4L : 0L;
-            int o = row / 8;
-            int s = row % 8;
-            outRowByteOff[row] = (FL_ORDER[o] * 16 + s * 128) * 4L;
-        }
+        UnpackSchedule sch = schedule(32, bitWidth);
+        int[] shifts = sch.shifts();
+        int[] remainingBits = sch.remainingBits();
+        int[] currentBits = sch.currentBits();
+        long[] loMasks = sch.loMasks();
+        long[] hiMasks = sch.hiMasks();
+        long[] currWordByteBase = sch.currWordByteBase();
+        long[] nextWordByteBase = sch.nextWordByteBase();
+        long[] outRowByteOff = sch.outRowByteOff();
 
         long blockByteOff = 0L;
         long blockByteStride = 128L * bitWidth;
@@ -391,30 +396,15 @@ private static void unpackLoop64(MemorySegment buf, int bitWidth, int offset, lo
         int blockCount = (int) ((totalElems + 1023) / 1024);
         long bitMask = bitWidth == 64 ? -1L : (1L << bitWidth) - 1L;
 
-        int[] shifts = new int[64];
-        int[] remainingBits = new int[64];
-        int[] currentBits = new int[64];
-        long[] loMasks = new long[64];
-        long[] hiMasks = new long[64];
-        long[] currWordByteBase = new long[64];
-        long[] nextWordByteBase = new long[64];
-        long[] outRowByteOff = new long[64];
-        for (int row = 0; row < 64; row++) {
-            int currWord = (row * bitWidth) / 64;
-            int nextWord = ((row + 1) * bitWidth) / 64;
-            shifts[row] = (row * bitWidth) % 64;
-            int rem = (nextWord > currWord) ? ((row + 1) * bitWidth) % 64 : 0;
-            remainingBits[row] = rem;
-            int curr = bitWidth - rem;
-            currentBits[row] = curr;
-            loMasks[row] = rem > 0 ? (1L << curr) - 1L : 0L;
-            hiMasks[row] = rem > 0 ? (1L << rem) - 1L : 0L;
-            currWordByteBase[row] = (long) lanes * currWord * 8L;
-            nextWordByteBase[row] = rem > 0 ? (long) lanes * nextWord * 8L : 0L;
-            int o = row / 8;
-            int s = row % 8;
-            outRowByteOff[row] = (FL_ORDER[o] * 16 + s * 128) * 8L;
-        }
+        UnpackSchedule sch = schedule(64, bitWidth);
+        int[] shifts = sch.shifts();
+        int[] remainingBits = sch.remainingBits();
+        int[] currentBits = sch.currentBits();
+        long[] loMasks = sch.loMasks();
+        long[] hiMasks = sch.hiMasks();
+        long[] currWordByteBase = sch.currWordByteBase();
+        long[] nextWordByteBase = sch.nextWordByteBase();
+        long[] outRowByteOff = sch.outRowByteOff();
 
         long blockByteOff = 0L;
         long blockByteStride = 128L * bitWidth;