From 51624173587d662de9e877576195adfa7a8761c4 Mon Sep 17 00:00:00 2001
From: Daniel Adam <daniel.adam1@thermofisher.com>
Date: Thu, 29 Jan 2026 10:07:43 +0100
Subject: [PATCH 1/8] Add SIMD-accelerated byte-stream-split decoding

- Move all byte-stream-split decoding routines to new file byte_stream_split_decode.go.
- Add architecture-specific SIMD implementations:
  - AVX2 for amd64 (byte_stream_split_decode_avx2_amd64.go/.s)
  - NEON for arm64 (byte_stream_split_decode_neon_arm64.go/.s)
- Add runtime dispatch for SIMD decoding based on CPU features (AVX2/NEON).
- Ensure correct build tags and file suffixes for cross-platform compatibility.
---
 .../internal/encoding/byte_stream_split.go    |  86 ------
 .../encoding/byte_stream_split_decode.go      | 112 ++++++++
 .../byte_stream_split_decode_amd64.go         |   9 +
 .../byte_stream_split_decode_arm64.go         |  14 +
 .../byte_stream_split_decode_avx2_amd64.go    |  42 +++
 .../byte_stream_split_decode_avx2_amd64.s     | 133 ++++++++++
 .../byte_stream_split_decode_neon_arm64.go    |  40 +++
 .../byte_stream_split_decode_neon_arm64.s     | 117 +++++++++
 .../encoding/byte_stream_split_decode_test.go | 246 ++++++++++++++++++
 9 files changed, 713 insertions(+), 86 deletions(-)
 create mode 100644 parquet/internal/encoding/byte_stream_split_decode.go
 create mode 100644 parquet/internal/encoding/byte_stream_split_decode_amd64.go
 create mode 100644 parquet/internal/encoding/byte_stream_split_decode_arm64.go
 create mode 100644 parquet/internal/encoding/byte_stream_split_decode_avx2_amd64.go
 create mode 100644 parquet/internal/encoding/byte_stream_split_decode_avx2_amd64.s
 create mode 100644 parquet/internal/encoding/byte_stream_split_decode_neon_arm64.go
 create mode 100644 parquet/internal/encoding/byte_stream_split_decode_neon_arm64.s
 create mode 100644 parquet/internal/encoding/byte_stream_split_decode_test.go

diff --git a/parquet/internal/encoding/byte_stream_split.go b/parquet/internal/encoding/byte_stream_split.go
index fab61365..a9fcd53e 100644
--- a/parquet/internal/encoding/byte_stream_split.go
+++ b/parquet/internal/encoding/byte_stream_split.go
@@ -88,92 +88,6 @@ func encodeByteStreamSplitWidth8(data []byte, in []byte) {
 	}
 }
 
-// decodeByteStreamSplitBatchWidth4 decodes the batch of nValues raw bytes representing a 4-byte datatype provided by 'data',
-// into the output buffer 'out' using BYTE_STREAM_SPLIT encoding.
-// 'out' must have space for at least len(data) bytes.
-func decodeByteStreamSplitBatchWidth4(data []byte, nValues, stride int, out []byte) {
-	const width = 4
-	debug.Assert(len(out) >= nValues*width, fmt.Sprintf("not enough space in output buffer for decoding, out: %d bytes, data: %d bytes", len(out), len(data)))
-	for element := 0; element < nValues; element++ {
-		out[width*element] = data[element]
-		out[width*element+1] = data[stride+element]
-		out[width*element+2] = data[2*stride+element]
-		out[width*element+3] = data[3*stride+element]
-	}
-}
-
-// decodeByteStreamSplitBatchWidth8 decodes the batch of nValues raw bytes representing a 8-byte datatype provided by 'data',
-// into the output buffer 'out' using BYTE_STREAM_SPLIT encoding.
-// 'out' must have space for at least len(data) bytes.
-func decodeByteStreamSplitBatchWidth8(data []byte, nValues, stride int, out []byte) {
-	const width = 8
-	debug.Assert(len(out) >= nValues*width, fmt.Sprintf("not enough space in output buffer for decoding, out: %d bytes, data: %d bytes", len(out), len(data)))
-	for element := 0; element < nValues; element++ {
-		out[width*element] = data[element]
-		out[width*element+1] = data[stride+element]
-		out[width*element+2] = data[2*stride+element]
-		out[width*element+3] = data[3*stride+element]
-		out[width*element+4] = data[4*stride+element]
-		out[width*element+5] = data[5*stride+element]
-		out[width*element+6] = data[6*stride+element]
-		out[width*element+7] = data[7*stride+element]
-	}
-}
-
-// decodeByteStreamSplitBatchFLBA decodes the batch of nValues FixedLenByteArrays provided by 'data',
-// into the output slice 'out' using BYTE_STREAM_SPLIT encoding.
-// 'out' must have space for at least nValues slices.
-func decodeByteStreamSplitBatchFLBA(data []byte, nValues, stride, width int, out []parquet.FixedLenByteArray) {
-	debug.Assert(len(out) >= nValues, fmt.Sprintf("not enough space in output slice for decoding, out: %d values, data: %d values", len(out), nValues))
-	for stream := 0; stream < width; stream++ {
-		for element := 0; element < nValues; element++ {
-			encLoc := stride*stream + element
-			out[element][stream] = data[encLoc]
-		}
-	}
-}
-
-// decodeByteStreamSplitBatchFLBAWidth2 decodes the batch of nValues FixedLenByteArrays of length 2 provided by 'data',
-// into the output slice 'out' using BYTE_STREAM_SPLIT encoding.
-// 'out' must have space for at least nValues slices.
-func decodeByteStreamSplitBatchFLBAWidth2(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) {
-	debug.Assert(len(out) >= nValues, fmt.Sprintf("not enough space in output slice for decoding, out: %d values, data: %d values", len(out), nValues))
-	for element := 0; element < nValues; element++ {
-		out[element][0] = data[element]
-		out[element][1] = data[stride+element]
-	}
-}
-
-// decodeByteStreamSplitBatchFLBAWidth4 decodes the batch of nValues FixedLenByteArrays of length 4 provided by 'data',
-// into the output slice 'out' using BYTE_STREAM_SPLIT encoding.
-// 'out' must have space for at least nValues slices.
-func decodeByteStreamSplitBatchFLBAWidth4(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) {
-	debug.Assert(len(out) >= nValues, fmt.Sprintf("not enough space in output slice for decoding, out: %d values, data: %d values", len(out), nValues))
-	for element := 0; element < nValues; element++ {
-		out[element][0] = data[element]
-		out[element][1] = data[stride+element]
-		out[element][2] = data[stride*2+element]
-		out[element][3] = data[stride*3+element]
-	}
-}
-
-// decodeByteStreamSplitBatchFLBAWidth8 decodes the batch of nValues FixedLenByteArrays of length 8 provided by 'data',
-// into the output slice 'out' using BYTE_STREAM_SPLIT encoding.
-// 'out' must have space for at least nValues slices.
-func decodeByteStreamSplitBatchFLBAWidth8(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) {
-	debug.Assert(len(out) >= nValues, fmt.Sprintf("not enough space in output slice for decoding, out: %d values, data: %d values", len(out), nValues))
-	for element := 0; element < nValues; element++ {
-		out[element][0] = data[element]
-		out[element][1] = data[stride+element]
-		out[element][2] = data[stride*2+element]
-		out[element][3] = data[stride*3+element]
-		out[element][4] = data[stride*4+element]
-		out[element][5] = data[stride*5+element]
-		out[element][6] = data[stride*6+element]
-		out[element][7] = data[stride*7+element]
-	}
-}
-
 func releaseBufferToPool(pooled *PooledBufferWriter) {
 	buf := pooled.buf
 	memory.Set(buf.Buf(), 0)
diff --git a/parquet/internal/encoding/byte_stream_split_decode.go b/parquet/internal/encoding/byte_stream_split_decode.go
new file mode 100644
index 00000000..c7a8c0f9
--- /dev/null
+++ b/parquet/internal/encoding/byte_stream_split_decode.go
@@ -0,0 +1,112 @@
+package encoding
+
+import (
+	"fmt"
+	"unsafe"
+
+	"github.com/apache/arrow-go/v18/parquet"
+	"github.com/apache/arrow-go/v18/parquet/internal/debug"
+)
+
+var decodeByteStreamSplitBatchWidth4SIMD func(data []byte, nValues, stride int, out []byte)
+
+// decodeByteStreamSplitBatchWidth4 decodes the batch of nValues raw bytes representing a 4-byte datatype provided by 'data',
+// into the output buffer 'out' using BYTE_STREAM_SPLIT encoding.
+// 'out' must have space for at least len(data) bytes.
+func decodeByteStreamSplitBatchWidth4(data []byte, nValues, stride int, out []byte) {
+	const width = 4
+	debug.Assert(len(out) >= nValues*width, fmt.Sprintf("not enough space in output buffer for decoding, out: %d bytes, data: %d bytes", len(out), len(data)))
+	for element := 0; element < nValues; element++ {
+		out[width*element] = data[element]
+		out[width*element+1] = data[stride+element]
+		out[width*element+2] = data[2*stride+element]
+		out[width*element+3] = data[3*stride+element]
+	}
+}
+
+func decodeByteStreamSplitBatchWidth4V2(data []byte, nValues, stride int, out []byte) {
+	const width = 4
+	// Narrow slices to help the compiler eliminate bounds checks.
+	s0 := data[:nValues]
+	s1 := data[stride : stride+nValues]
+	s2 := data[2*stride : 2*stride+nValues]
+	s3 := data[3*stride : 3*stride+nValues]
+
+	out = out[:width*nValues]
+	out32 := unsafe.Slice((*uint32)(unsafe.Pointer(&out[0])), nValues)
+	for i := range nValues {
+		out32[i] = uint32(s0[i]) | uint32(s1[i])<<8 | uint32(s2[i])<<16 | uint32(s3[i])<<24
+	}
+}
+
+// decodeByteStreamSplitBatchWidth8 decodes the batch of nValues raw bytes representing a 8-byte datatype provided by 'data',
+// into the output buffer 'out' using BYTE_STREAM_SPLIT encoding.
+// 'out' must have space for at least len(data) bytes.
+func decodeByteStreamSplitBatchWidth8(data []byte, nValues, stride int, out []byte) {
+	const width = 8
+	debug.Assert(len(out) >= nValues*width, fmt.Sprintf("not enough space in output buffer for decoding, out: %d bytes, data: %d bytes", len(out), len(data)))
+	for element := 0; element < nValues; element++ {
+		out[width*element] = data[element]
+		out[width*element+1] = data[stride+element]
+		out[width*element+2] = data[2*stride+element]
+		out[width*element+3] = data[3*stride+element]
+		out[width*element+4] = data[4*stride+element]
+		out[width*element+5] = data[5*stride+element]
+		out[width*element+6] = data[6*stride+element]
+		out[width*element+7] = data[7*stride+element]
+	}
+}
+
+// decodeByteStreamSplitBatchFLBA decodes the batch of nValues FixedLenByteArrays provided by 'data',
+// into the output slice 'out' using BYTE_STREAM_SPLIT encoding.
+// 'out' must have space for at least nValues slices.
+func decodeByteStreamSplitBatchFLBA(data []byte, nValues, stride, width int, out []parquet.FixedLenByteArray) {
+	debug.Assert(len(out) >= nValues, fmt.Sprintf("not enough space in output slice for decoding, out: %d values, data: %d values", len(out), nValues))
+	for stream := 0; stream < width; stream++ {
+		for element := 0; element < nValues; element++ {
+			encLoc := stride*stream + element
+			out[element][stream] = data[encLoc]
+		}
+	}
+}
+
+// decodeByteStreamSplitBatchFLBAWidth2 decodes the batch of nValues FixedLenByteArrays of length 2 provided by 'data',
+// into the output slice 'out' using BYTE_STREAM_SPLIT encoding.
+// 'out' must have space for at least nValues slices.
+func decodeByteStreamSplitBatchFLBAWidth2(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) {
+	debug.Assert(len(out) >= nValues, fmt.Sprintf("not enough space in output slice for decoding, out: %d values, data: %d values", len(out), nValues))
+	for element := 0; element < nValues; element++ {
+		out[element][0] = data[element]
+		out[element][1] = data[stride+element]
+	}
+}
+
+// decodeByteStreamSplitBatchFLBAWidth4 decodes the batch of nValues FixedLenByteArrays of length 4 provided by 'data',
+// into the output slice 'out' using BYTE_STREAM_SPLIT encoding.
+// 'out' must have space for at least nValues slices.
+func decodeByteStreamSplitBatchFLBAWidth4(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) {
+	debug.Assert(len(out) >= nValues, fmt.Sprintf("not enough space in output slice for decoding, out: %d values, data: %d values", len(out), nValues))
+	for element := 0; element < nValues; element++ {
+		out[element][0] = data[element]
+		out[element][1] = data[stride+element]
+		out[element][2] = data[stride*2+element]
+		out[element][3] = data[stride*3+element]
+	}
+}
+
+// decodeByteStreamSplitBatchFLBAWidth8 decodes the batch of nValues FixedLenByteArrays of length 8 provided by 'data',
+// into the output slice 'out' using BYTE_STREAM_SPLIT encoding.
+// 'out' must have space for at least nValues slices.
+func decodeByteStreamSplitBatchFLBAWidth8(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) {
+	debug.Assert(len(out) >= nValues, fmt.Sprintf("not enough space in output slice for decoding, out: %d values, data: %d values", len(out), nValues))
+	for element := 0; element < nValues; element++ {
+		out[element][0] = data[element]
+		out[element][1] = data[stride+element]
+		out[element][2] = data[stride*2+element]
+		out[element][3] = data[stride*3+element]
+		out[element][4] = data[stride*4+element]
+		out[element][5] = data[stride*5+element]
+		out[element][6] = data[stride*6+element]
+		out[element][7] = data[stride*7+element]
+	}
+}
diff --git a/parquet/internal/encoding/byte_stream_split_decode_amd64.go b/parquet/internal/encoding/byte_stream_split_decode_amd64.go
new file mode 100644
index 00000000..7de7ceed
--- /dev/null
+++ b/parquet/internal/encoding/byte_stream_split_decode_amd64.go
@@ -0,0 +1,9 @@
+package encoding
+
+import "golang.org/x/sys/cpu"
+
+func init() {
+	if cpu.X86.HasAVX2 {
+		decodeByteStreamSplitBatchWidth4SIMD = decodeByteStreamSplitBatchWidth4AVX2
+	}
+}
diff --git a/parquet/internal/encoding/byte_stream_split_decode_arm64.go b/parquet/internal/encoding/byte_stream_split_decode_arm64.go
new file mode 100644
index 00000000..e3159232
--- /dev/null
+++ b/parquet/internal/encoding/byte_stream_split_decode_arm64.go
@@ -0,0 +1,14 @@
+//go:build !noasm
+// +build !noasm
+
+package encoding
+
+import (
+	"golang.org/x/sys/cpu"
+)
+
+func init() {
+	if cpu.ARM64.HasASIMD {
+		decodeByteStreamSplitBatchWidth4SIMD = decodeByteStreamSplitBatchWidth4NEON
+	}
+}
diff --git a/parquet/internal/encoding/byte_stream_split_decode_avx2_amd64.go b/parquet/internal/encoding/byte_stream_split_decode_avx2_amd64.go
new file mode 100644
index 00000000..c4c3c5f2
--- /dev/null
+++ b/parquet/internal/encoding/byte_stream_split_decode_avx2_amd64.go
@@ -0,0 +1,42 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !noasm
+// +build !noasm
+
+package encoding
+
+import (
+	"unsafe"
+)
+
+//go:noescape
+func _decodeByteStreamSplitWidth4AVX2(data, out unsafe.Pointer, nValues, stride int)
+
+func decodeByteStreamSplitBatchWidth4AVX2(data []byte, nValues, stride int, out []byte) {
+	if nValues == 0 {
+		return
+	}
+	// Bounds checking
+	if len(out) < nValues*4 {
+		panic("output buffer too small")
+	}
+	if len(data) < 4*stride {
+		panic("input buffer too small")
+	}
+
+	_decodeByteStreamSplitWidth4AVX2(unsafe.Pointer(&data[0]), unsafe.Pointer(&out[0]), nValues, stride)
+}
diff --git a/parquet/internal/encoding/byte_stream_split_decode_avx2_amd64.s b/parquet/internal/encoding/byte_stream_split_decode_avx2_amd64.s
new file mode 100644
index 00000000..793261df
--- /dev/null
+++ b/parquet/internal/encoding/byte_stream_split_decode_avx2_amd64.s
@@ -0,0 +1,133 @@
+//+build !noasm !appengine
+// AVX2 implementation with 256-bit vectors
+
+#include "textflag.h"
+
+// func _decodeByteStreamSplitWidth4AVX2(data, out unsafe.Pointer, nValues, stride int)
+//
+// AVX2 implementation with 256-bit vectors:
+// - Processes suffix FIRST, then vectorized blocks
+// - Uses 256-bit AVX2 vectors (32 bytes per register)
+// - Processes 32 float32 values per block (128 bytes total)
+// - Uses 2-stage byte unpacking hierarchy
+TEXT ·_decodeByteStreamSplitWidth4AVX2(SB), NOSPLIT, $0-32
+	MOVQ data+0(FP), SI      // SI = data pointer
+	MOVQ out+8(FP), DI       // DI = out pointer
+	MOVQ nValues+16(FP), CX  // CX = nValues
+	MOVQ stride+24(FP), DX   // DX = stride
+
+	// Setup stream pointers
+	MOVQ SI, R9              // stream 0
+	LEAQ (SI)(DX*1), R10     // stream 1 = data + stride
+	LEAQ (SI)(DX*2), R11     // stream 2 = data + 2*stride
+	LEAQ (R10)(DX*2), R12    // stream 3 = data + 3*stride
+
+	// Calculate num_blocks = nValues / 32
+	MOVQ CX, AX
+	SHRQ $5, AX              // AX = num_blocks (divide by 32)
+
+	// Calculate num_processed_elements = num_blocks * 32
+	MOVQ AX, R13
+	SHLQ $5, R13             // R13 = num_processed_elements
+
+	// First handle suffix (elements beyond complete blocks)
+	MOVQ R13, R14            // R14 = i = num_processed_elements
+	JMP suffix_check_avx2
+
+suffix_loop_avx2:
+	// Gather bytes: gathered_byte_data[b] = data[b * stride + i]
+	MOVBQZX (R9)(R14*1), BX   // byte from stream 0
+	MOVBQZX (R10)(R14*1), R15 // byte from stream 1
+
+	// Calculate output offset: i * 4
+	MOVQ R14, AX
+	SHLQ $2, AX              // AX = i * 4
+
+	// Store gathered bytes
+	MOVB BX, (DI)(AX*1)
+	MOVB R15, 1(DI)(AX*1)
+
+	MOVBQZX (R11)(R14*1), BX  // byte from stream 2
+	MOVBQZX (R12)(R14*1), R15 // byte from stream 3
+
+	MOVB BX, 2(DI)(AX*1)
+	MOVB R15, 3(DI)(AX*1)
+
+	INCQ R14
+
+suffix_check_avx2:
+	CMPQ R14, CX
+	JL suffix_loop_avx2
+
+	// Check if we have blocks to process
+	TESTQ R13, R13           // Check if num_processed_elements > 0
+	JZ done_avx2
+
+	// Process blocks with AVX2
+	XORQ R14, R14            // R14 = block index i = 0
+	SHRQ $5, R13             // R13 = num_blocks
+
+block_loop_avx2:
+	// Calculate offset for this block: i * 32
+	MOVQ R14, AX
+	SHLQ $5, AX              // AX = i * 32
+
+	// Load 32 bytes from each stream
+	// stage[0][j] = _mm256_loadu_si256(&data[i * 32 + j * stride])
+	VMOVDQU (R9)(AX*1), Y0    // stage[0][0] from stream 0
+	VMOVDQU (R10)(AX*1), Y1   // stage[0][1] from stream 1
+	VMOVDQU (R11)(AX*1), Y2   // stage[0][2] from stream 2
+	VMOVDQU (R12)(AX*1), Y3   // stage[0][3] from stream 3
+
+	// Stage 1: First level of byte interleaving
+	// stage[1][0] = _mm256_unpacklo_epi8(stage[0][0], stage[0][2])
+	// stage[1][1] = _mm256_unpackhi_epi8(stage[0][0], stage[0][2])
+	// stage[1][2] = _mm256_unpacklo_epi8(stage[0][1], stage[0][3])
+	// stage[1][3] = _mm256_unpackhi_epi8(stage[0][1], stage[0][3])
+
+	VPUNPCKLBW Y2, Y0, Y4    // Y4 = unpacklo_epi8(Y0, Y2)
+	VPUNPCKHBW Y2, Y0, Y5    // Y5 = unpackhi_epi8(Y0, Y2)
+	VPUNPCKLBW Y3, Y1, Y6    // Y6 = unpacklo_epi8(Y1, Y3)
+	VPUNPCKHBW Y3, Y1, Y7    // Y7 = unpackhi_epi8(Y1, Y3)
+
+	// Stage 2: Second level of byte interleaving
+	// stage[2][0] = _mm256_unpacklo_epi8(stage[1][0], stage[1][2])
+	// stage[2][1] = _mm256_unpackhi_epi8(stage[1][0], stage[1][2])
+	// stage[2][2] = _mm256_unpacklo_epi8(stage[1][1], stage[1][3])
+	// stage[2][3] = _mm256_unpackhi_epi8(stage[1][1], stage[1][3])
+
+	VPUNPCKLBW Y6, Y4, Y0    // Y0 = unpacklo_epi8(Y4, Y6)
+	VPUNPCKHBW Y6, Y4, Y1    // Y1 = unpackhi_epi8(Y4, Y6)
+	VPUNPCKLBW Y7, Y5, Y2    // Y2 = unpacklo_epi8(Y5, Y7)
+	VPUNPCKHBW Y7, Y5, Y3    // Y3 = unpackhi_epi8(Y5, Y7)
+
+	// Fix lane order: AVX2 unpacking operates within each 128-bit lane
+	// After two levels of unpacking, we have:
+	// Y0 = [bytes 0-7 of values 0-7 | bytes 0-7 of values 16-23]
+	// Y1 = [bytes 0-7 of values 8-15 | bytes 0-7 of values 24-31]
+	// Y2 = [same pattern for different byte positions]
+	// Y3 = [same pattern for different byte positions]
+	// We need: [values 0-7 | values 8-15 | values 16-23 | values 24-31]
+
+	VPERM2I128 $0x20, Y1, Y0, Y4  // Y4 = [Y0_low(0-7) | Y1_low(8-15)]
+	VPERM2I128 $0x31, Y1, Y0, Y5  // Y5 = [Y0_high(16-23) | Y1_high(24-31)]
+	VPERM2I128 $0x20, Y3, Y2, Y6  // Y6 = [Y2_low | Y3_low]
+	VPERM2I128 $0x31, Y3, Y2, Y7  // Y7 = [Y2_high | Y3_high]
+
+	// Store results: out[(i * 4 + j) * 32] = stage[result][j]
+	// Calculate output base: i * 128
+	MOVQ R14, AX
+	SHLQ $7, AX              // AX = i * 128
+
+	VMOVDQU Y4, (DI)(AX*1)       // Store at offset 0
+	VMOVDQU Y6, 32(DI)(AX*1)     // Store at offset 32
+	VMOVDQU Y5, 64(DI)(AX*1)     // Store at offset 64
+	VMOVDQU Y7, 96(DI)(AX*1)     // Store at offset 96
+
+	INCQ R14
+	CMPQ R14, R13
+	JL block_loop_avx2
+
+done_avx2:
+	VZEROUPPER
+	RET
diff --git a/parquet/internal/encoding/byte_stream_split_decode_neon_arm64.go b/parquet/internal/encoding/byte_stream_split_decode_neon_arm64.go
new file mode 100644
index 00000000..f8f1d8c6
--- /dev/null
+++ b/parquet/internal/encoding/byte_stream_split_decode_neon_arm64.go
@@ -0,0 +1,40 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !noasm
+// +build !noasm
+
+package encoding
+
+import "unsafe"
+
+//go:noescape
+func _decodeByteStreamSplitWidth4NEON(data, out unsafe.Pointer, nValues, stride int)
+
+func decodeByteStreamSplitBatchWidth4NEON(data []byte, nValues, stride int, out []byte) {
+	if nValues == 0 {
+		return
+	}
+	// Bounds checking
+	if len(out) < nValues*4 {
+		panic("output buffer too small")
+	}
+	if len(data) < 4*stride {
+		panic("input buffer too small")
+	}
+
+	_decodeByteStreamSplitWidth4NEON(unsafe.Pointer(&data[0]), unsafe.Pointer(&out[0]), nValues, stride)
+}
diff --git a/parquet/internal/encoding/byte_stream_split_decode_neon_arm64.s b/parquet/internal/encoding/byte_stream_split_decode_neon_arm64.s
new file mode 100644
index 00000000..cbee47b3
--- /dev/null
+++ b/parquet/internal/encoding/byte_stream_split_decode_neon_arm64.s
@@ -0,0 +1,117 @@
+//+build !noasm !appengine
+// NEON implementation following AVX2 algorithm structure with 128-bit vectors
+
+#include "textflag.h"
+
+// func _decodeByteStreamSplitWidth4NEON(data, out unsafe.Pointer, nValues, stride int)
+//
+// NEON implementation following the AVX2 algorithm structure:
+// - Processes suffix FIRST, then vectorized blocks
+// - Uses 128-bit NEON vectors (16 bytes per register)
+// - Processes 16 float32 values per block (64 bytes total)
+// - Uses 2-stage byte unpacking hierarchy
+TEXT ·_decodeByteStreamSplitWidth4NEON(SB), NOSPLIT, $0-32
+	MOVD data+0(FP), R0      // R0 = data pointer
+	MOVD out+8(FP), R1       // R1 = out pointer
+	MOVD nValues+16(FP), R2  // R2 = nValues
+	MOVD stride+24(FP), R3   // R3 = stride
+
+	// Setup stream pointers
+	MOVD R0, R4              // R4 = stream 0
+	ADD R3, R0, R5           // R5 = stream 1 = data + stride
+	ADD R3, R5, R6           // R6 = stream 2 = data + 2*stride
+	ADD R3, R6, R7           // R7 = stream 3 = data + 3*stride
+
+	// Calculate num_blocks = nValues / 16
+	LSR $4, R2, R8           // R8 = num_blocks (divide by 16)
+
+	// Calculate num_processed_elements = num_blocks * 16
+	LSL $4, R8, R9           // R9 = num_processed_elements
+
+	// First handle suffix (elements beyond complete blocks)
+	MOVD R9, R10             // R10 = i = num_processed_elements
+	B suffix_check_neon
+
+suffix_loop_neon:
+	// Gather bytes: gathered_byte_data[b] = data[b * stride + i]
+	MOVBU (R4)(R10), R11      // byte from stream 0
+	MOVBU (R5)(R10), R12      // byte from stream 1
+
+	// Calculate output offset: i * 4
+	LSL $2, R10, R13         // R13 = i * 4
+	ADD R1, R13, R14         // R14 = out + (i * 4)
+
+	// Store gathered bytes
+	MOVB R11, (R14)
+	MOVB R12, 1(R14)
+
+	MOVBU (R6)(R10), R11      // byte from stream 2
+	MOVBU (R7)(R10), R12      // byte from stream 3
+
+	MOVB R11, 2(R14)
+	MOVB R12, 3(R14)
+
+	ADD $1, R10, R10
+
+suffix_check_neon:
+	CMP R2, R10
+	BLT suffix_loop_neon
+
+	// Check if we have blocks to process
+	CBZ R9, done_neon        // if num_processed_elements == 0, done
+
+	// Process blocks with NEON
+	MOVD $0, R10             // R10 = block index i = 0
+	LSR $4, R9, R9           // R9 = num_blocks
+
+block_loop_neon:
+	// Calculate offset for this block: i * 16
+	LSL $4, R10, R11         // R11 = i * 16
+
+	// Load 16 bytes from each stream - REVERSED for little-endian!
+	ADD R7, R11, R12
+	VLD1 (R12), [V0.B16]     // V0 = stream 3 (MSB in little-endian)
+
+	ADD R6, R11, R12
+	VLD1 (R12), [V1.B16]     // V1 = stream 2
+
+	ADD R5, R11, R12
+	VLD1 (R12), [V2.B16]     // V2 = stream 1
+
+	ADD R4, R11, R12
+	VLD1 (R12), [V3.B16]     // V3 = stream 0 (LSB in little-endian)
+
+	// Stage 1: Interleave like AVX2 VPUNPCKLBW/VPUNPCKHBW
+	VZIP1 V0.B16, V2.B16, V4.B16    // Interleave streams 0,2 low bytes
+	VZIP2 V0.B16, V2.B16, V5.B16    // Interleave streams 0,2 high bytes
+	VZIP1 V1.B16, V3.B16, V6.B16    // Interleave streams 1,3 low bytes
+	VZIP2 V1.B16, V3.B16, V7.B16    // Interleave streams 1,3 high bytes
+
+	// Stage 2: Second level
+	VZIP1 V4.B16, V6.B16, V0.B16
+	VZIP2 V4.B16, V6.B16, V1.B16
+	VZIP1 V5.B16, V7.B16, V2.B16
+	VZIP2 V5.B16, V7.B16, V3.B16
+
+	// Store results in sequential order
+	// Calculate output base: i * 64
+	LSL $6, R10, R11         // R11 = i * 64
+
+	ADD R1, R11, R12
+	VST1 [V0.B16], (R12)          // Store at offset 0
+
+	ADD $16, R12
+	VST1 [V1.B16], (R12)          // Store at offset 16
+
+	ADD $16, R12
+	VST1 [V2.B16], (R12)          // Store at offset 32
+
+	ADD $16, R12
+	VST1 [V3.B16], (R12)          // Store at offset 48
+
+	ADD $1, R10, R10
+	CMP R9, R10
+	BLT block_loop_neon
+
+done_neon:
+	RET
diff --git a/parquet/internal/encoding/byte_stream_split_decode_test.go b/parquet/internal/encoding/byte_stream_split_decode_test.go
new file mode 100644
index 00000000..e28156ca
--- /dev/null
+++ b/parquet/internal/encoding/byte_stream_split_decode_test.go
@@ -0,0 +1,246 @@
+package encoding
+
+import (
+	"bytes"
+	"fmt"
+	"testing"
+
+	"golang.org/x/sys/cpu"
+)
+
+// TestDecodeByteStreamSplitWidth4 validates correctness of all width-4 decoding implementations
+func TestDecodeByteStreamSplitWidth4(t *testing.T) {
+	const width = 4
+
+	// Test various sizes including edge cases
+	sizes := []int{1, 2, 7, 8, 31, 32, 33, 63, 64, 65, 127, 128, 129, 255, 256, 512, 1024}
+
+	for _, nValues := range sizes {
+		t.Run(fmt.Sprintf("nValues=%d", nValues), func(t *testing.T) {
+			// Setup encoded data (byte stream split format)
+			stride := nValues
+			data := make([]byte, width*nValues)
+
+			// Initialize with predictable pattern
+			for i := 0; i < nValues; i++ {
+				data[i] = byte(i % 256)                // stream 0
+				data[stride+i] = byte((i + 1) % 256)   // stream 1
+				data[2*stride+i] = byte((i + 2) % 256) // stream 2
+				data[3*stride+i] = byte((i + 3) % 256) // stream 3
+			}
+
+			// Expected output: interleaved bytes
+			expected := make([]byte, width*nValues)
+			for i := 0; i < nValues; i++ {
+				expected[i*4] = byte(i % 256)
+				expected[i*4+1] = byte((i + 1) % 256)
+				expected[i*4+2] = byte((i + 2) % 256)
+				expected[i*4+3] = byte((i + 3) % 256)
+			}
+
+			// Test reference implementation
+			t.Run("Reference", func(t *testing.T) {
+				out := make([]byte, width*nValues)
+				decodeByteStreamSplitBatchWidth4(data, nValues, stride, out)
+				if !bytes.Equal(out, expected) {
+					t.Errorf("Reference implementation produced incorrect output")
+					for i := 0; i < len(expected) && i < 64; i++ {
+						if out[i] != expected[i] {
+							t.Errorf("First mismatch at index %d: got %d, want %d", i, out[i], expected[i])
+							break
+						}
+					}
+				}
+			})
+
+			// Test V2 implementation
+			t.Run("V2", func(t *testing.T) {
+				out := make([]byte, width*nValues)
+				decodeByteStreamSplitBatchWidth4V2(data, nValues, stride, out)
+				if !bytes.Equal(out, expected) {
+					t.Errorf("V2 implementation produced incorrect output")
+					for i := 0; i < len(expected) && i < 64; i++ {
+						if out[i] != expected[i] {
+							t.Errorf("First mismatch at index %d: got %d, want %d", i, out[i], expected[i])
+							break
+						}
+					}
+				}
+			})
+
+			if cpu.X86.HasAVX2 {
+				// Test AVX2 assembly implementation
+				t.Run("AVX2", func(t *testing.T) {
+					out := make([]byte, width*nValues)
+					// _decodeByteStreamSplitWidth4AVX2(unsafe.Pointer(&data[0]), unsafe.Pointer(&out[0]), nValues, stride)
+					decodeByteStreamSplitBatchWidth4SIMD(data, nValues, stride, out)
+					if !bytes.Equal(out, expected) {
+						t.Errorf("AVX2 implementation produced incorrect output")
+						for i := 0; i < len(expected) && i < 64; i++ {
+							if out[i] != expected[i] {
+								t.Errorf("First mismatch at index %d: got %d, want %d", i, out[i], expected[i])
+								break
+							}
+						}
+					}
+				})
+			}
+
+			if cpu.ARM64.HasASIMD {
+				// Test NEON assembly implementation
+				t.Run("NEON", func(t *testing.T) {
+					out := make([]byte, width*nValues)
+					// _decodeByteStreamSplitWidth4NEON(unsafe.Pointer(&data[0]), unsafe.Pointer(&out[0]), nValues, stride)
+					decodeByteStreamSplitBatchWidth4SIMD(data, nValues, stride, out)
+					if !bytes.Equal(out, expected) {
+						t.Errorf("NEON implementation produced incorrect output")
+						for i := 0; i < len(expected) && i < 64; i++ {
+							if out[i] != expected[i] {
+								t.Errorf("First mismatch at index %d: got %d, want %d", i, out[i], expected[i])
+								break
+							}
+						}
+					}
+				})
+			}
+		})
+	}
+}
+
+// TestDecodeByteStreamSplitWidth4EdgeCases tests edge cases and boundary conditions
+func TestDecodeByteStreamSplitWidth4EdgeCases(t *testing.T) {
+	const width = 4
+
+	t.Run("ExactBlockBoundary32", func(t *testing.T) {
+		// Test exact block boundary (32 values = 1 complete AVX2 block)
+		nValues := 32
+		stride := nValues
+		data := make([]byte, width*nValues)
+
+		for i := 0; i < nValues; i++ {
+			data[i] = byte(i)
+			data[stride+i] = byte(i + 64)
+			data[2*stride+i] = byte(i + 128)
+			data[3*stride+i] = byte(i + 192)
+		}
+
+		expected := make([]byte, width*nValues)
+		for i := 0; i < nValues; i++ {
+			expected[i*4] = byte(i)
+			expected[i*4+1] = byte(i + 64)
+			expected[i*4+2] = byte(i + 128)
+			expected[i*4+3] = byte(i + 192)
+		}
+
+		out := make([]byte, width*nValues)
+		// _decodeByteStreamSplitWidth4AVX2(unsafe.Pointer(&data[0]), unsafe.Pointer(&out[0]), nValues, stride)
+		decodeByteStreamSplitBatchWidth4SIMD(data, nValues, stride, out)
+		if !bytes.Equal(out, expected) {
+			t.Errorf("Failed on exact 32-value block boundary")
+		}
+	})
+
+	t.Run("SingleValue", func(t *testing.T) {
+		// Test single value (all suffix, no vectorization)
+		nValues := 1
+		stride := nValues
+		data := []byte{0xAA, 0xBB, 0xCC, 0xDD}
+		expected := []byte{0xAA, 0xBB, 0xCC, 0xDD}
+
+		out := make([]byte, width*nValues)
+		decodeByteStreamSplitBatchWidth4SIMD(data, nValues, stride, out)
+		if !bytes.Equal(out, expected) {
+			t.Errorf("Failed on single value: got %v, want %v", out, expected)
+		}
+	})
+
+	t.Run("AllZeros", func(t *testing.T) {
+		// Test all zeros
+		nValues := 100
+		stride := nValues
+		data := make([]byte, width*nValues)
+		expected := make([]byte, width*nValues)
+
+		out := make([]byte, width*nValues)
+		// _decodeByteStreamSplitWidth4AVX2(unsafe.Pointer(&data[0]), unsafe.Pointer(&out[0]), nValues, stride)
+		decodeByteStreamSplitBatchWidth4SIMD(data, nValues, stride, out)
+		if !bytes.Equal(out, expected) {
+			t.Errorf("Failed on all-zero data")
+		}
+	})
+
+	t.Run("AllOnes", func(t *testing.T) {
+		// Test all 0xFF
+		nValues := 100
+		stride := nValues
+		data := make([]byte, width*nValues)
+		for i := range data {
+			data[i] = 0xFF
+		}
+		expected := make([]byte, width*nValues)
+		for i := range expected {
+			expected[i] = 0xFF
+		}
+
+		out := make([]byte, width*nValues)
+		// _decodeByteStreamSplitWidth4AVX2(unsafe.Pointer(&data[0]), unsafe.Pointer(&out[0]), nValues, stride)
+		decodeByteStreamSplitBatchWidth4SIMD(data, nValues, stride, out)
+		if !bytes.Equal(out, expected) {
+			t.Errorf("Failed on all-0xFF data")
+		}
+	})
+}
+
+// BenchmarkDecodeByteStreamSplitBatchWidth4 benchmarks all width-4 decoding implementations with sub-benchmarks.
+func BenchmarkDecodeByteStreamSplitBatchWidth4(b *testing.B) {
+	const width = 4
+	sizes := []int{8, 64, 512, 4096, 32768, 262144}
+
+	for _, nValues := range sizes {
+		b.Run(fmt.Sprintf("nValues=%d", nValues), func(b *testing.B) {
+			stride := nValues
+			data := make([]byte, width*nValues)
+			for i := 0; i < nValues; i++ {
+				data[i] = byte(i % 256)
+				data[stride+i] = byte((i + 1) % 256)
+				data[2*stride+i] = byte((i + 2) % 256)
+				data[3*stride+i] = byte((i + 3) % 256)
+			}
+			out := make([]byte, width*nValues)
+
+			b.SetBytes(int64(width * nValues))
+
+			b.Run("Reference", func(b *testing.B) {
+				b.ResetTimer()
+				for i := 0; i < b.N; i++ {
+					decodeByteStreamSplitBatchWidth4(data, nValues, stride, out)
+				}
+			})
+
+			b.Run("V2", func(b *testing.B) {
+				b.ResetTimer()
+				for i := 0; i < b.N; i++ {
+					decodeByteStreamSplitBatchWidth4V2(data, nValues, stride, out)
+				}
+			})
+
+			if cpu.X86.HasAVX2 {
+				b.Run("AVX2", func(b *testing.B) {
+					b.ResetTimer()
+					for i := 0; i < b.N; i++ {
+						decodeByteStreamSplitBatchWidth4SIMD(data, nValues, stride, out)
+					}
+				})
+			}
+
+			if cpu.ARM64.HasASIMD {
+				b.Run("NEON", func(b *testing.B) {
+					b.ResetTimer()
+					for i := 0; i < b.N; i++ {
+						decodeByteStreamSplitBatchWidth4SIMD(data, nValues, stride, out)
+					}
+				})
+			}
+		})
+	}
+}

From 15b9ecf199382431806aa7e59e07056165068333 Mon Sep 17 00:00:00 2001
From: Daniel Adam <daniel.adam1@thermofisher.com>
Date: Mon, 2 Feb 2026 17:18:12 +0100
Subject: [PATCH 2/8] fixup! Add SIMD-accelerated byte-stream-split decoding

---
 .../encoding/byte_stream_split_decode.go      |  16 ++
 .../byte_stream_split_decode_amd64.go         |  19 ++
 .../byte_stream_split_decode_arm64.go         |  16 ++
 .../byte_stream_split_decode_asm_test.go      | 177 ++++++++++++++++++
 .../encoding/byte_stream_split_decode_test.go | 168 ++++-------------
 5 files changed, 261 insertions(+), 135 deletions(-)
 create mode 100644 parquet/internal/encoding/byte_stream_split_decode_asm_test.go

diff --git a/parquet/internal/encoding/byte_stream_split_decode.go b/parquet/internal/encoding/byte_stream_split_decode.go
index c7a8c0f9..bb0e7aa2 100644
--- a/parquet/internal/encoding/byte_stream_split_decode.go
+++ b/parquet/internal/encoding/byte_stream_split_decode.go
@@ -1,3 +1,19 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package encoding
 
 import (
diff --git a/parquet/internal/encoding/byte_stream_split_decode_amd64.go b/parquet/internal/encoding/byte_stream_split_decode_amd64.go
index 7de7ceed..7572d1dc 100644
--- a/parquet/internal/encoding/byte_stream_split_decode_amd64.go
+++ b/parquet/internal/encoding/byte_stream_split_decode_amd64.go
@@ -1,3 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !noasm
+// +build !noasm
+
 package encoding
 
 import "golang.org/x/sys/cpu"
diff --git a/parquet/internal/encoding/byte_stream_split_decode_arm64.go b/parquet/internal/encoding/byte_stream_split_decode_arm64.go
index e3159232..628c3d90 100644
--- a/parquet/internal/encoding/byte_stream_split_decode_arm64.go
+++ b/parquet/internal/encoding/byte_stream_split_decode_arm64.go
@@ -1,3 +1,19 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 //go:build !noasm
 // +build !noasm
 
diff --git a/parquet/internal/encoding/byte_stream_split_decode_asm_test.go b/parquet/internal/encoding/byte_stream_split_decode_asm_test.go
new file mode 100644
index 00000000..6f6d1329
--- /dev/null
+++ b/parquet/internal/encoding/byte_stream_split_decode_asm_test.go
@@ -0,0 +1,177 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !noasm
+// +build !noasm
+
+package encoding
+
+import (
+	"bytes"
+	"fmt"
+	"testing"
+
+	"golang.org/x/sys/cpu"
+)
+
+// TestDecodeByteStreamSplitWidth4 validates correctness of all width-4 decoding implementations
+func TestDecodeByteStreamSplitWidth4SIMD(t *testing.T) {
+	const width = 4
+
+	// Test various sizes including edge cases
+	sizes := []int{1, 2, 7, 8, 31, 32, 33, 63, 64, 65, 127, 128, 129, 255, 256, 512, 1024}
+
+	for _, nValues := range sizes {
+		t.Run(fmt.Sprintf("nValues=%d", nValues), func(t *testing.T) {
+			// Setup encoded data (byte stream split format)
+			stride := nValues
+			data := make([]byte, width*nValues)
+
+			// Initialize with predictable pattern
+			for i := 0; i < nValues; i++ {
+				data[i] = byte(i % 256)                // stream 0
+				data[stride+i] = byte((i + 1) % 256)   // stream 1
+				data[2*stride+i] = byte((i + 2) % 256) // stream 2
+				data[3*stride+i] = byte((i + 3) % 256) // stream 3
+			}
+
+			// Expected output: interleaved bytes
+			expected := make([]byte, width*nValues)
+			for i := 0; i < nValues; i++ {
+				expected[i*4] = byte(i % 256)
+				expected[i*4+1] = byte((i + 1) % 256)
+				expected[i*4+2] = byte((i + 2) % 256)
+				expected[i*4+3] = byte((i + 3) % 256)
+			}
+
+			if cpu.X86.HasAVX2 {
+				// Test AVX2 assembly implementation
+				t.Run("AVX2", func(t *testing.T) {
+					out := make([]byte, width*nValues)
+					decodeByteStreamSplitBatchWidth4SIMD(data, nValues, stride, out)
+					if !bytes.Equal(out, expected) {
+						t.Errorf("AVX2 implementation produced incorrect output")
+						for i := 0; i < len(expected) && i < 64; i++ {
+							if out[i] != expected[i] {
+								t.Errorf("First mismatch at index %d: got %d, want %d", i, out[i], expected[i])
+								break
+							}
+						}
+					}
+				})
+			}
+
+			if cpu.ARM64.HasASIMD {
+				// Test NEON assembly implementation
+				t.Run("NEON", func(t *testing.T) {
+					out := make([]byte, width*nValues)
+					// _decodeByteStreamSplitWidth4NEON(unsafe.Pointer(&data[0]), unsafe.Pointer(&out[0]), nValues, stride)
+					decodeByteStreamSplitBatchWidth4SIMD(data, nValues, stride, out)
+					if !bytes.Equal(out, expected) {
+						t.Errorf("NEON implementation produced incorrect output")
+						for i := 0; i < len(expected) && i < 64; i++ {
+							if out[i] != expected[i] {
+								t.Errorf("First mismatch at index %d: got %d, want %d", i, out[i], expected[i])
+								break
+							}
+						}
+					}
+				})
+			}
+		})
+	}
+}
+
+// TestDecodeByteStreamSplitWidth4EdgeCases tests edge cases and boundary conditions
+func TestDecodeByteStreamSplitWidth4EdgeCasesSIMD(t *testing.T) {
+	const width = 4
+
+	t.Run("ExactBlockBoundary32", func(t *testing.T) {
+		// Test exact block boundary (32 values = 1 complete AVX2 block)
+		nValues := 32
+		stride := nValues
+		data := make([]byte, width*nValues)
+
+		for i := 0; i < nValues; i++ {
+			data[i] = byte(i)
+			data[stride+i] = byte(i + 64)
+			data[2*stride+i] = byte(i + 128)
+			data[3*stride+i] = byte(i + 192)
+		}
+
+		expected := make([]byte, width*nValues)
+		for i := 0; i < nValues; i++ {
+			expected[i*4] = byte(i)
+			expected[i*4+1] = byte(i + 64)
+			expected[i*4+2] = byte(i + 128)
+			expected[i*4+3] = byte(i + 192)
+		}
+
+		out := make([]byte, width*nValues)
+		decodeByteStreamSplitBatchWidth4SIMD(data, nValues, stride, out)
+		if !bytes.Equal(out, expected) {
+			t.Errorf("Failed on exact 32-value block boundary")
+		}
+	})
+
+	t.Run("SingleValue", func(t *testing.T) {
+		// Test single value (all suffix, no vectorization)
+		nValues := 1
+		stride := nValues
+		data := []byte{0xAA, 0xBB, 0xCC, 0xDD}
+		expected := []byte{0xAA, 0xBB, 0xCC, 0xDD}
+
+		out := make([]byte, width*nValues)
+		decodeByteStreamSplitBatchWidth4SIMD(data, nValues, stride, out)
+		if !bytes.Equal(out, expected) {
+			t.Errorf("Failed on single value: got %v, want %v", out, expected)
+		}
+	})
+
+	t.Run("AllZeros", func(t *testing.T) {
+		// Test all zeros
+		nValues := 100
+		stride := nValues
+		data := make([]byte, width*nValues)
+		expected := make([]byte, width*nValues)
+
+		out := make([]byte, width*nValues)
+		decodeByteStreamSplitBatchWidth4SIMD(data, nValues, stride, out)
+		if !bytes.Equal(out, expected) {
+			t.Errorf("Failed on all-zero data")
+		}
+	})
+
+	t.Run("AllOnes", func(t *testing.T) {
+		// Test all 0xFF
+		nValues := 100
+		stride := nValues
+		data := make([]byte, width*nValues)
+		for i := range data {
+			data[i] = 0xFF
+		}
+		expected := make([]byte, width*nValues)
+		for i := range expected {
+			expected[i] = 0xFF
+		}
+
+		out := make([]byte, width*nValues)
+		decodeByteStreamSplitBatchWidth4SIMD(data, nValues, stride, out)
+		if !bytes.Equal(out, expected) {
+			t.Errorf("Failed on all-0xFF data")
+		}
+	})
+}
diff --git a/parquet/internal/encoding/byte_stream_split_decode_test.go b/parquet/internal/encoding/byte_stream_split_decode_test.go
index e28156ca..9428168a 100644
--- a/parquet/internal/encoding/byte_stream_split_decode_test.go
+++ b/parquet/internal/encoding/byte_stream_split_decode_test.go
@@ -1,3 +1,19 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package encoding
 
 import (
@@ -67,130 +83,10 @@ func TestDecodeByteStreamSplitWidth4(t *testing.T) {
 					}
 				}
 			})
-
-			if cpu.X86.HasAVX2 {
-				// Test AVX2 assembly implementation
-				t.Run("AVX2", func(t *testing.T) {
-					out := make([]byte, width*nValues)
-					// _decodeByteStreamSplitWidth4AVX2(unsafe.Pointer(&data[0]), unsafe.Pointer(&out[0]), nValues, stride)
-					decodeByteStreamSplitBatchWidth4SIMD(data, nValues, stride, out)
-					if !bytes.Equal(out, expected) {
-						t.Errorf("AVX2 implementation produced incorrect output")
-						for i := 0; i < len(expected) && i < 64; i++ {
-							if out[i] != expected[i] {
-								t.Errorf("First mismatch at index %d: got %d, want %d", i, out[i], expected[i])
-								break
-							}
-						}
-					}
-				})
-			}
-
-			if cpu.ARM64.HasASIMD {
-				// Test NEON assembly implementation
-				t.Run("NEON", func(t *testing.T) {
-					out := make([]byte, width*nValues)
-					// _decodeByteStreamSplitWidth4NEON(unsafe.Pointer(&data[0]), unsafe.Pointer(&out[0]), nValues, stride)
-					decodeByteStreamSplitBatchWidth4SIMD(data, nValues, stride, out)
-					if !bytes.Equal(out, expected) {
-						t.Errorf("NEON implementation produced incorrect output")
-						for i := 0; i < len(expected) && i < 64; i++ {
-							if out[i] != expected[i] {
-								t.Errorf("First mismatch at index %d: got %d, want %d", i, out[i], expected[i])
-								break
-							}
-						}
-					}
-				})
-			}
 		})
 	}
 }
 
-// TestDecodeByteStreamSplitWidth4EdgeCases tests edge cases and boundary conditions
-func TestDecodeByteStreamSplitWidth4EdgeCases(t *testing.T) {
-	const width = 4
-
-	t.Run("ExactBlockBoundary32", func(t *testing.T) {
-		// Test exact block boundary (32 values = 1 complete AVX2 block)
-		nValues := 32
-		stride := nValues
-		data := make([]byte, width*nValues)
-
-		for i := 0; i < nValues; i++ {
-			data[i] = byte(i)
-			data[stride+i] = byte(i + 64)
-			data[2*stride+i] = byte(i + 128)
-			data[3*stride+i] = byte(i + 192)
-		}
-
-		expected := make([]byte, width*nValues)
-		for i := 0; i < nValues; i++ {
-			expected[i*4] = byte(i)
-			expected[i*4+1] = byte(i + 64)
-			expected[i*4+2] = byte(i + 128)
-			expected[i*4+3] = byte(i + 192)
-		}
-
-		out := make([]byte, width*nValues)
-		// _decodeByteStreamSplitWidth4AVX2(unsafe.Pointer(&data[0]), unsafe.Pointer(&out[0]), nValues, stride)
-		decodeByteStreamSplitBatchWidth4SIMD(data, nValues, stride, out)
-		if !bytes.Equal(out, expected) {
-			t.Errorf("Failed on exact 32-value block boundary")
-		}
-	})
-
-	t.Run("SingleValue", func(t *testing.T) {
-		// Test single value (all suffix, no vectorization)
-		nValues := 1
-		stride := nValues
-		data := []byte{0xAA, 0xBB, 0xCC, 0xDD}
-		expected := []byte{0xAA, 0xBB, 0xCC, 0xDD}
-
-		out := make([]byte, width*nValues)
-		decodeByteStreamSplitBatchWidth4SIMD(data, nValues, stride, out)
-		if !bytes.Equal(out, expected) {
-			t.Errorf("Failed on single value: got %v, want %v", out, expected)
-		}
-	})
-
-	t.Run("AllZeros", func(t *testing.T) {
-		// Test all zeros
-		nValues := 100
-		stride := nValues
-		data := make([]byte, width*nValues)
-		expected := make([]byte, width*nValues)
-
-		out := make([]byte, width*nValues)
-		// _decodeByteStreamSplitWidth4AVX2(unsafe.Pointer(&data[0]), unsafe.Pointer(&out[0]), nValues, stride)
-		decodeByteStreamSplitBatchWidth4SIMD(data, nValues, stride, out)
-		if !bytes.Equal(out, expected) {
-			t.Errorf("Failed on all-zero data")
-		}
-	})
-
-	t.Run("AllOnes", func(t *testing.T) {
-		// Test all 0xFF
-		nValues := 100
-		stride := nValues
-		data := make([]byte, width*nValues)
-		for i := range data {
-			data[i] = 0xFF
-		}
-		expected := make([]byte, width*nValues)
-		for i := range expected {
-			expected[i] = 0xFF
-		}
-
-		out := make([]byte, width*nValues)
-		// _decodeByteStreamSplitWidth4AVX2(unsafe.Pointer(&data[0]), unsafe.Pointer(&out[0]), nValues, stride)
-		decodeByteStreamSplitBatchWidth4SIMD(data, nValues, stride, out)
-		if !bytes.Equal(out, expected) {
-			t.Errorf("Failed on all-0xFF data")
-		}
-	})
-}
-
 // BenchmarkDecodeByteStreamSplitBatchWidth4 benchmarks all width-4 decoding implementations with sub-benchmarks.
 func BenchmarkDecodeByteStreamSplitBatchWidth4(b *testing.B) {
 	const width = 4
@@ -224,22 +120,24 @@ func BenchmarkDecodeByteStreamSplitBatchWidth4(b *testing.B) {
 				}
 			})
 
-			if cpu.X86.HasAVX2 {
-				b.Run("AVX2", func(b *testing.B) {
-					b.ResetTimer()
-					for i := 0; i < b.N; i++ {
-						decodeByteStreamSplitBatchWidth4SIMD(data, nValues, stride, out)
-					}
-				})
-			}
+			if decodeByteStreamSplitBatchWidth4SIMD != nil {
+				if cpu.X86.HasAVX2 {
+					b.Run("AVX2", func(b *testing.B) {
+						b.ResetTimer()
+						for i := 0; i < b.N; i++ {
+							decodeByteStreamSplitBatchWidth4SIMD(data, nValues, stride, out)
+						}
+					})
+				}
 
-			if cpu.ARM64.HasASIMD {
-				b.Run("NEON", func(b *testing.B) {
-					b.ResetTimer()
-					for i := 0; i < b.N; i++ {
-						decodeByteStreamSplitBatchWidth4SIMD(data, nValues, stride, out)
-					}
-				})
+				if cpu.ARM64.HasASIMD {
+					b.Run("NEON", func(b *testing.B) {
+						b.ResetTimer()
+						for i := 0; i < b.N; i++ {
+							decodeByteStreamSplitBatchWidth4SIMD(data, nValues, stride, out)
+						}
+					})
+				}
 			}
 		})
 	}

From 07eb37a168a50da0a4b8d34cd60196b24735be86 Mon Sep 17 00:00:00 2001
From: Daniel Adam <daniel.adam1@thermofisher.com>
Date: Wed, 4 Feb 2026 14:29:27 +0100
Subject: [PATCH 3/8] perf(parquet/encoding): add optimized FLBA width-2 byte
 stream split decoder

Add decodeByteStreamSplitBatchFLBAWidth2V2 as an optimized implementation
for decoding FixedLenByteArray width-2 byte stream split encoding.

The V2 implementation uses slice narrowing to eliminate bounds checks and
unsafe pointer operations to write directly as uint16, providing significant
performance improvements over the reference implementation:

Includes comprehensive unit tests and benchmarks to verify correctness
across various batch sizes and measure performance characteristics.
---
 .../encoding/byte_stream_split_decode.go      | 12 +++
 .../encoding/byte_stream_split_decode_test.go | 98 +++++++++++++++++++
 2 files changed, 110 insertions(+)

diff --git a/parquet/internal/encoding/byte_stream_split_decode.go b/parquet/internal/encoding/byte_stream_split_decode.go
index bb0e7aa2..138a34d7 100644
--- a/parquet/internal/encoding/byte_stream_split_decode.go
+++ b/parquet/internal/encoding/byte_stream_split_decode.go
@@ -97,6 +97,18 @@ func decodeByteStreamSplitBatchFLBAWidth2(data []byte, nValues, stride int, out
 	}
 }
 
+func decodeByteStreamSplitBatchFLBAWidth2V2(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) {
+	const width = 2
+	// Narrow slices to help the compiler eliminate bounds checks.
+	s0 := data[:nValues]
+	s1 := data[stride : stride+nValues]
+
+	for i := range nValues {
+		out16 := (*uint16)(unsafe.Pointer(&out[i][0]))
+		*out16 = uint16(s0[i]) | uint16(s1[i])<<8
+	}
+}
+
 // decodeByteStreamSplitBatchFLBAWidth4 decodes the batch of nValues FixedLenByteArrays of length 4 provided by 'data',
 // into the output slice 'out' using BYTE_STREAM_SPLIT encoding.
 // 'out' must have space for at least nValues slices.
diff --git a/parquet/internal/encoding/byte_stream_split_decode_test.go b/parquet/internal/encoding/byte_stream_split_decode_test.go
index 9428168a..43c43261 100644
--- a/parquet/internal/encoding/byte_stream_split_decode_test.go
+++ b/parquet/internal/encoding/byte_stream_split_decode_test.go
@@ -21,6 +21,7 @@ import (
 	"fmt"
 	"testing"
 
+	"github.com/apache/arrow-go/v18/parquet"
 	"golang.org/x/sys/cpu"
 )
 
@@ -142,3 +143,100 @@ func BenchmarkDecodeByteStreamSplitBatchWidth4(b *testing.B) {
 		})
 	}
 }
+
+// TestDecodeByteStreamSplitFLBAWidth2 validates correctness of all FLBA width-2 decoding implementations
+func TestDecodeByteStreamSplitFLBAWidth2(t *testing.T) {
+	const width = 2
+
+	// Test various sizes including edge cases
+	sizes := []int{1, 2, 7, 8, 31, 32, 33, 63, 64, 65, 127, 128, 129, 255, 256, 512, 1024}
+
+	for _, nValues := range sizes {
+		t.Run(fmt.Sprintf("nValues=%d", nValues), func(t *testing.T) {
+			// Setup encoded data (byte stream split format)
+			stride := nValues
+			data := make([]byte, width*nValues)
+
+			// Initialize with predictable pattern
+			for i := 0; i < nValues; i++ {
+				data[i] = byte(i % 256)              // stream 0
+				data[stride+i] = byte((i + 1) % 256) // stream 1
+			}
+
+			// Expected output: FixedLenByteArray slices with interleaved bytes
+			expected := make([]parquet.FixedLenByteArray, nValues)
+			for i := 0; i < nValues; i++ {
+				expected[i] = make(parquet.FixedLenByteArray, width)
+				expected[i][0] = byte(i % 256)
+				expected[i][1] = byte((i + 1) % 256)
+			}
+
+			// Test reference implementation
+			t.Run("Reference", func(t *testing.T) {
+				out := make([]parquet.FixedLenByteArray, nValues)
+				for i := range out {
+					out[i] = make(parquet.FixedLenByteArray, width)
+				}
+				decodeByteStreamSplitBatchFLBAWidth2(data, nValues, stride, out)
+				for i := 0; i < nValues; i++ {
+					if !bytes.Equal(out[i], expected[i]) {
+						t.Errorf("Reference implementation mismatch at index %d: got %v, want %v", i, out[i], expected[i])
+						break
+					}
+				}
+			})
+
+			// Test V2 implementation
+			t.Run("V2", func(t *testing.T) {
+				out := make([]parquet.FixedLenByteArray, nValues)
+				for i := range out {
+					out[i] = make(parquet.FixedLenByteArray, width)
+				}
+				decodeByteStreamSplitBatchFLBAWidth2V2(data, nValues, stride, out)
+				for i := 0; i < nValues; i++ {
+					if !bytes.Equal(out[i], expected[i]) {
+						t.Errorf("V2 implementation mismatch at index %d: got %v, want %v", i, out[i], expected[i])
+						break
+					}
+				}
+			})
+		})
+	}
+}
+
+// BenchmarkDecodeByteStreamSplitBatchFLBAWidth2 benchmarks all FLBA width-2 decoding implementations.
+func BenchmarkDecodeByteStreamSplitBatchFLBAWidth2(b *testing.B) {
+	const width = 2
+	sizes := []int{8, 64, 512, 4096, 32768, 262144}
+
+	for _, nValues := range sizes {
+		b.Run(fmt.Sprintf("nValues=%d", nValues), func(b *testing.B) {
+			stride := nValues
+			data := make([]byte, width*nValues)
+			for i := 0; i < nValues; i++ {
+				data[i] = byte(i % 256)
+				data[stride+i] = byte((i + 1) % 256)
+			}
+			out := make([]parquet.FixedLenByteArray, nValues)
+			for i := range out {
+				out[i] = make(parquet.FixedLenByteArray, width)
+			}
+
+			b.SetBytes(int64(width * nValues))
+
+			b.Run("Reference", func(b *testing.B) {
+				b.ResetTimer()
+				for i := 0; i < b.N; i++ {
+					decodeByteStreamSplitBatchFLBAWidth2(data, nValues, stride, out)
+				}
+			})
+
+			b.Run("V2", func(b *testing.B) {
+				b.ResetTimer()
+				for i := 0; i < b.N; i++ {
+					decodeByteStreamSplitBatchFLBAWidth2V2(data, nValues, stride, out)
+				}
+			})
+		})
+	}
+}

From 660e7b214f37868b566984f4b3138ab133ed0e0e Mon Sep 17 00:00:00 2001
From: Daniel Adam <daniel.adam1@thermofisher.com>
Date: Mon, 9 Feb 2026 18:22:02 +0100
Subject: [PATCH 4/8] Update byte-stream split decoding tests

---
 ...x2_amd64.go => byte_stream_split_amd64.go} |  34 +-
 ...on_arm64.go => byte_stream_split_arm64.go} |  38 +-
 .../encoding/byte_stream_split_decode.go      |  92 +++-
 .../byte_stream_split_decode_amd64.go         |  28 --
 .../byte_stream_split_decode_arm64.go         |  30 --
 .../byte_stream_split_decode_asm_test.go      | 177 -------
 .../byte_stream_split_decode_avx2_amd64.s     | 147 ++++++
 .../byte_stream_split_decode_neon_arm64.s     | 172 +++++++
 .../encoding/byte_stream_split_decode_test.go | 454 ++++++++++++------
 9 files changed, 759 insertions(+), 413 deletions(-)
 rename parquet/internal/encoding/{byte_stream_split_decode_avx2_amd64.go => byte_stream_split_amd64.go} (51%)
 rename parquet/internal/encoding/{byte_stream_split_decode_neon_arm64.go => byte_stream_split_arm64.go} (50%)
 delete mode 100644 parquet/internal/encoding/byte_stream_split_decode_amd64.go
 delete mode 100644 parquet/internal/encoding/byte_stream_split_decode_arm64.go
 delete mode 100644 parquet/internal/encoding/byte_stream_split_decode_asm_test.go

diff --git a/parquet/internal/encoding/byte_stream_split_decode_avx2_amd64.go b/parquet/internal/encoding/byte_stream_split_amd64.go
similarity index 51%
rename from parquet/internal/encoding/byte_stream_split_decode_avx2_amd64.go
rename to parquet/internal/encoding/byte_stream_split_amd64.go
index c4c3c5f2..9e0276cb 100644
--- a/parquet/internal/encoding/byte_stream_split_decode_avx2_amd64.go
+++ b/parquet/internal/encoding/byte_stream_split_amd64.go
@@ -15,28 +15,44 @@
 // limitations under the License.
 
 //go:build !noasm
-// +build !noasm
 
 package encoding
 
 import (
+	"fmt"
 	"unsafe"
+
+	"github.com/apache/arrow-go/v18/parquet/internal/debug"
+	"golang.org/x/sys/cpu"
 )
 
+func init() {
+	if cpu.X86.HasAVX2 {
+	}
+}
+
 //go:noescape
 func _decodeByteStreamSplitWidth4AVX2(data, out unsafe.Pointer, nValues, stride int)
 
+//go:noescape
+func _decodeByteStreamSplitWidth8AVX2(data, out unsafe.Pointer, nValues, stride int)
+
 func decodeByteStreamSplitBatchWidth4AVX2(data []byte, nValues, stride int, out []byte) {
 	if nValues == 0 {
 		return
 	}
-	// Bounds checking
-	if len(out) < nValues*4 {
-		panic("output buffer too small")
-	}
-	if len(data) < 4*stride {
-		panic("input buffer too small")
-	}
-
+	const width = 4
+	debug.Assert(len(out) >= nValues*width, fmt.Sprintf("not enough space in output buffer for decoding, out: %d bytes, data: %d bytes", len(out), len(data)))
+	debug.Assert(len(data) >= width*stride, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), width*stride))
 	_decodeByteStreamSplitWidth4AVX2(unsafe.Pointer(&data[0]), unsafe.Pointer(&out[0]), nValues, stride)
 }
+
+func decodeByteStreamSplitBatchWidth8AVX2(data []byte, nValues, stride int, out []byte) {
+	if nValues == 0 {
+		return
+	}
+	const width = 8
+	debug.Assert(len(out) >= nValues*width, fmt.Sprintf("not enough space in output buffer for decoding, out: %d bytes, data: %d bytes", len(out), len(data)))
+	debug.Assert(len(data) >= width*stride, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), width*stride))
+	_decodeByteStreamSplitWidth8AVX2(unsafe.Pointer(&data[0]), unsafe.Pointer(&out[0]), nValues, stride)
+}
diff --git a/parquet/internal/encoding/byte_stream_split_decode_neon_arm64.go b/parquet/internal/encoding/byte_stream_split_arm64.go
similarity index 50%
rename from parquet/internal/encoding/byte_stream_split_decode_neon_arm64.go
rename to parquet/internal/encoding/byte_stream_split_arm64.go
index f8f1d8c6..25ed23cf 100644
--- a/parquet/internal/encoding/byte_stream_split_decode_neon_arm64.go
+++ b/parquet/internal/encoding/byte_stream_split_arm64.go
@@ -15,26 +15,44 @@
 // limitations under the License.
 
 //go:build !noasm
-// +build !noasm
 
 package encoding
 
-import "unsafe"
+import (
+	"fmt"
+	"unsafe"
+
+	"github.com/apache/arrow-go/v18/parquet/internal/debug"
+	"golang.org/x/sys/cpu"
+)
+
+func init() {
+	if cpu.ARM64.HasASIMD {
+	}
+}
 
 //go:noescape
 func _decodeByteStreamSplitWidth4NEON(data, out unsafe.Pointer, nValues, stride int)
 
+//go:noescape
+func _decodeByteStreamSplitWidth8NEON(data, out unsafe.Pointer, nValues, stride int)
+
 func decodeByteStreamSplitBatchWidth4NEON(data []byte, nValues, stride int, out []byte) {
 	if nValues == 0 {
 		return
 	}
-	// Bounds checking
-	if len(out) < nValues*4 {
-		panic("output buffer too small")
-	}
-	if len(data) < 4*stride {
-		panic("input buffer too small")
-	}
-
+	const width = 4
+	debug.Assert(len(out) >= nValues*width, fmt.Sprintf("not enough space in output buffer for decoding, out: %d bytes, data: %d bytes", len(out), len(data)))
+	debug.Assert(len(data) >= width*stride, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), width*stride))
 	_decodeByteStreamSplitWidth4NEON(unsafe.Pointer(&data[0]), unsafe.Pointer(&out[0]), nValues, stride)
 }
+
+func decodeByteStreamSplitBatchWidth8NEON(data []byte, nValues, stride int, out []byte) {
+	if nValues == 0 {
+		return
+	}
+	const width = 8
+	debug.Assert(len(out) >= nValues*width, fmt.Sprintf("not enough space in output buffer for decoding, out: %d bytes, data: %d bytes", len(out), len(data)))
+	debug.Assert(len(data) >= width*stride, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), width*stride))
+	_decodeByteStreamSplitWidth8NEON(unsafe.Pointer(&data[0]), unsafe.Pointer(&out[0]), nValues, stride)
+}
diff --git a/parquet/internal/encoding/byte_stream_split_decode.go b/parquet/internal/encoding/byte_stream_split_decode.go
index 138a34d7..48ca2ca6 100644
--- a/parquet/internal/encoding/byte_stream_split_decode.go
+++ b/parquet/internal/encoding/byte_stream_split_decode.go
@@ -24,14 +24,21 @@ import (
 	"github.com/apache/arrow-go/v18/parquet/internal/debug"
 )
 
-var decodeByteStreamSplitBatchWidth4SIMD func(data []byte, nValues, stride int, out []byte)
+var (
+	decodeByteStreamSplitBatchWidth4     func(data []byte, nValues, stride int, out []byte)                      = decodeByteStreamSplitBatchWidth4Default
+	decodeByteStreamSplitBatchWidth8     func(data []byte, nValues, stride int, out []byte)                      = decodeByteStreamSplitBatchWidth8Default
+	decodeByteStreamSplitBatchFLBAWidth2 func(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) = decodeByteStreamSplitBatchFLBAWidth2Default
+	decodeByteStreamSplitBatchFLBAWidth4 func(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) = decodeByteStreamSplitBatchFLBAWidth4Default
+	decodeByteStreamSplitBatchFLBAWidth8 func(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) = decodeByteStreamSplitBatchFLBAWidth8Default
+)
 
 // decodeByteStreamSplitBatchWidth4 decodes the batch of nValues raw bytes representing a 4-byte datatype provided by 'data',
 // into the output buffer 'out' using BYTE_STREAM_SPLIT encoding.
 // 'out' must have space for at least len(data) bytes.
-func decodeByteStreamSplitBatchWidth4(data []byte, nValues, stride int, out []byte) {
+func decodeByteStreamSplitBatchWidth4Default(data []byte, nValues, stride int, out []byte) {
 	const width = 4
 	debug.Assert(len(out) >= nValues*width, fmt.Sprintf("not enough space in output buffer for decoding, out: %d bytes, data: %d bytes", len(out), len(data)))
+	debug.Assert(len(data) >= width*stride, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), width*stride))
 	for element := 0; element < nValues; element++ {
 		out[width*element] = data[element]
 		out[width*element+1] = data[stride+element]
@@ -42,6 +49,8 @@ func decodeByteStreamSplitBatchWidth4(data []byte, nValues, stride int, out []by
 
 func decodeByteStreamSplitBatchWidth4V2(data []byte, nValues, stride int, out []byte) {
 	const width = 4
+	debug.Assert(len(out) >= nValues*width, fmt.Sprintf("not enough space in output buffer for decoding, out: %d bytes, data: %d bytes", len(out), len(data)))
+	debug.Assert(len(data) >= width*stride, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), width*stride))
 	// Narrow slices to help the compiler eliminate bounds checks.
 	s0 := data[:nValues]
 	s1 := data[stride : stride+nValues]
@@ -58,9 +67,10 @@ func decodeByteStreamSplitBatchWidth4V2(data []byte, nValues, stride int, out []
 // decodeByteStreamSplitBatchWidth8 decodes the batch of nValues raw bytes representing a 8-byte datatype provided by 'data',
 // into the output buffer 'out' using BYTE_STREAM_SPLIT encoding.
 // 'out' must have space for at least len(data) bytes.
-func decodeByteStreamSplitBatchWidth8(data []byte, nValues, stride int, out []byte) {
+func decodeByteStreamSplitBatchWidth8Default(data []byte, nValues, stride int, out []byte) {
 	const width = 8
 	debug.Assert(len(out) >= nValues*width, fmt.Sprintf("not enough space in output buffer for decoding, out: %d bytes, data: %d bytes", len(out), len(data)))
+	debug.Assert(len(data) >= width*stride, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), width*stride))
 	for element := 0; element < nValues; element++ {
 		out[width*element] = data[element]
 		out[width*element+1] = data[stride+element]
@@ -73,11 +83,34 @@ func decodeByteStreamSplitBatchWidth8(data []byte, nValues, stride int, out []by
 	}
 }
 
+func decodeByteStreamSplitBatchWidth8V2(data []byte, nValues, stride int, out []byte) {
+	const width = 8
+	debug.Assert(len(out) >= nValues*width, fmt.Sprintf("not enough space in output buffer for decoding, out: %d bytes, data: %d bytes", len(out), len(data)))
+	debug.Assert(len(data) >= width*stride, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), width*stride))
+	// Narrow slices to help the compiler eliminate bounds checks.
+	s0 := data[:nValues]
+	s1 := data[stride : stride+nValues]
+	s2 := data[2*stride : 2*stride+nValues]
+	s3 := data[3*stride : 3*stride+nValues]
+	s4 := data[4*stride : 4*stride+nValues]
+	s5 := data[5*stride : 5*stride+nValues]
+	s6 := data[6*stride : 6*stride+nValues]
+	s7 := data[7*stride : 7*stride+nValues]
+
+	out = out[:width*nValues]
+	out64 := unsafe.Slice((*uint64)(unsafe.Pointer(&out[0])), nValues)
+	for i := range nValues {
+		out64[i] = uint64(s0[i]) | uint64(s1[i])<<8 | uint64(s2[i])<<16 | uint64(s3[i])<<24 |
+			uint64(s4[i])<<32 | uint64(s5[i])<<40 | uint64(s6[i])<<48 | uint64(s7[i])<<56
+	}
+}
+
 // decodeByteStreamSplitBatchFLBA decodes the batch of nValues FixedLenByteArrays provided by 'data',
 // into the output slice 'out' using BYTE_STREAM_SPLIT encoding.
 // 'out' must have space for at least nValues slices.
 func decodeByteStreamSplitBatchFLBA(data []byte, nValues, stride, width int, out []parquet.FixedLenByteArray) {
 	debug.Assert(len(out) >= nValues, fmt.Sprintf("not enough space in output slice for decoding, out: %d values, data: %d values", len(out), nValues))
+	debug.Assert(len(data) >= width*stride, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), width*stride))
 	for stream := 0; stream < width; stream++ {
 		for element := 0; element < nValues; element++ {
 			encLoc := stride*stream + element
@@ -89,8 +122,10 @@ func decodeByteStreamSplitBatchFLBA(data []byte, nValues, stride, width int, out
 // decodeByteStreamSplitBatchFLBAWidth2 decodes the batch of nValues FixedLenByteArrays of length 2 provided by 'data',
 // into the output slice 'out' using BYTE_STREAM_SPLIT encoding.
 // 'out' must have space for at least nValues slices.
-func decodeByteStreamSplitBatchFLBAWidth2(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) {
+func decodeByteStreamSplitBatchFLBAWidth2Default(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) {
+	const width = 2
 	debug.Assert(len(out) >= nValues, fmt.Sprintf("not enough space in output slice for decoding, out: %d values, data: %d values", len(out), nValues))
+	debug.Assert(len(data) >= 2*width*stride, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), width*stride))
 	for element := 0; element < nValues; element++ {
 		out[element][0] = data[element]
 		out[element][1] = data[stride+element]
@@ -99,10 +134,10 @@ func decodeByteStreamSplitBatchFLBAWidth2(data []byte, nValues, stride int, out
 
 func decodeByteStreamSplitBatchFLBAWidth2V2(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) {
 	const width = 2
-	// Narrow slices to help the compiler eliminate bounds checks.
+	debug.Assert(len(out) >= nValues, fmt.Sprintf("not enough space in output slice for decoding, out: %d values, data: %d values", len(out), nValues))
+	debug.Assert(len(data) >= width*stride, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), width*stride))
 	s0 := data[:nValues]
 	s1 := data[stride : stride+nValues]
-
 	for i := range nValues {
 		out16 := (*uint16)(unsafe.Pointer(&out[i][0]))
 		*out16 = uint16(s0[i]) | uint16(s1[i])<<8
@@ -112,8 +147,10 @@ func decodeByteStreamSplitBatchFLBAWidth2V2(data []byte, nValues, stride int, ou
 // decodeByteStreamSplitBatchFLBAWidth4 decodes the batch of nValues FixedLenByteArrays of length 4 provided by 'data',
 // into the output slice 'out' using BYTE_STREAM_SPLIT encoding.
 // 'out' must have space for at least nValues slices.
-func decodeByteStreamSplitBatchFLBAWidth4(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) {
+func decodeByteStreamSplitBatchFLBAWidth4Default(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) {
+	const width = 4
 	debug.Assert(len(out) >= nValues, fmt.Sprintf("not enough space in output slice for decoding, out: %d values, data: %d values", len(out), nValues))
+	debug.Assert(len(data) >= width*stride, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), width*stride))
 	for element := 0; element < nValues; element++ {
 		out[element][0] = data[element]
 		out[element][1] = data[stride+element]
@@ -122,11 +159,29 @@ func decodeByteStreamSplitBatchFLBAWidth4(data []byte, nValues, stride int, out
 	}
 }
 
+func decodeByteStreamSplitBatchFLBAWidth4V2(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) {
+	const width = 4
+	debug.Assert(len(out) >= nValues, fmt.Sprintf("not enough space in output slice for decoding, out: %d values, data: %d values", len(out), nValues))
+	debug.Assert(len(data) >= width*stride, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), width*stride))
+	// Narrow slices to help the compiler eliminate bounds checks.
+	s0 := data[:nValues]
+	s1 := data[stride : stride+nValues]
+	s2 := data[stride*2 : stride*2+nValues]
+	s3 := data[stride*3 : stride*3+nValues]
+
+	for i := range nValues {
+		out32 := (*uint32)(unsafe.Pointer(&out[i][0]))
+		*out32 = uint32(s0[i]) | uint32(s1[i])<<8 | uint32(s2[i])<<16 | uint32(s3[i])<<24
+	}
+}
+
 // decodeByteStreamSplitBatchFLBAWidth8 decodes the batch of nValues FixedLenByteArrays of length 8 provided by 'data',
 // into the output slice 'out' using BYTE_STREAM_SPLIT encoding.
 // 'out' must have space for at least nValues slices.
-func decodeByteStreamSplitBatchFLBAWidth8(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) {
+func decodeByteStreamSplitBatchFLBAWidth8Default(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) {
+	const width = 8
 	debug.Assert(len(out) >= nValues, fmt.Sprintf("not enough space in output slice for decoding, out: %d values, data: %d values", len(out), nValues))
+	debug.Assert(len(data) >= width*stride, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), width*stride))
 	for element := 0; element < nValues; element++ {
 		out[element][0] = data[element]
 		out[element][1] = data[stride+element]
@@ -138,3 +193,24 @@ func decodeByteStreamSplitBatchFLBAWidth8(data []byte, nValues, stride int, out
 		out[element][7] = data[stride*7+element]
 	}
 }
+
+func decodeByteStreamSplitBatchFLBAWidth8V2(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) {
+	const width = 8
+	debug.Assert(len(out) >= nValues, fmt.Sprintf("not enough space in output slice for decoding, out: %d values, data: %d values", len(out), nValues))
+	debug.Assert(len(data) >= width*stride, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), width*stride))
+	// Narrow slices to help the compiler eliminate bounds checks.
+	s0 := data[:nValues]
+	s1 := data[stride : stride+nValues]
+	s2 := data[stride*2 : stride*2+nValues]
+	s3 := data[stride*3 : stride*3+nValues]
+	s4 := data[stride*4 : stride*4+nValues]
+	s5 := data[stride*5 : stride*5+nValues]
+	s6 := data[stride*6 : stride*6+nValues]
+	s7 := data[stride*7 : stride*7+nValues]
+
+	for i := range nValues {
+		out64 := (*uint64)(unsafe.Pointer(&out[i][0]))
+		*out64 = uint64(s0[i]) | uint64(s1[i])<<8 | uint64(s2[i])<<16 | uint64(s3[i])<<24 |
+			uint64(s4[i])<<32 | uint64(s5[i])<<40 | uint64(s6[i])<<48 | uint64(s7[i])<<56
+	}
+}
diff --git a/parquet/internal/encoding/byte_stream_split_decode_amd64.go b/parquet/internal/encoding/byte_stream_split_decode_amd64.go
deleted file mode 100644
index 7572d1dc..00000000
--- a/parquet/internal/encoding/byte_stream_split_decode_amd64.go
+++ /dev/null
@@ -1,28 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//go:build !noasm
-// +build !noasm
-
-package encoding
-
-import "golang.org/x/sys/cpu"
-
-func init() {
-	if cpu.X86.HasAVX2 {
-		decodeByteStreamSplitBatchWidth4SIMD = decodeByteStreamSplitBatchWidth4AVX2
-	}
-}
diff --git a/parquet/internal/encoding/byte_stream_split_decode_arm64.go b/parquet/internal/encoding/byte_stream_split_decode_arm64.go
deleted file mode 100644
index 628c3d90..00000000
--- a/parquet/internal/encoding/byte_stream_split_decode_arm64.go
+++ /dev/null
@@ -1,30 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//go:build !noasm
-// +build !noasm
-
-package encoding
-
-import (
-	"golang.org/x/sys/cpu"
-)
-
-func init() {
-	if cpu.ARM64.HasASIMD {
-		decodeByteStreamSplitBatchWidth4SIMD = decodeByteStreamSplitBatchWidth4NEON
-	}
-}
diff --git a/parquet/internal/encoding/byte_stream_split_decode_asm_test.go b/parquet/internal/encoding/byte_stream_split_decode_asm_test.go
deleted file mode 100644
index 6f6d1329..00000000
--- a/parquet/internal/encoding/byte_stream_split_decode_asm_test.go
+++ /dev/null
@@ -1,177 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//go:build !noasm
-// +build !noasm
-
-package encoding
-
-import (
-	"bytes"
-	"fmt"
-	"testing"
-
-	"golang.org/x/sys/cpu"
-)
-
-// TestDecodeByteStreamSplitWidth4 validates correctness of all width-4 decoding implementations
-func TestDecodeByteStreamSplitWidth4SIMD(t *testing.T) {
-	const width = 4
-
-	// Test various sizes including edge cases
-	sizes := []int{1, 2, 7, 8, 31, 32, 33, 63, 64, 65, 127, 128, 129, 255, 256, 512, 1024}
-
-	for _, nValues := range sizes {
-		t.Run(fmt.Sprintf("nValues=%d", nValues), func(t *testing.T) {
-			// Setup encoded data (byte stream split format)
-			stride := nValues
-			data := make([]byte, width*nValues)
-
-			// Initialize with predictable pattern
-			for i := 0; i < nValues; i++ {
-				data[i] = byte(i % 256)                // stream 0
-				data[stride+i] = byte((i + 1) % 256)   // stream 1
-				data[2*stride+i] = byte((i + 2) % 256) // stream 2
-				data[3*stride+i] = byte((i + 3) % 256) // stream 3
-			}
-
-			// Expected output: interleaved bytes
-			expected := make([]byte, width*nValues)
-			for i := 0; i < nValues; i++ {
-				expected[i*4] = byte(i % 256)
-				expected[i*4+1] = byte((i + 1) % 256)
-				expected[i*4+2] = byte((i + 2) % 256)
-				expected[i*4+3] = byte((i + 3) % 256)
-			}
-
-			if cpu.X86.HasAVX2 {
-				// Test AVX2 assembly implementation
-				t.Run("AVX2", func(t *testing.T) {
-					out := make([]byte, width*nValues)
-					decodeByteStreamSplitBatchWidth4SIMD(data, nValues, stride, out)
-					if !bytes.Equal(out, expected) {
-						t.Errorf("AVX2 implementation produced incorrect output")
-						for i := 0; i < len(expected) && i < 64; i++ {
-							if out[i] != expected[i] {
-								t.Errorf("First mismatch at index %d: got %d, want %d", i, out[i], expected[i])
-								break
-							}
-						}
-					}
-				})
-			}
-
-			if cpu.ARM64.HasASIMD {
-				// Test NEON assembly implementation
-				t.Run("NEON", func(t *testing.T) {
-					out := make([]byte, width*nValues)
-					// _decodeByteStreamSplitWidth4NEON(unsafe.Pointer(&data[0]), unsafe.Pointer(&out[0]), nValues, stride)
-					decodeByteStreamSplitBatchWidth4SIMD(data, nValues, stride, out)
-					if !bytes.Equal(out, expected) {
-						t.Errorf("NEON implementation produced incorrect output")
-						for i := 0; i < len(expected) && i < 64; i++ {
-							if out[i] != expected[i] {
-								t.Errorf("First mismatch at index %d: got %d, want %d", i, out[i], expected[i])
-								break
-							}
-						}
-					}
-				})
-			}
-		})
-	}
-}
-
-// TestDecodeByteStreamSplitWidth4EdgeCases tests edge cases and boundary conditions
-func TestDecodeByteStreamSplitWidth4EdgeCasesSIMD(t *testing.T) {
-	const width = 4
-
-	t.Run("ExactBlockBoundary32", func(t *testing.T) {
-		// Test exact block boundary (32 values = 1 complete AVX2 block)
-		nValues := 32
-		stride := nValues
-		data := make([]byte, width*nValues)
-
-		for i := 0; i < nValues; i++ {
-			data[i] = byte(i)
-			data[stride+i] = byte(i + 64)
-			data[2*stride+i] = byte(i + 128)
-			data[3*stride+i] = byte(i + 192)
-		}
-
-		expected := make([]byte, width*nValues)
-		for i := 0; i < nValues; i++ {
-			expected[i*4] = byte(i)
-			expected[i*4+1] = byte(i + 64)
-			expected[i*4+2] = byte(i + 128)
-			expected[i*4+3] = byte(i + 192)
-		}
-
-		out := make([]byte, width*nValues)
-		decodeByteStreamSplitBatchWidth4SIMD(data, nValues, stride, out)
-		if !bytes.Equal(out, expected) {
-			t.Errorf("Failed on exact 32-value block boundary")
-		}
-	})
-
-	t.Run("SingleValue", func(t *testing.T) {
-		// Test single value (all suffix, no vectorization)
-		nValues := 1
-		stride := nValues
-		data := []byte{0xAA, 0xBB, 0xCC, 0xDD}
-		expected := []byte{0xAA, 0xBB, 0xCC, 0xDD}
-
-		out := make([]byte, width*nValues)
-		decodeByteStreamSplitBatchWidth4SIMD(data, nValues, stride, out)
-		if !bytes.Equal(out, expected) {
-			t.Errorf("Failed on single value: got %v, want %v", out, expected)
-		}
-	})
-
-	t.Run("AllZeros", func(t *testing.T) {
-		// Test all zeros
-		nValues := 100
-		stride := nValues
-		data := make([]byte, width*nValues)
-		expected := make([]byte, width*nValues)
-
-		out := make([]byte, width*nValues)
-		decodeByteStreamSplitBatchWidth4SIMD(data, nValues, stride, out)
-		if !bytes.Equal(out, expected) {
-			t.Errorf("Failed on all-zero data")
-		}
-	})
-
-	t.Run("AllOnes", func(t *testing.T) {
-		// Test all 0xFF
-		nValues := 100
-		stride := nValues
-		data := make([]byte, width*nValues)
-		for i := range data {
-			data[i] = 0xFF
-		}
-		expected := make([]byte, width*nValues)
-		for i := range expected {
-			expected[i] = 0xFF
-		}
-
-		out := make([]byte, width*nValues)
-		decodeByteStreamSplitBatchWidth4SIMD(data, nValues, stride, out)
-		if !bytes.Equal(out, expected) {
-			t.Errorf("Failed on all-0xFF data")
-		}
-	})
-}
diff --git a/parquet/internal/encoding/byte_stream_split_decode_avx2_amd64.s b/parquet/internal/encoding/byte_stream_split_decode_avx2_amd64.s
index 793261df..63473326 100644
--- a/parquet/internal/encoding/byte_stream_split_decode_avx2_amd64.s
+++ b/parquet/internal/encoding/byte_stream_split_decode_avx2_amd64.s
@@ -131,3 +131,150 @@ block_loop_avx2:
 done_avx2:
 	VZEROUPPER
 	RET
+
+// func _decodeByteStreamSplitWidth8AVX2(data, out unsafe.Pointer, nValues, stride int)
+//
+// AVX2 implementation for width=8 (float64/int64) with 256-bit vectors:
+// - Processes suffix FIRST, then vectorized blocks
+// - Uses 256-bit AVX2 vectors (32 bytes per register)
+// - Processes 16 float64 values per block (128 bytes total)
+// - Uses 3-stage byte unpacking hierarchy (for 8 streams)
+TEXT ·_decodeByteStreamSplitWidth8AVX2(SB), NOSPLIT, $0-32
+	MOVQ data+0(FP), SI      // SI = data pointer
+	MOVQ out+8(FP), DI       // DI = out pointer
+	MOVQ nValues+16(FP), CX  // CX = nValues
+	MOVQ stride+24(FP), DX   // DX = stride
+
+	// Setup 8 stream pointers
+	MOVQ SI, R9              // stream 0
+	LEAQ (SI)(DX*1), R10     // stream 1 = data + stride
+	LEAQ (SI)(DX*2), R11     // stream 2 = data + 2*stride
+	LEAQ (R10)(DX*2), R12    // stream 3 = data + 3*stride
+	LEAQ (SI)(DX*4), R13     // stream 4 = data + 4*stride
+	LEAQ (R10)(DX*4), R14    // stream 5 = data + 5*stride
+	LEAQ (R11)(DX*4), R15    // stream 6 = data + 6*stride
+	LEAQ (R12)(DX*4), BX     // stream 7 = data + 7*stride
+
+	// Calculate num_blocks = nValues / 16
+	MOVQ CX, AX
+	SHRQ $4, AX              // AX = num_blocks
+
+	// Calculate num_processed_elements = num_blocks * 16
+	MOVQ AX, R8
+	SHLQ $4, R8              // R8 = num_processed_elements
+
+	// First handle suffix (elements beyond complete blocks)
+	MOVQ R8, SI              // SI = i = num_processed_elements
+	JMP suffix_check_w8_avx2
+
+suffix_loop_w8_avx2:
+	// Calculate output offset: i * 8
+	MOVQ SI, AX
+	SHLQ $3, AX              // AX = i * 8
+
+	// Load and store bytes from all 8 streams
+	MOVBQZX (R9)(SI*1), DX
+	MOVB DX, (DI)(AX*1)
+
+	MOVBQZX (R10)(SI*1), DX
+	MOVB DX, 1(DI)(AX*1)
+
+	MOVBQZX (R11)(SI*1), DX
+	MOVB DX, 2(DI)(AX*1)
+
+	MOVBQZX (R12)(SI*1), DX
+	MOVB DX, 3(DI)(AX*1)
+
+	MOVBQZX (R13)(SI*1), DX
+	MOVB DX, 4(DI)(AX*1)
+
+	MOVBQZX (R14)(SI*1), DX
+	MOVB DX, 5(DI)(AX*1)
+
+	MOVBQZX (R15)(SI*1), DX
+	MOVB DX, 6(DI)(AX*1)
+
+	MOVBQZX (BX)(SI*1), DX
+	MOVB DX, 7(DI)(AX*1)
+
+	INCQ SI
+
+suffix_check_w8_avx2:
+	CMPQ SI, CX
+	JL suffix_loop_w8_avx2
+
+	// Check if we have blocks to process
+	TESTQ R8, R8             // Check if num_processed_elements > 0
+	JZ done_w8_avx2
+
+	// Process blocks with AVX2
+	XORQ SI, SI              // SI = block index i = 0
+	SHRQ $4, R8              // R8 = num_blocks
+
+block_loop_w8_avx2:
+	// Calculate offset for this block: i * 16
+	MOVQ SI, AX
+	SHLQ $4, AX              // AX = i * 16
+
+	// Load 16 bytes from each of 8 streams (using 128-bit loads for narrower data)
+	// We load 16 bytes but will use AVX2 operations
+	VMOVDQU (R9)(AX*1), X0    // stream 0
+	VMOVDQU (R10)(AX*1), X1   // stream 1
+	VMOVDQU (R11)(AX*1), X2   // stream 2
+	VMOVDQU (R12)(AX*1), X3   // stream 3
+	VMOVDQU (R13)(AX*1), X4   // stream 4
+	VMOVDQU (R14)(AX*1), X5   // stream 5
+	VMOVDQU (R15)(AX*1), X6   // stream 6
+	VMOVDQU (BX)(AX*1), X7    // stream 7
+
+	// Stage 1: First level of byte interleaving (pairs 0-4, 1-5, 2-6, 3-7)
+	VPUNPCKLBW X4, X0, X8     // X8 = unpacklo_epi8(X0, X4)
+	VPUNPCKHBW X4, X0, X9     // X9 = unpackhi_epi8(X0, X4)
+	VPUNPCKLBW X5, X1, X10    // X10 = unpacklo_epi8(X1, X5)
+	VPUNPCKHBW X5, X1, X11    // X11 = unpackhi_epi8(X1, X5)
+	VPUNPCKLBW X6, X2, X12    // X12 = unpacklo_epi8(X2, X6)
+	VPUNPCKHBW X6, X2, X13    // X13 = unpackhi_epi8(X2, X6)
+	VPUNPCKLBW X7, X3, X14    // X14 = unpacklo_epi8(X3, X7)
+	VPUNPCKHBW X7, X3, X15    // X15 = unpackhi_epi8(X3, X7)
+
+	// Stage 2: Second level of byte interleaving
+	VPUNPCKLBW X12, X8, X0    // X0 = unpacklo_epi8(X8, X12)
+	VPUNPCKHBW X12, X8, X1    // X1 = unpackhi_epi8(X8, X12)
+	VPUNPCKLBW X13, X9, X2    // X2 = unpacklo_epi8(X9, X13)
+	VPUNPCKHBW X13, X9, X3    // X3 = unpackhi_epi8(X9, X13)
+	VPUNPCKLBW X14, X10, X4   // X4 = unpacklo_epi8(X10, X14)
+	VPUNPCKHBW X14, X10, X5   // X5 = unpackhi_epi8(X10, X14)
+	VPUNPCKLBW X15, X11, X6   // X6 = unpacklo_epi8(X11, X15)
+	VPUNPCKHBW X15, X11, X7   // X7 = unpackhi_epi8(X11, X15)
+
+	// Stage 3: Third level of byte interleaving
+	VPUNPCKLBW X4, X0, X8     // X8 = unpacklo_epi8(X0, X4)
+	VPUNPCKHBW X4, X0, X9     // X9 = unpackhi_epi8(X0, X4)
+	VPUNPCKLBW X5, X1, X10    // X10 = unpacklo_epi8(X1, X5)
+	VPUNPCKHBW X5, X1, X11    // X11 = unpackhi_epi8(X1, X5)
+	VPUNPCKLBW X6, X2, X12    // X12 = unpacklo_epi8(X2, X6)
+	VPUNPCKHBW X6, X2, X13    // X13 = unpackhi_epi8(X2, X6)
+	VPUNPCKLBW X7, X3, X14    // X14 = unpacklo_epi8(X3, X7)
+	VPUNPCKHBW X7, X3, X15    // X15 = unpackhi_epi8(X3, X7)
+
+	// Store results: out[(i * 8 + j) * 16] = result[j]
+	// Calculate output base: i * 128
+	MOVQ SI, AX
+	SHLQ $7, AX              // AX = i * 128
+
+	VMOVDQU X8, (DI)(AX*1)         // Store at offset 0
+	VMOVDQU X9, 16(DI)(AX*1)       // Store at offset 16
+	VMOVDQU X10, 32(DI)(AX*1)      // Store at offset 32
+	VMOVDQU X11, 48(DI)(AX*1)      // Store at offset 48
+	VMOVDQU X12, 64(DI)(AX*1)      // Store at offset 64
+	VMOVDQU X13, 80(DI)(AX*1)      // Store at offset 80
+	VMOVDQU X14, 96(DI)(AX*1)      // Store at offset 96
+	VMOVDQU X15, 112(DI)(AX*1)     // Store at offset 112
+
+	INCQ SI
+	CMPQ SI, R8
+	JL block_loop_w8_avx2
+
+done_w8_avx2:
+	VZEROUPPER
+	RET
diff --git a/parquet/internal/encoding/byte_stream_split_decode_neon_arm64.s b/parquet/internal/encoding/byte_stream_split_decode_neon_arm64.s
index cbee47b3..838d8d86 100644
--- a/parquet/internal/encoding/byte_stream_split_decode_neon_arm64.s
+++ b/parquet/internal/encoding/byte_stream_split_decode_neon_arm64.s
@@ -115,3 +115,175 @@ block_loop_neon:
 
 done_neon:
 	RET
+
+// func _decodeByteStreamSplitWidth8NEON(data, out unsafe.Pointer, nValues, stride int)
+//
+// NEON implementation for width=8 following AVX2 algorithm structure:
+// - Processes suffix FIRST, then vectorized blocks
+// - Uses 128-bit NEON vectors (16 bytes per register)
+// - Processes 16 float64 values per block (128 bytes total)
+// - Uses 3-stage byte unpacking hierarchy (for 8 streams)
+TEXT ·_decodeByteStreamSplitWidth8NEON(SB), NOSPLIT, $0-32
+	MOVD data+0(FP), R0      // R0 = data pointer
+	MOVD out+8(FP), R1       // R1 = out pointer
+	MOVD nValues+16(FP), R2  // R2 = nValues
+	MOVD stride+24(FP), R3   // R3 = stride
+
+	// Setup 8 stream pointers
+	MOVD R0, R4              // R4 = stream 0
+	ADD R3, R0, R5           // R5 = stream 1 = data + stride
+	ADD R3, R5, R6           // R6 = stream 2 = data + 2*stride
+	ADD R3, R6, R7           // R7 = stream 3 = data + 3*stride
+	LSL $2, R3, R8           // R8 = 4*stride
+	ADD R0, R8, R9           // R9 = stream 4 = data + 4*stride
+	ADD R5, R8, R10          // R10 = stream 5 = data + 5*stride
+	ADD R6, R8, R11          // R11 = stream 6 = data + 6*stride
+	ADD R7, R8, R12          // R12 = stream 7 = data + 7*stride
+
+	// Calculate num_blocks = nValues / 16
+	LSR $4, R2, R13          // R13 = num_blocks (divide by 16)
+
+	// Calculate num_processed_elements = num_blocks * 16
+	LSL $4, R13, R14         // R14 = num_processed_elements
+
+	// First handle suffix (elements beyond complete blocks)
+	MOVD R14, R15            // R15 = i = num_processed_elements
+	B suffix_check_w8_neon
+
+suffix_loop_w8_neon:
+	// Calculate output offset: i * 8
+	LSL $3, R15, R16         // R16 = i * 8
+	ADD R1, R16, R17         // R17 = out + (i * 8)
+
+	// Load and store bytes from all 8 streams
+	MOVBU (R4)(R15), R19
+	MOVB R19, (R17)
+
+	MOVBU (R5)(R15), R19
+	MOVB R19, 1(R17)
+
+	MOVBU (R6)(R15), R19
+	MOVB R19, 2(R17)
+
+	MOVBU (R7)(R15), R19
+	MOVB R19, 3(R17)
+
+	MOVBU (R9)(R15), R19
+	MOVB R19, 4(R17)
+
+	MOVBU (R10)(R15), R19
+	MOVB R19, 5(R17)
+
+	MOVBU (R11)(R15), R19
+	MOVB R19, 6(R17)
+
+	MOVBU (R12)(R15), R19
+	MOVB R19, 7(R17)
+
+	ADD $1, R15, R15
+
+suffix_check_w8_neon:
+	CMP R2, R15
+	BLT suffix_loop_w8_neon
+
+	// Check if we have blocks to process
+	CBZ R14, done_w8_neon    // if num_processed_elements == 0, done
+
+	// Process blocks with NEON
+	MOVD $0, R15             // R15 = block index i = 0
+	LSR $4, R14, R14         // R14 = num_blocks
+
+block_loop_w8_neon:
+	// Calculate offset for this block: i * 16
+	LSL $4, R15, R16         // R16 = i * 16
+
+	// Load 16 bytes from each stream - REVERSED for little-endian!
+	// V0 = stream 7 (MSB), V7 = stream 0 (LSB)
+	ADD R12, R16, R17
+	VLD1 (R17), [V0.B16]     // V0 = stream 7 (MSB in little-endian)
+
+	ADD R11, R16, R17
+	VLD1 (R17), [V1.B16]     // V1 = stream 6
+
+	ADD R10, R16, R17
+	VLD1 (R17), [V2.B16]     // V2 = stream 5
+
+	ADD R9, R16, R17
+	VLD1 (R17), [V3.B16]     // V3 = stream 4
+
+	ADD R7, R16, R17
+	VLD1 (R17), [V4.B16]     // V4 = stream 3
+
+	ADD R6, R16, R17
+	VLD1 (R17), [V5.B16]     // V5 = stream 2
+
+	ADD R5, R16, R17
+	VLD1 (R17), [V6.B16]     // V6 = stream 1
+
+	ADD R4, R16, R17
+	VLD1 (R17), [V7.B16]     // V7 = stream 0 (LSB in little-endian)
+
+	// Stage 1: First level of byte interleaving (pairs 0-4, 1-5, 2-6, 3-7)
+	VZIP1 V0.B16, V4.B16, V8.B16     // Interleave streams 3,7 low
+	VZIP2 V0.B16, V4.B16, V9.B16     // Interleave streams 3,7 high
+	VZIP1 V1.B16, V5.B16, V10.B16    // Interleave streams 2,6 low
+	VZIP2 V1.B16, V5.B16, V11.B16    // Interleave streams 2,6 high
+	VZIP1 V2.B16, V6.B16, V12.B16    // Interleave streams 1,5 low
+	VZIP2 V2.B16, V6.B16, V13.B16    // Interleave streams 1,5 high
+	VZIP1 V3.B16, V7.B16, V14.B16    // Interleave streams 0,4 low
+	VZIP2 V3.B16, V7.B16, V15.B16    // Interleave streams 0,4 high
+
+	// Stage 2: Second level of byte interleaving
+	VZIP1 V8.B16, V12.B16, V0.B16
+	VZIP2 V8.B16, V12.B16, V1.B16
+	VZIP1 V9.B16, V13.B16, V2.B16
+	VZIP2 V9.B16, V13.B16, V3.B16
+	VZIP1 V10.B16, V14.B16, V4.B16
+	VZIP2 V10.B16, V14.B16, V5.B16
+	VZIP1 V11.B16, V15.B16, V6.B16
+	VZIP2 V11.B16, V15.B16, V7.B16
+
+	// Stage 3: Third level of byte interleaving
+	VZIP1 V0.B16, V4.B16, V8.B16
+	VZIP2 V0.B16, V4.B16, V9.B16
+	VZIP1 V1.B16, V5.B16, V10.B16
+	VZIP2 V1.B16, V5.B16, V11.B16
+	VZIP1 V2.B16, V6.B16, V12.B16
+	VZIP2 V2.B16, V6.B16, V13.B16
+	VZIP1 V3.B16, V7.B16, V14.B16
+	VZIP2 V3.B16, V7.B16, V15.B16
+
+	// Store results: out[(i * 8 + j) * 16] = result[j]
+	// Calculate output base: i * 128
+	LSL $7, R15, R16         // R16 = i * 128
+
+	ADD R1, R16, R17
+	VST1 [V8.B16], (R17)          // Store at offset 0
+
+	ADD $16, R17
+	VST1 [V9.B16], (R17)          // Store at offset 16
+
+	ADD $16, R17
+	VST1 [V10.B16], (R17)         // Store at offset 32
+
+	ADD $16, R17
+	VST1 [V11.B16], (R17)         // Store at offset 48
+
+	ADD $16, R17
+	VST1 [V12.B16], (R17)         // Store at offset 64
+
+	ADD $16, R17
+	VST1 [V13.B16], (R17)         // Store at offset 80
+
+	ADD $16, R17
+	VST1 [V14.B16], (R17)         // Store at offset 96
+
+	ADD $16, R17
+	VST1 [V15.B16], (R17)         // Store at offset 112
+
+	ADD $1, R15, R15
+	CMP R14, R15
+	BLT block_loop_w8_neon
+
+done_w8_neon:
+	RET
diff --git a/parquet/internal/encoding/byte_stream_split_decode_test.go b/parquet/internal/encoding/byte_stream_split_decode_test.go
index 43c43261..68436baf 100644
--- a/parquet/internal/encoding/byte_stream_split_decode_test.go
+++ b/parquet/internal/encoding/byte_stream_split_decode_test.go
@@ -22,10 +22,8 @@ import (
 	"testing"
 
 	"github.com/apache/arrow-go/v18/parquet"
-	"golang.org/x/sys/cpu"
 )
 
-// TestDecodeByteStreamSplitWidth4 validates correctness of all width-4 decoding implementations
 func TestDecodeByteStreamSplitWidth4(t *testing.T) {
 	const width = 4
 
@@ -33,118 +31,147 @@ func TestDecodeByteStreamSplitWidth4(t *testing.T) {
 	sizes := []int{1, 2, 7, 8, 31, 32, 33, 63, 64, 65, 127, 128, 129, 255, 256, 512, 1024}
 
 	for _, nValues := range sizes {
-		t.Run(fmt.Sprintf("nValues=%d", nValues), func(t *testing.T) {
-			// Setup encoded data (byte stream split format)
-			stride := nValues
-			data := make([]byte, width*nValues)
+		stride := nValues
+		data := make([]byte, width*nValues)
 
-			// Initialize with predictable pattern
-			for i := 0; i < nValues; i++ {
-				data[i] = byte(i % 256)                // stream 0
-				data[stride+i] = byte((i + 1) % 256)   // stream 1
-				data[2*stride+i] = byte((i + 2) % 256) // stream 2
-				data[3*stride+i] = byte((i + 3) % 256) // stream 3
-			}
+		// Initialize with predictable pattern
+		for i := 0; i < nValues; i++ {
+			data[i] = byte(i % 256)                // stream 0
+			data[stride+i] = byte((i + 1) % 256)   // stream 1
+			data[2*stride+i] = byte((i + 2) % 256) // stream 2
+			data[3*stride+i] = byte((i + 3) % 256) // stream 3
+		}
 
-			// Expected output: interleaved bytes
-			expected := make([]byte, width*nValues)
-			for i := 0; i < nValues; i++ {
-				expected[i*4] = byte(i % 256)
-				expected[i*4+1] = byte((i + 1) % 256)
-				expected[i*4+2] = byte((i + 2) % 256)
-				expected[i*4+3] = byte((i + 3) % 256)
-			}
+		// Expected output: interleaved bytes
+		expected := make([]byte, width*nValues)
+		for i := 0; i < nValues; i++ {
+			expected[i*4] = byte(i % 256)
+			expected[i*4+1] = byte((i + 1) % 256)
+			expected[i*4+2] = byte((i + 2) % 256)
+			expected[i*4+3] = byte((i + 3) % 256)
+		}
 
-			// Test reference implementation
-			t.Run("Reference", func(t *testing.T) {
-				out := make([]byte, width*nValues)
-				decodeByteStreamSplitBatchWidth4(data, nValues, stride, out)
-				if !bytes.Equal(out, expected) {
-					t.Errorf("Reference implementation produced incorrect output")
-					for i := 0; i < len(expected) && i < 64; i++ {
-						if out[i] != expected[i] {
-							t.Errorf("First mismatch at index %d: got %d, want %d", i, out[i], expected[i])
-							break
-						}
-					}
-				}
-			})
-
-			// Test V2 implementation
-			t.Run("V2", func(t *testing.T) {
-				out := make([]byte, width*nValues)
-				decodeByteStreamSplitBatchWidth4V2(data, nValues, stride, out)
-				if !bytes.Equal(out, expected) {
-					t.Errorf("V2 implementation produced incorrect output")
-					for i := 0; i < len(expected) && i < 64; i++ {
-						if out[i] != expected[i] {
-							t.Errorf("First mismatch at index %d: got %d, want %d", i, out[i], expected[i])
-							break
-						}
+		out := make([]byte, width*nValues)
+		t.Run(fmt.Sprintf("nValues=%d", nValues), func(t *testing.T) {
+			decodeByteStreamSplitBatchWidth4(data, nValues, stride, out)
+			if !bytes.Equal(out, expected) {
+				for i := 0; i < len(expected); i++ {
+					if out[i] != expected[i] {
+						t.Errorf("First mismatch at index %d: got %d, want %d", i, out[i], expected[i])
+						break
 					}
 				}
-			})
+			}
 		})
 	}
 }
 
-// BenchmarkDecodeByteStreamSplitBatchWidth4 benchmarks all width-4 decoding implementations with sub-benchmarks.
 func BenchmarkDecodeByteStreamSplitBatchWidth4(b *testing.B) {
 	const width = 4
-	sizes := []int{8, 64, 512, 4096, 32768, 262144}
+	sizes := []int{8, 64, 512, 4096, 32768, 2097152, 16777216}
 
 	for _, nValues := range sizes {
+		stride := nValues
+		data := make([]byte, width*nValues)
+		for i := 0; i < nValues; i++ {
+			data[i] = byte(i % 256)
+			data[stride+i] = byte((i + 1) % 256)
+			data[2*stride+i] = byte((i + 2) % 256)
+			data[3*stride+i] = byte((i + 3) % 256)
+		}
+		out := make([]byte, width*nValues)
+		b.SetBytes(int64(width * nValues))
+
 		b.Run(fmt.Sprintf("nValues=%d", nValues), func(b *testing.B) {
-			stride := nValues
-			data := make([]byte, width*nValues)
-			for i := 0; i < nValues; i++ {
-				data[i] = byte(i % 256)
-				data[stride+i] = byte((i + 1) % 256)
-				data[2*stride+i] = byte((i + 2) % 256)
-				data[3*stride+i] = byte((i + 3) % 256)
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				decodeByteStreamSplitBatchWidth4(data, nValues, stride, out)
 			}
-			out := make([]byte, width*nValues)
+		})
+	}
+}
 
-			b.SetBytes(int64(width * nValues))
+func TestDecodeByteStreamSplitWidth8(t *testing.T) {
+	const width = 8
 
-			b.Run("Reference", func(b *testing.B) {
-				b.ResetTimer()
-				for i := 0; i < b.N; i++ {
-					decodeByteStreamSplitBatchWidth4(data, nValues, stride, out)
-				}
-			})
+	// Test various sizes including edge cases
+	sizes := []int{1, 2, 7, 8, 31, 32, 33, 63, 64, 65, 127, 128, 129, 255, 256, 512, 1024}
 
-			b.Run("V2", func(b *testing.B) {
-				b.ResetTimer()
-				for i := 0; i < b.N; i++ {
-					decodeByteStreamSplitBatchWidth4V2(data, nValues, stride, out)
-				}
-			})
-
-			if decodeByteStreamSplitBatchWidth4SIMD != nil {
-				if cpu.X86.HasAVX2 {
-					b.Run("AVX2", func(b *testing.B) {
-						b.ResetTimer()
-						for i := 0; i < b.N; i++ {
-							decodeByteStreamSplitBatchWidth4SIMD(data, nValues, stride, out)
-						}
-					})
-				}
+	for _, nValues := range sizes {
+		// Setup encoded data (byte stream split format)
+		stride := nValues
+		data := make([]byte, width*nValues)
 
-				if cpu.ARM64.HasASIMD {
-					b.Run("NEON", func(b *testing.B) {
-						b.ResetTimer()
-						for i := 0; i < b.N; i++ {
-							decodeByteStreamSplitBatchWidth4SIMD(data, nValues, stride, out)
-						}
-					})
+		// Initialize with predictable pattern
+		for i := 0; i < nValues; i++ {
+			data[i] = byte(i % 256)                // stream 0
+			data[stride+i] = byte((i + 1) % 256)   // stream 1
+			data[2*stride+i] = byte((i + 2) % 256) // stream 2
+			data[3*stride+i] = byte((i + 3) % 256) // stream 3
+			data[4*stride+i] = byte((i + 4) % 256) // stream 4
+			data[5*stride+i] = byte((i + 5) % 256) // stream 5
+			data[6*stride+i] = byte((i + 6) % 256) // stream 6
+			data[7*stride+i] = byte((i + 7) % 256) // stream 7
+		}
+
+		// Expected output: interleaved bytes
+		expected := make([]byte, width*nValues)
+		for i := 0; i < nValues; i++ {
+			expected[i*8] = byte(i % 256)
+			expected[i*8+1] = byte((i + 1) % 256)
+			expected[i*8+2] = byte((i + 2) % 256)
+			expected[i*8+3] = byte((i + 3) % 256)
+			expected[i*8+4] = byte((i + 4) % 256)
+			expected[i*8+5] = byte((i + 5) % 256)
+			expected[i*8+6] = byte((i + 6) % 256)
+			expected[i*8+7] = byte((i + 7) % 256)
+		}
+
+		t.Run(fmt.Sprintf("nValues=%d", nValues), func(t *testing.T) {
+			out := make([]byte, width*nValues)
+			decodeByteStreamSplitBatchWidth8(data, nValues, stride, out)
+			if !bytes.Equal(out, expected) {
+				t.Errorf("Reference implementation produced incorrect output")
+				for i := 0; i < len(expected); i++ {
+					if out[i] != expected[i] {
+						t.Errorf("First mismatch at index %d: got %d, want %d", i, out[i], expected[i])
+						break
+					}
 				}
 			}
 		})
 	}
 }
 
-// TestDecodeByteStreamSplitFLBAWidth2 validates correctness of all FLBA width-2 decoding implementations
+func BenchmarkDecodeByteStreamSplitBatchWidth8(b *testing.B) {
+	const width = 8
+	sizes := []int{8, 64, 512, 4096, 32768, 2097152, 16777216}
+
+	for _, nValues := range sizes {
+		stride := nValues
+		data := make([]byte, width*nValues)
+		for i := 0; i < nValues; i++ {
+			data[i] = byte(i % 256)
+			data[stride+i] = byte((i + 1) % 256)
+			data[2*stride+i] = byte((i + 2) % 256)
+			data[3*stride+i] = byte((i + 3) % 256)
+			data[4*stride+i] = byte((i + 4) % 256)
+			data[5*stride+i] = byte((i + 5) % 256)
+			data[6*stride+i] = byte((i + 6) % 256)
+			data[7*stride+i] = byte((i + 7) % 256)
+		}
+		out := make([]byte, width*nValues)
+		b.SetBytes(int64(width * nValues))
+
+		b.Run(fmt.Sprintf("nValues=%d", nValues), func(b *testing.B) {
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				decodeByteStreamSplitBatchWidth8(data, nValues, stride, out)
+			}
+		})
+	}
+}
+
 func TestDecodeByteStreamSplitFLBAWidth2(t *testing.T) {
 	const width = 2
 
@@ -152,91 +179,216 @@ func TestDecodeByteStreamSplitFLBAWidth2(t *testing.T) {
 	sizes := []int{1, 2, 7, 8, 31, 32, 33, 63, 64, 65, 127, 128, 129, 255, 256, 512, 1024}
 
 	for _, nValues := range sizes {
-		t.Run(fmt.Sprintf("nValues=%d", nValues), func(t *testing.T) {
-			// Setup encoded data (byte stream split format)
-			stride := nValues
-			data := make([]byte, width*nValues)
+		// Setup encoded data (byte stream split format)
+		stride := nValues
+		data := make([]byte, width*nValues)
 
-			// Initialize with predictable pattern
-			for i := 0; i < nValues; i++ {
-				data[i] = byte(i % 256)              // stream 0
-				data[stride+i] = byte((i + 1) % 256) // stream 1
-			}
+		// Initialize with predictable pattern
+		for i := 0; i < nValues; i++ {
+			data[i] = byte(i % 256)              // stream 0
+			data[stride+i] = byte((i + 1) % 256) // stream 1
+		}
+
+		// Expected output: FixedLenByteArray slices with interleaved bytes
+		expected := make([]parquet.FixedLenByteArray, nValues)
+		for i := 0; i < nValues; i++ {
+			expected[i] = make(parquet.FixedLenByteArray, width)
+			expected[i][0] = byte(i % 256)
+			expected[i][1] = byte((i + 1) % 256)
+		}
 
-			// Expected output: FixedLenByteArray slices with interleaved bytes
-			expected := make([]parquet.FixedLenByteArray, nValues)
+		t.Run(fmt.Sprintf("nValues=%d", nValues), func(t *testing.T) {
+			out := make([]parquet.FixedLenByteArray, nValues)
+			for i := range out {
+				out[i] = make(parquet.FixedLenByteArray, width)
+			}
+			decodeByteStreamSplitBatchFLBAWidth2(data, nValues, stride, out)
 			for i := 0; i < nValues; i++ {
-				expected[i] = make(parquet.FixedLenByteArray, width)
-				expected[i][0] = byte(i % 256)
-				expected[i][1] = byte((i + 1) % 256)
+				if !bytes.Equal(out[i], expected[i]) {
+					t.Errorf("Reference implementation mismatch at index %d: got %v, want %v", i, out[i], expected[i])
+					break
+				}
 			}
+		})
+	}
+}
 
-			// Test reference implementation
-			t.Run("Reference", func(t *testing.T) {
-				out := make([]parquet.FixedLenByteArray, nValues)
-				for i := range out {
-					out[i] = make(parquet.FixedLenByteArray, width)
-				}
+func BenchmarkDecodeByteStreamSplitBatchFLBAWidth2(b *testing.B) {
+	const width = 2
+	sizes := []int{8, 64, 512, 4096, 32768, 262144, 2097152, 16777216}
+
+	for _, nValues := range sizes {
+		stride := nValues
+		data := make([]byte, width*nValues)
+		for i := 0; i < nValues; i++ {
+			data[i] = byte(i % 256)
+			data[stride+i] = byte((i + 1) % 256)
+		}
+		out := make([]parquet.FixedLenByteArray, nValues)
+		for i := range out {
+			out[i] = make(parquet.FixedLenByteArray, width)
+		}
+		b.SetBytes(int64(width * nValues))
+
+		b.Run(fmt.Sprintf("nValues=%d", nValues), func(b *testing.B) {
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
 				decodeByteStreamSplitBatchFLBAWidth2(data, nValues, stride, out)
-				for i := 0; i < nValues; i++ {
-					if !bytes.Equal(out[i], expected[i]) {
-						t.Errorf("Reference implementation mismatch at index %d: got %v, want %v", i, out[i], expected[i])
-						break
-					}
-				}
-			})
+			}
+		})
+	}
+}
 
-			// Test V2 implementation
-			t.Run("V2", func(t *testing.T) {
-				out := make([]parquet.FixedLenByteArray, nValues)
-				for i := range out {
-					out[i] = make(parquet.FixedLenByteArray, width)
-				}
-				decodeByteStreamSplitBatchFLBAWidth2V2(data, nValues, stride, out)
-				for i := 0; i < nValues; i++ {
-					if !bytes.Equal(out[i], expected[i]) {
-						t.Errorf("V2 implementation mismatch at index %d: got %v, want %v", i, out[i], expected[i])
-						break
-					}
+func TestDecodeByteStreamSplitFLBAWidth4(t *testing.T) {
+	const width = 4
+	// Test various sizes including edge cases and block boundaries
+	sizes := []int{1, 2, 7, 8, 31, 32, 33, 63, 64, 65, 127, 128, 129, 255, 256, 512, 1024}
+
+	for _, nValues := range sizes {
+		// Setup encoded data (byte stream split format)
+		stride := nValues
+		data := make([]byte, width*nValues)
+
+		// Initialize with predictable pattern
+		for i := 0; i < nValues; i++ {
+			data[i] = byte(i % 256)                // stream 0
+			data[stride+i] = byte((i + 1) % 256)   // stream 1
+			data[stride*2+i] = byte((i + 2) % 256) // stream 2
+			data[stride*3+i] = byte((i + 3) % 256) // stream 3
+		}
+
+		// Expected output: FixedLenByteArray slices with interleaved bytes
+		expected := make([]parquet.FixedLenByteArray, nValues)
+		for i := 0; i < nValues; i++ {
+			expected[i] = make(parquet.FixedLenByteArray, width)
+			expected[i][0] = byte(i % 256)
+			expected[i][1] = byte((i + 1) % 256)
+			expected[i][2] = byte((i + 2) % 256)
+			expected[i][3] = byte((i + 3) % 256)
+		}
+
+		t.Run(fmt.Sprintf("nValues=%d", nValues), func(t *testing.T) {
+			out := make([]parquet.FixedLenByteArray, nValues)
+			for i := range out {
+				out[i] = make(parquet.FixedLenByteArray, width)
+			}
+			decodeByteStreamSplitBatchFLBAWidth4(data, nValues, stride, out)
+			for i := 0; i < nValues; i++ {
+				if !bytes.Equal(out[i], expected[i]) {
+					t.Errorf("Reference implementation mismatch at index %d: got %v, want %v", i, out[i], expected[i])
+					break
 				}
-			})
+			}
 		})
 	}
 }
 
-// BenchmarkDecodeByteStreamSplitBatchFLBAWidth2 benchmarks all FLBA width-2 decoding implementations.
-func BenchmarkDecodeByteStreamSplitBatchFLBAWidth2(b *testing.B) {
-	const width = 2
-	sizes := []int{8, 64, 512, 4096, 32768, 262144}
+func BenchmarkDecodeByteStreamSplitBatchFLBAWidth4(b *testing.B) {
+	const width = 4
+	sizes := []int{8, 64, 512, 4096, 32768, 262144, 2097152, 16777216}
 
 	for _, nValues := range sizes {
+		stride := nValues
+		data := make([]byte, width*nValues)
+		for i := 0; i < nValues; i++ {
+			data[i] = byte(i % 256)
+			data[stride+i] = byte((i + 1) % 256)
+			data[stride*2+i] = byte((i + 2) % 256)
+			data[stride*3+i] = byte((i + 3) % 256)
+		}
+		out := make([]parquet.FixedLenByteArray, nValues)
+		for i := range out {
+			out[i] = make(parquet.FixedLenByteArray, width)
+		}
+		b.SetBytes(int64(width * nValues))
+
 		b.Run(fmt.Sprintf("nValues=%d", nValues), func(b *testing.B) {
-			stride := nValues
-			data := make([]byte, width*nValues)
-			for i := 0; i < nValues; i++ {
-				data[i] = byte(i % 256)
-				data[stride+i] = byte((i + 1) % 256)
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				decodeByteStreamSplitBatchFLBAWidth4(data, nValues, stride, out)
 			}
+		})
+	}
+}
+
+func TestDecodeByteStreamSplitFLBAWidth8(t *testing.T) {
+	const width = 8
+	// Test various sizes including edge cases and block boundaries
+	sizes := []int{1, 2, 7, 8, 31, 32, 33, 63, 64, 65, 127, 128, 129, 255, 256, 512, 1024}
+
+	for _, nValues := range sizes {
+		// Setup encoded data (byte stream split format)
+		stride := nValues
+		data := make([]byte, width*nValues)
+		// Initialize with predictable pattern
+		for i := 0; i < nValues; i++ {
+			data[i] = byte(i % 256)                // stream 0
+			data[stride+i] = byte((i + 1) % 256)   // stream 1
+			data[stride*2+i] = byte((i + 2) % 256) // stream 2
+			data[stride*3+i] = byte((i + 3) % 256) // stream 3
+			data[stride*4+i] = byte((i + 4) % 256) // stream 4
+			data[stride*5+i] = byte((i + 5) % 256) // stream 5
+			data[stride*6+i] = byte((i + 6) % 256) // stream 6
+			data[stride*7+i] = byte((i + 7) % 256) // stream 7
+		}
+		// Expected output: FixedLenByteArray slices with interleaved bytes
+		expected := make([]parquet.FixedLenByteArray, nValues)
+		for i := 0; i < nValues; i++ {
+			expected[i] = make(parquet.FixedLenByteArray, width)
+			expected[i][0] = byte(i % 256)
+			expected[i][1] = byte((i + 1) % 256)
+			expected[i][2] = byte((i + 2) % 256)
+			expected[i][3] = byte((i + 3) % 256)
+			expected[i][4] = byte((i + 4) % 256)
+			expected[i][5] = byte((i + 5) % 256)
+			expected[i][6] = byte((i + 6) % 256)
+			expected[i][7] = byte((i + 7) % 256)
+		}
+
+		t.Run(fmt.Sprintf("nValues=%d", nValues), func(t *testing.T) {
 			out := make([]parquet.FixedLenByteArray, nValues)
 			for i := range out {
 				out[i] = make(parquet.FixedLenByteArray, width)
 			}
+			decodeByteStreamSplitBatchFLBAWidth8(data, nValues, stride, out)
+			for i := 0; i < nValues; i++ {
+				if !bytes.Equal(out[i], expected[i]) {
+					t.Errorf("Reference implementation mismatch at index %d: got %v, want %v", i, out[i], expected[i])
+					break
+				}
+			}
+		})
+	}
+}
 
-			b.SetBytes(int64(width * nValues))
+func BenchmarkDecodeByteStreamSplitBatchFLBAWidth8(b *testing.B) {
+	const width = 8
+	sizes := []int{8, 64, 512, 4096, 32768, 262144, 2097152, 16777216}
 
-			b.Run("Reference", func(b *testing.B) {
-				b.ResetTimer()
-				for i := 0; i < b.N; i++ {
-					decodeByteStreamSplitBatchFLBAWidth2(data, nValues, stride, out)
-				}
-			})
+	for _, nValues := range sizes {
+		stride := nValues
+		data := make([]byte, width*nValues)
+		for i := 0; i < nValues; i++ {
+			data[i] = byte(i % 256)
+			data[stride+i] = byte((i + 1) % 256)
+			data[stride*2+i] = byte((i + 2) % 256)
+			data[stride*3+i] = byte((i + 3) % 256)
+			data[stride*4+i] = byte((i + 4) % 256)
+			data[stride*5+i] = byte((i + 5) % 256)
+			data[stride*6+i] = byte((i + 6) % 256)
+			data[stride*7+i] = byte((i + 7) % 256)
+		}
+		out := make([]parquet.FixedLenByteArray, nValues)
+		for i := range out {
+			out[i] = make(parquet.FixedLenByteArray, width)
+		}
+		b.SetBytes(int64(width * nValues))
 
-			b.Run("V2", func(b *testing.B) {
-				b.ResetTimer()
-				for i := 0; i < b.N; i++ {
-					decodeByteStreamSplitBatchFLBAWidth2V2(data, nValues, stride, out)
-				}
-			})
+		b.Run(fmt.Sprintf("nValues=%d", nValues), func(b *testing.B) {
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				decodeByteStreamSplitBatchFLBAWidth8(data, nValues, stride, out)
+			}
 		})
 	}
 }

From 1b651fa28ff644bbf001aa4faf625de2479d207c Mon Sep 17 00:00:00 2001
From: Daniel Adam <daniel.adam1@thermofisher.com>
Date: Mon, 9 Feb 2026 18:26:44 +0100
Subject: [PATCH 5/8] Update byte-stream split decoding

---
 .../encoding/byte_stream_split_decode.go      | 74 -------------------
 1 file changed, 74 deletions(-)

diff --git a/parquet/internal/encoding/byte_stream_split_decode.go b/parquet/internal/encoding/byte_stream_split_decode.go
index 48ca2ca6..07d3936b 100644
--- a/parquet/internal/encoding/byte_stream_split_decode.go
+++ b/parquet/internal/encoding/byte_stream_split_decode.go
@@ -39,24 +39,10 @@ func decodeByteStreamSplitBatchWidth4Default(data []byte, nValues, stride int, o
 	const width = 4
 	debug.Assert(len(out) >= nValues*width, fmt.Sprintf("not enough space in output buffer for decoding, out: %d bytes, data: %d bytes", len(out), len(data)))
 	debug.Assert(len(data) >= width*stride, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), width*stride))
-	for element := 0; element < nValues; element++ {
-		out[width*element] = data[element]
-		out[width*element+1] = data[stride+element]
-		out[width*element+2] = data[2*stride+element]
-		out[width*element+3] = data[3*stride+element]
-	}
-}
-
-func decodeByteStreamSplitBatchWidth4V2(data []byte, nValues, stride int, out []byte) {
-	const width = 4
-	debug.Assert(len(out) >= nValues*width, fmt.Sprintf("not enough space in output buffer for decoding, out: %d bytes, data: %d bytes", len(out), len(data)))
-	debug.Assert(len(data) >= width*stride, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), width*stride))
-	// Narrow slices to help the compiler eliminate bounds checks.
 	s0 := data[:nValues]
 	s1 := data[stride : stride+nValues]
 	s2 := data[2*stride : 2*stride+nValues]
 	s3 := data[3*stride : 3*stride+nValues]
-
 	out = out[:width*nValues]
 	out32 := unsafe.Slice((*uint32)(unsafe.Pointer(&out[0])), nValues)
 	for i := range nValues {
@@ -71,23 +57,6 @@ func decodeByteStreamSplitBatchWidth8Default(data []byte, nValues, stride int, o
 	const width = 8
 	debug.Assert(len(out) >= nValues*width, fmt.Sprintf("not enough space in output buffer for decoding, out: %d bytes, data: %d bytes", len(out), len(data)))
 	debug.Assert(len(data) >= width*stride, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), width*stride))
-	for element := 0; element < nValues; element++ {
-		out[width*element] = data[element]
-		out[width*element+1] = data[stride+element]
-		out[width*element+2] = data[2*stride+element]
-		out[width*element+3] = data[3*stride+element]
-		out[width*element+4] = data[4*stride+element]
-		out[width*element+5] = data[5*stride+element]
-		out[width*element+6] = data[6*stride+element]
-		out[width*element+7] = data[7*stride+element]
-	}
-}
-
-func decodeByteStreamSplitBatchWidth8V2(data []byte, nValues, stride int, out []byte) {
-	const width = 8
-	debug.Assert(len(out) >= nValues*width, fmt.Sprintf("not enough space in output buffer for decoding, out: %d bytes, data: %d bytes", len(out), len(data)))
-	debug.Assert(len(data) >= width*stride, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), width*stride))
-	// Narrow slices to help the compiler eliminate bounds checks.
 	s0 := data[:nValues]
 	s1 := data[stride : stride+nValues]
 	s2 := data[2*stride : 2*stride+nValues]
@@ -96,7 +65,6 @@ func decodeByteStreamSplitBatchWidth8V2(data []byte, nValues, stride int, out []
 	s5 := data[5*stride : 5*stride+nValues]
 	s6 := data[6*stride : 6*stride+nValues]
 	s7 := data[7*stride : 7*stride+nValues]
-
 	out = out[:width*nValues]
 	out64 := unsafe.Slice((*uint64)(unsafe.Pointer(&out[0])), nValues)
 	for i := range nValues {
@@ -123,16 +91,6 @@ func decodeByteStreamSplitBatchFLBA(data []byte, nValues, stride, width int, out
 // into the output slice 'out' using BYTE_STREAM_SPLIT encoding.
 // 'out' must have space for at least nValues slices.
 func decodeByteStreamSplitBatchFLBAWidth2Default(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) {
-	const width = 2
-	debug.Assert(len(out) >= nValues, fmt.Sprintf("not enough space in output slice for decoding, out: %d values, data: %d values", len(out), nValues))
-	debug.Assert(len(data) >= 2*width*stride, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), width*stride))
-	for element := 0; element < nValues; element++ {
-		out[element][0] = data[element]
-		out[element][1] = data[stride+element]
-	}
-}
-
-func decodeByteStreamSplitBatchFLBAWidth2V2(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) {
 	const width = 2
 	debug.Assert(len(out) >= nValues, fmt.Sprintf("not enough space in output slice for decoding, out: %d values, data: %d values", len(out), nValues))
 	debug.Assert(len(data) >= width*stride, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), width*stride))
@@ -151,24 +109,10 @@ func decodeByteStreamSplitBatchFLBAWidth4Default(data []byte, nValues, stride in
 	const width = 4
 	debug.Assert(len(out) >= nValues, fmt.Sprintf("not enough space in output slice for decoding, out: %d values, data: %d values", len(out), nValues))
 	debug.Assert(len(data) >= width*stride, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), width*stride))
-	for element := 0; element < nValues; element++ {
-		out[element][0] = data[element]
-		out[element][1] = data[stride+element]
-		out[element][2] = data[stride*2+element]
-		out[element][3] = data[stride*3+element]
-	}
-}
-
-func decodeByteStreamSplitBatchFLBAWidth4V2(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) {
-	const width = 4
-	debug.Assert(len(out) >= nValues, fmt.Sprintf("not enough space in output slice for decoding, out: %d values, data: %d values", len(out), nValues))
-	debug.Assert(len(data) >= width*stride, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), width*stride))
-	// Narrow slices to help the compiler eliminate bounds checks.
 	s0 := data[:nValues]
 	s1 := data[stride : stride+nValues]
 	s2 := data[stride*2 : stride*2+nValues]
 	s3 := data[stride*3 : stride*3+nValues]
-
 	for i := range nValues {
 		out32 := (*uint32)(unsafe.Pointer(&out[i][0]))
 		*out32 = uint32(s0[i]) | uint32(s1[i])<<8 | uint32(s2[i])<<16 | uint32(s3[i])<<24
@@ -182,23 +126,6 @@ func decodeByteStreamSplitBatchFLBAWidth8Default(data []byte, nValues, stride in
 	const width = 8
 	debug.Assert(len(out) >= nValues, fmt.Sprintf("not enough space in output slice for decoding, out: %d values, data: %d values", len(out), nValues))
 	debug.Assert(len(data) >= width*stride, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), width*stride))
-	for element := 0; element < nValues; element++ {
-		out[element][0] = data[element]
-		out[element][1] = data[stride+element]
-		out[element][2] = data[stride*2+element]
-		out[element][3] = data[stride*3+element]
-		out[element][4] = data[stride*4+element]
-		out[element][5] = data[stride*5+element]
-		out[element][6] = data[stride*6+element]
-		out[element][7] = data[stride*7+element]
-	}
-}
-
-func decodeByteStreamSplitBatchFLBAWidth8V2(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) {
-	const width = 8
-	debug.Assert(len(out) >= nValues, fmt.Sprintf("not enough space in output slice for decoding, out: %d values, data: %d values", len(out), nValues))
-	debug.Assert(len(data) >= width*stride, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), width*stride))
-	// Narrow slices to help the compiler eliminate bounds checks.
 	s0 := data[:nValues]
 	s1 := data[stride : stride+nValues]
 	s2 := data[stride*2 : stride*2+nValues]
@@ -207,7 +134,6 @@ func decodeByteStreamSplitBatchFLBAWidth8V2(data []byte, nValues, stride int, ou
 	s5 := data[stride*5 : stride*5+nValues]
 	s6 := data[stride*6 : stride*6+nValues]
 	s7 := data[stride*7 : stride*7+nValues]
-
 	for i := range nValues {
 		out64 := (*uint64)(unsafe.Pointer(&out[i][0]))
 		*out64 = uint64(s0[i]) | uint64(s1[i])<<8 | uint64(s2[i])<<16 | uint64(s3[i])<<24 |

From 779a11b9f27719f859578307f8a7308158d8c675 Mon Sep 17 00:00:00 2001
From: Daniel Adam <daniel.adam1@thermofisher.com>
Date: Mon, 9 Feb 2026 18:40:13 +0100
Subject: [PATCH 6/8] Use SIMD implementations if available

---
 .../internal/encoding/byte_stream_split_amd64.go    |  2 ++
 .../internal/encoding/byte_stream_split_arm64.go    |  2 ++
 .../internal/encoding/byte_stream_split_decode.go   | 13 +++++--------
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/parquet/internal/encoding/byte_stream_split_amd64.go b/parquet/internal/encoding/byte_stream_split_amd64.go
index 9e0276cb..3c65241e 100644
--- a/parquet/internal/encoding/byte_stream_split_amd64.go
+++ b/parquet/internal/encoding/byte_stream_split_amd64.go
@@ -28,6 +28,8 @@ import (
 
 func init() {
 	if cpu.X86.HasAVX2 {
+		decodeByteStreamSplitBatchWidth4 = decodeByteStreamSplitBatchWidth4AVX2
+		decodeByteStreamSplitBatchWidth8 = decodeByteStreamSplitBatchWidth8AVX2
 	}
 }
 
diff --git a/parquet/internal/encoding/byte_stream_split_arm64.go b/parquet/internal/encoding/byte_stream_split_arm64.go
index 25ed23cf..e6a88d20 100644
--- a/parquet/internal/encoding/byte_stream_split_arm64.go
+++ b/parquet/internal/encoding/byte_stream_split_arm64.go
@@ -28,6 +28,8 @@ import (
 
 func init() {
 	if cpu.ARM64.HasASIMD {
+		decodeByteStreamSplitBatchWidth4 = decodeByteStreamSplitBatchWidth4NEON
+		decodeByteStreamSplitBatchWidth8 = decodeByteStreamSplitBatchWidth8NEON
 	}
 }
 
diff --git a/parquet/internal/encoding/byte_stream_split_decode.go b/parquet/internal/encoding/byte_stream_split_decode.go
index 07d3936b..d1c4b033 100644
--- a/parquet/internal/encoding/byte_stream_split_decode.go
+++ b/parquet/internal/encoding/byte_stream_split_decode.go
@@ -25,11 +25,8 @@ import (
 )
 
 var (
-	decodeByteStreamSplitBatchWidth4     func(data []byte, nValues, stride int, out []byte)                      = decodeByteStreamSplitBatchWidth4Default
-	decodeByteStreamSplitBatchWidth8     func(data []byte, nValues, stride int, out []byte)                      = decodeByteStreamSplitBatchWidth8Default
-	decodeByteStreamSplitBatchFLBAWidth2 func(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) = decodeByteStreamSplitBatchFLBAWidth2Default
-	decodeByteStreamSplitBatchFLBAWidth4 func(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) = decodeByteStreamSplitBatchFLBAWidth4Default
-	decodeByteStreamSplitBatchFLBAWidth8 func(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) = decodeByteStreamSplitBatchFLBAWidth8Default
+	decodeByteStreamSplitBatchWidth4 func(data []byte, nValues, stride int, out []byte) = decodeByteStreamSplitBatchWidth4Default
+	decodeByteStreamSplitBatchWidth8 func(data []byte, nValues, stride int, out []byte) = decodeByteStreamSplitBatchWidth8Default
 )
 
 // decodeByteStreamSplitBatchWidth4 decodes the batch of nValues raw bytes representing a 4-byte datatype provided by 'data',
@@ -90,7 +87,7 @@ func decodeByteStreamSplitBatchFLBA(data []byte, nValues, stride, width int, out
 // decodeByteStreamSplitBatchFLBAWidth2 decodes the batch of nValues FixedLenByteArrays of length 2 provided by 'data',
 // into the output slice 'out' using BYTE_STREAM_SPLIT encoding.
 // 'out' must have space for at least nValues slices.
-func decodeByteStreamSplitBatchFLBAWidth2Default(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) {
+func decodeByteStreamSplitBatchFLBAWidth2(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) {
 	const width = 2
 	debug.Assert(len(out) >= nValues, fmt.Sprintf("not enough space in output slice for decoding, out: %d values, data: %d values", len(out), nValues))
 	debug.Assert(len(data) >= width*stride, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), width*stride))
@@ -105,7 +102,7 @@ func decodeByteStreamSplitBatchFLBAWidth2Default(data []byte, nValues, stride in
 // decodeByteStreamSplitBatchFLBAWidth4 decodes the batch of nValues FixedLenByteArrays of length 4 provided by 'data',
 // into the output slice 'out' using BYTE_STREAM_SPLIT encoding.
 // 'out' must have space for at least nValues slices.
-func decodeByteStreamSplitBatchFLBAWidth4Default(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) {
+func decodeByteStreamSplitBatchFLBAWidth4(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) {
 	const width = 4
 	debug.Assert(len(out) >= nValues, fmt.Sprintf("not enough space in output slice for decoding, out: %d values, data: %d values", len(out), nValues))
 	debug.Assert(len(data) >= width*stride, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), width*stride))
@@ -122,7 +119,7 @@ func decodeByteStreamSplitBatchFLBAWidth4Default(data []byte, nValues, stride in
 // decodeByteStreamSplitBatchFLBAWidth8 decodes the batch of nValues FixedLenByteArrays of length 8 provided by 'data',
 // into the output slice 'out' using BYTE_STREAM_SPLIT encoding.
 // 'out' must have space for at least nValues slices.
-func decodeByteStreamSplitBatchFLBAWidth8Default(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) {
+func decodeByteStreamSplitBatchFLBAWidth8(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) {
 	const width = 8
 	debug.Assert(len(out) >= nValues, fmt.Sprintf("not enough space in output slice for decoding, out: %d values, data: %d values", len(out), nValues))
 	debug.Assert(len(data) >= width*stride, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), width*stride))

From 1f8cc24d1087578f8decce788552d462dded06e2 Mon Sep 17 00:00:00 2001
From: Daniel Adam <daniel.adam1@thermofisher.com>
Date: Tue, 10 Feb 2026 11:08:18 +0100
Subject: [PATCH 7/8] Update suffix loop in _decodeByteStreamSplitWidth4AVX2
 and _decodeByteStreamSplitWidth8AVX2

---
 .../byte_stream_split_decode_avx2_amd64.s     | 92 ++++++++++---------
 1 file changed, 51 insertions(+), 41 deletions(-)

diff --git a/parquet/internal/encoding/byte_stream_split_decode_avx2_amd64.s b/parquet/internal/encoding/byte_stream_split_decode_avx2_amd64.s
index 63473326..cde6a473 100644
--- a/parquet/internal/encoding/byte_stream_split_decode_avx2_amd64.s
+++ b/parquet/internal/encoding/byte_stream_split_decode_avx2_amd64.s
@@ -35,23 +35,20 @@ TEXT ·_decodeByteStreamSplitWidth4AVX2(SB), NOSPLIT, $0-32
 	JMP suffix_check_avx2
 
 suffix_loop_avx2:
-	// Gather bytes: gathered_byte_data[b] = data[b * stride + i]
-	MOVBQZX (R9)(R14*1), BX   // byte from stream 0
-	MOVBQZX (R10)(R14*1), R15 // byte from stream 1
-
-	// Calculate output offset: i * 4
+	MOVBQZX (R9)(R14*1), BX      // s0
+	MOVBQZX (R10)(R14*1), R15    // s1
+	SHLQ $8, R15
+	ORQ R15, BX
+	MOVBQZX (R11)(R14*1), R15    // s2
+	SHLQ $16, R15
+	ORQ R15, BX
+	MOVBQZX (R12)(R14*1), R15    // s3
+	SHLQ $24, R15
+	ORQ R15, BX
 	MOVQ R14, AX
-	SHLQ $2, AX              // AX = i * 4
-
-	// Store gathered bytes
-	MOVB BX, (DI)(AX*1)
-	MOVB R15, 1(DI)(AX*1)
-
-	MOVBQZX (R11)(R14*1), BX  // byte from stream 2
-	MOVBQZX (R12)(R14*1), R15 // byte from stream 3
 
-	MOVB BX, 2(DI)(AX*1)
-	MOVB R15, 3(DI)(AX*1)
+	SHLQ $2, AX              // AX = i*4
+	MOVL BX, (DI)(AX*1)      // ← single 32-bit store
 
 	INCQ R14
 
@@ -168,34 +165,47 @@ TEXT ·_decodeByteStreamSplitWidth8AVX2(SB), NOSPLIT, $0-32
 	JMP suffix_check_w8_avx2
 
 suffix_loop_w8_avx2:
-	// Calculate output offset: i * 8
+	// Load first byte (stream 0) and start accumulator
+	MOVBQZX (R9)(SI*1), DX   // DX = s0[i] (lowest byte)
+
+	// stream 1 << 8
+	MOVBQZX (R10)(SI*1), AX
+	SHLQ $8, AX
+	ORQ AX, DX
+
+	// stream 2 << 16
+	MOVBQZX (R11)(SI*1), AX
+	SHLQ $16, AX
+	ORQ AX, DX
+
+	// stream 3 << 24
+	MOVBQZX (R12)(SI*1), AX
+	SHLQ $24, AX
+	ORQ AX, DX
+
+	// stream 4 << 32
+	MOVBQZX (R13)(SI*1), AX
+	SHLQ $32, AX
+	ORQ AX, DX
+
+	// stream 5 << 40
+	MOVBQZX (R14)(SI*1), AX
+	SHLQ $40, AX
+	ORQ AX, DX
+
+	// stream 6 << 48
+	MOVBQZX (R15)(SI*1), AX
+	SHLQ $48, AX
+	ORQ AX, DX
+
+	// stream 7 << 56
+	MOVBQZX (BX)(SI*1), AX
+	SHLQ $56, AX
+	ORQ AX, DX
+
 	MOVQ SI, AX
 	SHLQ $3, AX              // AX = i * 8
-
-	// Load and store bytes from all 8 streams
-	MOVBQZX (R9)(SI*1), DX
-	MOVB DX, (DI)(AX*1)
-
-	MOVBQZX (R10)(SI*1), DX
-	MOVB DX, 1(DI)(AX*1)
-
-	MOVBQZX (R11)(SI*1), DX
-	MOVB DX, 2(DI)(AX*1)
-
-	MOVBQZX (R12)(SI*1), DX
-	MOVB DX, 3(DI)(AX*1)
-
-	MOVBQZX (R13)(SI*1), DX
-	MOVB DX, 4(DI)(AX*1)
-
-	MOVBQZX (R14)(SI*1), DX
-	MOVB DX, 5(DI)(AX*1)
-
-	MOVBQZX (R15)(SI*1), DX
-	MOVB DX, 6(DI)(AX*1)
-
-	MOVBQZX (BX)(SI*1), DX
-	MOVB DX, 7(DI)(AX*1)
+	MOVQ DX, (DI)(AX*1)
 
 	INCQ SI
 

From 0dc7aa6b88c83674a09f41e98a77438e024d5aab Mon Sep 17 00:00:00 2001
From: Daniel Adam <daniel.adam1@thermofisher.com>
Date: Tue, 10 Feb 2026 15:17:46 +0100
Subject: [PATCH 8/8] Fix assertions in byte-stream split decoding functions

---
 .../internal/encoding/byte_stream_split_amd64.go    |  4 ++--
 .../internal/encoding/byte_stream_split_arm64.go    |  4 ++--
 .../internal/encoding/byte_stream_split_decode.go   | 13 +++++++------
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/parquet/internal/encoding/byte_stream_split_amd64.go b/parquet/internal/encoding/byte_stream_split_amd64.go
index 3c65241e..9e029df8 100644
--- a/parquet/internal/encoding/byte_stream_split_amd64.go
+++ b/parquet/internal/encoding/byte_stream_split_amd64.go
@@ -45,7 +45,7 @@ func decodeByteStreamSplitBatchWidth4AVX2(data []byte, nValues, stride int, out
 	}
 	const width = 4
 	debug.Assert(len(out) >= nValues*width, fmt.Sprintf("not enough space in output buffer for decoding, out: %d bytes, data: %d bytes", len(out), len(data)))
-	debug.Assert(len(data) >= width*stride, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), width*stride))
+	debug.Assert(len(data) >= 3*stride+nValues, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), 3*stride+nValues))
 	_decodeByteStreamSplitWidth4AVX2(unsafe.Pointer(&data[0]), unsafe.Pointer(&out[0]), nValues, stride)
 }
 
@@ -55,6 +55,6 @@ func decodeByteStreamSplitBatchWidth8AVX2(data []byte, nValues, stride int, out
 	}
 	const width = 8
 	debug.Assert(len(out) >= nValues*width, fmt.Sprintf("not enough space in output buffer for decoding, out: %d bytes, data: %d bytes", len(out), len(data)))
-	debug.Assert(len(data) >= width*stride, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), width*stride))
+	debug.Assert(len(data) >= 7*stride+nValues, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), 7*stride+nValues))
 	_decodeByteStreamSplitWidth8AVX2(unsafe.Pointer(&data[0]), unsafe.Pointer(&out[0]), nValues, stride)
 }
diff --git a/parquet/internal/encoding/byte_stream_split_arm64.go b/parquet/internal/encoding/byte_stream_split_arm64.go
index e6a88d20..df4a841f 100644
--- a/parquet/internal/encoding/byte_stream_split_arm64.go
+++ b/parquet/internal/encoding/byte_stream_split_arm64.go
@@ -45,7 +45,7 @@ func decodeByteStreamSplitBatchWidth4NEON(data []byte, nValues, stride int, out
 	}
 	const width = 4
 	debug.Assert(len(out) >= nValues*width, fmt.Sprintf("not enough space in output buffer for decoding, out: %d bytes, data: %d bytes", len(out), len(data)))
-	debug.Assert(len(data) >= width*stride, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), width*stride))
+	debug.Assert(len(data) >= 3*stride+nValues, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), 3*stride+nValues))
 	_decodeByteStreamSplitWidth4NEON(unsafe.Pointer(&data[0]), unsafe.Pointer(&out[0]), nValues, stride)
 }
 
@@ -55,6 +55,6 @@ func decodeByteStreamSplitBatchWidth8NEON(data []byte, nValues, stride int, out
 	}
 	const width = 8
 	debug.Assert(len(out) >= nValues*width, fmt.Sprintf("not enough space in output buffer for decoding, out: %d bytes, data: %d bytes", len(out), len(data)))
-	debug.Assert(len(data) >= width*stride, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), width*stride))
+	debug.Assert(len(data) >= 7*stride+nValues, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), 7*stride+nValues))
 	_decodeByteStreamSplitWidth8NEON(unsafe.Pointer(&data[0]), unsafe.Pointer(&out[0]), nValues, stride)
 }
diff --git a/parquet/internal/encoding/byte_stream_split_decode.go b/parquet/internal/encoding/byte_stream_split_decode.go
index d1c4b033..7c3c910c 100644
--- a/parquet/internal/encoding/byte_stream_split_decode.go
+++ b/parquet/internal/encoding/byte_stream_split_decode.go
@@ -35,7 +35,8 @@ var (
 func decodeByteStreamSplitBatchWidth4Default(data []byte, nValues, stride int, out []byte) {
 	const width = 4
 	debug.Assert(len(out) >= nValues*width, fmt.Sprintf("not enough space in output buffer for decoding, out: %d bytes, data: %d bytes", len(out), len(data)))
-	debug.Assert(len(data) >= width*stride, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), width*stride))
+	// the beginning of the data slice can be truncated, but for valid encoding we need at least (width-1)*stride+nValues bytes
+	debug.Assert(len(data) >= 3*stride+nValues, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), 3*stride+nValues))
 	s0 := data[:nValues]
 	s1 := data[stride : stride+nValues]
 	s2 := data[2*stride : 2*stride+nValues]
@@ -53,7 +54,7 @@ func decodeByteStreamSplitBatchWidth4Default(data []byte, nValues, stride int, o
 func decodeByteStreamSplitBatchWidth8Default(data []byte, nValues, stride int, out []byte) {
 	const width = 8
 	debug.Assert(len(out) >= nValues*width, fmt.Sprintf("not enough space in output buffer for decoding, out: %d bytes, data: %d bytes", len(out), len(data)))
-	debug.Assert(len(data) >= width*stride, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), width*stride))
+	debug.Assert(len(data) >= 7*stride+nValues, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), 7*stride+nValues))
 	s0 := data[:nValues]
 	s1 := data[stride : stride+nValues]
 	s2 := data[2*stride : 2*stride+nValues]
@@ -75,7 +76,7 @@ func decodeByteStreamSplitBatchWidth8Default(data []byte, nValues, stride int, o
 // 'out' must have space for at least nValues slices.
 func decodeByteStreamSplitBatchFLBA(data []byte, nValues, stride, width int, out []parquet.FixedLenByteArray) {
 	debug.Assert(len(out) >= nValues, fmt.Sprintf("not enough space in output slice for decoding, out: %d values, data: %d values", len(out), nValues))
-	debug.Assert(len(data) >= width*stride, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), width*stride))
+	debug.Assert(len(data) >= (width-1)*stride+nValues, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), (width-1)*stride+nValues))
 	for stream := 0; stream < width; stream++ {
 		for element := 0; element < nValues; element++ {
 			encLoc := stride*stream + element
@@ -90,7 +91,7 @@ func decodeByteStreamSplitBatchFLBA(data []byte, nValues, stride, width int, out
 func decodeByteStreamSplitBatchFLBAWidth2(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) {
 	const width = 2
 	debug.Assert(len(out) >= nValues, fmt.Sprintf("not enough space in output slice for decoding, out: %d values, data: %d values", len(out), nValues))
-	debug.Assert(len(data) >= width*stride, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), width*stride))
+	debug.Assert(len(data) >= stride+nValues, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), stride+nValues))
 	s0 := data[:nValues]
 	s1 := data[stride : stride+nValues]
 	for i := range nValues {
@@ -105,7 +106,7 @@ func decodeByteStreamSplitBatchFLBAWidth2(data []byte, nValues, stride int, out
 func decodeByteStreamSplitBatchFLBAWidth4(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) {
 	const width = 4
 	debug.Assert(len(out) >= nValues, fmt.Sprintf("not enough space in output slice for decoding, out: %d values, data: %d values", len(out), nValues))
-	debug.Assert(len(data) >= width*stride, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), width*stride))
+	debug.Assert(len(data) >= 3*stride+nValues, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), 3*stride+nValues))
 	s0 := data[:nValues]
 	s1 := data[stride : stride+nValues]
 	s2 := data[stride*2 : stride*2+nValues]
@@ -122,7 +123,7 @@ func decodeByteStreamSplitBatchFLBAWidth4(data []byte, nValues, stride int, out
 func decodeByteStreamSplitBatchFLBAWidth8(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) {
 	const width = 8
 	debug.Assert(len(out) >= nValues, fmt.Sprintf("not enough space in output slice for decoding, out: %d values, data: %d values", len(out), nValues))
-	debug.Assert(len(data) >= width*stride, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), width*stride))
+	debug.Assert(len(data) >= 7*stride+nValues, fmt.Sprintf("not enough data for decoding, data: %d bytes, expected at least: %d bytes", len(data), 7*stride+nValues))
 	s0 := data[:nValues]
 	s1 := data[stride : stride+nValues]
 	s2 := data[stride*2 : stride*2+nValues]