diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/FlatSegmentDecoder.java b/reader/src/main/java/io/github/dfa1/vortex/reader/FlatSegmentDecoder.java index 3972942f..6fe9d8cd 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/FlatSegmentDecoder.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/FlatSegmentDecoder.java @@ -2,6 +2,7 @@ import static io.github.dfa1.vortex.core.io.PTypeIO.LE_INT; +import io.github.dfa1.vortex.core.error.VortexException; import io.github.dfa1.vortex.core.model.DType; import io.github.dfa1.vortex.core.io.IoBounds; import io.github.dfa1.vortex.reader.array.Array; @@ -26,6 +27,13 @@ /// FlatBuffer parsing, buffer-offset arithmetic, and encoding-spec lookup. public final class FlatSegmentDecoder { + /// Hard cap on array-node recursion depth. The encoded array tree nests through child nodes + /// (validity, patches, run-ends, dictionary codes/values, …); a crafted or self-referential + /// FlatBuffer can drive [#convertArrayNode] into unbounded recursion and a [StackOverflowError] + /// — an `Error`, so it would bypass the [VortexException] + /// contract. 64 is well past any real encoding's nesting. + static final int MAX_ARRAY_TREE_DEPTH = 64; + private final ReadRegistry registry; /// Creates a decoder backed by the given registry. @@ -64,20 +72,25 @@ public Array decode(MemorySegment seg, List encodingSpecs, dataOffset += bufDesc.length(); } - ArrayNode rootNode = convertArrayNode(fbArray.root(), encodingSpecs); + ArrayNode rootNode = convertArrayNode(fbArray.root(), encodingSpecs, 0); var ctx = new DecodeContext(rootNode, dtype, rowCount, bufs, registry, arena); return registry.decode(ctx); } private static ArrayNode convertArrayNode( io.github.dfa1.vortex.core.fbs.FbsArrayNode fbs, - List encodingSpecs + List encodingSpecs, + int depth ) { + if (depth > MAX_ARRAY_TREE_DEPTH) { + throw new VortexException( + "array tree depth exceeds limit (" + MAX_ARRAY_TREE_DEPTH + ")"); + } String rawEncodingId = encodingSpecs.get(fbs.encoding()); ArrayNode[] children = new ArrayNode[fbs.childrenLength()]; for (int i = 0; i < children.length; i++) { - children[i] = convertArrayNode(fbs.children(i), encodingSpecs); + children[i] = convertArrayNode(fbs.children(i), encodingSpecs, depth + 1); } int[] bufferIndices = new int[fbs.buffersLength()]; diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/PostscriptParser.java b/reader/src/main/java/io/github/dfa1/vortex/reader/PostscriptParser.java index 9c0f053d..044f631d 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/PostscriptParser.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/PostscriptParser.java @@ -34,6 +34,14 @@ final class PostscriptParser { /// real encoding's metadata footprint (the largest is FSST's symbol table at ~32 KiB). static final int MAX_LAYOUT_METADATA_BYTES = 4 * 1024 * 1024; + /// Hard cap on DType-tree recursion depth. A `DType` nests through Struct fields, List/ + /// FixedSizeList element types, and Extension storage types; like the layout tree, a crafted + /// or self-referential FlatBuffer can drive [#convertDType(io.github.dfa1.vortex.core.fbs.FbsDType, int)] + /// into unbounded recursion and a [StackOverflowError] — which, being an `Error`, would escape + /// the [VortexException] sanitization and leak the reader's memory-mapped Arena. 64 is well past + /// any real schema's nesting. + static final int MAX_DTYPE_DEPTH = 64; + private PostscriptParser() { } @@ -110,7 +118,7 @@ static ParsedFile parseBlobs(MemorySegment footerBuf, MemorySegment layoutBuf, M DType dtype = null; if (dtypeBuf != null && dtypeBuf.byteSize() > 0) { - dtype = convertDType(io.github.dfa1.vortex.core.fbs.FbsDType.getRootAsFbsDType(dtypeBuf)); + dtype = convertDType(io.github.dfa1.vortex.core.fbs.FbsDType.getRootAsFbsDType(dtypeBuf), 0); } return new ParsedFile(footer, dtype, layout); @@ -188,7 +196,11 @@ private static Layout convertLayout(io.github.dfa1.vortex.core.fbs.FbsLayout l, return new Layout(encodingId, l.rowCount(), metadata, List.copyOf(children), List.copyOf(segments)); } - private static DType convertDType(io.github.dfa1.vortex.core.fbs.FbsDType fbs) { + private static DType convertDType(io.github.dfa1.vortex.core.fbs.FbsDType fbs, int depth) { + if (depth > MAX_DTYPE_DEPTH) { + throw new VortexException( + "DType tree depth exceeds limit (" + MAX_DTYPE_DEPTH + ")"); + } int typeType = fbs.typeType(); return switch (typeType) { case FbsType.FbsNull -> new DType.Null(true); @@ -224,21 +236,21 @@ private static DType convertDType(io.github.dfa1.vortex.core.fbs.FbsDType fbs) { names.add(s.names(i)); } for (int i = 0; i < s.dtypesLength(); i++) { - types.add(convertDType(s.dtypes(i))); + types.add(convertDType(s.dtypes(i), depth + 1)); } yield new DType.Struct(List.copyOf(names), List.copyOf(types), s.nullable()); } case FbsType.FbsList -> { var l = fbs.type(new io.github.dfa1.vortex.core.fbs.FbsList()); - yield new DType.List(convertDType(l.elementType()), l.nullable()); + yield new DType.List(convertDType(l.elementType(), depth + 1), l.nullable()); } case FbsType.FbsFixedSizeList -> { var fsl = fbs.type(new FbsFixedSizeList()); - yield new DType.FixedSizeList(convertDType(fsl.elementType()), (int) fsl.size(), fsl.nullable()); + yield new DType.FixedSizeList(convertDType(fsl.elementType(), depth + 1), (int) fsl.size(), fsl.nullable()); } case FbsType.FbsExtension -> { var e = fbs.type(new FbsExtension()); - DType storage = convertDType(e.storageDtype()); + DType storage = convertDType(e.storageDtype(), depth + 1); yield new DType.Extension( e.id(), storage, diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/VortexHttpReader.java b/reader/src/main/java/io/github/dfa1/vortex/reader/VortexHttpReader.java index 4fdc05c4..15054899 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/VortexHttpReader.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/VortexHttpReader.java @@ -124,6 +124,10 @@ public static VortexHttpReader open(URI uri, ReadRegistry registry, HttpClient c : null; var parsed = PostscriptParser.parseBlobs(footerBuf, layoutBuf, dtypeBuf); + // Reject footer segmentSpecs that fall outside the file before any rawSegment() builds a + // Range request from them. The local-file path runs this inside PostscriptParser.parse; + // the HTTP path calls parseBlobs directly and must validate here too. + PostscriptParser.validateSegmentSpecs(parsed.footer().segmentSpecs(), fileSize); return new VortexHttpReader( uri, client, fileSize, trailer.version(), @@ -155,11 +159,36 @@ private static TailFetch fetchTail(URI uri, HttpClient client) throws IOExceptio // Content-Range: bytes -/ String cr = resp.headers().firstValue("Content-Range") .orElseThrow(() -> new VortexException("206 response missing Content-Range from " + uri)); - String spec = cr.substring("bytes ".length()); // "-/" + return parseContentRange(cr, body, uri); + } + + if (status == 200) { + // Server returned full file (no Range support) + return new TailFetch(body, 0L, body.length); + } + + throw new VortexException("HTTP " + status + " fetching tail of " + uri); + } + + /// Parses a `bytes -/` Content-Range header from an untrusted server. + /// Any structural defect (missing `bytes ` prefix, missing `-`/`/`, non-numeric fields) + /// surfaces as a [VortexException] rather than a raw [NumberFormatException] or + /// [StringIndexOutOfBoundsException]. + private static TailFetch parseContentRange(String contentRange, byte[] body, URI uri) { + try { + String prefix = "bytes "; + if (!contentRange.startsWith(prefix)) { + throw new VortexException("malformed Content-Range '" + contentRange + "' from " + uri); + } + String spec = contentRange.substring(prefix.length()); // "-/" + int dash = spec.indexOf('-'); int slash = spec.indexOf('/'); + if (dash < 0 || slash < 0 || dash > slash) { + throw new VortexException("malformed Content-Range '" + contentRange + "' from " + uri); + } + long start = Long.parseLong(spec.substring(0, dash)); + long end = Long.parseLong(spec.substring(dash + 1, slash)); long total = Long.parseLong(spec.substring(slash + 1)); - long start = Long.parseLong(spec.substring(0, spec.indexOf('-'))); - long end = Long.parseLong(spec.substring(spec.indexOf('-') + 1, slash)); long expected = end - start + 1; if (body.length != expected) { throw new VortexException( @@ -167,14 +196,9 @@ private static TailFetch fetchTail(URI uri, HttpClient client) throws IOExceptio .formatted(uri, expected, body.length)); } return new TailFetch(body, start, total); + } catch (NumberFormatException e) { + throw new VortexException("malformed Content-Range '" + contentRange + "' from " + uri, e); } - - if (status == 200) { - // Server returned full file (no Range support) - return new TailFetch(body, 0L, body.length); - } - - throw new VortexException("HTTP " + status + " fetching tail of " + uri); } private static byte[] fetchRange(URI uri, long from, long to, HttpClient client) throws IOException { @@ -211,8 +235,8 @@ private static MemorySegment fetchBlob( URI uri, HttpClient client ) throws IOException { if (offset >= tailStart) { - int relOffset = (int) (offset - tailStart); - return MemorySegment.ofArray(tail).asSlice(relOffset, length); + long relOffset = offset - tailStart; + return IoBounds.slice(MemorySegment.ofArray(tail), relOffset, length); } byte[] bytes = fetchRange(uri, offset, offset + length - 1, client); return MemorySegment.ofArray(bytes); diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/decode/ZstdEncodingDecoder.java b/reader/src/main/java/io/github/dfa1/vortex/reader/decode/ZstdEncodingDecoder.java index 47a2cb60..007c42b5 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/decode/ZstdEncodingDecoder.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/decode/ZstdEncodingDecoder.java @@ -4,6 +4,7 @@ import io.github.dfa1.vortex.core.model.PType; import io.github.dfa1.vortex.core.error.VortexException; import io.github.dfa1.vortex.core.model.EncodingId; +import io.github.dfa1.vortex.core.io.IoBounds; import io.github.dfa1.vortex.core.io.PTypeIO; import io.github.dfa1.vortex.core.proto.ProtoZstdMetadata; import io.github.dfa1.vortex.reader.array.Array; @@ -63,7 +64,17 @@ public Array decode(DecodeContext ctx) { int frameCount = meta.frames().size(); long totalUncompressed = 0; for (int i = 0; i < frameCount; i++) { - totalUncompressed += meta.frames().get(i).uncompressed_size(); + // Validate each frame's declared size (rejects negative / >2 GB) and accumulate + // overflow-safely, so a crafted metadata cannot wrap the total to a small positive + // value and under-allocate, nor drive arena.allocate negative. The per-frame cap also + // guards the (int) narrowing at the asSlice call site in decompressFrames. + int frameSize = IoBounds.toIntSize(meta.frames().get(i).uncompressed_size()); + try { + totalUncompressed = Math.addExact(totalUncompressed, frameSize); + } catch (ArithmeticException e) { + throw new VortexException(EncodingId.VORTEX_ZSTD, + "total uncompressed size overflows", e); + } } MemorySegment decompressed = decompressFrames(ctx, meta, frameCount, totalUncompressed); @@ -112,7 +123,7 @@ private static VarBinArray buildScatteredVarBin( long scanPos = 0; for (long i = 0; i < rowCount; i++) { if (validity.getBoolean(i)) { - int len = validValues.get(PTypeIO.LE_INT, scanPos); + int len = readVarBinLen(validValues, scanPos); scanPos += 4L + len; totalDataBytes += len; } @@ -122,6 +133,9 @@ private static VarBinArray buildScatteredVarBin( MemorySegment offsets = ctx.arena().allocate((rowCount + 1) * 4L, 4); offsets.setAtIndex(PTypeIO.LE_INT, 0, 0); + // Second pass reads the same positions the first pass already bounds-checked via + // readVarBinLen, so a raw get/copy here cannot overrun; values is sized to the validated + // total. Keep both passes in lockstep — any edit to the cursor advance must stay identical. long readPos = 0; long dataPos = 0; for (long i = 0; i < rowCount; i++) { @@ -163,7 +177,7 @@ private static MemorySegment decompressFrames( long outOffset = 0; for (int i = 0; i < frameCount; i++) { MemorySegment src = asNative(ctx.buffer(frameBufferBase + i), scratch); - int uncompSize = (int) meta.frames().get(i).uncompressed_size(); + int uncompSize = IoBounds.toIntSize(meta.frames().get(i).uncompressed_size()); MemorySegment dst = out.asSlice(outOffset, uncompSize); long written = dictionary == null ? dctx.decompress(dst, src) @@ -236,11 +250,28 @@ private static Array buildPrimitive(DType.Primitive dt, long n, MemorySegment de }; } + /// Reads a 4-byte little-endian length prefix at `pos` from a decompressed VarBin payload and + /// validates that both the prefix and the `len` bytes that follow lie within `src`. Without this, + /// a crafted payload with a negative or oversized length would advance the cursor out of bounds + /// and surface as a raw [IndexOutOfBoundsException] instead of a + /// [io.github.dfa1.vortex.core.error.VortexException]. + /// + /// @param src the decompressed VarBin payload segment + /// @param pos byte offset of the length prefix within `src` + /// @return the validated element length in bytes + private static int readVarBinLen(MemorySegment src, long pos) { + IoBounds.checkRange(pos, 4, src.byteSize()); + int len = src.get(PTypeIO.LE_INT, pos); + // checkRange rejects len < 0 and a [pos+4, pos+4+len) range that overruns src. + IoBounds.checkRange(pos + 4L, len, src.byteSize()); + return len; + } + private static VarBinArray buildVarBin(DType dtype, long n, MemorySegment decompressed, DecodeContext ctx) { long totalDataBytes = 0; long pos = 0; for (long i = 0; i < n; i++) { - int len = decompressed.get(PTypeIO.LE_INT, pos); + int len = readVarBinLen(decompressed, pos); pos += 4 + len; totalDataBytes += len; } @@ -249,6 +280,9 @@ private static VarBinArray buildVarBin(DType dtype, long n, MemorySegment decomp MemorySegment offsets = ctx.arena().allocate((n + 1) * 4L, 4); offsets.setAtIndex(PTypeIO.LE_INT, 0, 0); + // Second pass reads the same positions the first pass already bounds-checked via + // readVarBinLen, so a raw get/copy here cannot overrun; values is sized to the validated + // total. Keep both passes in lockstep — any edit to the cursor advance must stay identical. pos = 0; long dataPos = 0; for (long i = 0; i < n; i++) { diff --git a/reader/src/test/java/io/github/dfa1/vortex/reader/ArrayNodeDepthBombSecurityTest.java b/reader/src/test/java/io/github/dfa1/vortex/reader/ArrayNodeDepthBombSecurityTest.java new file mode 100644 index 00000000..3a12f63a --- /dev/null +++ b/reader/src/test/java/io/github/dfa1/vortex/reader/ArrayNodeDepthBombSecurityTest.java @@ -0,0 +1,66 @@ +package io.github.dfa1.vortex.reader; + +import io.github.dfa1.vortex.core.error.VortexException; +import io.github.dfa1.vortex.core.fbs.FbsArray; +import io.github.dfa1.vortex.core.fbs.FbsArrayNode; +import io.github.dfa1.vortex.core.fbs.FbsBuilder; +import io.github.dfa1.vortex.core.io.PTypeIO; +import io.github.dfa1.vortex.core.model.DType; +import org.junit.jupiter.api.Test; + +import java.lang.foreign.Arena; +import java.lang.foreign.MemorySegment; +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +/// Adversarial test for the encoded-array-tree recursion in +/// [FlatSegmentDecoder]'s `convertArrayNode`. +/// +/// The decoder walks the array node tree recursively (validity, patches, run-ends, dictionary +/// codes/values, …). Without the [FlatSegmentDecoder#MAX_ARRAY_TREE_DEPTH] cap a crafted segment +/// with thousands of nested children produces a [StackOverflowError] — an `Error` that escapes the +/// "malformed input must surface as [VortexException]" contract (ADR 0003). This pins that contract. +class ArrayNodeDepthBombSecurityTest { + + private static final DType DTYPE = DType.I32; + + private final FlatSegmentDecoder sut = new FlatSegmentDecoder(ReadRegistry.empty()); + + @Test + void deeplyNestedArrayTree_throwsVortexException() { + try (Arena arena = Arena.ofConfined()) { + // Given — a flat segment whose FbsArray root nests 65536 levels of single-child nodes. + // Real encodings nest only a handful of levels; 65536 reliably blows the JVM stack on + // the recursive convertArrayNode walk if the depth cap is removed. + byte[] fb = deeplyNestedArrayFlatBuffer(65536); + MemorySegment seg = arena.allocate(fb.length + 4L); + MemorySegment.copy(MemorySegment.ofArray(fb), 0, seg, 0, fb.length); + seg.set(PTypeIO.LE_INT, fb.length, fb.length); + + // When / Then — must surface as VortexException, not StackOverflowError + assertThatThrownBy(() -> sut.decode(seg, List.of("vortex.flat"), DTYPE, 1, arena)) + .isInstanceOf(VortexException.class); + } + } + + /// Builds a minimal `FbsArray` whose root node has `depth` levels of single-child nesting, + /// each level a buffer-less node referencing encoding index 0. + private static byte[] deeplyNestedArrayFlatBuffer(int depth) { + FbsBuilder b = new FbsBuilder(depth * 32); + // Leaf first; FlatBuffer requires children be finished before parents. + int emptyChildren = FbsArrayNode.createChildrenVector(b, new int[0]); + int emptyBuffers = FbsArrayNode.createBuffersVector(b, new int[0]); + int current = FbsArrayNode.createFbsArrayNode(b, 0, 0, emptyChildren, emptyBuffers, 0); + for (int i = 0; i < depth; i++) { + int childV = FbsArrayNode.createChildrenVector(b, new int[]{current}); + int bufV = FbsArrayNode.createBuffersVector(b, new int[0]); + current = FbsArrayNode.createFbsArrayNode(b, 0, 0, childV, bufV, 0); + } + FbsArray.startBuffersVector(b, 0); + int buffers = b.endVector(); + int array = FbsArray.createFbsArray(b, current, buffers); + FbsArray.finishFbsArrayBuffer(b, array); + return b.sizedByteArray(); + } +} diff --git a/reader/src/test/java/io/github/dfa1/vortex/reader/DTypeDepthBombSecurityTest.java b/reader/src/test/java/io/github/dfa1/vortex/reader/DTypeDepthBombSecurityTest.java new file mode 100644 index 00000000..b93f811f --- /dev/null +++ b/reader/src/test/java/io/github/dfa1/vortex/reader/DTypeDepthBombSecurityTest.java @@ -0,0 +1,87 @@ +package io.github.dfa1.vortex.reader; + +import io.github.dfa1.vortex.core.error.VortexException; +import io.github.dfa1.vortex.core.io.VortexFormat; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.OutputStream; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.file.Files; +import java.nio.file.Path; + +import static io.github.dfa1.vortex.reader.MalformedFiles.buildDeeplyNestedListDtype; +import static io.github.dfa1.vortex.reader.MalformedFiles.buildFlatLayout; +import static io.github.dfa1.vortex.reader.MalformedFiles.buildFooter; +import static io.github.dfa1.vortex.reader.MalformedFiles.buildPostscript; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +/// Adversarial test for the DType-tree recursion in [PostscriptParser]'s `convertDType`. +/// +/// `convertDType` walks Struct fields, List/FixedSizeList element types, and Extension storage +/// types recursively. Without the [PostscriptParser#MAX_DTYPE_DEPTH] cap a crafted file with +/// thousands of nested `List` types produces a [StackOverflowError] during `VortexReader.open` — +/// an `Error` that escapes the [VortexException] sanitization and leaks the reader's memory-mapped +/// Arena (`open`'s `catch (Exception)` never runs `arena.close()`). This pins the contract: deeply +/// nested dtypes must be rejected as [VortexException], never a [StackOverflowError]. +class DTypeDepthBombSecurityTest { + + private static final ReadRegistry REGISTRY = ReadRegistry.empty(); + + @Test + void deeplyNestedDtype_throwsVortexException(@TempDir Path tmp) throws Exception { + // Given — a file whose DType nests 65536 levels of List. Real schemas nest a handful of + // levels; 65536 reliably blows the JVM stack on the recursive convertDType walk. + Path file = buildDeeplyNestedDtypeFile(tmp, 65536); + + // When / Then — must surface as VortexException, not StackOverflowError + assertThatThrownBy(() -> VortexReader.open(file, REGISTRY)) + .isInstanceOf(VortexException.class); + } + + private static Path buildDeeplyNestedDtypeFile(Path dir, int depth) throws Exception { + byte[] body = new byte[8]; // unused placeholder + ByteBuffer footerBuf = buildFooter( + new String[]{"vortex.primitive"}, + new String[]{"vortex.flat"}, + new long[]{0L}, + new long[]{(long) body.length}); + ByteBuffer dtypeBuf = buildDeeplyNestedListDtype(depth); + ByteBuffer layoutBuf = buildFlatLayout(0, 1L, 0); + + long footerOff = body.length; + long dtypeOff = footerOff + footerBuf.remaining(); + long layoutOff = dtypeOff + dtypeBuf.remaining(); + + ByteBuffer psBuf = buildPostscript( + footerOff, footerBuf.remaining(), + dtypeOff, dtypeBuf.remaining(), + layoutOff, layoutBuf.remaining()); + + int psLen = psBuf.remaining(); + ByteBuffer trailer = ByteBuffer.allocate(8).order(ByteOrder.LITTLE_ENDIAN); + trailer.putShort((short) 1); + trailer.putShort((short) psLen); + trailer.put(VortexFormat.MAGIC.asByteBuffer()); + trailer.flip(); + + Path file = dir.resolve("deep_dtype.vtx"); + try (OutputStream out = Files.newOutputStream(file)) { + out.write(body); + writeBuf(out, footerBuf); + writeBuf(out, dtypeBuf); + writeBuf(out, layoutBuf); + writeBuf(out, psBuf); + out.write(trailer.array()); + } + return file; + } + + private static void writeBuf(OutputStream out, ByteBuffer buf) throws Exception { + buf = buf.duplicate(); + byte[] bytes = new byte[buf.remaining()]; + buf.get(bytes); + out.write(bytes); + } +} diff --git a/reader/src/test/java/io/github/dfa1/vortex/reader/HttpSegmentSpecBoundsSecurityTest.java b/reader/src/test/java/io/github/dfa1/vortex/reader/HttpSegmentSpecBoundsSecurityTest.java new file mode 100644 index 00000000..8b9d5352 --- /dev/null +++ b/reader/src/test/java/io/github/dfa1/vortex/reader/HttpSegmentSpecBoundsSecurityTest.java @@ -0,0 +1,148 @@ +package io.github.dfa1.vortex.reader; + +import io.github.dfa1.vortex.core.error.VortexException; +import io.github.dfa1.vortex.core.io.VortexFormat; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; + +import java.io.ByteArrayOutputStream; +import java.net.URI; +import java.net.http.HttpClient; +import java.net.http.HttpHeaders; +import java.net.http.HttpRequest; +import java.net.http.HttpResponse; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import javax.net.ssl.SSLSession; + +import static io.github.dfa1.vortex.reader.MalformedFiles.buildFlatLayout; +import static io.github.dfa1.vortex.reader.MalformedFiles.buildFooter; +import static io.github.dfa1.vortex.reader.MalformedFiles.buildI64Dtype; +import static io.github.dfa1.vortex.reader.MalformedFiles.buildPostscript; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.doReturn; + +/// Pins that the HTTP reader validates footer `segmentSpecs` against the file size, mirroring the +/// local-file path. `VortexHttpReader.open` calls `PostscriptParser.parseBlobs` directly (not +/// `parse`), so it must run `validateSegmentSpecs` itself — otherwise an out-of-bounds remote +/// footer turns into out-of-range HTTP Range requests at scan time instead of a [VortexException]. +@ExtendWith(MockitoExtension.class) +class HttpSegmentSpecBoundsSecurityTest { + + @Mock + private HttpClient client; + + private static final URI URI = java.net.URI.create("http://example.com/oob_segment.vortex"); + + @Test + void open_segmentSpecPastEof_throwsVortexException() throws Exception { + // Given — a well-formed remote file whose single footer segmentSpec declares an offset far + // beyond the file, served as a 206 that fits the tail window. + byte[] file = buildFileWithOobSegment(); + doReturn(response206("bytes 0-" + (file.length - 1) + "/" + file.length, file)) + .when(client).send(any(), any()); + + // When / Then + assertThatThrownBy(() -> VortexHttpReader.open(URI, ReadRegistry.empty(), client)) + .isInstanceOf(VortexException.class) + .hasMessageContaining("out of bounds"); + } + + private static byte[] buildFileWithOobSegment() throws Exception { + byte[] body = new byte[8]; // unused placeholder + // Segment offset 2^40 sits far past any real file size, so validateSegmentSpecs must reject. + ByteBuffer footerBuf = buildFooter( + new String[]{"vortex.primitive"}, + new String[]{"vortex.flat"}, + new long[]{1L << 40}, + new long[]{0L}); + ByteBuffer dtypeBuf = buildI64Dtype(); + ByteBuffer layoutBuf = buildFlatLayout(0, 1L, 0); + + long footerOff = body.length; + long dtypeOff = footerOff + footerBuf.remaining(); + long layoutOff = dtypeOff + dtypeBuf.remaining(); + + ByteBuffer psBuf = buildPostscript( + footerOff, footerBuf.remaining(), + dtypeOff, dtypeBuf.remaining(), + layoutOff, layoutBuf.remaining()); + + int psLen = psBuf.remaining(); + ByteBuffer trailer = ByteBuffer.allocate(8).order(ByteOrder.LITTLE_ENDIAN); + trailer.putShort((short) 1); + trailer.putShort((short) psLen); + trailer.put(VortexFormat.MAGIC.asByteBuffer()); + trailer.flip(); + + ByteArrayOutputStream out = new ByteArrayOutputStream(); + out.write(body); + writeBuf(out, footerBuf); + writeBuf(out, dtypeBuf); + writeBuf(out, layoutBuf); + writeBuf(out, psBuf); + out.write(trailer.array()); + return out.toByteArray(); + } + + private static void writeBuf(ByteArrayOutputStream out, ByteBuffer buf) { + buf = buf.duplicate(); + byte[] bytes = new byte[buf.remaining()]; + buf.get(bytes); + out.write(bytes, 0, bytes.length); + } + + @SuppressWarnings("unchecked") + private static HttpResponse response206(String contentRange, byte[] body) { + return new HttpResponse<>() { + @Override + public int statusCode() { + return 206; + } + + @Override + public byte[] body() { + return body; + } + + @Override + public HttpHeaders headers() { + Map> map = contentRange == null + ? Map.of() + : Map.of("content-range", List.of(contentRange)); + return HttpHeaders.of(map, (k, v) -> true); + } + + @Override + public HttpRequest request() { + return null; + } + + @Override + public Optional> previousResponse() { + return Optional.empty(); + } + + @Override + public Optional sslSession() { + return Optional.empty(); + } + + @Override + public java.net.URI uri() { + return URI; + } + + @Override + public HttpClient.Version version() { + return HttpClient.Version.HTTP_1_1; + } + }; + } +} diff --git a/reader/src/test/java/io/github/dfa1/vortex/reader/MalformedFiles.java b/reader/src/test/java/io/github/dfa1/vortex/reader/MalformedFiles.java index 3f14eed2..c1bf82b5 100644 --- a/reader/src/test/java/io/github/dfa1/vortex/reader/MalformedFiles.java +++ b/reader/src/test/java/io/github/dfa1/vortex/reader/MalformedFiles.java @@ -5,6 +5,7 @@ import io.github.dfa1.vortex.core.fbs.FbsFooter; import io.github.dfa1.vortex.core.fbs.FbsLayout; import io.github.dfa1.vortex.core.fbs.FbsLayoutSpec; +import io.github.dfa1.vortex.core.fbs.FbsList; import io.github.dfa1.vortex.core.fbs.FbsPostscript; import io.github.dfa1.vortex.core.fbs.FbsPostscriptSegment; import io.github.dfa1.vortex.core.fbs.FbsPrimitive; @@ -34,6 +35,23 @@ static ByteBuffer buildI64Dtype() { return slice(fbb); } + /// Builds a DType blob nesting `depth` levels of `List` around an I64 primitive leaf. + /// + /// @param depth number of nested `List` wrappers around the leaf + /// @return the finished DType FlatBuffer + static ByteBuffer buildDeeplyNestedListDtype(int depth) { + var fbb = new FbsBuilder(depth * 32); + // Leaf first; FlatBuffer requires children be finished before parents. + int prim = FbsPrimitive.createFbsPrimitive(fbb, io.github.dfa1.vortex.core.fbs.FbsPType.I64, false); + int current = io.github.dfa1.vortex.core.fbs.FbsDType.createFbsDType(fbb, FbsType.FbsPrimitive, prim); + for (int i = 0; i < depth; i++) { + int list = FbsList.createFbsList(fbb, current, false); + current = io.github.dfa1.vortex.core.fbs.FbsDType.createFbsDType(fbb, FbsType.FbsList, list); + } + io.github.dfa1.vortex.core.fbs.FbsDType.finishFbsDTypeBuffer(fbb, current); + return slice(fbb); + } + /// Builds a FbsFooter with the given array/layout spec ids and inline segment specs. /// /// @param arraySpecs encoding ids, one per array spec diff --git a/reader/src/test/java/io/github/dfa1/vortex/reader/MalformedHttpResponseTest.java b/reader/src/test/java/io/github/dfa1/vortex/reader/MalformedHttpResponseTest.java index 8e50a6a5..0bb0e529 100644 --- a/reader/src/test/java/io/github/dfa1/vortex/reader/MalformedHttpResponseTest.java +++ b/reader/src/test/java/io/github/dfa1/vortex/reader/MalformedHttpResponseTest.java @@ -3,6 +3,8 @@ import io.github.dfa1.vortex.core.error.VortexException; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; @@ -67,6 +69,29 @@ void tailFetch_missingContentRange_throws() throws Exception { .hasMessageContaining("Content-Range"); } + @ParameterizedTest(name = "[{index}] {0}") + @ValueSource(strings = { + "0-63/64", // missing the "bytes " prefix + "bytes 0631/64", // no '-' and no '/' separators + "bytes 0-63", // missing the '/total' field + "bytes x-63/64", // non-numeric start + "bytes 0-y/64", // non-numeric end + "bytes 0-63/z", // non-numeric total + "bytes 63/0-/64", // '/' before '-' — reversed separators (dash > slash) + "bytes -/64" // empty start and end fields + }) + void tailFetch_malformedContentRange_throws(String contentRange) throws Exception { + // Given — a 206 whose server-controlled Content-Range is structurally broken. The parse + // must surface as a VortexException, never a raw NumberFormatException or + // StringIndexOutOfBoundsException leaking from substring/Long.parseLong. + doReturn(response206(contentRange, new byte[64])).when(client).send(any(), any()); + + // When / Then + assertThatThrownBy(() -> VortexHttpReader.open(URI, ReadRegistry.empty(), client)) + .isInstanceOf(VortexException.class) + .hasMessageContaining("Content-Range"); + } + // ── helpers ─────────────────────────────────────────────────────────────── @SuppressWarnings("unchecked") diff --git a/writer/src/test/java/io/github/dfa1/vortex/writer/encode/ZstdEncodingEncoderTest.java b/writer/src/test/java/io/github/dfa1/vortex/writer/encode/ZstdEncodingEncoderTest.java index f57feeae..af141032 100644 --- a/writer/src/test/java/io/github/dfa1/vortex/writer/encode/ZstdEncodingEncoderTest.java +++ b/writer/src/test/java/io/github/dfa1/vortex/writer/encode/ZstdEncodingEncoderTest.java @@ -409,6 +409,78 @@ void decode_missingMetadata_throwsVortexException() { .isInstanceOf(VortexException.class) .hasMessageContaining("missing metadata"); } + + @Test + void decode_negativeFrameSize_throwsVortexException() { + // Given — metadata declares a negative per-frame uncompressed_size. A raw + // arena.allocate(negative) would throw IllegalArgumentException; the per-frame + // IoBounds.toIntSize guard must convert it to a VortexException first. + byte[] compressed = compress(toLeBytes(new int[]{0})); + byte[] meta = metaNoDict(new long[]{-1}, new long[]{1}); + ArrayNode node = ArrayNode.of(EncodingId.VORTEX_ZSTD, MemorySegment.ofArray(meta), + new ArrayNode[0], new int[]{0}); + DecodeContext ctx = new DecodeContext(node, DTypes.I32, 1, + new MemorySegment[]{MemorySegment.ofArray(compressed)}, ReadRegistry.empty(), Arena.ofAuto()); + + // When / Then + assertThatThrownBy(() -> DECODER.decode(ctx)) + .isInstanceOf(VortexException.class); + } + + @Test + void decode_oversizedFrameSize_throwsVortexException() { + // Given — metadata declares a per-frame uncompressed_size above the 2 GB int cap. + // Caught by IoBounds.toIntSize before it can drive the (int) narrowing negative at the + // asSlice site in decompressFrames. + byte[] compressed = compress(toLeBytes(new int[]{0})); + byte[] meta = metaNoDict(new long[]{(long) Integer.MAX_VALUE + 1}, new long[]{1}); + ArrayNode node = ArrayNode.of(EncodingId.VORTEX_ZSTD, MemorySegment.ofArray(meta), + new ArrayNode[0], new int[]{0}); + DecodeContext ctx = new DecodeContext(node, DTypes.I32, 1, + new MemorySegment[]{MemorySegment.ofArray(compressed)}, ReadRegistry.empty(), Arena.ofAuto()); + + // When / Then + assertThatThrownBy(() -> DECODER.decode(ctx)) + .isInstanceOf(VortexException.class) + .hasMessageContaining("2 GB"); + } + + @Test + void decode_varBinOversizedLengthPrefix_throwsVortexException() { + // Given — a non-nullable VarBin payload whose single 4-byte length prefix claims + // 1 000 000 bytes that the 4-byte decompressed buffer cannot hold. readVarBinLen must + // reject the overrun as a VortexException instead of leaking an IndexOutOfBoundsException + // when the cursor advances past the segment. + byte[] raw = toLeBytes(new int[]{1_000_000}); + byte[] compressed = compress(raw); + byte[] meta = metaNoDict(new long[]{raw.length}, new long[]{1}); + ArrayNode node = ArrayNode.of(EncodingId.VORTEX_ZSTD, MemorySegment.ofArray(meta), + new ArrayNode[0], new int[]{0}); + DecodeContext ctx = new DecodeContext(node, new DType.Utf8(false), 1, + new MemorySegment[]{MemorySegment.ofArray(compressed)}, ReadRegistry.empty(), Arena.ofAuto()); + + // When / Then + assertThatThrownBy(() -> DECODER.decode(ctx)) + .isInstanceOf(VortexException.class) + .hasMessageContaining("out of bounds"); + } + + @Test + void decode_scatteredVarBinOversizedLengthPrefix_throwsVortexException() { + // Given — the nullable (scattered) VarBin path: one valid element whose length prefix + // claims 1 000 000 bytes the buffer cannot hold. The same readVarBinLen guard must fire + // on the scatter scan, not only the contiguous path. + boolean[] validityBits = {true}; + byte[] raw = toLeBytes(new int[]{1_000_000}); + byte[] compressed = compress(raw); + byte[] meta = metaNoDict(new long[]{raw.length}, new long[]{1}); + DecodeContext ctx = makeNullableCtx(meta, new DType.Utf8(true), 1, validityBits, compressed); + + // When / Then + assertThatThrownBy(() -> DECODER.decode(ctx)) + .isInstanceOf(VortexException.class) + .hasMessageContaining("out of bounds"); + } } @Nested