diff --git a/CHANGELOG.md b/CHANGELOG.md index 6d35463c..0413f23e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed +- Writing a nullable `Utf8`/`Binary` column no longer throws `NullPointerException` (or silently drops nulls): nullable string columns now carry their validity like nullable primitives and round-trip through `vortex.masked`. As a result they decode as `MaskedArray` (validity + values child) rather than a bare `VarBinArray`. ([#168](https://github.com/dfa1/vortex-java/pull/168)) +- CSV export now handles nullable columns (`MaskedArray`): null rows export as an empty field instead of failing with "unsupported array type for CSV export". ([#168](https://github.com/dfa1/vortex-java/pull/168)) - Zone-map pruning now compares filter values in the *column's* type domain rather than by the boxed value's type. A predicate whose value is boxed at a different width (e.g. `Integer` on an `I64` column) — or any value on a `U64` column — previously pruned nothing and silently degraded to a full scan; it now prunes correctly (unsigned columns by unsigned order). As part of this, a filter value genuinely incomparable to its column (e.g. a `String` against a numeric column) now raises `VortexException` during the scan instead of silently disabling pruning — a behaviour change for callers that relied on the previous silent full scan. ([#159](https://github.com/dfa1/vortex-java/issues/159)) ## [0.9.0] — 2026-06-24 diff --git a/csv/src/main/java/io/github/dfa1/vortex/csv/CsvExporter.java b/csv/src/main/java/io/github/dfa1/vortex/csv/CsvExporter.java index 611f955c..12c2b351 100644 --- a/csv/src/main/java/io/github/dfa1/vortex/csv/CsvExporter.java +++ b/csv/src/main/java/io/github/dfa1/vortex/csv/CsvExporter.java @@ -11,6 +11,7 @@ import io.github.dfa1.vortex.reader.array.IntArray; import io.github.dfa1.vortex.reader.array.LongArray; import io.github.dfa1.vortex.reader.array.ShortArray; +import io.github.dfa1.vortex.reader.array.MaskedArray; import io.github.dfa1.vortex.reader.array.VarBinArray; import io.github.dfa1.vortex.reader.VortexReader; import io.github.dfa1.vortex.reader.ScanIterator; @@ -144,6 +145,9 @@ private static String cellValue(Array arr, long rowIdx) { case FloatArray fa -> Float.toString(fa.getFloat(rowIdx)); case BoolArray ba -> Boolean.toString(ba.getBoolean(rowIdx)); case VarBinArray va -> va.getString(rowIdx); + // Nullable columns decode as MaskedArray: null rows export as an empty field, valid + // rows defer to the inner values array. + case MaskedArray ma -> ma.isValid(rowIdx) ? cellValue(ma.inner(), rowIdx) : ""; default -> throw new VortexException( "unsupported array type for CSV export: " + arr.getClass().getSimpleName()); }; diff --git a/integration/src/test/java/io/github/dfa1/vortex/integration/JavaWritesRustReadsIntegrationTest.java b/integration/src/test/java/io/github/dfa1/vortex/integration/JavaWritesRustReadsIntegrationTest.java index c2c19bf6..44323a8e 100644 --- a/integration/src/test/java/io/github/dfa1/vortex/integration/JavaWritesRustReadsIntegrationTest.java +++ b/integration/src/test/java/io/github/dfa1/vortex/integration/JavaWritesRustReadsIntegrationTest.java @@ -1087,6 +1087,41 @@ void javaWriter_rustReader_masked_nullableI64(@TempDir Path tmp) throws IOExcept assertThat(values).containsExactly(10L, null, 30L, null, 50L, 60L); } + @Test + void javaWriter_rustReader_masked_nullableUtf8(@TempDir Path tmp) throws IOException { + // Given — nullable Utf8 via the default write path. Nullable utf8 now unifies on the + // NullableData carrier, so it routes through MaskedEncoding → VarBin (values + validity) + // like nullable primitives. Regression guard: this path used to NPE in VarBin on the null + // element because no masked wrapper was inserted for String[]-with-nulls. + Path file = tmp.resolve("java_masked_utf8.vtx"); + DType.Struct schema = new DType.Struct(List.of("s"), List.of(new DType.Utf8(true)), false); + String[] data = {"alpha", null, "gamma", null, "epsilon"}; + try (var ch = FileChannel.open(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE); + var sut = VortexWriter.create(ch, schema, WriteOptions.defaults())) { + // When + sut.writeChunk(Map.of("s", data)); + } + + // Then — Rust reads the nullable Utf8 vector; null positions survive, values round-trip + String uri = file.toAbsolutePath().toUri().toString(); + DataSource ds = DataSource.open(SESSION, uri); + Scan scan = ds.scan(ScanOptions.of()); + var values = new ArrayList(); + while (scan.hasNext()) { + Partition partition = scan.next(); + try (ArrowReader reader = partition.scanArrow(ALLOCATOR)) { + while (reader.loadNextBatch()) { + VectorSchemaRoot root = reader.getVectorSchemaRoot(); + VarCharVector vec = (VarCharVector) root.getVector("s"); + for (int i = 0; i < root.getRowCount(); i++) { + values.add(vec.isNull(i) ? null : vec.getObject(i).toString()); + } + } + } + } + assertThat(values).containsExactly("alpha", null, "gamma", null, "epsilon"); + } + @Test void javaWriter_rustReader_bool_boolEncoding(@TempDir Path tmp) throws IOException { // Given — BoolEncoding: bit-packed boolean column diff --git a/integration/src/test/java/io/github/dfa1/vortex/integration/ParquetImportIntegrationTest.java b/integration/src/test/java/io/github/dfa1/vortex/integration/ParquetImportIntegrationTest.java index 25747e0f..e840e260 100644 --- a/integration/src/test/java/io/github/dfa1/vortex/integration/ParquetImportIntegrationTest.java +++ b/integration/src/test/java/io/github/dfa1/vortex/integration/ParquetImportIntegrationTest.java @@ -5,6 +5,7 @@ import dev.hardwood.reader.RowReader; import io.github.dfa1.vortex.core.model.DType; import io.github.dfa1.vortex.reader.array.LongArray; +import io.github.dfa1.vortex.reader.array.MaskedArray; import io.github.dfa1.vortex.reader.array.VarBinArray; import io.github.dfa1.vortex.reader.VortexReader; import io.github.dfa1.vortex.parquet.ParquetImporter; @@ -148,10 +149,12 @@ void stringColumnValuesMatch(@TempDir Path tmp) throws Exception { ScanIterator iter = reader.scan(ScanOptions.all())) { assertThat(iter.hasNext()).isTrue(); try (Chunk first = iter.next()) { - VarBinArray col = first.column("c_first_name"); - assertThat(col.getString(0)).isEqualTo("Jeannette"); - assertThat(col.getString(1)).isEqualTo("Austin"); - assertThat(col.getString(2)).isEqualTo("David"); + // Nullable Utf8 decodes as MaskedArray (validity + VarBin values); rows 0-2 are non-null. + MaskedArray col = first.column("c_first_name"); + VarBinArray values = (VarBinArray) col.inner(); + assertThat(values.getString(0)).isEqualTo("Jeannette"); + assertThat(values.getString(1)).isEqualTo("Austin"); + assertThat(values.getString(2)).isEqualTo("David"); } } } diff --git a/parquet/src/test/java/io/github/dfa1/vortex/parquet/ParquetImporterTest.java b/parquet/src/test/java/io/github/dfa1/vortex/parquet/ParquetImporterTest.java index b4c19f33..142c5b73 100644 --- a/parquet/src/test/java/io/github/dfa1/vortex/parquet/ParquetImporterTest.java +++ b/parquet/src/test/java/io/github/dfa1/vortex/parquet/ParquetImporterTest.java @@ -12,6 +12,7 @@ import io.github.dfa1.vortex.reader.ScanOptions; import io.github.dfa1.vortex.reader.VortexReader; import io.github.dfa1.vortex.reader.array.LongArray; +import io.github.dfa1.vortex.reader.array.MaskedArray; import io.github.dfa1.vortex.reader.array.VarBinArray; import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; @@ -218,10 +219,13 @@ void importsFixture_columnValuesRoundTrip(@TempDir Path tmp) throws Exception { assertThat(sk.getLong(1)).isEqualTo(99L); assertThat(sk.getLong(2)).isEqualTo(98L); - VarBinArray name = first.column("c_first_name"); - assertThat(name.getString(0)).isEqualTo("Jeannette"); - assertThat(name.getString(1)).isEqualTo("Austin"); - assertThat(name.getString(2)).isEqualTo("David"); + // c_first_name is nullable Utf8, so it round-trips as a MaskedArray (validity + + // VarBin values child), like nullable primitives. The first three rows are non-null. + MaskedArray name = first.column("c_first_name"); + VarBinArray nameValues = (VarBinArray) name.inner(); + assertThat(nameValues.getString(0)).isEqualTo("Jeannette"); + assertThat(nameValues.getString(1)).isEqualTo("Austin"); + assertThat(nameValues.getString(2)).isEqualTo("David"); } } } diff --git a/writer/src/main/java/io/github/dfa1/vortex/writer/ChunkImpl.java b/writer/src/main/java/io/github/dfa1/vortex/writer/ChunkImpl.java index 40978d0c..8fab18de 100644 --- a/writer/src/main/java/io/github/dfa1/vortex/writer/ChunkImpl.java +++ b/writer/src/main/java/io/github/dfa1/vortex/writer/ChunkImpl.java @@ -124,8 +124,16 @@ private static Object adaptUtf8(String column, DType.Utf8 dtype, Object value) { "non-nullable column '" + column + "' received null at row " + i); } } + return arr; } - return arr; + // Nullable utf8/binary unifies on the NullableData carrier like the primitive paths: the + // raw String[] (null elements preserved) plus a derived validity bitmap. The writer then + // routes it through MaskedEncoding (or a nullable-capable encoder such as vortex.zstd). + boolean[] validity = new boolean[arr.length]; + for (int i = 0; i < arr.length; i++) { + validity[i] = arr[i] != null; + } + return new NullableData(arr, validity); } private static Object adaptBool(String column, DType.Bool dtype, Object value) { diff --git a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/VarBinEncodingEncoder.java b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/VarBinEncodingEncoder.java index 72a2c74e..6ee60b06 100644 --- a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/VarBinEncodingEncoder.java +++ b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/VarBinEncodingEncoder.java @@ -15,6 +15,8 @@ /// Write-only encoder for `vortex.varbin`. public final class VarBinEncodingEncoder implements EncodingEncoder { + private static final byte[] EMPTY_BYTES = new byte[0]; + /// Public no-arg constructor required by [java.util.ServiceLoader]. public VarBinEncodingEncoder() { } @@ -37,7 +39,9 @@ public EncodeResult encode(DType dtype, Object data, EncodeContext ctx) { byte[][] byteArrays = new byte[n][]; int totalBytes = 0; for (int i = 0; i < n; i++) { - byteArrays[i] = strings[i].getBytes(StandardCharsets.UTF_8); + // Null entries (this encoder is the values child of a masked/nullable layout, where + // validity carries nullity) contribute a zero-length slot, not an NPE. + byteArrays[i] = strings[i] == null ? EMPTY_BYTES : strings[i].getBytes(StandardCharsets.UTF_8); totalBytes += byteArrays[i].length; } diff --git a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/ZstdEncodingEncoder.java b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/ZstdEncodingEncoder.java index 4335669e..a2076f9a 100644 --- a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/ZstdEncodingEncoder.java +++ b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/ZstdEncodingEncoder.java @@ -35,19 +35,22 @@ public boolean accepts(DType dtype) { @Override public boolean acceptsNullable(DType dtype) { - // Nullable utf8/binary arrive as a String[] carrying nulls (handled in encode), not a - // NullableData carrier; only primitive nullable columns are routed here as NullableData. - return dtype instanceof DType.Primitive; + // Nullable primitive, utf8 and binary columns all arrive as a NullableData carrier and are + // encoded directly here (validity emitted as Bool child[0]) rather than masked-wrapped. + return dtype instanceof DType.Primitive || dtype instanceof DType.Utf8 || dtype instanceof DType.Binary; } @Override public EncodeResult encode(DType dtype, Object data, EncodeContext ctx) { if (data instanceof NullableData nd) { - if (!(dtype instanceof DType.Primitive dt)) { - throw new VortexException(EncodingId.VORTEX_ZSTD, - "NullableData is only supported for primitive dtypes, got " + dtype); + if (dtype instanceof DType.Primitive dt) { + return encodeNullablePrimitive(dt, nd, ctx); + } + if (dtype instanceof DType.Utf8 || dtype instanceof DType.Binary) { + return encodeNullableVarBin(nd, ctx); } - return encodeNullablePrimitive(dt, nd, ctx); + throw new VortexException(EncodingId.VORTEX_ZSTD, + "NullableData is unsupported for dtype: " + dtype); } if (dtype instanceof DType.Primitive dt) { return encodePrimitive(dt, data, ctx.arena()); @@ -55,11 +58,8 @@ public EncodeResult encode(DType dtype, Object data, EncodeContext ctx) { if (dtype instanceof DType.Utf8 || dtype instanceof DType.Binary) { String[] strings = (String[]) data; if (containsNull(strings)) { - if (!dtype.nullable()) { - throw new VortexException(EncodingId.VORTEX_ZSTD, - "non-nullable " + dtype + " contains null"); - } - return encodeNullableVarBin(strings, ctx); + throw new VortexException(EncodingId.VORTEX_ZSTD, + "non-nullable " + dtype + " contains null"); } return encodeVarBin(strings, ctx.arena()); } @@ -104,11 +104,12 @@ private static EncodeResult encodeNullablePrimitive(DType.Primitive dt, Nullable return buildNullableResult(packed, countValid(validity), validity, ctx); } - private static EncodeResult encodeNullableVarBin(String[] strings, EncodeContext ctx) { - boolean[] validity = validityOf(strings); - String[] valid = stripNulls(strings); + private static EncodeResult encodeNullableVarBin(NullableData nd, EncodeContext ctx) { + // Strip null positions: only valid strings reach the compressed payload (mirrors the Rust + // reference). The decoder scatters them back over the validity mask carried by child[0]. + String[] valid = stripNulls((String[]) nd.values()); MemorySegment packed = buildLengthPrefixed(valid, ctx.arena()); - return buildNullableResult(packed, valid.length, validity, ctx); + return buildNullableResult(packed, valid.length, nd.validity(), ctx); } private static EncodeResult buildNullableResult( @@ -170,14 +171,6 @@ private static boolean containsNull(String[] strings) { return false; } - private static boolean[] validityOf(String[] strings) { - boolean[] validity = new boolean[strings.length]; - for (int i = 0; i < strings.length; i++) { - validity[i] = strings[i] != null; - } - return validity; - } - private static String[] stripNulls(String[] strings) { String[] valid = new String[countNonNull(strings)]; int j = 0; diff --git a/writer/src/test/java/io/github/dfa1/vortex/writer/ChunkImplTest.java b/writer/src/test/java/io/github/dfa1/vortex/writer/ChunkImplTest.java index 97d4fe94..816af945 100644 --- a/writer/src/test/java/io/github/dfa1/vortex/writer/ChunkImplTest.java +++ b/writer/src/test/java/io/github/dfa1/vortex/writer/ChunkImplTest.java @@ -178,8 +178,14 @@ void stringArrayAccepted() { } @Test - void nullableAllowsNullElements() { - assertThat(putGet(new DType.Utf8(true), new String[]{"a", null})).isInstanceOf(String[].class); + void nullableConvertsToNullableData() { + // Nullable utf8 unifies on the NullableData carrier (like nullable primitives/bools): + // the raw String[] with null elements preserved, plus a derived validity bitmap. + Object result = putGet(new DType.Utf8(true), new String[]{"a", null, "c"}); + + assertThat(result).isInstanceOf(NullableData.class); + assertThat(((NullableData) result).validity()).containsExactly(true, false, true); + assertThat((String[]) ((NullableData) result).values()).containsExactly("a", null, "c"); } @Test diff --git a/writer/src/test/java/io/github/dfa1/vortex/writer/encode/ZstdEncodingEncoderTest.java b/writer/src/test/java/io/github/dfa1/vortex/writer/encode/ZstdEncodingEncoderTest.java index fbfc741d..c89d8d9a 100644 --- a/writer/src/test/java/io/github/dfa1/vortex/writer/encode/ZstdEncodingEncoderTest.java +++ b/writer/src/test/java/io/github/dfa1/vortex/writer/encode/ZstdEncodingEncoderTest.java @@ -115,15 +115,18 @@ void encode_nullablePrimitive_roundTrips() { @Test void encode_nullableUtf8_roundTrips() { - // Given — a String[] carrying nulls; the encoder must strip them, compress only the - // valid strings, and emit the validity bitmap as child[0]. - String[] data = {"hello", null, "world", null}; + // Given — nullable utf8 as a NullableData carrier (String[] with null elements + the + // derived validity), the unified nullable shape. The encoder strips nulls, compresses + // only the valid strings, and emits the validity bitmap as child[0]. + String[] storage = {"hello", null, "world", null}; + boolean[] validity = {true, false, true, false}; DType utf8Nullable = new DType.Utf8(true); + NullableData data = new NullableData(storage, validity); // When EncodeResult result = ENCODER.encode(utf8Nullable, data, EncodeTestHelper.testCtx()); DecodeContext ctx = DecodeTestHelper.toDecodeContext( - result, data.length, utf8Nullable, TestRegistry.ofDecoders(new BoolEncodingDecoder())); + result, validity.length, utf8Nullable, TestRegistry.ofDecoders(new BoolEncodingDecoder())); MaskedArray decoded = (MaskedArray) DECODER.decode(ctx); // Then @@ -164,13 +167,15 @@ void encode_allNullPrimitive_roundTrips() { void encode_allNullUtf8_roundTrips() { // Given — every string null: stripNulls yields an empty array, so the length-prefixed // payload is 0 bytes. Guards the empty-payload corner of the nullable varbin path. - String[] data = {null, null, null}; + String[] storage = {null, null, null}; + boolean[] validity = {false, false, false}; DType utf8Nullable = new DType.Utf8(true); + NullableData data = new NullableData(storage, validity); // When EncodeResult result = ENCODER.encode(utf8Nullable, data, EncodeTestHelper.testCtx()); DecodeContext ctx = DecodeTestHelper.toDecodeContext( - result, data.length, utf8Nullable, TestRegistry.ofDecoders(new BoolEncodingDecoder())); + result, validity.length, utf8Nullable, TestRegistry.ofDecoders(new BoolEncodingDecoder())); MaskedArray decoded = (MaskedArray) DECODER.decode(ctx); // Then