Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Fixed

- Writing a nullable `Utf8`/`Binary` column no longer throws `NullPointerException` (or silently drops nulls): nullable string columns now carry their validity like nullable primitives and round-trip through `vortex.masked`. As a result they decode as `MaskedArray` (validity + values child) rather than a bare `VarBinArray`. ([#168](https://github.com/dfa1/vortex-java/pull/168))
- CSV export now handles nullable columns (`MaskedArray`): null rows export as an empty field instead of failing with "unsupported array type for CSV export". ([#168](https://github.com/dfa1/vortex-java/pull/168))
- Zone-map pruning now compares filter values in the *column's* type domain rather than by the boxed value's type. A predicate whose value is boxed at a different width (e.g. `Integer` on an `I64` column) — or any value on a `U64` column — previously pruned nothing and silently degraded to a full scan; it now prunes correctly (unsigned columns by unsigned order). As part of this, a filter value genuinely incomparable to its column (e.g. a `String` against a numeric column) now raises `VortexException` during the scan instead of silently disabling pruning — a behaviour change for callers that relied on the previous silent full scan. ([#159](https://github.com/dfa1/vortex-java/issues/159))

## [0.9.0] — 2026-06-24
Expand Down
4 changes: 4 additions & 0 deletions csv/src/main/java/io/github/dfa1/vortex/csv/CsvExporter.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import io.github.dfa1.vortex.reader.array.IntArray;
import io.github.dfa1.vortex.reader.array.LongArray;
import io.github.dfa1.vortex.reader.array.ShortArray;
import io.github.dfa1.vortex.reader.array.MaskedArray;
import io.github.dfa1.vortex.reader.array.VarBinArray;
import io.github.dfa1.vortex.reader.VortexReader;
import io.github.dfa1.vortex.reader.ScanIterator;
Expand Down Expand Up @@ -144,6 +145,9 @@ private static String cellValue(Array arr, long rowIdx) {
case FloatArray fa -> Float.toString(fa.getFloat(rowIdx));
case BoolArray ba -> Boolean.toString(ba.getBoolean(rowIdx));
case VarBinArray va -> va.getString(rowIdx);
// Nullable columns decode as MaskedArray: null rows export as an empty field, valid
// rows defer to the inner values array.
case MaskedArray ma -> ma.isValid(rowIdx) ? cellValue(ma.inner(), rowIdx) : "";
default -> throw new VortexException(
"unsupported array type for CSV export: " + arr.getClass().getSimpleName());
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1087,6 +1087,41 @@ void javaWriter_rustReader_masked_nullableI64(@TempDir Path tmp) throws IOExcept
assertThat(values).containsExactly(10L, null, 30L, null, 50L, 60L);
}

@Test
void javaWriter_rustReader_masked_nullableUtf8(@TempDir Path tmp) throws IOException {
// Given — nullable Utf8 via the default write path. Nullable utf8 now unifies on the
// NullableData carrier, so it routes through MaskedEncoding → VarBin (values + validity)
// like nullable primitives. Regression guard: this path used to NPE in VarBin on the null
// element because no masked wrapper was inserted for String[]-with-nulls.
Path file = tmp.resolve("java_masked_utf8.vtx");
DType.Struct schema = new DType.Struct(List.of("s"), List.of(new DType.Utf8(true)), false);
String[] data = {"alpha", null, "gamma", null, "epsilon"};
try (var ch = FileChannel.open(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE);
var sut = VortexWriter.create(ch, schema, WriteOptions.defaults())) {
// When
sut.writeChunk(Map.of("s", data));
}

// Then — Rust reads the nullable Utf8 vector; null positions survive, values round-trip
String uri = file.toAbsolutePath().toUri().toString();
DataSource ds = DataSource.open(SESSION, uri);
Scan scan = ds.scan(ScanOptions.of());
var values = new ArrayList<String>();
while (scan.hasNext()) {
Partition partition = scan.next();
try (ArrowReader reader = partition.scanArrow(ALLOCATOR)) {
while (reader.loadNextBatch()) {
VectorSchemaRoot root = reader.getVectorSchemaRoot();
VarCharVector vec = (VarCharVector) root.getVector("s");
for (int i = 0; i < root.getRowCount(); i++) {
values.add(vec.isNull(i) ? null : vec.getObject(i).toString());
}
}
}
}
assertThat(values).containsExactly("alpha", null, "gamma", null, "epsilon");
}

@Test
void javaWriter_rustReader_bool_boolEncoding(@TempDir Path tmp) throws IOException {
// Given — BoolEncoding: bit-packed boolean column
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import dev.hardwood.reader.RowReader;
import io.github.dfa1.vortex.core.model.DType;
import io.github.dfa1.vortex.reader.array.LongArray;
import io.github.dfa1.vortex.reader.array.MaskedArray;
import io.github.dfa1.vortex.reader.array.VarBinArray;
import io.github.dfa1.vortex.reader.VortexReader;
import io.github.dfa1.vortex.parquet.ParquetImporter;
Expand Down Expand Up @@ -148,10 +149,12 @@ void stringColumnValuesMatch(@TempDir Path tmp) throws Exception {
ScanIterator iter = reader.scan(ScanOptions.all())) {
assertThat(iter.hasNext()).isTrue();
try (Chunk first = iter.next()) {
VarBinArray col = first.column("c_first_name");
assertThat(col.getString(0)).isEqualTo("Jeannette");
assertThat(col.getString(1)).isEqualTo("Austin");
assertThat(col.getString(2)).isEqualTo("David");
// Nullable Utf8 decodes as MaskedArray (validity + VarBin values); rows 0-2 are non-null.
MaskedArray col = first.column("c_first_name");
VarBinArray values = (VarBinArray) col.inner();
assertThat(values.getString(0)).isEqualTo("Jeannette");
assertThat(values.getString(1)).isEqualTo("Austin");
assertThat(values.getString(2)).isEqualTo("David");
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import io.github.dfa1.vortex.reader.ScanOptions;
import io.github.dfa1.vortex.reader.VortexReader;
import io.github.dfa1.vortex.reader.array.LongArray;
import io.github.dfa1.vortex.reader.array.MaskedArray;
import io.github.dfa1.vortex.reader.array.VarBinArray;
import org.junit.jupiter.api.Nested;
import org.junit.jupiter.api.Test;
Expand Down Expand Up @@ -218,10 +219,13 @@ void importsFixture_columnValuesRoundTrip(@TempDir Path tmp) throws Exception {
assertThat(sk.getLong(1)).isEqualTo(99L);
assertThat(sk.getLong(2)).isEqualTo(98L);

VarBinArray name = first.column("c_first_name");
assertThat(name.getString(0)).isEqualTo("Jeannette");
assertThat(name.getString(1)).isEqualTo("Austin");
assertThat(name.getString(2)).isEqualTo("David");
// c_first_name is nullable Utf8, so it round-trips as a MaskedArray (validity +
// VarBin values child), like nullable primitives. The first three rows are non-null.
MaskedArray name = first.column("c_first_name");
VarBinArray nameValues = (VarBinArray) name.inner();
assertThat(nameValues.getString(0)).isEqualTo("Jeannette");
assertThat(nameValues.getString(1)).isEqualTo("Austin");
assertThat(nameValues.getString(2)).isEqualTo("David");
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,8 +124,16 @@ private static Object adaptUtf8(String column, DType.Utf8 dtype, Object value) {
"non-nullable column '" + column + "' received null at row " + i);
}
}
return arr;
}
return arr;
// Nullable utf8/binary unifies on the NullableData carrier like the primitive paths: the
// raw String[] (null elements preserved) plus a derived validity bitmap. The writer then
// routes it through MaskedEncoding (or a nullable-capable encoder such as vortex.zstd).
boolean[] validity = new boolean[arr.length];
for (int i = 0; i < arr.length; i++) {
validity[i] = arr[i] != null;
}
return new NullableData(arr, validity);
}

private static Object adaptBool(String column, DType.Bool dtype, Object value) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
/// Write-only encoder for `vortex.varbin`.
public final class VarBinEncodingEncoder implements EncodingEncoder {

private static final byte[] EMPTY_BYTES = new byte[0];

/// Public no-arg constructor required by [java.util.ServiceLoader].
public VarBinEncodingEncoder() {
}
Expand All @@ -37,7 +39,9 @@ public EncodeResult encode(DType dtype, Object data, EncodeContext ctx) {
byte[][] byteArrays = new byte[n][];
int totalBytes = 0;
for (int i = 0; i < n; i++) {
byteArrays[i] = strings[i].getBytes(StandardCharsets.UTF_8);
// Null entries (this encoder is the values child of a masked/nullable layout, where
// validity carries nullity) contribute a zero-length slot, not an NPE.
byteArrays[i] = strings[i] == null ? EMPTY_BYTES : strings[i].getBytes(StandardCharsets.UTF_8);
totalBytes += byteArrays[i].length;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,31 +35,31 @@ public boolean accepts(DType dtype) {

@Override
public boolean acceptsNullable(DType dtype) {
// Nullable utf8/binary arrive as a String[] carrying nulls (handled in encode), not a
// NullableData carrier; only primitive nullable columns are routed here as NullableData.
return dtype instanceof DType.Primitive;
// Nullable primitive, utf8 and binary columns all arrive as a NullableData carrier and are
// encoded directly here (validity emitted as Bool child[0]) rather than masked-wrapped.
return dtype instanceof DType.Primitive || dtype instanceof DType.Utf8 || dtype instanceof DType.Binary;
}

@Override
public EncodeResult encode(DType dtype, Object data, EncodeContext ctx) {
if (data instanceof NullableData nd) {
if (!(dtype instanceof DType.Primitive dt)) {
throw new VortexException(EncodingId.VORTEX_ZSTD,
"NullableData is only supported for primitive dtypes, got " + dtype);
if (dtype instanceof DType.Primitive dt) {
return encodeNullablePrimitive(dt, nd, ctx);
}
if (dtype instanceof DType.Utf8 || dtype instanceof DType.Binary) {
return encodeNullableVarBin(nd, ctx);
}
return encodeNullablePrimitive(dt, nd, ctx);
throw new VortexException(EncodingId.VORTEX_ZSTD,
"NullableData is unsupported for dtype: " + dtype);
}
if (dtype instanceof DType.Primitive dt) {
return encodePrimitive(dt, data, ctx.arena());
}
if (dtype instanceof DType.Utf8 || dtype instanceof DType.Binary) {
String[] strings = (String[]) data;
if (containsNull(strings)) {
if (!dtype.nullable()) {
throw new VortexException(EncodingId.VORTEX_ZSTD,
"non-nullable " + dtype + " contains null");
}
return encodeNullableVarBin(strings, ctx);
throw new VortexException(EncodingId.VORTEX_ZSTD,
"non-nullable " + dtype + " contains null");
}
return encodeVarBin(strings, ctx.arena());
}
Expand Down Expand Up @@ -104,11 +104,12 @@ private static EncodeResult encodeNullablePrimitive(DType.Primitive dt, Nullable
return buildNullableResult(packed, countValid(validity), validity, ctx);
}

private static EncodeResult encodeNullableVarBin(String[] strings, EncodeContext ctx) {
boolean[] validity = validityOf(strings);
String[] valid = stripNulls(strings);
private static EncodeResult encodeNullableVarBin(NullableData nd, EncodeContext ctx) {
// Strip null positions: only valid strings reach the compressed payload (mirrors the Rust
// reference). The decoder scatters them back over the validity mask carried by child[0].
String[] valid = stripNulls((String[]) nd.values());
MemorySegment packed = buildLengthPrefixed(valid, ctx.arena());
return buildNullableResult(packed, valid.length, validity, ctx);
return buildNullableResult(packed, valid.length, nd.validity(), ctx);
}

private static EncodeResult buildNullableResult(
Expand Down Expand Up @@ -170,14 +171,6 @@ private static boolean containsNull(String[] strings) {
return false;
}

private static boolean[] validityOf(String[] strings) {
boolean[] validity = new boolean[strings.length];
for (int i = 0; i < strings.length; i++) {
validity[i] = strings[i] != null;
}
return validity;
}

private static String[] stripNulls(String[] strings) {
String[] valid = new String[countNonNull(strings)];
int j = 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -178,8 +178,14 @@ void stringArrayAccepted() {
}

@Test
void nullableAllowsNullElements() {
assertThat(putGet(new DType.Utf8(true), new String[]{"a", null})).isInstanceOf(String[].class);
void nullableConvertsToNullableData() {
// Nullable utf8 unifies on the NullableData carrier (like nullable primitives/bools):
// the raw String[] with null elements preserved, plus a derived validity bitmap.
Object result = putGet(new DType.Utf8(true), new String[]{"a", null, "c"});

assertThat(result).isInstanceOf(NullableData.class);
assertThat(((NullableData) result).validity()).containsExactly(true, false, true);
assertThat((String[]) ((NullableData) result).values()).containsExactly("a", null, "c");
}

@Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,15 +115,18 @@ void encode_nullablePrimitive_roundTrips() {

@Test
void encode_nullableUtf8_roundTrips() {
// Given — a String[] carrying nulls; the encoder must strip them, compress only the
// valid strings, and emit the validity bitmap as child[0].
String[] data = {"hello", null, "world", null};
// Given — nullable utf8 as a NullableData carrier (String[] with null elements + the
// derived validity), the unified nullable shape. The encoder strips nulls, compresses
// only the valid strings, and emits the validity bitmap as child[0].
String[] storage = {"hello", null, "world", null};
boolean[] validity = {true, false, true, false};
DType utf8Nullable = new DType.Utf8(true);
NullableData data = new NullableData(storage, validity);

// When
EncodeResult result = ENCODER.encode(utf8Nullable, data, EncodeTestHelper.testCtx());
DecodeContext ctx = DecodeTestHelper.toDecodeContext(
result, data.length, utf8Nullable, TestRegistry.ofDecoders(new BoolEncodingDecoder()));
result, validity.length, utf8Nullable, TestRegistry.ofDecoders(new BoolEncodingDecoder()));
MaskedArray decoded = (MaskedArray) DECODER.decode(ctx);

// Then
Expand Down Expand Up @@ -164,13 +167,15 @@ void encode_allNullPrimitive_roundTrips() {
void encode_allNullUtf8_roundTrips() {
// Given — every string null: stripNulls yields an empty array, so the length-prefixed
// payload is 0 bytes. Guards the empty-payload corner of the nullable varbin path.
String[] data = {null, null, null};
String[] storage = {null, null, null};
boolean[] validity = {false, false, false};
DType utf8Nullable = new DType.Utf8(true);
NullableData data = new NullableData(storage, validity);

// When
EncodeResult result = ENCODER.encode(utf8Nullable, data, EncodeTestHelper.testCtx());
DecodeContext ctx = DecodeTestHelper.toDecodeContext(
result, data.length, utf8Nullable, TestRegistry.ofDecoders(new BoolEncodingDecoder()));
result, validity.length, utf8Nullable, TestRegistry.ofDecoders(new BoolEncodingDecoder()));
MaskedArray decoded = (MaskedArray) DECODER.decode(ctx);

// Then
Expand Down
Loading