From f870c9c2c518ab075213534866acac62138774ff Mon Sep 17 00:00:00 2001 From: Davide Angelocola Date: Sat, 20 Jun 2026 19:51:59 +0200 Subject: [PATCH] fix(registry): deterministic encoder order + honest accepts() (fixes Windows build) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VortexWriter.create(.., WriteRegistry) with no cascade selects the first encoder whose accepts() matches the dtype, iterating WriteRegistry.encoderMap().values(). Two coupled defects made that selection wrong and unstable: 1. The map was Map.copyOf(HashMap) — non-deterministic iteration order — so the chosen encoder varied across runs and platforms. 2. ChunkedEncodingEncoder.accepts(Primitive) returned true, but encode() requires a ChunkedData wrapper the dtype-based write path never produces. It is not in DEFAULT_CODECS or the cascade and nothing dtype-dispatches to it — it is only reachable here. On Windows the hash order put it first for a plain long[], throwing ClassCastException; other platforms happened to order a leaf encoder first. Fixes (root, not symptom): - Both WriteRegistry and ReadRegistry back their maps with a TreeMap ordered by encoding name (Comparator.comparing(id) — Enum.compareTo is final, so natural enum order is ordinal and a Comparator is needed). Order is now a pure function of the registered set, independent of registration sequence or ServiceLoader's unspecified iteration order. - ChunkedEncodingEncoder.accepts() returns false: it is carrier-dispatched (selected explicitly when a caller holds ChunkedData), never dtype-dispatched, so it must not be a first-match candidate. Its explicit-encode path is unchanged and still covered by ChunkedEncodingEncoderTest. Tests: - WriteRegistryTest pins name-ordered iteration regardless of registration sequence, and that loadAll() is sorted by name. - ReadRegistryTest covers that a TreeMap-backed registry retains all decoders. - create_withFullRegistry_roundTrips drives the exact failing combination — create(loadAll()) -> writeChunk -> read back — as an end-to-end regression guard, rather than dodging it with a hand-picked single-encoder registry. Co-Authored-By: Claude Opus 4.8 --- .../dfa1/vortex/reader/ReadRegistry.java | 13 ++++++-- .../dfa1/vortex/reader/ReadRegistryTest.java | 24 ++++++++++++++ .../dfa1/vortex/writer/WriteRegistry.java | 33 +++++++++++++++---- .../writer/encode/ChunkedEncodingEncoder.java | 7 +++- .../dfa1/vortex/writer/VortexWriterTest.java | 10 ++++-- .../dfa1/vortex/writer/WriteRegistryTest.java | 31 +++++++++++++++++ 6 files changed, 104 insertions(+), 14 deletions(-) diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/ReadRegistry.java b/reader/src/main/java/io/github/dfa1/vortex/reader/ReadRegistry.java index 62cae7c1..b7f28ed9 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/ReadRegistry.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/ReadRegistry.java @@ -11,9 +11,11 @@ import io.github.dfa1.vortex.reader.decode.UnknownArrayNode; import java.lang.foreign.MemorySegment; -import java.util.HashMap; +import java.util.Collections; +import java.util.Comparator; import java.util.Map; import java.util.ServiceLoader; +import java.util.TreeMap; /// Read-side registry: maps [EncodingId] to [EncodingDecoder] implementations. /// @@ -25,7 +27,12 @@ public final class ReadRegistry { private final boolean allowUnknown; private ReadRegistry(Map decoders, boolean allowUnknown) { - this.decoders = Map.copyOf(decoders); + // Order by encoding name, mirroring WriteRegistry (Enum.compareTo is final, so a Comparator + // is required to sort by id rather than ordinal). Decode dispatch is keyed, so order is not + // load-bearing here, but a stable order keeps the two registries consistent. + var sorted = new TreeMap(Comparator.comparing(EncodingId::id)); + sorted.putAll(decoders); + this.decoders = Collections.unmodifiableMap(sorted); this.allowUnknown = allowUnknown; } @@ -136,7 +143,7 @@ private static UnknownArray decodeUnknown(DecodeContext ctx, ArrayNode node) { /// Not thread-safe. Build once, use everywhere — the produced [ReadRegistry] is immutable. public static final class Builder { - private final Map decoders = new HashMap<>(); + private final Map decoders = new TreeMap<>(); private boolean allowUnknown = false; private Builder() { diff --git a/reader/src/test/java/io/github/dfa1/vortex/reader/ReadRegistryTest.java b/reader/src/test/java/io/github/dfa1/vortex/reader/ReadRegistryTest.java index 5dc17841..203a4056 100644 --- a/reader/src/test/java/io/github/dfa1/vortex/reader/ReadRegistryTest.java +++ b/reader/src/test/java/io/github/dfa1/vortex/reader/ReadRegistryTest.java @@ -7,6 +7,7 @@ import io.github.dfa1.vortex.encoding.EncodingId; import io.github.dfa1.vortex.reader.decode.ArrayNode; import io.github.dfa1.vortex.reader.decode.DecodeContext; +import io.github.dfa1.vortex.reader.decode.EncodingDecoder; import io.github.dfa1.vortex.reader.decode.UnknownArrayNode; import org.junit.jupiter.api.Test; @@ -16,6 +17,8 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.mockito.BDDMockito.given; +import static org.mockito.Mockito.mock; class ReadRegistryTest { @@ -116,4 +119,25 @@ void decodeUnknownEncodingWrapsChildrenAsUnknown() { assertThat(((UnknownArray) unknown.children()[0]).encodingId()).isEqualTo("vortex.primitive"); assertThat(sut.isAllowUnknown()).isTrue(); } + + @Test + void builder_retainsAllRegisteredDecoders() { + // Given — decoders registered out of natural id order (the registry sorts into a TreeMap) + EncodingDecoder bool = decoder(EncodingId.VORTEX_BOOL); + EncodingDecoder primitive = decoder(EncodingId.VORTEX_PRIMITIVE); + + // When + ReadRegistry result = ReadRegistry.builder().register(bool).register(primitive).build(); + + // Then — every registered decoder is retained, regardless of registration order + assertThat(result.hasDecoder(EncodingId.VORTEX_PRIMITIVE)).isTrue(); + assertThat(result.hasDecoder(EncodingId.VORTEX_BOOL)).isTrue(); + assertThat(result.hasDecoder(EncodingId.VORTEX_CONSTANT)).isFalse(); + } + + private static EncodingDecoder decoder(EncodingId id) { + EncodingDecoder decoder = mock(EncodingDecoder.class); + given(decoder.encodingId()).willReturn(id); + return decoder; + } } diff --git a/writer/src/main/java/io/github/dfa1/vortex/writer/WriteRegistry.java b/writer/src/main/java/io/github/dfa1/vortex/writer/WriteRegistry.java index c47066e4..ce29796f 100644 --- a/writer/src/main/java/io/github/dfa1/vortex/writer/WriteRegistry.java +++ b/writer/src/main/java/io/github/dfa1/vortex/writer/WriteRegistry.java @@ -5,9 +5,12 @@ import io.github.dfa1.vortex.extension.ExtensionId; import io.github.dfa1.vortex.writer.encode.EncodingEncoder; -import java.util.HashMap; +import java.util.Collections; +import java.util.Comparator; import java.util.Map; import java.util.ServiceLoader; +import java.util.TreeMap; +import java.util.function.Function; /// Write-side registry: maps [EncodingId] to [EncodingEncoder] implementations, /// and [ExtensionId] to [ExtensionEncoder] implementations. @@ -27,8 +30,21 @@ public final class WriteRegistry { private WriteRegistry(Map encoders, Map extensions) { - this.encoders = Map.copyOf(encoders); - this.extensions = Map.copyOf(extensions); + // Order by encoding name so it is stable regardless of enum declaration order. (The id + // enums sort by ordinal naturally — Enum.compareTo is final — so a Comparator is required.) + // VortexWriter.create(.., WriteRegistry) selects the first encoder whose accepts() matches + // the dtype, so iteration order is significant: a HashMap — or registration order, which + // depends on ServiceLoader's unspecified iteration and on how register* calls interleave — + // would make selection vary across runs and platforms. The order is now a pure function of + // the registered set, however it was assembled. + this.encoders = sortedByName(encoders, EncodingId::id); + this.extensions = sortedByName(extensions, ExtensionId::id); + } + + private static Map sortedByName(Map src, Function name) { + var sorted = new TreeMap(Comparator.comparing(name)); + sorted.putAll(src); + return Collections.unmodifiableMap(sorted); } /// Loads all service-discovered [EncodingEncoder] and [ExtensionEncoder] implementations. @@ -72,8 +88,8 @@ public ExtensionEncoder lookup(ExtensionId extensionId) { /// Not thread-safe. Build once, use everywhere — the produced [WriteRegistry] is immutable. public static final class Builder { - private final Map encoders = new HashMap<>(); - private final Map extensions = new HashMap<>(); + private final Map encoders = new TreeMap<>(); + private final Map extensions = new TreeMap<>(); private Builder() { } @@ -104,8 +120,11 @@ public Builder register(ExtensionEncoder extension) { return this; } - /// Registers every [EncodingEncoder] and [ExtensionEncoder] discovered via - /// [ServiceLoader]. + /// Registers every [EncodingEncoder] and [ExtensionEncoder] discovered via [ServiceLoader]. + /// + /// The order in which `ServiceLoader` yields providers is unspecified by the JDK, but it + /// does not matter here: [#build()] sorts the registered set by id, so the resulting + /// registry has the same deterministic order no matter how it was populated. /// /// @return this builder, for chaining /// @throws VortexException if a service-loaded entry collides with one already registered diff --git a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/ChunkedEncodingEncoder.java b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/ChunkedEncodingEncoder.java index 2d98de54..deb8e949 100644 --- a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/ChunkedEncodingEncoder.java +++ b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/ChunkedEncodingEncoder.java @@ -35,7 +35,12 @@ public EncodingId encodingId() { @Override public boolean accepts(DType dtype) { - return dtype instanceof DType.Primitive || dtype instanceof DType.Struct; + // Carrier-dispatched, not dtype-dispatched: encode() requires a ChunkedData wrapper, which + // the dtype-based write path never produces. Returning false keeps this encoder out of + // first-match selection (VortexWriter.findEncoder and the cascade), where it would + // otherwise be handed a raw primitive array and throw ClassCastException. It stays usable + // by explicit construction / keyed lookup for callers that already hold ChunkedData. + return false; } @Override diff --git a/writer/src/test/java/io/github/dfa1/vortex/writer/VortexWriterTest.java b/writer/src/test/java/io/github/dfa1/vortex/writer/VortexWriterTest.java index 2131213e..ded0e0c8 100644 --- a/writer/src/test/java/io/github/dfa1/vortex/writer/VortexWriterTest.java +++ b/writer/src/test/java/io/github/dfa1/vortex/writer/VortexWriterTest.java @@ -474,9 +474,13 @@ void writeAndRead_columnProjection_returnsOnlyRequestedColumns(@TempDir Path tmp } @Test - void create_withWriteRegistry_roundTrips(@TempDir Path tmp) throws IOException { - // Given — the WriteRegistry factory overload (global dict forced off, encoders taken from - // the registry). A round-trip exercises this otherwise-untested create path end to end. + void create_withFullRegistry_roundTrips(@TempDir Path tmp) throws IOException { + // Given — create(WriteRegistry) over loadAll(), the exact combination that broke on Windows. + // With no cascade, writeSegment picks the first registered encoder whose accepts() matches + // the dtype. loadAll() pulls in every service encoder, so this is the end-to-end guard that + // a deterministic order (WriteRegistry's TreeMap) plus an honest accepts() (composite + // encoders like ChunkedEncodingEncoder no longer claim raw primitive dtypes) keep selection + // both stable across platforms and correct. var schema = new DType.Struct(List.of("v"), List.of(new DType.Primitive(PType.I64, false)), false); Path file = tmp.resolve("registry.vortex"); diff --git a/writer/src/test/java/io/github/dfa1/vortex/writer/WriteRegistryTest.java b/writer/src/test/java/io/github/dfa1/vortex/writer/WriteRegistryTest.java index dbcb41ba..1e1defe5 100644 --- a/writer/src/test/java/io/github/dfa1/vortex/writer/WriteRegistryTest.java +++ b/writer/src/test/java/io/github/dfa1/vortex/writer/WriteRegistryTest.java @@ -6,6 +6,8 @@ import io.github.dfa1.vortex.writer.encode.EncodingEncoder; import org.junit.jupiter.api.Test; +import java.util.List; + import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.mockito.BDDMockito.given; @@ -125,4 +127,33 @@ void loadAll_discoversServiceLoadedEncoders() { // Then assertThat(sut.encoderMap()).isNotEmpty(); } + + @Test + void build_ordersEncodersByEncodingNameRegardlessOfRegistrationSequence() { + // Given — three encoders registered out of name order ("vortex.bool" < "vortex.constant" + // < "vortex.primitive") + EncodingEncoder bool = encoder(EncodingId.VORTEX_BOOL); + EncodingEncoder primitive = encoder(EncodingId.VORTEX_PRIMITIVE); + EncodingEncoder constant = encoder(EncodingId.VORTEX_CONSTANT); + + // When + WriteRegistry result = WriteRegistry.builder().register(primitive).register(bool).register(constant).build(); + + // Then — encoderMap iterates by encoding name no matter the registration sequence, so + // VortexWriter's first-match encoder selection is deterministic + assertThat(result.encoderMap().keySet()) + .containsExactly(EncodingId.VORTEX_BOOL, EncodingId.VORTEX_CONSTANT, EncodingId.VORTEX_PRIMITIVE); + } + + @Test + void loadAll_encoderOrderIsDeterministicallySortedByName() { + // Given — all service-loaded encoders + WriteRegistry registry = WriteRegistry.loadAll(); + + // When — the encoder names in iteration order + List result = registry.encoderMap().keySet().stream().map(EncodingId::id).toList(); + + // Then — sorted by name, independent of ServiceLoader's unspecified iteration order + assertThat(result).isSorted(); + } }