Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,18 +60,19 @@ Trunk-based. PRs fine but always squash or rebase — no merge commits. Keep com
./bench JavaVsJniReadBenchmark.javaReadVolume # benchmark — always ClassName.methodName filter
```

Regenerate after editing `.fbs`/`.proto`:
Regenerate after editing `.fbs`/`.proto` (both generators are in-house, no external tools):

```bash
brew install flatbuffers # only for .fbs edits (any flatc version; guard auto-stripped)
./mvnw compile -pl proto-gen # only on .proto edits
./mvnw compile -pl fbs-gen,proto-gen # build the generators
./mvnw generate-sources -pl core -P regenerate-sources # then commit
```

`flatc` runs whenever the profile is active; if you only changed `.proto`, revert spurious
`fbs/` diffs: `git checkout -- core/src/main/java/io/github/dfa1/vortex/fbs/`. Proto-to-Java
is in-process via `proto-gen` (no `protoc`/`protobuf-java`): one record per message with static
`decode(MemorySegment, long, long)` + `encode()` operating directly on a segment.
Both schema languages are compiled in-process to MemorySegment-native Java, with no
`flatc`/`protoc` and no `com.google.flatbuffers`/`protobuf-java` runtime (ADR 0017):
- **`.fbs` → `fbs-gen`** (`io.github.dfa1.vortex.fbsgen`): generates readers extending
`FbsTable`/`FbsStruct` and builders over `FbsBuilder` (all in `io.github.dfa1.vortex.fbsrt`).
- **`.proto` → `proto-gen`**: one record per message with static `decode(MemorySegment, long,
long)` + `encode()` operating directly on a segment.

### Mutation testing

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -191,8 +191,9 @@ private void indexStatsChildren(InspectorTree.Node node) {
}

private static boolean hasLeadingStats(Layout layout) {
java.nio.ByteBuffer meta = layout.metadata();
return meta != null && meta.hasRemaining() && meta.get(meta.position()) == 1;
MemorySegment meta = layout.metadata();
return meta != null && meta.byteSize() > 0
&& meta.get(java.lang.foreign.ValueLayout.JAVA_BYTE, 0) == 1;
}

private void prefetchTopColumns() {
Expand Down Expand Up @@ -671,8 +672,7 @@ private void runStatsLoad(InspectorTree.Node anchor) {
statsCache.put(anchor, new DataState.Failed("no column dtype"));
return;
}
DType.Struct statsDtype = ZonedStatsSchema.statsTableDtype(
columnDtype, anchorLayout.metadata());
DType.Struct statsDtype = ZonedStatsSchema.statsTableDtype(columnDtype, anchorLayout.metadata());
if (statsDtype.fieldNames().isEmpty()) {
statsCache.put(anchor, new DataState.Failed("no stats present in metadata"));
return;
Expand Down
94 changes: 21 additions & 73 deletions core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,6 @@
</description>

<dependencies>
<!-- production -->
<dependency>
<groupId>com.google.flatbuffers</groupId>
<artifactId>flatbuffers-java</artifactId>
</dependency>
<!-- testing -->
<dependency>
<groupId>org.junit.jupiter</groupId>
Expand Down Expand Up @@ -59,53 +54,51 @@

<!--
Generated sources (src/main/java/…/fbs and …/proto) are committed to the repo.
Normal builds need no external tools.
Normal builds need no external tools — both generators are pure in-house Java.

To regenerate after editing .fbs or .proto schemas:
brew install flatbuffers
./mvnw compile -pl fbs-gen,proto-gen
./mvnw generate-sources -pl core -P regenerate-sources
Then commit the updated files.
Any flatc version works — the profile strips the version guard automatically.
-->
<profiles>
<profile>
<id>regenerate-sources</id>
<build>
<plugins>
<!-- 1. Generate Java from FlatBuffer schemas -->
<!--
Generate MemorySegment-native Java from the .fbs (FlatBuffers) and .proto
(Protobuf) schemas via the in-house vortex-fbs-gen / vortex-proto-gen tools.

Pre-step: run `./mvnw compile -pl fbs-gen,proto-gen` once so these execs find
the classes. We use direct execs (rather than declaring the generators as Maven
deps) to avoid artificial provided-scope deps leaking into the published core POM.
-->
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<executions>
<execution>
<id>flatc-generate</id>
<id>fbsgen-generate</id>
<phase>generate-sources</phase>
<goals>
<goal>exec</goal>
</goals>
<configuration>
<executable>flatc</executable>
<workingDirectory>${project.basedir}/src/main/flatbuffers</workingDirectory>
<executable>java</executable>
<arguments>
<argument>--java</argument>
<argument>-I</argument>
<argument>.</argument>
<argument>-o</argument>
<argument>${project.basedir}/src/main/java</argument>
<argument>vortex-file/footer.fbs</argument>
<argument>vortex-layout/layout.fbs</argument>
<argument>vortex-array/array.fbs</argument>
<argument>vortex-dtype/dtype.fbs</argument>
<argument>-cp</argument>
<argument>${project.basedir}/../fbs-gen/target/classes</argument>
<argument>io.github.dfa1.vortex.fbsgen.Main</argument>
<argument>--out</argument>
<argument>${project.basedir}/src/main/java/io/github/dfa1/vortex/fbs</argument>
<argument>${project.basedir}/src/main/flatbuffers/vortex-array/array.fbs</argument>
<argument>${project.basedir}/src/main/flatbuffers/vortex-dtype/dtype.fbs</argument>
<argument>${project.basedir}/src/main/flatbuffers/vortex-file/footer.fbs</argument>
<argument>${project.basedir}/src/main/flatbuffers/vortex-layout/layout.fbs</argument>
</arguments>
</configuration>
</execution>
<!--
2. Generate MemorySegment-native Java from Protobuf schemas via vortex-proto-gen.

Pre-step: run `./mvnw compile -pl proto-gen` once so this exec finds the classes.
We use a direct exec (rather than declaring vortex-proto-gen as a Maven dep) to avoid
an artificial provided-scope dep that would leak into the published core POM.
-->
<execution>
<id>protogen-generate</id>
<phase>generate-sources</phase>
Expand All @@ -128,51 +121,6 @@
</execution>
</executions>
</plugin>

<!--
Why we strip ValidateVersion():
flatc injects a `public static void ValidateVersion()` into every generated
class that calls `Constants.FLATBUFFERS_25_X_Y()` — a method that only exists
in the flatbuffers-java jar built from the *same* flatc release. This is a
developer-experience guard: it prevents accidentally pairing a stale generated
file with a newer runtime jar.

The problem: flatbuffers-java on Maven Central lags the flatc CLI by months.
As of 2026-06, Maven Central tops out at 25.2.10 while `brew install flatbuffers`
installs 25.12.19. Any attempt to compile freshly generated code against the
Maven Central jar fails with "cannot find symbol: Constants.FLATBUFFERS_25_12_19".

The strip is safe because:
- The FlatBuffers binary wire format is stable across releases; 25.2.10 can
read bytes written by code generated with any 25.x flatc.
- We commit the stripped files, so the check is irrelevant at compile time for
normal builds (no flatc involved).
- The strip only runs inside the `regenerate-sources` profile, not during
normal `./mvnw verify` builds.

Re-evaluate if flatbuffers-java is ever published to Maven Central in lock-step
with flatc releases — at that point the antrun step can be removed.
-->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-antrun-plugin</artifactId>
<executions>
<execution>
<id>flatc-strip-version-guard</id>
<phase>generate-sources</phase>
<goals>
<goal>run</goal>
</goals>
<configuration>
<target>
<replaceregexp match=" public static void ValidateVersion\(\) \{ Constants\.[^}]+\}&#xA;" replace="" flags="g">
<fileset dir="${project.basedir}/src/main/java/io/github/dfa1/vortex/fbs" includes="*.java" />
</replaceregexp>
</target>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
Expand Down
49 changes: 45 additions & 4 deletions core/src/main/java/io/github/dfa1/vortex/core/DType.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package io.github.dfa1.vortex.core;

import java.nio.ByteBuffer;
import java.lang.foreign.MemorySegment;
import java.util.ArrayList;
import java.util.LinkedHashMap;

Expand Down Expand Up @@ -267,7 +267,7 @@ record FixedSizeList(DType elementType, int fixedSize, boolean nullable) impleme
record Extension(
String extensionId,
DType storageDType,
ByteBuffer metadata,
MemorySegment metadata,
boolean nullable
) implements DType {

Expand All @@ -280,11 +280,52 @@ record Extension(
/// @throws VortexException if `metadata` carries more than
/// [#MAX_METADATA_SIZE] readable bytes
public Extension {
if (metadata != null && metadata.remaining() > MAX_METADATA_SIZE) {
if (metadata != null && metadata.byteSize() > MAX_METADATA_SIZE) {
throw new VortexException("extension metadata too large: "
+ metadata.remaining() + " > " + MAX_METADATA_SIZE);
+ metadata.byteSize() + " > " + MAX_METADATA_SIZE);
}
}

/// Value equality with content-based metadata comparison.
///
/// [MemorySegment] uses identity equality, so the default record `equals` would
/// treat a heap-built dtype and the same dtype read back from a mapped file as
/// unequal. Extension is a value type (compared in schema round-trips and as a
/// component of [Struct]/[List] dtypes), so metadata is compared by bytes —
/// matching the content semantics the previous `ByteBuffer` component had.
///
/// @param o other object
/// @return whether `o` is an Extension with equal id, storage, nullability and metadata bytes
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (!(o instanceof Extension other)) {
return false;
}
return nullable == other.nullable
&& extensionId.equals(other.extensionId)
&& storageDType.equals(other.storageDType)
&& metadataEquals(metadata, other.metadata);
}

/// @return a hash consistent with [#equals(Object)] (metadata hashed by content)
@Override
public int hashCode() {
int h = java.util.Objects.hash(extensionId, storageDType, nullable);
if (metadata != null) {
h = 31 * h + java.util.Arrays.hashCode(metadata.toArray(java.lang.foreign.ValueLayout.JAVA_BYTE));
}
return h;
}

private static boolean metadataEquals(MemorySegment a, MemorySegment b) {
if (a == null || b == null) {
return a == b;
}
return a.byteSize() == b.byteSize() && a.mismatch(b) == -1;
}
}

/// Variant logical type for semi-structured data (analogous to Parquet variant / JSON).
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
import io.github.dfa1.vortex.core.VortexException;
import io.github.dfa1.vortex.encoding.TimeUnit;

import java.nio.ByteBuffer;
import java.lang.foreign.MemorySegment;
import java.lang.foreign.ValueLayout;

/// Static factories and metadata accessors for `vortex.time` extension dtypes.
///
Expand Down Expand Up @@ -36,8 +37,7 @@ public static DType.Extension of(TimeUnit unit, boolean nullable) {
case Microseconds, Nanoseconds -> PType.I64;
case Days -> throw new IllegalArgumentException("Days unit not valid for vortex.time");
};
ByteBuffer meta = ByteBuffer.allocate(1);
meta.put(0, (byte) unit.ordinal());
MemorySegment meta = MemorySegment.ofArray(new byte[]{(byte) unit.ordinal()});
return new DType.Extension(
ExtensionId.VORTEX_TIME.id(),
new DType.Primitive(storage, nullable),
Expand All @@ -51,10 +51,10 @@ public static DType.Extension of(TimeUnit unit, boolean nullable) {
/// @return the recorded time unit
/// @throws VortexException if metadata is missing or empty
public static TimeUnit readUnit(DType.Extension ext) {
ByteBuffer meta = ext.metadata();
if (meta == null || !meta.hasRemaining()) {
MemorySegment meta = ext.metadata();
if (meta == null || meta.byteSize() == 0) {
throw new VortexException("missing TimeUnit metadata byte for " + ext.extensionId());
}
return TimeUnit.fromTag(meta.get(meta.position()));
return TimeUnit.fromTag(meta.get(ValueLayout.JAVA_BYTE, 0));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
import io.github.dfa1.vortex.core.VortexException;
import io.github.dfa1.vortex.encoding.TimeUnit;

import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import static io.github.dfa1.vortex.encoding.PTypeIO.LE_SHORT;

import java.lang.foreign.MemorySegment;
import java.lang.foreign.ValueLayout;
import java.nio.charset.StandardCharsets;
import java.time.ZoneId;
import java.util.Optional;
Expand Down Expand Up @@ -37,12 +39,10 @@ public static DType.Extension of(boolean nullable) {
/// @return matching extension dtype
public static DType.Extension of(TimeUnit unit, ZoneId zone, boolean nullable) {
byte[] tzBytes = zone == null ? new byte[0] : zone.getId().getBytes(StandardCharsets.UTF_8);
ByteBuffer meta = ByteBuffer.allocate(3 + tzBytes.length).order(ByteOrder.LITTLE_ENDIAN);
meta.put(0, (byte) unit.ordinal());
meta.putShort(1, (short) tzBytes.length);
for (int k = 0; k < tzBytes.length; k++) {
meta.put(3 + k, tzBytes[k]);
}
MemorySegment meta = MemorySegment.ofArray(new byte[3 + tzBytes.length]);
meta.set(ValueLayout.JAVA_BYTE, 0, (byte) unit.ordinal());
meta.set(LE_SHORT, 1, (short) tzBytes.length);
MemorySegment.copy(tzBytes, 0, meta, ValueLayout.JAVA_BYTE, 3, tzBytes.length);
return new DType.Extension(
ExtensionId.VORTEX_TIMESTAMP.id(),
new DType.Primitive(PType.I64, nullable),
Expand All @@ -56,11 +56,11 @@ public static DType.Extension of(TimeUnit unit, ZoneId zone, boolean nullable) {
/// @return the recorded time unit
/// @throws VortexException if metadata is missing or empty
public static TimeUnit readUnit(DType.Extension ext) {
ByteBuffer meta = ext.metadata();
if (meta == null || !meta.hasRemaining()) {
MemorySegment meta = ext.metadata();
if (meta == null || meta.byteSize() == 0) {
throw new VortexException("missing TimeUnit metadata byte for " + ext.extensionId());
}
return TimeUnit.fromTag(meta.get(meta.position()));
return TimeUnit.fromTag(meta.get(ValueLayout.JAVA_BYTE, 0));
}

/// Reads the optional IANA timezone from the metadata's UTF-8 suffix.
Expand All @@ -69,25 +69,20 @@ public static TimeUnit readUnit(DType.Extension ext) {
/// @return parsed zone id, or empty when no timezone is recorded
/// @throws VortexException if the metadata is truncated mid-string
public static Optional<ZoneId> timezone(DType.Extension ext) {
ByteBuffer meta = ext.metadata();
if (meta == null || meta.remaining() < 3) {
MemorySegment meta = ext.metadata();
if (meta == null || meta.byteSize() < 3) {
return Optional.empty();
}
ByteBuffer le = meta.duplicate().order(ByteOrder.LITTLE_ENDIAN);
int basePos = le.position();
int tzLen = Short.toUnsignedInt(le.getShort(basePos + 1));
int tzLen = Short.toUnsignedInt(meta.get(LE_SHORT, 1));
if (tzLen == 0) {
return Optional.empty();
}
if (le.remaining() < 3 + tzLen) {
if (meta.byteSize() < 3 + tzLen) {
throw new VortexException(
"timestamp metadata truncated: declared tz_len="
+ tzLen + " but only " + (le.remaining() - 3) + " bytes available");
}
byte[] tzBytes = new byte[tzLen];
for (int k = 0; k < tzLen; k++) {
tzBytes[k] = le.get(basePos + 3 + k);
+ tzLen + " but only " + (meta.byteSize() - 3) + " bytes available");
}
byte[] tzBytes = meta.asSlice(3, tzLen).toArray(ValueLayout.JAVA_BYTE);
return Optional.of(ZoneId.of(new String(tzBytes, StandardCharsets.UTF_8)));
}
}
Loading
Loading