Skip to content

Commit b219a2d

Browse files
authored
add DataSetInfo and some guards around loading errors (#637)
* add DataSetInfo * expand type-safe metadata support for datasets * add docs to dataset_metadata * fix inheritDoc directive * update rat settings * restore correct dataset layering
1 parent 2adb12d commit b219a2d

24 files changed

Lines changed: 1171 additions & 58 deletions

File tree

jvector-examples/pom.xml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,14 @@
1616
</properties>
1717
<build>
1818
<plugins>
19+
<plugin>
20+
<groupId>org.apache.maven.plugins</groupId>
21+
<artifactId>maven-surefire-plugin</artifactId>
22+
<configuration>
23+
<skip>false</skip>
24+
<workingDirectory>${project.parent.basedir}</workingDirectory>
25+
</configuration>
26+
</plugin>
1927
<plugin>
2028
<groupId>org.codehaus.mojo</groupId>
2129
<artifactId>exec-maven-plugin</artifactId>

jvector-examples/src/main/java/io/github/jbellis/jvector/example/AutoBenchYAML.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ public static void main(String[] args) throws IOException {
132132
try {
133133
DataSet ds = DataSets.loadDataSet(datasetName).orElseThrow(
134134
() -> new RuntimeException("Dataset " + datasetName + " not found")
135-
);
135+
).getDataSet();
136136
logger.info("Dataset loaded: {} with {} vectors", datasetName, ds.getBaseVectors().size());
137137

138138
String normalizedDatasetName = datasetName;

jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ private static void execute(Pattern pattern, boolean enableIndexCache, List<Func
9393
for (var datasetName : datasetNames) {
9494
DataSet ds = DataSets.loadDataSet(datasetName).orElseThrow(
9595
() -> new RuntimeException("Dataset " + datasetName + " not found")
96-
);
96+
).getDataSet();
9797
Grid.runAll(ds, enableIndexCache, mGrid, efConstructionGrid, neighborOverflowGrid, addHierarchyGrid, refineFinalGraphGrid, featureSets, buildCompression, compressionGrid, topKGrid, usePruningGrid);
9898
}
9999
}

jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ public static void main(String[] args) throws IOException {
120120
String datasetName = config.dataset;
121121
DataSet ds = DataSets.loadDataSet(datasetName).orElseThrow(
122122
() -> new RuntimeException("Could not load dataset:" + datasetName)
123-
);
123+
).getDataSet();
124124
// Register dataset info the first time we actually load the dataset for benchmarking
125125
artifacts.registerDataset(datasetName, ds);
126126

jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,8 @@ public static void main(String[] args) throws IOException {
3838

3939
// Load dataset
4040
var ds = new DataSetLoaderMFD().loadDataSet(datasetName)
41-
.orElseThrow(() -> new RuntimeException("dataset " + datasetName + " not found"));
41+
.orElseThrow(() -> new RuntimeException("dataset " + datasetName + " not found"))
42+
.getDataSet();
4243

4344
// Run artifacts + selections (sys_info/dataset_info/experiments.csv)
4445
RunArtifacts artifacts = RunArtifacts.open(runCfg, List.of(config));
Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
/*
2+
* Copyright DataStax, Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package io.github.jbellis.jvector.example.benchmarks.datasets;
18+
19+
import io.github.jbellis.jvector.vector.VectorSimilarityFunction;
20+
import java.util.Optional;
21+
import java.util.function.Supplier;
22+
23+
/// A lightweight, lazy handle that separates *identifying* a dataset from *loading* its data.
24+
///
25+
/// Metadata such as the dataset name and similarity function are available immediately
26+
/// without any I/O, while the expensive work of reading vectors, deduplicating, scrubbing
27+
/// zero vectors, and normalizing is deferred until the first call to {@link #getDataSet()}.
28+
///
29+
/// This design allows callers to enumerate or filter available datasets cheaply, and
30+
/// ensures that the full load-and-scrub pipeline runs at most once per handle thanks to
31+
/// thread-safe caching.
32+
///
33+
/// Instances are created by {@link DataSetLoader} implementations; callers obtain them
34+
/// through {@link DataSets#loadDataSet(String)}.
35+
///
36+
/// ### Typical usage
37+
/// ```java
38+
/// DataSetInfo info = DataSets.loadDataSet("ada002-100k").orElseThrow();
39+
///
40+
/// // Cheap — no vectors loaded yet
41+
/// System.out.println(info.getName());
42+
/// System.out.println(info.similarityFunction());
43+
///
44+
/// // First call triggers full load; subsequent calls return the cached DataSet
45+
/// DataSet ds = info.getDataSet();
46+
/// ```
47+
///
48+
/// @see DataSet
49+
/// @see DataSetLoader
50+
/// @see DataSets
51+
public class DataSetInfo implements DataSetProperties {
52+
private final Supplier<DataSet> loader;
53+
private final DataSetProperties baseProperties;
54+
private volatile DataSet cached;
55+
56+
/// Creates a new dataset info handle.
57+
///
58+
/// The supplied {@code loader} will not be invoked until {@link #getDataSet()} is called.
59+
/// It should perform the full load-and-scrub pipeline (read vectors, remove duplicates /
60+
/// zero vectors, filter queries, normalize) and return a ready-to-use {@link DataSet}.
61+
///
62+
/// @param baseProperties the dataset properties (name, similarity function, etc.)
63+
/// @param loader a supplier that performs the deferred load; invoked at most once
64+
public DataSetInfo(DataSetProperties baseProperties, Supplier<DataSet> loader) {
65+
this.baseProperties = baseProperties;
66+
this.loader = loader;
67+
}
68+
69+
/**
70+
* {@inheritDoc}
71+
*/
72+
@Override
73+
public Optional<VectorSimilarityFunction> similarityFunction() {
74+
return baseProperties.similarityFunction();
75+
}
76+
77+
/**
78+
* {@inheritDoc}
79+
*/
80+
@Override
81+
public int numVectors() {
82+
return this.baseProperties.numVectors();
83+
}
84+
85+
/**
86+
* {@inheritDoc}
87+
*/
88+
@Override
89+
public String getName() {
90+
return baseProperties.getName();
91+
}
92+
93+
/**
94+
* {@inheritDoc}
95+
*/
96+
@Override
97+
public boolean isNormalized() {
98+
return baseProperties.isNormalized();
99+
}
100+
101+
/**
102+
* {@inheritDoc}
103+
*/
104+
@Override
105+
public boolean isZeroVectorFree() {
106+
return baseProperties.isZeroVectorFree();
107+
}
108+
109+
/**
110+
* {@inheritDoc}
111+
*/
112+
@Override
113+
public boolean isDuplicateVectorFree() {
114+
return baseProperties.isDuplicateVectorFree();
115+
}
116+
117+
/// Returns the fully loaded and scrubbed {@link DataSet}.
118+
///
119+
/// On the first invocation this triggers the deferred load pipeline, which may involve
120+
/// reading large vector files from disk, deduplication, zero-vector removal, and
121+
/// normalization. The result is cached so that subsequent calls return immediately.
122+
///
123+
/// This method is thread-safe: concurrent callers will block until the first load
124+
/// completes, after which all callers share the same cached instance.
125+
///
126+
/// @return the ready-to-use {@link DataSet}
127+
public DataSet getDataSet() {
128+
if (cached == null) {
129+
synchronized (this) {
130+
if (cached == null) {
131+
cached = loader.get();
132+
}
133+
}
134+
}
135+
return cached;
136+
}
137+
}

jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoader.java

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,19 +23,28 @@
2323
*/
2424
public interface DataSetLoader {
2525
/**
26-
* Implementations of this method <EM>MUST NOT</EM> throw exceptions related to the presence or absence of a
26+
* Looks up a dataset by name and returns a lightweight {@link DataSetInfo} handle.
27+
*
28+
* <p>The returned handle provides the dataset name and similarity function immediately,
29+
* without loading vector data into memory. The full {@link DataSet} (vectors, ground truth,
30+
* etc.) is loaded lazily on the first call to {@link DataSetInfo#getDataSet()}.
31+
*
32+
* <p>Implementations <em>MUST NOT</em> throw exceptions related to the presence or absence of a
2733
* requested dataset. Instead, {@link Optional} should be used. Other errors should still be indicated with
28-
* exceptions as usual, including any errors loading a dataset which has been found. Implementors should reliably
29-
* return from this method, avoiding any {@link System#exit(int)} or similar calls.
34+
* exceptions as usual, including any errors downloading or preparing a dataset which has been found.
35+
* Implementors should reliably return from this method, avoiding any {@link System#exit(int)} or similar calls.
36+
*
37+
* <p>Implementations may perform file downloads or other preparation work before returning the handle,
38+
* but should defer the expensive parsing and scrubbing of vector data to the {@link DataSetInfo} supplier.
3039
*
3140
* <HR/>
3241
*
33-
* Implementations are encouraged to include logging at debug level for diagnostics, such as when datasets are
42+
* <p>Implementations are encouraged to include logging at debug level for diagnostics, such as when datasets are
3443
* not found, and info level for when datasets are found and loaded. This can assist users troubleshooting
3544
* diverse data sources.
3645
*
37-
* @param dataSetName
38-
* @return a {@link DataSet}, if found
46+
* @param dataSetName the logical dataset name (not a filename; do not include extensions like {@code .hdf5})
47+
* @return a {@link DataSetInfo} handle for the dataset, if found
3948
*/
40-
Optional<DataSet> loadDataSet(String dataSetName);
49+
Optional<DataSetInfo> loadDataSet(String dataSetName);
4150
}

jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderHDF5.java

Lines changed: 53 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -34,30 +34,42 @@
3434
import java.util.ArrayList;
3535
import java.util.Arrays;
3636
import java.util.List;
37+
import java.util.Map;
3738
import java.util.Optional;
3839
import java.util.stream.IntStream;
3940

4041
/**
4142
* This dataset loader will get and load hdf5 files from <a href="https://ann-benchmarks.com/">ann-benchmarks</a>.
43+
*
44+
* <p>The vector similarity function is first inferred from the filename (e.g. {@code -angular},
45+
* {@code -euclidean}). If the filename does not contain a recognized suffix, the loader falls
46+
* back to looking up the dataset in {@code dataset_metadata.yml} via {@link DataSetMetadataReader}.
47+
* If neither source provides a similarity function, an error is thrown.
4248
*/
4349
public class DataSetLoaderHDF5 implements DataSetLoader {
4450
public static final Path HDF5_DIR = Path.of("hdf5");
4551
private static final VectorTypeSupport vectorTypeSupport = VectorizationProvider.getInstance().getVectorTypeSupport();
4652
public static final String HDF5_EXTN = ".hdf5";
53+
private static final DataSetMetadataReader metadata = DataSetMetadataReader.load();
4754

4855
/**
4956
* {@inheritDoc}
5057
*/
51-
public Optional<DataSet> loadDataSet(String datasetName) {
52-
return maybeDownloadHdf5(datasetName).map(this::readHdf5Data);
58+
public Optional<DataSetInfo> loadDataSet(String datasetName) {
59+
return maybeDownloadHdf5(datasetName).map(path -> {
60+
var props = getProperties(datasetName, path);
61+
var similarity = props.similarityFunction()
62+
.orElseThrow(() -> new IllegalArgumentException(
63+
"No similarity function found for HDF5 dataset: " + datasetName
64+
+ ". Either include -angular, -dot, or -euclidean in the filename,"
65+
+ " or add an entry in dataset_metadata.yml"));
66+
return new DataSetInfo(props, () -> readHdf5Data(path, similarity));
67+
});
5368
}
5469

55-
private DataSet readHdf5Data(Path path) {
56-
57-
// infer the similarity
58-
VectorSimilarityFunction similarityFunction = getVectorSimilarityFunction(path);
59-
60-
// read the data
70+
/// Reads base vectors, query vectors, and ground truth from an HDF5 file
71+
/// and returns a scrubbed {@link DataSet}.
72+
private DataSet readHdf5Data(Path path, VectorSimilarityFunction similarityFunction) {
6173
VectorFloat<?>[] baseVectors;
6274
VectorFloat<?>[] queryVectors;
6375
var gtSets = new ArrayList<List<Integer>>();
@@ -94,27 +106,43 @@ private DataSet readHdf5Data(Path path) {
94106
return DataSetUtils.getScrubbedDataSet(path.getFileName().toString(), similarityFunction, Arrays.asList(baseVectors), Arrays.asList(queryVectors), gtSets);
95107
}
96108

97-
/**
98-
* Derive the similarity function from the dataset name.
99-
* @param filename filename of the dataset AKA "name"
100-
* @return The matching similarity function, or throw an error
101-
*/
102-
private static VectorSimilarityFunction getVectorSimilarityFunction(Path filename) {
103-
VectorSimilarityFunction similarityFunction;
104-
if (filename.toString().contains("-angular") || filename.toString().contains("-dot")) {
105-
similarityFunction = VectorSimilarityFunction.COSINE;
109+
/// Derives dataset properties from the filename, falling back to {@link DataSetMetadataReader}.
110+
///
111+
/// The filename is checked first for known suffixes ({@code -angular}, {@code -dot},
112+
/// {@code -euclidean}) to infer the similarity function. If none match, the dataset name
113+
/// is looked up in {@code dataset_metadata.yml}. If neither source provides properties,
114+
/// a minimal {@link DataSetProperties} with an empty similarity function is returned
115+
/// so that the caller can produce a clear error.
116+
///
117+
/// @param datasetName the logical dataset name (without {@code .hdf5} extension)
118+
/// @param filename the resolved file path including the {@code .hdf5} extension
119+
/// @return the dataset properties
120+
private static DataSetProperties getProperties(String datasetName, Path filename) {
121+
String filenameStr = filename.toString();
122+
VectorSimilarityFunction inferred = null;
123+
if (filenameStr.contains("-angular") || filenameStr.contains("-dot")) {
124+
inferred = VectorSimilarityFunction.COSINE;
125+
} else if (filenameStr.contains("-euclidean")) {
126+
inferred = VectorSimilarityFunction.EUCLIDEAN;
106127
}
107-
else if (filename.toString().contains("-euclidean")) {
108-
similarityFunction = VectorSimilarityFunction.EUCLIDEAN;
109-
}
110-
else {
111-
throw new IllegalArgumentException("Unknown similarity function -- expected angular or euclidean for " + filename);
128+
129+
// If filename inference succeeded, build properties with just the SF
130+
if (inferred != null) {
131+
return new DataSetProperties.PropertyMap(Map.of(
132+
DataSetProperties.KEY_NAME, datasetName,
133+
DataSetProperties.KEY_SIMILARITY_FUNCTION, inferred));
112134
}
113-
return similarityFunction;
135+
136+
// Fall back to metadata YAML
137+
return metadata.getProperties(datasetName)
138+
.orElse(new DataSetProperties.PropertyMap(Map.of(DataSetProperties.KEY_NAME, datasetName)));
114139
}
115140

141+
/// Downloads the HDF5 file for the given dataset if it is not already present locally.
142+
///
143+
/// @param datasetName the logical dataset name (without {@code .hdf5} extension)
144+
/// @return the local path to the HDF5 file, or empty if the remote file was not found
116145
private Optional<Path> maybeDownloadHdf5(String datasetName) {
117-
118146
var dsFilePath = HDF5_DIR.resolve(datasetName+HDF5_EXTN);
119147

120148
if (Files.exists(dsFilePath)) {
@@ -123,7 +151,6 @@ private Optional<Path> maybeDownloadHdf5(String datasetName) {
123151

124152
// Download from https://ann-benchmarks.com/datasetName
125153
var url = "https://ann-benchmarks.com/" + datasetName + HDF5_EXTN;
126-
System.out.println("Downloading: " + url);
127154

128155
HttpURLConnection connection;
129156
while (true) {
@@ -148,6 +175,7 @@ private Optional<Path> maybeDownloadHdf5(String datasetName) {
148175

149176
try (InputStream in = connection.getInputStream()) {
150177
Files.createDirectories(dsFilePath.getParent());
178+
System.out.println("Downloading: " + url);
151179
Files.copy(in, dsFilePath, StandardCopyOption.REPLACE_EXISTING);
152180
} catch (IOException e) {
153181
throw new RuntimeException("Error downloading data:" + e.getMessage(),e);

0 commit comments

Comments
 (0)