-
Notifications
You must be signed in to change notification settings - Fork 2.5k
feat(lance): round-trip Hudi VECTOR columns as native Lance fixed-size lists #18497
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
2ceee88
ee93e5a
099aadf
ff5beb3
66276af
4a0acc8
e703935
aa47b75
40bef3c
2ed72a5
21c4186
92d75af
574aca0
e1d6825
01b6733
158b2d0
a8f2c55
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,7 +23,9 @@ | |
| import org.apache.hudi.common.engine.TaskContextSupplier; | ||
| import org.apache.hudi.common.model.HoodieKey; | ||
| import org.apache.hudi.common.model.HoodieRecord; | ||
| import org.apache.hudi.common.schema.HoodieSchema; | ||
| import org.apache.hudi.common.util.Option; | ||
| import org.apache.hudi.exception.HoodieNotSupportedException; | ||
| import org.apache.hudi.io.lance.HoodieBaseLanceWriter; | ||
| import org.apache.hudi.io.storage.row.HoodieBloomFilterRowWriteSupport; | ||
| import org.apache.hudi.io.storage.row.HoodieInternalRowFileWriter; | ||
|
|
@@ -36,11 +38,16 @@ | |
| import org.apache.arrow.vector.VectorSchemaRoot; | ||
| import org.apache.arrow.vector.types.pojo.Schema; | ||
| import org.apache.spark.sql.catalyst.InternalRow; | ||
| import org.apache.spark.sql.types.Metadata; | ||
| import org.apache.spark.sql.types.MetadataBuilder; | ||
| import org.apache.spark.sql.types.StructField; | ||
| import org.apache.spark.sql.types.StructType; | ||
| import org.apache.spark.sql.util.LanceArrowUtils; | ||
| import org.apache.spark.unsafe.types.UTF8String; | ||
|
|
||
| import java.io.IOException; | ||
| import java.util.Collections; | ||
| import java.util.Map; | ||
| import java.util.function.Function; | ||
|
|
||
| import static org.apache.hudi.common.model.HoodieRecord.HoodieMetadataField.COMMIT_SEQNO_METADATA_FIELD; | ||
|
|
@@ -120,8 +127,8 @@ private HoodieSparkLanceWriter(StoragePath file, | |
| Option<BloomFilter> bloomFilterOpt, | ||
| long maxFileSize) { | ||
| super(file, DEFAULT_BATCH_SIZE, bloomFilterOpt.map(HoodieBloomFilterRowWriteSupport::new)); | ||
| this.sparkSchema = sparkSchema; | ||
| this.arrowSchema = LanceArrowUtils.toArrowSchema(sparkSchema, DEFAULT_TIMEZONE, true); | ||
| this.sparkSchema = enrichSparkSchemaForLanceVectors(sparkSchema); | ||
| this.arrowSchema = LanceArrowUtils.toArrowSchema(this.sparkSchema, DEFAULT_TIMEZONE, true); | ||
| this.fileName = UTF8String.fromString(file.getName()); | ||
| this.instantTime = UTF8String.fromString(instantTime); | ||
| this.populateMetaFields = populateMetaFields; | ||
|
|
@@ -132,6 +139,55 @@ private HoodieSparkLanceWriter(StoragePath file, | |
| }; | ||
| } | ||
|
|
||
| /** | ||
| * For every field carrying a Hudi VECTOR logical type annotation | ||
| * (Spark metadata key {@link HoodieSchema#TYPE_METADATA_FIELD} starting with {@code "VECTOR"}), | ||
| * auto-attach the lance-spark metadata key {@link LanceArrowUtils#ARROW_FIXED_SIZE_LIST_SIZE_KEY()} | ||
| * with the vector's dimension so that {@link LanceArrowUtils#toArrowSchema} emits a native | ||
| * Arrow {@code FixedSizeList<elem, dim>} (Lance's vector column encoding) and | ||
| * {@link LanceArrowWriter} selects its fixed-size-list field writer when serializing values. | ||
| * | ||
| * <p>Lance-spark keys vector columns off the per-field | ||
| * {@link LanceArrowUtils#ARROW_FIXED_SIZE_LIST_SIZE_KEY()} (literal: | ||
| * {@code arrow.fixed-size-list.size}) metadata entry (see Lance Spark CREATE TABLE docs); | ||
| * we derive it from the VECTOR dimension so users don't have to set it alongside the | ||
| * Hudi descriptor. | ||
| * | ||
| * <p>Currently only FLOAT and DOUBLE element vectors are supported on Lance, matching | ||
| * lance-spark's {@code VectorUtils.shouldBeFixedSizeList}. Other element types would | ||
| * silently fall through to a plain list write, so we fail fast instead. | ||
| */ | ||
| private static StructType enrichSparkSchemaForLanceVectors(StructType sparkSchema) { | ||
| Map<Integer, HoodieSchema.Vector> vectorColumns = | ||
| VectorConversionUtils.detectVectorColumnsFromMetadata(sparkSchema); | ||
| if (vectorColumns.isEmpty()) { | ||
| return sparkSchema; | ||
| } | ||
| StructField[] fields = sparkSchema.fields(); | ||
| StructField[] newFields = new StructField[fields.length]; | ||
| for (int i = 0; i < fields.length; i++) { | ||
| StructField field = fields[i]; | ||
| HoodieSchema.Vector vec = vectorColumns.get(i); | ||
| if (vec == null) { | ||
| newFields[i] = field; | ||
| continue; | ||
| } | ||
| HoodieSchema.Vector.VectorElementType elemType = vec.getVectorElementType(); | ||
| if (elemType != HoodieSchema.Vector.VectorElementType.FLOAT | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🤖 - Generated by an AI agent and may contain mistakes. Please verify any suggestions before applying.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it is ok to add this defensive check here to throw |
||
| && elemType != HoodieSchema.Vector.VectorElementType.DOUBLE) { | ||
| throw new HoodieNotSupportedException( | ||
| "Lance base-file format currently supports FLOAT/DOUBLE VECTOR columns only; " | ||
| + "got element type " + elemType + " for field '" + field.name() + "'"); | ||
| } | ||
| Metadata enriched = new MetadataBuilder() | ||
| .withMetadata(field.metadata()) | ||
| .putLong(LanceArrowUtils.ARROW_FIXED_SIZE_LIST_SIZE_KEY(), vec.getDimension()) | ||
|
yihua marked this conversation as resolved.
|
||
| .build(); | ||
| newFields[i] = new StructField(field.name(), field.dataType(), field.nullable(), enriched); | ||
| } | ||
| return new StructType(newFields); | ||
| } | ||
|
rahil-c marked this conversation as resolved.
|
||
|
|
||
| @Override | ||
| public void writeRowWithMetadata(HoodieKey key, InternalRow row) throws IOException { | ||
| UTF8String recordKey = UTF8String.fromString(key.getRecordKey()); | ||
|
|
@@ -198,6 +254,27 @@ protected Schema getArrowSchema() { | |
| return arrowSchema; | ||
| } | ||
|
|
||
| /** | ||
| * Emit Hudi's {@code hoodie.vector.columns} footer entry alongside any | ||
| * bloom-filter metadata. Mirrors the Parquet writer (see | ||
| * {@code HoodieRowParquetWriteSupport#init}) so Lance files carry the same | ||
| * self-describing VECTOR descriptor list that Parquet files do. | ||
| * | ||
| * <p>The read side today derives VECTOR identity from the Arrow | ||
| * {@code FixedSizeList<Float/Double, N>} type — this footer entry is a | ||
| * forward-compat guard: it lets future readers recover the exact descriptor | ||
| * (including fields the Arrow type cannot express, e.g. quantization tags) | ||
| * without a writer bump. | ||
| */ | ||
| @Override | ||
| protected Map<String, String> additionalSchemaMetadata() { | ||
| String value = VectorConversionUtils.buildVectorColumnsFooterValue(sparkSchema); | ||
|
yihua marked this conversation as resolved.
|
||
| if (value.isEmpty()) { | ||
| return Collections.emptyMap(); | ||
| } | ||
| return Collections.singletonMap(HoodieSchema.VECTOR_COLUMNS_METADATA_KEY, value); | ||
| } | ||
|
|
||
| /** | ||
| * Update Hudi metadata fields in the InternalRow. | ||
| * | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit:
VECTOR_COLUMNS_METADATA_KEYin the footer is now used. We can remove this usage later.