-
Notifications
You must be signed in to change notification settings - Fork 2.5k
feat(blob): default blob.inline.mode to DESCRIPTOR for Lance #18744
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
3c7300a
e96f620
a18b113
97bc5de
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -208,16 +208,44 @@ class BatchedBlobReader( | |
| // Dispatch based on storage_type (field 0) | ||
| val storageType = accessor.getString(blobStruct, 0) | ||
| if (storageType == HoodieSchema.Blob.INLINE) { | ||
| // Case 1: Inline — bytes are in field 1 | ||
| val bytes = accessor.getBytes(blobStruct, 1) | ||
| batch += RowInfo[R]( | ||
| originalRow = row, | ||
| filePath = "", | ||
| offset = -1, | ||
| length = -1, | ||
| index = rowIndex, | ||
| inlineBytes = Some(bytes) | ||
| ) | ||
| // INLINE rows can arrive in two shapes: | ||
| // - CONTENT-mode read: inline_data populated with raw bytes (field 1). | ||
| // - DESCRIPTOR-mode read: inline_data is null and the reference struct | ||
| // (field 2) carries a synthetic positional pointer into the base file. | ||
| // read_blob() must return bytes in both cases, so we fall back to the | ||
| // descriptor's range read when inline_data is absent. | ||
| val inlineIsNull = accessor.isNullAt(blobStruct, 1) | ||
| if (!inlineIsNull) { | ||
| val bytes = accessor.getBytes(blobStruct, 1) | ||
| batch += RowInfo[R]( | ||
| originalRow = row, | ||
| filePath = "", | ||
| offset = -1, | ||
| length = -1, | ||
| index = rowIndex, | ||
| inlineBytes = Some(bytes) | ||
| ) | ||
| } else { | ||
| require(!accessor.isNullAt(blobStruct, 2), | ||
| s"INLINE blob at row $rowIndex has null inline_data and null reference; cannot resolve bytes") | ||
| val referenceStruct = accessor.getStruct(blobStruct, 2, HoodieSchema.Blob.getReferenceFieldCount) | ||
| val filePath = accessor.getString(referenceStruct, 0) | ||
| require(filePath != null && filePath.nonEmpty, | ||
| s"INLINE blob descriptor at row $rowIndex must have non-empty external_path") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🤖 The INLINE-with-null-inline_data branch silently performs a remote pread against the synthetic descriptor's - AI-generated; verify before applying. React 👍/👎 to flag quality. |
||
| require(!accessor.isNullAt(referenceStruct, 1) && !accessor.isNullAt(referenceStruct, 2), | ||
| s"INLINE blob descriptor at row $rowIndex must set both offset and length") | ||
| val offset = accessor.getLong(referenceStruct, 1) | ||
| val length = accessor.getLong(referenceStruct, 2) | ||
| require(offset >= 0, s"INLINE blob descriptor offset must be non-negative for '$filePath': $offset") | ||
| require(length >= 0, s"INLINE blob descriptor length must be non-negative for '$filePath': $length") | ||
| batch += RowInfo[R]( | ||
| originalRow = row, | ||
| filePath = filePath, | ||
| offset = offset, | ||
| length = length, | ||
| index = rowIndex | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🤖 The - AI-generated; verify before applying. React 👍/👎 to flag quality. |
||
| ) | ||
| } | ||
| } else if (storageType == HoodieSchema.Blob.OUT_OF_LINE) { | ||
| // Case 2 or 3: Out-of-line — get reference struct (field 2) | ||
| require(!accessor.isNullAt(blobStruct, 2), s"Out-of-line blob at row $rowIndex must set reference") | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🤖 This is a user-visible behavior change for
SELECT *on INLINE blob columns. The config carriessinceVersion("1.2.0")— has 1.2.0 already shipped CONTENT as the default? If yes, the flip should bumpsinceVersion/document the breaking change explicitly; if no, it would help to call that out so reviewers can confirm no released version locks in the old default. Either way, a short "Migration / Upgrade" subsection in the RFC pointing users to sethoodie.read.blob.inline.mode=CONTENTto preserve prior behavior would make this easier to roll out.- AI-generated; verify before applying. React 👍/👎 to flag quality.