From 1d5b8a1ab6c3ed0c9239db0c2b1d0ac0bbecbd9a Mon Sep 17 00:00:00 2001 From: prrao87 <35005448+prrao87@users.noreply.github.com> Date: Thu, 5 Mar 2026 21:43:40 -0500 Subject: [PATCH 1/2] Add ts/rs snippets for table ingestion --- docs/snippets/tables.mdx | 28 ++++ docs/tables/create.mdx | 148 ++++++++++++++----- tests/rs/Cargo.toml | 4 + tests/rs/tables.rs | 304 +++++++++++++++++++++++++++++++++++++++ tests/ts/tables.test.ts | 161 +++++++++++++++++++++ 5 files changed, 609 insertions(+), 36 deletions(-) create mode 100644 tests/rs/tables.rs create mode 100644 tests/ts/tables.test.ts diff --git a/docs/snippets/tables.mdx b/docs/snippets/tables.mdx index c79f6e0..a2cc4c2 100644 --- a/docs/snippets/tables.mdx +++ b/docs/snippets/tables.mdx @@ -112,3 +112,31 @@ export const PyVersioningRollback = "# Let's roll back to before we added the ve export const PyVersioningUpdateData = "# Update author names to be more specific\ntable.update(where=\"author='Richard'\", values={\"author\": \"Richard Daniel Sanchez\"})\nrows_after_update = table.count_rows()\nprint(f\"Number of rows after update: {rows_after_update}\")\n"; +export const TsCreateEmptyTable = "const emptySchema = new arrow.Schema([\n new arrow.Field(\n \"vector\",\n new arrow.FixedSizeList(\n 2,\n new arrow.Field(\"item\", new arrow.Float32(), true),\n ),\n ),\n new arrow.Field(\"item\", new arrow.Utf8()),\n new arrow.Field(\"price\", new arrow.Float32()),\n]);\nconst emptyTable = await db.createEmptyTable(\n \"test_empty_table\",\n emptySchema,\n {\n mode: \"overwrite\",\n },\n);\n"; + +export const TsCreateTableCustomSchema = "const customSchema = new arrow.Schema([\n new arrow.Field(\n \"vector\",\n new arrow.FixedSizeList(\n 4,\n new arrow.Field(\"item\", new arrow.Float32(), true),\n ),\n ),\n new arrow.Field(\"lat\", new arrow.Float32()),\n new arrow.Field(\"long\", new arrow.Float32()),\n]);\n\nconst customSchemaData = lancedb.makeArrowTable(\n [\n { vector: [1.1, 1.2, 1.3, 1.4], lat: 45.5, long: -122.7 },\n { vector: [0.2, 1.8, 0.4, 3.6], lat: 40.1, long: -74.1 },\n ],\n { schema: customSchema },\n);\nconst customSchemaTable = await db.createTable(\n \"my_table_custom_schema\",\n customSchemaData,\n { mode: \"overwrite\" },\n);\n"; + +export const TsCreateTableFromArrow = "const arrowSchema = new arrow.Schema([\n new arrow.Field(\n \"vector\",\n new arrow.FixedSizeList(\n 16,\n new arrow.Field(\"item\", new arrow.Float32(), true),\n ),\n ),\n new arrow.Field(\"text\", new arrow.Utf8()),\n]);\nconst arrowData = lancedb.makeArrowTable(\n [\n { vector: Array(16).fill(0.1), text: \"foo\" },\n { vector: Array(16).fill(0.2), text: \"bar\" },\n ],\n { schema: arrowSchema },\n);\nconst arrowTable = await db.createTable(\"f32_tbl\", arrowData, {\n mode: \"overwrite\",\n});\n"; + +export const TsCreateTableFromDicts = "type Location = {\n vector: number[];\n lat: number;\n long: number;\n};\n\nconst data: Location[] = [\n { vector: [1.1, 1.2], lat: 45.5, long: -122.7 },\n { vector: [0.2, 1.8], lat: 40.1, long: -74.1 },\n];\nconst table = await db.createTable(\"test_table\", data, {\n mode: \"overwrite\",\n});\n"; + +export const TsCreateTableFromIterator = "const batchSchema = new arrow.Schema([\n new arrow.Field(\n \"vector\",\n new arrow.FixedSizeList(\n 4,\n new arrow.Field(\"item\", new arrow.Float32(), true),\n ),\n ),\n new arrow.Field(\"item\", new arrow.Utf8()),\n new arrow.Field(\"price\", new arrow.Float32()),\n]);\n\nconst tableForBatches = await db.createEmptyTable(\n \"batched_table\",\n batchSchema,\n {\n mode: \"overwrite\",\n },\n);\n\nconst rows = Array.from({ length: 10 }, (_, i) => ({\n vector: [i + 0.1, i + 0.2, i + 0.3, i + 0.4],\n item: `item-${i + 1}`,\n price: (i + 1) * 10,\n}));\n\nconst chunkSize = 2;\nfor (let i = 0; i < rows.length; i += chunkSize) {\n const batch = lancedb.makeArrowTable(rows.slice(i, i + chunkSize), {\n schema: batchSchema,\n });\n await tableForBatches.add(batch);\n}\n"; + +export const TsDropTable = "await db.createTable(\"my_table\", [{ vector: [1.1, 1.2], lat: 45.5 }], {\n mode: \"overwrite\",\n});\n\nawait db.dropTable(\"my_table\");\n"; + +export const TsOpenExistingTable = "const openTableData = [{ vector: [1.1, 1.2], lat: 45.5, long: -122.7 }];\nawait db.createTable(\"test_table_open\", openTableData, {\n mode: \"overwrite\",\n});\n\nconsole.log(await db.tableNames());\n\nconst openedTable = await db.openTable(\"test_table_open\");\n"; + +export const RsCreateEmptyTable = "let empty_schema = Arc::new(Schema::new(vec![\n Field::new(\n \"vector\",\n DataType::FixedSizeList(Arc::new(Field::new(\"item\", DataType::Float32, true)), 2),\n false,\n ),\n Field::new(\"item\", DataType::Utf8, false),\n Field::new(\"price\", DataType::Float32, false),\n]));\nlet empty_table = db\n .create_empty_table(\"test_empty_table\", empty_schema)\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\n"; + +export const RsCreateTableCustomSchema = "let custom_schema = Arc::new(Schema::new(vec![\n Field::new(\n \"vector\",\n DataType::FixedSizeList(Arc::new(Field::new(\"item\", DataType::Float32, true)), 4),\n false,\n ),\n Field::new(\"lat\", DataType::Float32, false),\n Field::new(\"long\", DataType::Float32, false),\n]));\n\nlet custom_batch = RecordBatch::try_new(\n custom_schema.clone(),\n vec![\n Arc::new(\n FixedSizeListArray::from_iter_primitive::(\n vec![\n Some(vec![Some(1.1), Some(1.2), Some(1.3), Some(1.4)]),\n Some(vec![Some(0.2), Some(1.8), Some(0.4), Some(3.6)]),\n ],\n 4,\n ),\n ),\n Arc::new(Float32Array::from(vec![45.5, 40.1])),\n Arc::new(Float32Array::from(vec![-122.7, -74.1])),\n ],\n)\n.unwrap();\nlet custom_reader =\n RecordBatchIterator::new(vec![Ok(custom_batch)].into_iter(), custom_schema.clone());\nlet custom_table = db\n .create_table(\"my_table_custom_schema\", custom_reader)\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\n"; + +export const RsCreateTableFromArrow = "let arrow_schema = Arc::new(Schema::new(vec![\n Field::new(\n \"vector\",\n DataType::FixedSizeList(Arc::new(Field::new(\"item\", DataType::Float32, true)), 16),\n false,\n ),\n Field::new(\"text\", DataType::Utf8, false),\n]));\n\nlet arrow_batch = RecordBatch::try_new(\n arrow_schema.clone(),\n vec![\n Arc::new(\n FixedSizeListArray::from_iter_primitive::(\n vec![Some(vec![Some(0.1); 16]), Some(vec![Some(0.2); 16])],\n 16,\n ),\n ),\n Arc::new(StringArray::from(vec![\"foo\", \"bar\"])),\n ],\n)\n.unwrap();\nlet arrow_reader =\n RecordBatchIterator::new(vec![Ok(arrow_batch)].into_iter(), arrow_schema.clone());\nlet arrow_table = db\n .create_table(\"arrow_table_example\", arrow_reader)\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\n"; + +export const RsCreateTableFromDicts = "struct Location {\n vector: [f32; 2],\n lat: f32,\n long: f32,\n}\n\nlet data = vec![\n Location {\n vector: [1.1, 1.2],\n lat: 45.5,\n long: -122.7,\n },\n Location {\n vector: [0.2, 1.8],\n lat: 40.1,\n long: -74.1,\n },\n];\n\nlet schema = Arc::new(Schema::new(vec![\n Field::new(\n \"vector\",\n DataType::FixedSizeList(Arc::new(Field::new(\"item\", DataType::Float32, true)), 2),\n false,\n ),\n Field::new(\"lat\", DataType::Float32, false),\n Field::new(\"long\", DataType::Float32, false),\n]));\n\nlet batch = RecordBatch::try_new(\n schema.clone(),\n vec![\n Arc::new(\n FixedSizeListArray::from_iter_primitive::(\n data.iter()\n .map(|row| Some(row.vector.iter().copied().map(Some).collect::>())),\n 2,\n ),\n ),\n Arc::new(Float32Array::from_iter_values(\n data.iter().map(|row| row.lat),\n )),\n Arc::new(Float32Array::from_iter_values(\n data.iter().map(|row| row.long),\n )),\n ],\n)\n.unwrap();\nlet reader = RecordBatchIterator::new(vec![Ok(batch)].into_iter(), schema.clone());\nlet table = db\n .create_table(\"test_table\", reader)\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\n"; + +export const RsCreateTableFromIterator = "let batch_schema = Arc::new(Schema::new(vec![\n Field::new(\n \"vector\",\n DataType::FixedSizeList(Arc::new(Field::new(\"item\", DataType::Float32, true)), 4),\n false,\n ),\n Field::new(\"item\", DataType::Utf8, false),\n Field::new(\"price\", DataType::Float32, false),\n]));\n\nlet batches = (0..5)\n .map(|i| {\n RecordBatch::try_new(\n batch_schema.clone(),\n vec![\n Arc::new(\n FixedSizeListArray::from_iter_primitive::(\n vec![\n Some(vec![Some(3.1 + i as f32), Some(4.1), Some(5.1), Some(6.1)]),\n Some(vec![\n Some(5.9),\n Some(26.5 + i as f32),\n Some(4.7),\n Some(32.8),\n ]),\n ],\n 4,\n ),\n ),\n Arc::new(StringArray::from(vec![\n format!(\"item{}\", i * 2 + 1),\n format!(\"item{}\", i * 2 + 2),\n ])),\n Arc::new(Float32Array::from(vec![\n ((i * 2 + 1) * 10) as f32,\n ((i * 2 + 2) * 10) as f32,\n ])),\n ],\n )\n .unwrap()\n })\n .collect::>();\n\nlet batch_reader = RecordBatchIterator::new(batches.into_iter().map(Ok), batch_schema.clone());\nlet batch_table = db\n .create_table(\"batched_table\", batch_reader)\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\n"; + +export const RsDropTable = "let drop_schema = Arc::new(Schema::new(vec![\n Field::new(\n \"vector\",\n DataType::FixedSizeList(Arc::new(Field::new(\"item\", DataType::Float32, true)), 2),\n false,\n ),\n Field::new(\"lat\", DataType::Float32, false),\n]));\nlet drop_batch = RecordBatch::try_new(\n drop_schema.clone(),\n vec![\n Arc::new(\n FixedSizeListArray::from_iter_primitive::(\n vec![Some(vec![Some(1.1), Some(1.2)])],\n 2,\n ),\n ),\n Arc::new(Float32Array::from(vec![45.5])),\n ],\n)\n.unwrap();\nlet drop_reader =\n RecordBatchIterator::new(vec![Ok(drop_batch)].into_iter(), drop_schema.clone());\ndb.create_table(\"my_table\", drop_reader)\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\n\ndb.drop_table(\"my_table\", &[]).await.unwrap();\n"; + +export const RsOpenExistingTable = "let open_schema = Arc::new(Schema::new(vec![\n Field::new(\n \"vector\",\n DataType::FixedSizeList(Arc::new(Field::new(\"item\", DataType::Float32, true)), 2),\n false,\n ),\n Field::new(\"lat\", DataType::Float32, false),\n Field::new(\"long\", DataType::Float32, false),\n]));\nlet open_batch = RecordBatch::try_new(\n open_schema.clone(),\n vec![\n Arc::new(\n FixedSizeListArray::from_iter_primitive::(\n vec![Some(vec![Some(1.1), Some(1.2)])],\n 2,\n ),\n ),\n Arc::new(Float32Array::from(vec![45.5])),\n Arc::new(Float32Array::from(vec![-122.7])),\n ],\n)\n.unwrap();\nlet open_reader =\n RecordBatchIterator::new(vec![Ok(open_batch)].into_iter(), open_schema.clone());\ndb.create_table(\"test_table\", open_reader)\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\n\nprintln!(\"{:?}\", db.table_names().execute().await.unwrap());\n\nlet opened_table = db.open_table(\"test_table\").execute().await.unwrap();\n"; + diff --git a/docs/tables/create.mdx b/docs/tables/create.mdx index 7fbd2ed..b80950a 100644 --- a/docs/tables/create.mdx +++ b/docs/tables/create.mdx @@ -4,31 +4,40 @@ sidebarTitle: "Ingesting data" description: Learn about different methods to ingest data into tables in LanceDB, including from various data sources and empty tables. icon: "cookie" --- +import { TsConnect, RsConnect } from '/snippets/connection.mdx'; import { PyCreateTableFromDicts as CreateTableFromDicts, + TsCreateTableFromDicts as TsCreateTableFromDicts, + RsCreateTableFromDicts as RsCreateTableFromDicts, PyCreateTableFromPandas as CreateTableFromPandas, PyCreateTableCustomSchema as CreateTableCustomSchema, + TsCreateTableCustomSchema as TsCreateTableCustomSchema, + RsCreateTableCustomSchema as RsCreateTableCustomSchema, PyCreateTableFromPolars as CreateTableFromPolars, PyCreateTableFromArrow as CreateTableFromArrow, + TsCreateTableFromArrow as TsCreateTableFromArrow, + RsCreateTableFromArrow as RsCreateTableFromArrow, PyCreateTableFromPydantic as CreateTableFromPydantic, PyCreateTableNestedSchema as CreateTableNestedSchema, PyCreateTableFromIterator as CreateTableFromIterator, + TsCreateTableFromIterator as TsCreateTableFromIterator, + RsCreateTableFromIterator as RsCreateTableFromIterator, PyOpenExistingTable as OpenExistingTable, + TsOpenExistingTable as TsOpenExistingTable, + RsOpenExistingTable as RsOpenExistingTable, PyCreateEmptyTable as CreateEmptyTable, + TsCreateEmptyTable as TsCreateEmptyTable, + RsCreateEmptyTable as RsCreateEmptyTable, PyCreateEmptyTablePydantic as CreateEmptyTablePydantic, PyDropTable as DropTable, + TsDropTable as TsDropTable, + RsDropTable as RsDropTable, PyTablesBasicConnect as TablesBasicConnect, PyTablesDocumentModel as TablesDocumentModel, PyTablesTzValidator as TablesTzValidator, } from '/snippets/tables.mdx'; -In LanceDB, tables store records with a defined schema that specifies column names and types. You can create LanceDB tables from these data formats: - -- Pandas DataFrames -- [Polars](https://pola.rs/) DataFrames -- Apache Arrow Tables - -The Python SDK additionally supports: +In LanceDB, tables store records with a defined schema that specifies column names and types. Across the SDKs, you can create tables from row-oriented data and Apache Arrow data structures. The Python SDK additionally supports: - PyArrow schemas for explicit schema control - `LanceModel` for Pydantic-based validation @@ -37,24 +46,80 @@ The Python SDK additionally supports: Initialize a LanceDB connection and create a table - {TablesBasicConnect} + + + {TsConnect} + + + + {RsConnect} + -LanceDB allows ingesting data from various sources - `dict`, `list[dict]`, `pd.DataFrame`, `pa.Table` or a `Iterator[pa.RecordBatch]`. Let's take a look at some of the these. +Depending on the SDK, LanceDB can ingest arrays of records, Arrow tables or record batches, and Arrow batch iterators or readers. Let's take a look at some of the common patterns. + +### From list of objects -### From list of tuples or dictionaries +You can provide a list of objects to create a table. The Python and TypeScript SDKs +support lists/arrays of dictionaries, while the Rust SDK supports lists of structs. {CreateTableFromDicts} + + + {TsCreateTableFromDicts} + + + + {RsCreateTableFromDicts} + + + +### From a custom schema + +You can define a custom Arrow schema for the table. This is useful when you want to have more control over the column types and metadata. + + + + {CreateTableCustomSchema} + + + + {TsCreateTableCustomSchema} + + + + {RsCreateTableCustomSchema} + + + +### From an Arrow Table +You can also create LanceDB tables directly from Arrow tables. +Rust uses an Arrow `RecordBatchReader` for the same Arrow-native ingest flow. + + + + {CreateTableFromArrow} + + + + {TsCreateTableFromArrow} + + + + {RsCreateTableFromArrow} + + ### From a Pandas DataFrame +Python Only @@ -70,15 +135,8 @@ Data is converted to Arrow before being written to disk. For maximum control ove The **`vector`** column needs to be a [Vector](/integrations/data/pydantic#vector-field) (defined as [pyarrow.FixedSizeList](https://arrow.apache.org/docs/python/generated/pyarrow.list_.html)) type. -#### From a custom schema - - - - {CreateTableCustomSchema} - - - ### From a Polars DataFrame +Python Only LanceDB supports [Polars](https://pola.rs/), a modern, fast DataFrame library written in Rust. Just like in Pandas, the Polars integration is enabled by PyArrow @@ -91,17 +149,8 @@ is on the way. -### From an Arrow Table -You can also create LanceDB tables directly from Arrow tables. -LanceDB supports float16 data type! - - - - {CreateTableFromArrow} - - - ### From Pydantic Models +Python Only When you create an empty table without data, you must specify the table schema. LanceDB supports creating tables by specifying a PyArrow schema or a specialized @@ -170,19 +219,23 @@ When you run this code it, should raise the `ValidationError`. ### Using Iterators / Writing Large Datasets -It is recommended to use iterators to add large datasets in batches when creating your table in one go. This does not create multiple versions of your dataset unlike manually adding batches using `table.add()` - -LanceDB additionally supports PyArrow's `RecordBatch` Iterators or other generators producing supported data types. - -Here's an example using using `RecordBatch` iterator for creating tables. +For large ingests, prefer batching instead of adding one row at a time. Python and Rust can create a table directly from Arrow batch iterators or readers. In TypeScript, the practical pattern today is to create an empty table and append Arrow batches in chunks. {CreateTableFromIterator} + + + {TsCreateTableFromIterator} + + + + {RsCreateTableFromIterator} + -You can also use iterators of other types like Pandas DataFrame or Pylists directly in the above example. +Python can also consume iterators of other supported types like Pandas DataFrames or Python lists. ## Open existing tables @@ -192,6 +245,14 @@ If you forget the name of your table, you can always get a listing of all table {OpenExistingTable} + + + {TsOpenExistingTable} + + + + {RsOpenExistingTable} + ## Creating empty table @@ -199,12 +260,20 @@ You can create an empty table for scenarios where you want to add data to the ta An example would be when you want to collect data from a stream/external file and then add it to a table in batches. -An empty table can be initialized via a PyArrow schema. +An empty table can be initialized via an Arrow schema. {CreateEmptyTable} + + + {TsCreateEmptyTable} + + + + {RsCreateEmptyTable} + Alternatively, you can also use Pydantic to specify the schema for the empty table. Note that we do not @@ -228,9 +297,16 @@ Use the `drop_table()` method on the database to remove a table. {DropTable} + + + {TsDropTable} + + + + {RsDropTable} + This permanently removes the table and is not recoverable, unlike deleting rows. By default, if the table does not exist an exception is raised. To suppress this, you can pass in `ignore_missing=True`. - diff --git a/tests/rs/Cargo.toml b/tests/rs/Cargo.toml index 5efbf96..10f7537 100644 --- a/tests/rs/Cargo.toml +++ b/tests/rs/Cargo.toml @@ -28,3 +28,7 @@ path = "connection.rs" [[example]] name = "quickstart" path = "quickstart.rs" + +[[example]] +name = "tables" +path = "tables.rs" diff --git a/tests/rs/tables.rs b/tests/rs/tables.rs new file mode 100644 index 0000000..ea7a35f --- /dev/null +++ b/tests/rs/tables.rs @@ -0,0 +1,304 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The LanceDB Authors + +use std::sync::Arc; + +use arrow_array::types::Float32Type; +use arrow_array::{ + FixedSizeListArray, Float32Array, RecordBatch, RecordBatchIterator, StringArray, +}; +use arrow_schema::{DataType, Field, Schema}; +use lancedb::connect; +use lancedb::database::CreateTableMode; + +#[tokio::main] +async fn main() { + let temp_dir = tempfile::tempdir().unwrap(); + let db = connect(temp_dir.path().to_str().unwrap()) + .execute() + .await + .unwrap(); + + // --8<-- [start:create_table_from_dicts] + struct Location { + vector: [f32; 2], + lat: f32, + long: f32, + } + + let data = vec![ + Location { + vector: [1.1, 1.2], + lat: 45.5, + long: -122.7, + }, + Location { + vector: [0.2, 1.8], + lat: 40.1, + long: -74.1, + }, + ]; + + let schema = Arc::new(Schema::new(vec![ + Field::new( + "vector", + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 2), + false, + ), + Field::new("lat", DataType::Float32, false), + Field::new("long", DataType::Float32, false), + ])); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new( + FixedSizeListArray::from_iter_primitive::( + data.iter() + .map(|row| Some(row.vector.iter().copied().map(Some).collect::>())), + 2, + ), + ), + Arc::new(Float32Array::from_iter_values( + data.iter().map(|row| row.lat), + )), + Arc::new(Float32Array::from_iter_values( + data.iter().map(|row| row.long), + )), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)].into_iter(), schema.clone()); + let table = db + .create_table("test_table", reader) + .mode(CreateTableMode::Overwrite) + .execute() + .await + .unwrap(); + // --8<-- [end:create_table_from_dicts] + assert_eq!(table.count_rows(None).await.unwrap(), 2); + + // --8<-- [start:create_table_custom_schema] + let custom_schema = Arc::new(Schema::new(vec![ + Field::new( + "vector", + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 4), + false, + ), + Field::new("lat", DataType::Float32, false), + Field::new("long", DataType::Float32, false), + ])); + + let custom_batch = RecordBatch::try_new( + custom_schema.clone(), + vec![ + Arc::new( + FixedSizeListArray::from_iter_primitive::( + vec![ + Some(vec![Some(1.1), Some(1.2), Some(1.3), Some(1.4)]), + Some(vec![Some(0.2), Some(1.8), Some(0.4), Some(3.6)]), + ], + 4, + ), + ), + Arc::new(Float32Array::from(vec![45.5, 40.1])), + Arc::new(Float32Array::from(vec![-122.7, -74.1])), + ], + ) + .unwrap(); + let custom_reader = + RecordBatchIterator::new(vec![Ok(custom_batch)].into_iter(), custom_schema.clone()); + let custom_table = db + .create_table("my_table_custom_schema", custom_reader) + .mode(CreateTableMode::Overwrite) + .execute() + .await + .unwrap(); + // --8<-- [end:create_table_custom_schema] + assert_eq!(custom_table.count_rows(None).await.unwrap(), 2); + + // --8<-- [start:create_table_from_arrow] + let arrow_schema = Arc::new(Schema::new(vec![ + Field::new( + "vector", + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 16), + false, + ), + Field::new("text", DataType::Utf8, false), + ])); + + let arrow_batch = RecordBatch::try_new( + arrow_schema.clone(), + vec![ + Arc::new( + FixedSizeListArray::from_iter_primitive::( + vec![Some(vec![Some(0.1); 16]), Some(vec![Some(0.2); 16])], + 16, + ), + ), + Arc::new(StringArray::from(vec!["foo", "bar"])), + ], + ) + .unwrap(); + let arrow_reader = + RecordBatchIterator::new(vec![Ok(arrow_batch)].into_iter(), arrow_schema.clone()); + let arrow_table = db + .create_table("arrow_table_example", arrow_reader) + .mode(CreateTableMode::Overwrite) + .execute() + .await + .unwrap(); + // --8<-- [end:create_table_from_arrow] + assert_eq!(arrow_table.count_rows(None).await.unwrap(), 2); + + // --8<-- [start:create_table_from_iterator] + let batch_schema = Arc::new(Schema::new(vec![ + Field::new( + "vector", + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 4), + false, + ), + Field::new("item", DataType::Utf8, false), + Field::new("price", DataType::Float32, false), + ])); + + let batches = (0..5) + .map(|i| { + RecordBatch::try_new( + batch_schema.clone(), + vec![ + Arc::new( + FixedSizeListArray::from_iter_primitive::( + vec![ + Some(vec![Some(3.1 + i as f32), Some(4.1), Some(5.1), Some(6.1)]), + Some(vec![ + Some(5.9), + Some(26.5 + i as f32), + Some(4.7), + Some(32.8), + ]), + ], + 4, + ), + ), + Arc::new(StringArray::from(vec![ + format!("item{}", i * 2 + 1), + format!("item{}", i * 2 + 2), + ])), + Arc::new(Float32Array::from(vec![ + ((i * 2 + 1) * 10) as f32, + ((i * 2 + 2) * 10) as f32, + ])), + ], + ) + .unwrap() + }) + .collect::>(); + + let batch_reader = RecordBatchIterator::new(batches.into_iter().map(Ok), batch_schema.clone()); + let batch_table = db + .create_table("batched_table", batch_reader) + .mode(CreateTableMode::Overwrite) + .execute() + .await + .unwrap(); + // --8<-- [end:create_table_from_iterator] + assert_eq!(batch_table.count_rows(None).await.unwrap(), 10); + + // --8<-- [start:open_existing_table] + let open_schema = Arc::new(Schema::new(vec![ + Field::new( + "vector", + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 2), + false, + ), + Field::new("lat", DataType::Float32, false), + Field::new("long", DataType::Float32, false), + ])); + let open_batch = RecordBatch::try_new( + open_schema.clone(), + vec![ + Arc::new( + FixedSizeListArray::from_iter_primitive::( + vec![Some(vec![Some(1.1), Some(1.2)])], + 2, + ), + ), + Arc::new(Float32Array::from(vec![45.5])), + Arc::new(Float32Array::from(vec![-122.7])), + ], + ) + .unwrap(); + let open_reader = + RecordBatchIterator::new(vec![Ok(open_batch)].into_iter(), open_schema.clone()); + db.create_table("test_table", open_reader) + .mode(CreateTableMode::Overwrite) + .execute() + .await + .unwrap(); + + println!("{:?}", db.table_names().execute().await.unwrap()); + + let opened_table = db.open_table("test_table").execute().await.unwrap(); + // --8<-- [end:open_existing_table] + assert_eq!(opened_table.count_rows(None).await.unwrap(), 1); + + // --8<-- [start:create_empty_table] + let empty_schema = Arc::new(Schema::new(vec![ + Field::new( + "vector", + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 2), + false, + ), + Field::new("item", DataType::Utf8, false), + Field::new("price", DataType::Float32, false), + ])); + let empty_table = db + .create_empty_table("test_empty_table", empty_schema) + .mode(CreateTableMode::Overwrite) + .execute() + .await + .unwrap(); + // --8<-- [end:create_empty_table] + assert_eq!(empty_table.count_rows(None).await.unwrap(), 0); + + // --8<-- [start:drop_table] + let drop_schema = Arc::new(Schema::new(vec![ + Field::new( + "vector", + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 2), + false, + ), + Field::new("lat", DataType::Float32, false), + ])); + let drop_batch = RecordBatch::try_new( + drop_schema.clone(), + vec![ + Arc::new( + FixedSizeListArray::from_iter_primitive::( + vec![Some(vec![Some(1.1), Some(1.2)])], + 2, + ), + ), + Arc::new(Float32Array::from(vec![45.5])), + ], + ) + .unwrap(); + let drop_reader = + RecordBatchIterator::new(vec![Ok(drop_batch)].into_iter(), drop_schema.clone()); + db.create_table("my_table", drop_reader) + .mode(CreateTableMode::Overwrite) + .execute() + .await + .unwrap(); + + db.drop_table("my_table", &[]).await.unwrap(); + // --8<-- [end:drop_table] + assert!( + !db.table_names() + .execute() + .await + .unwrap() + .contains(&"my_table".to_string()) + ); +} diff --git a/tests/ts/tables.test.ts b/tests/ts/tables.test.ts new file mode 100644 index 0000000..2b173a4 --- /dev/null +++ b/tests/ts/tables.test.ts @@ -0,0 +1,161 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The LanceDB Authors +import { expect, test } from "@jest/globals"; +import * as arrow from "apache-arrow"; +import * as lancedb from "@lancedb/lancedb"; +import { withTempDirectory } from "./util.ts"; + +test("table creation snippets (async)", async () => { + await withTempDirectory(async (databaseDir) => { + const db = await lancedb.connect(databaseDir); + + // --8<-- [start:create_table_from_dicts] + type Location = { + vector: number[]; + lat: number; + long: number; + }; + + const data: Location[] = [ + { vector: [1.1, 1.2], lat: 45.5, long: -122.7 }, + { vector: [0.2, 1.8], lat: 40.1, long: -74.1 }, + ]; + const table = await db.createTable("test_table", data, { + mode: "overwrite", + }); + // --8<-- [end:create_table_from_dicts] + expect(await table.countRows()).toBe(2); + + // --8<-- [start:create_table_custom_schema] + const customSchema = new arrow.Schema([ + new arrow.Field( + "vector", + new arrow.FixedSizeList( + 4, + new arrow.Field("item", new arrow.Float32(), true), + ), + ), + new arrow.Field("lat", new arrow.Float32()), + new arrow.Field("long", new arrow.Float32()), + ]); + + const customSchemaData = lancedb.makeArrowTable( + [ + { vector: [1.1, 1.2, 1.3, 1.4], lat: 45.5, long: -122.7 }, + { vector: [0.2, 1.8, 0.4, 3.6], lat: 40.1, long: -74.1 }, + ], + { schema: customSchema }, + ); + const customSchemaTable = await db.createTable( + "my_table_custom_schema", + customSchemaData, + { mode: "overwrite" }, + ); + // --8<-- [end:create_table_custom_schema] + expect(await customSchemaTable.countRows()).toBe(2); + + // --8<-- [start:create_table_from_arrow] + const arrowSchema = new arrow.Schema([ + new arrow.Field( + "vector", + new arrow.FixedSizeList( + 16, + new arrow.Field("item", new arrow.Float32(), true), + ), + ), + new arrow.Field("text", new arrow.Utf8()), + ]); + const arrowData = lancedb.makeArrowTable( + [ + { vector: Array(16).fill(0.1), text: "foo" }, + { vector: Array(16).fill(0.2), text: "bar" }, + ], + { schema: arrowSchema }, + ); + const arrowTable = await db.createTable("f32_tbl", arrowData, { + mode: "overwrite", + }); + // --8<-- [end:create_table_from_arrow] + expect(await arrowTable.countRows()).toBe(2); + + // --8<-- [start:create_table_from_iterator] + const batchSchema = new arrow.Schema([ + new arrow.Field( + "vector", + new arrow.FixedSizeList( + 4, + new arrow.Field("item", new arrow.Float32(), true), + ), + ), + new arrow.Field("item", new arrow.Utf8()), + new arrow.Field("price", new arrow.Float32()), + ]); + + const tableForBatches = await db.createEmptyTable( + "batched_table", + batchSchema, + { + mode: "overwrite", + }, + ); + + const rows = Array.from({ length: 10 }, (_, i) => ({ + vector: [i + 0.1, i + 0.2, i + 0.3, i + 0.4], + item: `item-${i + 1}`, + price: (i + 1) * 10, + })); + + const chunkSize = 2; + for (let i = 0; i < rows.length; i += chunkSize) { + const batch = lancedb.makeArrowTable(rows.slice(i, i + chunkSize), { + schema: batchSchema, + }); + await tableForBatches.add(batch); + } + // --8<-- [end:create_table_from_iterator] + expect(await tableForBatches.countRows()).toBe(10); + + // --8<-- [start:open_existing_table] + const openTableData = [{ vector: [1.1, 1.2], lat: 45.5, long: -122.7 }]; + await db.createTable("test_table_open", openTableData, { + mode: "overwrite", + }); + + console.log(await db.tableNames()); + + const openedTable = await db.openTable("test_table_open"); + // --8<-- [end:open_existing_table] + expect(await openedTable.countRows()).toBe(1); + + // --8<-- [start:create_empty_table] + const emptySchema = new arrow.Schema([ + new arrow.Field( + "vector", + new arrow.FixedSizeList( + 2, + new arrow.Field("item", new arrow.Float32(), true), + ), + ), + new arrow.Field("item", new arrow.Utf8()), + new arrow.Field("price", new arrow.Float32()), + ]); + const emptyTable = await db.createEmptyTable( + "test_empty_table", + emptySchema, + { + mode: "overwrite", + }, + ); + // --8<-- [end:create_empty_table] + expect(await emptyTable.countRows()).toBe(0); + + // --8<-- [start:drop_table] + await db.createTable("my_table", [{ vector: [1.1, 1.2], lat: 45.5 }], { + mode: "overwrite", + }); + + await db.dropTable("my_table"); + // --8<-- [end:drop_table] + expect(await db.tableNames()).not.toContain("my_table"); + }); +}); From 39721c73ce556aab5aee2408d53842a08957366a Mon Sep 17 00:00:00 2001 From: prrao87 <35005448+prrao87@users.noreply.github.com> Date: Thu, 5 Mar 2026 22:14:42 -0500 Subject: [PATCH 2/2] Add ts/rs snippets to schema evolution --- AGENTS.md | 6 +- docs/snippets/tables.mdx | 60 ++++++++ docs/tables/schema.mdx | 268 ++++++++++++++++++++++++-------- tests/py/test_tables.py | 31 +++- tests/rs/tables.rs | 324 ++++++++++++++++++++++++++++++++++++++- tests/ts/tables.test.ts | 238 ++++++++++++++++++++++++++++ 6 files changed, 856 insertions(+), 71 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 6b5156d..0cf09a6 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -17,4 +17,8 @@ When running Python code, we have to cater to users of both pip and uv. - Look for a local virtual environment (typically in `.venv` or `venv`) - Activate the environment, so that you can run multiple code exampes in the same environment - Avoid using `uv run` directly, as you have issues running it in your sandbox -- Only fall back to the system `python3` to run code if the above steps don't work \ No newline at end of file +- Only fall back to the system `python3` to run code if the above steps don't work + +## Generate snippets + +- Generate the required code snippets using the provided Makefile: `make snippets` \ No newline at end of file diff --git a/docs/snippets/tables.mdx b/docs/snippets/tables.mdx index a2cc4c2..ba7655c 100644 --- a/docs/snippets/tables.mdx +++ b/docs/snippets/tables.mdx @@ -20,6 +20,8 @@ export const PyAlterColumnsNullable = "# Make the name column nullable\ntable.al export const PyAlterColumnsRename = "# Rename discount_price to sale_price\ntable.alter_columns({\"path\": \"discount_price\", \"rename\": \"sale_price\"})\n"; +export const PyAlterColumnsWithExpression = "# For custom transforms, create a new column from a SQL expression.\nexpression_table = tmp_db.create_table(\n \"schema_evolution_expression_example\",\n [{\"id\": 1, \"price_text\": \"$100\"}],\n mode=\"overwrite\",\n)\n\nexpression_table.add_columns(\n {\"price_numeric\": \"cast(replace(price_text, '$', '') as int)\"}\n)\nexpression_table.drop_columns([\"price_text\"])\nexpression_table.alter_columns({\"path\": \"price_numeric\", \"rename\": \"price\"})\n"; + export const PyAlterVectorColumn = "vector_dim = 768 # Your embedding dimension\ntable_name = \"vector_alter_example\"\ndb = tmp_db\ndata = [\n {\n \"id\": 1,\n \"embedding\": np.random.random(vector_dim).tolist(),\n },\n]\ntable = db.create_table(table_name, data, mode=\"overwrite\")\n\ntable.alter_columns(\n dict(path=\"embedding\", data_type=pa.list_(pa.float32(), vector_dim))\n)\n"; export const PyBatchDataInsertion = "import pyarrow as pa\n\ndef make_batches():\n for i in range(5): # Create 5 batches\n yield pa.RecordBatch.from_arrays(\n [\n pa.array([[3.1, 4.1], [5.9, 26.5]], pa.list_(pa.float32(), 2)),\n pa.array([f\"item{i*2+1}\", f\"item{i*2+2}\"]),\n pa.array([float((i * 2 + 1) * 10), float((i * 2 + 2) * 10)]),\n ],\n [\"vector\", \"item\", \"price\"],\n )\n\nschema = pa.schema(\n [\n pa.field(\"vector\", pa.list_(pa.float32(), 2)),\n pa.field(\"item\", pa.utf8()),\n pa.field(\"price\", pa.float32()),\n ]\n)\n# Create table with batches\ntable_name = \"batch_ingestion_example\"\ntable = db.create_table(table_name, make_batches(), schema=schema, mode=\"overwrite\")\n"; @@ -74,6 +76,8 @@ export const PySchemaAddSetup = "table_name = \"schema_evolution_add_example\"\n export const PySchemaAlterSetup = "table_name = \"schema_evolution_alter_example\"\nif data is None:\n data = [\n {\n \"id\": 1,\n \"name\": \"Laptop\",\n \"price\": 1200,\n \"discount_price\": 1080.0,\n \"vector\": np.random.random(128).tolist(),\n },\n {\n \"id\": 2,\n \"name\": \"Smartphone\",\n \"price\": 800,\n \"discount_price\": 720.0,\n \"vector\": np.random.random(128).tolist(),\n },\n ]\nschema = pa.schema(\n {\n \"id\": pa.int64(),\n \"name\": pa.string(),\n \"price\": pa.int32(),\n \"discount_price\": pa.float64(),\n \"vector\": pa.list_(pa.float32(), 128),\n }\n)\ntable = tmp_db.create_table(table_name, data, schema=schema, mode=\"overwrite\")\n"; +export const PySchemaDropSetup = "if data is None:\n data = [\n {\n \"id\": 1,\n \"name\": \"Laptop\",\n \"price\": 1200.00,\n \"temp_col1\": \"X\",\n \"temp_col2\": 100,\n \"vector\": np.random.random(128).tolist(),\n },\n {\n \"id\": 2,\n \"name\": \"Smartphone\",\n \"price\": 800.00,\n \"temp_col1\": \"Y\",\n \"temp_col2\": 200,\n \"vector\": np.random.random(128).tolist(),\n },\n {\n \"id\": 3,\n \"name\": \"Headphones\",\n \"price\": 150.00,\n \"temp_col1\": \"Z\",\n \"temp_col2\": 300,\n \"vector\": np.random.random(128).tolist(),\n },\n ]\ntable = tmp_db.create_table(\"schema_evolution_drop_example\", data, mode=\"overwrite\")\n"; + export const PyTablesBasicConnect = "import lancedb\n\nuri = \"data/sample-lancedb\"\ndb = lancedb.connect(uri)\n"; export const PyTablesDocumentModel = "from pydantic import BaseModel\n\nclass Document(BaseModel):\n content: str\n source: str\n"; @@ -112,6 +116,24 @@ export const PyVersioningRollback = "# Let's roll back to before we added the ve export const PyVersioningUpdateData = "# Update author names to be more specific\ntable.update(where=\"author='Richard'\", values={\"author\": \"Richard Daniel Sanchez\"})\nrows_after_update = table.count_rows()\nprint(f\"Number of rows after update: {rows_after_update}\")\n"; +export const TsAddColumnsCalculated = "// Add a discounted price column (10% discount)\nawait schemaAddTable.addColumns([\n {\n name: \"discounted_price\",\n valueSql: \"cast((price * 0.9) as float)\",\n },\n]);\n"; + +export const TsAddColumnsDefaultValues = "// Add a stock status column with default value\nawait schemaAddTable.addColumns([\n {\n name: \"in_stock\",\n valueSql: \"cast(true as boolean)\",\n },\n]);\n"; + +export const TsAddColumnsNullable = "// Add a nullable timestamp column\nawait schemaAddTable.addColumns([\n {\n name: \"last_ordered\",\n valueSql: \"cast(NULL as timestamp)\",\n },\n]);\n"; + +export const TsAlterColumnsDataType = "// Change price from int32 to int64 for larger numbers\nawait schemaAlterTable.alterColumns([\n { path: \"price\", dataType: new arrow.Int64() },\n]);\n"; + +export const TsAlterColumnsMultiple = "// Rename, change type, and make nullable in one operation\nawait schemaAlterTable.alterColumns([\n {\n path: \"sale_price\",\n rename: \"final_price\",\n dataType: new arrow.Float64(),\n nullable: true,\n },\n]);\n"; + +export const TsAlterColumnsNullable = "// Make the name column nullable\nawait schemaAlterTable.alterColumns([{ path: \"name\", nullable: true }]);\n"; + +export const TsAlterColumnsRename = "// Rename discount_price to sale_price\nawait schemaAlterTable.alterColumns([\n { path: \"discount_price\", rename: \"sale_price\" },\n]);\n"; + +export const TsAlterColumnsWithExpression = "// For custom transforms, create a new column from a SQL expression.\nconst expressionTable = await db.createTable(\n \"schema_evolution_expression_example\",\n [{ id: 1, price_text: \"$100\" }],\n { mode: \"overwrite\" },\n);\n\nawait expressionTable.addColumns([\n {\n name: \"price_numeric\",\n valueSql: \"cast(replace(price_text, '$', '') as int)\",\n },\n]);\nawait expressionTable.dropColumns([\"price_text\"]);\nawait expressionTable.alterColumns([\n { path: \"price_numeric\", rename: \"price\" },\n]);\n"; + +export const TsAlterVectorColumn = "const oldDim = 384;\nconst newDim = 1024;\nconst vectorSchema = new arrow.Schema([\n new arrow.Field(\"id\", new arrow.Int64()),\n new arrow.Field(\n \"embedding\",\n new arrow.FixedSizeList(\n oldDim,\n new arrow.Field(\"item\", new arrow.Float16(), true),\n ),\n true,\n ),\n]);\nconst vectorData = lancedb.makeArrowTable(\n [{ id: 1, embedding: Array.from({ length: oldDim }, () => Math.random()) }],\n { schema: vectorSchema },\n);\nconst vectorTable = await db.createTable(\"vector_alter_example\", vectorData, {\n mode: \"overwrite\",\n});\n\n// Changing FixedSizeList dimensions (384 -> 1024) is not supported via alterColumns.\n// Use addColumns + dropColumns + alterColumns(rename) to replace the column.\nawait vectorTable.addColumns([\n {\n name: \"embedding_v2\",\n valueSql: `arrow_cast(NULL, 'FixedSizeList(${newDim}, Float16)')`,\n },\n]);\nawait vectorTable.dropColumns([\"embedding\"]);\nawait vectorTable.alterColumns([{ path: \"embedding_v2\", rename: \"embedding\" }]);\n"; + export const TsCreateEmptyTable = "const emptySchema = new arrow.Schema([\n new arrow.Field(\n \"vector\",\n new arrow.FixedSizeList(\n 2,\n new arrow.Field(\"item\", new arrow.Float32(), true),\n ),\n ),\n new arrow.Field(\"item\", new arrow.Utf8()),\n new arrow.Field(\"price\", new arrow.Float32()),\n]);\nconst emptyTable = await db.createEmptyTable(\n \"test_empty_table\",\n emptySchema,\n {\n mode: \"overwrite\",\n },\n);\n"; export const TsCreateTableCustomSchema = "const customSchema = new arrow.Schema([\n new arrow.Field(\n \"vector\",\n new arrow.FixedSizeList(\n 4,\n new arrow.Field(\"item\", new arrow.Float32(), true),\n ),\n ),\n new arrow.Field(\"lat\", new arrow.Float32()),\n new arrow.Field(\"long\", new arrow.Float32()),\n]);\n\nconst customSchemaData = lancedb.makeArrowTable(\n [\n { vector: [1.1, 1.2, 1.3, 1.4], lat: 45.5, long: -122.7 },\n { vector: [0.2, 1.8, 0.4, 3.6], lat: 40.1, long: -74.1 },\n ],\n { schema: customSchema },\n);\nconst customSchemaTable = await db.createTable(\n \"my_table_custom_schema\",\n customSchemaData,\n { mode: \"overwrite\" },\n);\n"; @@ -122,10 +144,38 @@ export const TsCreateTableFromDicts = "type Location = {\n vector: number[];\n export const TsCreateTableFromIterator = "const batchSchema = new arrow.Schema([\n new arrow.Field(\n \"vector\",\n new arrow.FixedSizeList(\n 4,\n new arrow.Field(\"item\", new arrow.Float32(), true),\n ),\n ),\n new arrow.Field(\"item\", new arrow.Utf8()),\n new arrow.Field(\"price\", new arrow.Float32()),\n]);\n\nconst tableForBatches = await db.createEmptyTable(\n \"batched_table\",\n batchSchema,\n {\n mode: \"overwrite\",\n },\n);\n\nconst rows = Array.from({ length: 10 }, (_, i) => ({\n vector: [i + 0.1, i + 0.2, i + 0.3, i + 0.4],\n item: `item-${i + 1}`,\n price: (i + 1) * 10,\n}));\n\nconst chunkSize = 2;\nfor (let i = 0; i < rows.length; i += chunkSize) {\n const batch = lancedb.makeArrowTable(rows.slice(i, i + chunkSize), {\n schema: batchSchema,\n });\n await tableForBatches.add(batch);\n}\n"; +export const TsDropColumnsMultiple = "// Remove the second temporary column\nawait schemaDropTable.dropColumns([\"temp_col2\"]);\n"; + +export const TsDropColumnsSingle = "// Remove the first temporary column\nawait schemaDropTable.dropColumns([\"temp_col1\"]);\n"; + export const TsDropTable = "await db.createTable(\"my_table\", [{ vector: [1.1, 1.2], lat: 45.5 }], {\n mode: \"overwrite\",\n});\n\nawait db.dropTable(\"my_table\");\n"; export const TsOpenExistingTable = "const openTableData = [{ vector: [1.1, 1.2], lat: 45.5, long: -122.7 }];\nawait db.createTable(\"test_table_open\", openTableData, {\n mode: \"overwrite\",\n});\n\nconsole.log(await db.tableNames());\n\nconst openedTable = await db.openTable(\"test_table_open\");\n"; +export const TsSchemaAddSetup = "const schemaAddData = [\n {\n id: 1,\n name: \"Laptop\",\n price: 1200.0,\n vector: Array.from({ length: 128 }, () => Math.random()),\n },\n {\n id: 2,\n name: \"Smartphone\",\n price: 800.0,\n vector: Array.from({ length: 128 }, () => Math.random()),\n },\n {\n id: 3,\n name: \"Headphones\",\n price: 150.0,\n vector: Array.from({ length: 128 }, () => Math.random()),\n },\n];\nconst schemaAddTable = await db.createTable(\n \"schema_evolution_add_example\",\n schemaAddData,\n { mode: \"overwrite\" },\n);\n"; + +export const TsSchemaAlterSetup = "const schemaAlter = new arrow.Schema([\n new arrow.Field(\"id\", new arrow.Int64()),\n new arrow.Field(\"name\", new arrow.Utf8()),\n new arrow.Field(\"price\", new arrow.Int32()),\n new arrow.Field(\"discount_price\", new arrow.Float64()),\n new arrow.Field(\n \"vector\",\n new arrow.FixedSizeList(\n 128,\n new arrow.Field(\"item\", new arrow.Float32(), true),\n ),\n ),\n]);\nconst schemaAlterData = lancedb.makeArrowTable(\n [\n {\n id: 1,\n name: \"Laptop\",\n price: 1200,\n discount_price: 1080.0,\n vector: Array.from({ length: 128 }, () => Math.random()),\n },\n {\n id: 2,\n name: \"Smartphone\",\n price: 800,\n discount_price: 720.0,\n vector: Array.from({ length: 128 }, () => Math.random()),\n },\n ],\n { schema: schemaAlter },\n);\nconst schemaAlterTable = await db.createTable(\n \"schema_evolution_alter_example\",\n schemaAlterData,\n { mode: \"overwrite\" },\n);\n"; + +export const TsSchemaDropSetup = "const schemaDropData = [\n {\n id: 1,\n name: \"Laptop\",\n price: 1200.0,\n temp_col1: \"X\",\n temp_col2: 100,\n vector: Array.from({ length: 128 }, () => Math.random()),\n },\n {\n id: 2,\n name: \"Smartphone\",\n price: 800.0,\n temp_col1: \"Y\",\n temp_col2: 200,\n vector: Array.from({ length: 128 }, () => Math.random()),\n },\n {\n id: 3,\n name: \"Headphones\",\n price: 150.0,\n temp_col1: \"Z\",\n temp_col2: 300,\n vector: Array.from({ length: 128 }, () => Math.random()),\n },\n];\nconst schemaDropTable = await db.createTable(\n \"schema_evolution_drop_example\",\n schemaDropData,\n { mode: \"overwrite\" },\n);\n"; + +export const RsAddColumnsCalculated = "// Add a discounted price column (10% discount)\nschema_add_table\n .add_columns(\n NewColumnTransform::SqlExpressions(vec![(\n \"discounted_price\".to_string(),\n \"cast((price * 0.9) as float)\".to_string(),\n )]),\n None,\n )\n .await\n .unwrap();\n"; + +export const RsAddColumnsDefaultValues = "// Add a stock status column with default value\nschema_add_table\n .add_columns(\n NewColumnTransform::SqlExpressions(vec![(\n \"in_stock\".to_string(),\n \"cast(true as boolean)\".to_string(),\n )]),\n None,\n )\n .await\n .unwrap();\n"; + +export const RsAddColumnsNullable = "// Add a nullable timestamp column\nschema_add_table\n .add_columns(\n NewColumnTransform::SqlExpressions(vec![(\n \"last_ordered\".to_string(),\n \"cast(NULL as timestamp)\".to_string(),\n )]),\n None,\n )\n .await\n .unwrap();\n"; + +export const RsAlterColumnsDataType = "// Change price from int32 to int64 for larger numbers\nschema_alter_table\n .alter_columns(&[ColumnAlteration::new(\"price\".to_string()).cast_to(DataType::Int64)])\n .await\n .unwrap();\n"; + +export const RsAlterColumnsMultiple = "// Rename, change type, and make nullable in one operation\nschema_alter_table\n .alter_columns(&[ColumnAlteration::new(\"sale_price\".to_string())\n .rename(\"final_price\".to_string())\n .cast_to(DataType::Float64)\n .set_nullable(true)])\n .await\n .unwrap();\n"; + +export const RsAlterColumnsNullable = "// Make the name column nullable\nschema_alter_table\n .alter_columns(&[ColumnAlteration::new(\"name\".to_string()).set_nullable(true)])\n .await\n .unwrap();\n"; + +export const RsAlterColumnsRename = "// Rename discount_price to sale_price\nschema_alter_table\n .alter_columns(&[ColumnAlteration::new(\"discount_price\".to_string())\n .rename(\"sale_price\".to_string())])\n .await\n .unwrap();\n"; + +export const RsAlterColumnsWithExpression = "// For custom transforms, create a new column from a SQL expression.\nlet expression_schema = Arc::new(Schema::new(vec![\n Field::new(\"id\", DataType::Int64, false),\n Field::new(\"price_text\", DataType::Utf8, false),\n]));\nlet expression_batch = RecordBatch::try_new(\n expression_schema.clone(),\n vec![\n Arc::new(Int64Array::from(vec![1])),\n Arc::new(StringArray::from(vec![\"$100\"])),\n ],\n)\n.unwrap();\nlet expression_reader = RecordBatchIterator::new(\n vec![Ok(expression_batch)].into_iter(),\n expression_schema.clone(),\n);\nlet expression_table = db\n .create_table(\"schema_evolution_expression_example\", expression_reader)\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\n\nexpression_table\n .add_columns(\n NewColumnTransform::SqlExpressions(vec![(\n \"price_numeric\".to_string(),\n \"cast(replace(price_text, '$', '') as int)\".to_string(),\n )]),\n None,\n )\n .await\n .unwrap();\nexpression_table.drop_columns(&[\"price_text\"]).await.unwrap();\nexpression_table\n .alter_columns(&[ColumnAlteration::new(\"price_numeric\".to_string())\n .rename(\"price\".to_string())])\n .await\n .unwrap();\n"; + +export const RsAlterVectorColumn = "let old_dim = 384;\nlet new_dim = 1024;\nlet vector_schema = Arc::new(Schema::new(vec![\n Field::new(\"id\", DataType::Int64, false),\n Field::new(\n \"embedding\",\n DataType::FixedSizeList(\n Arc::new(Field::new(\"item\", DataType::Float32, true)),\n old_dim,\n ),\n true,\n ),\n]));\nlet vector_batch = RecordBatch::try_new(\n vector_schema.clone(),\n vec![\n Arc::new(Int64Array::from(vec![1])),\n Arc::new(\n FixedSizeListArray::from_iter_primitive::(\n vec![Some(vec![Some(0.1_f32); old_dim as usize])],\n old_dim,\n ),\n ),\n ],\n)\n.unwrap();\nlet vector_reader =\n RecordBatchIterator::new(vec![Ok(vector_batch)].into_iter(), vector_schema.clone());\nlet vector_table = db\n .create_table(\"vector_alter_example\", vector_reader)\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\n\n// Changing FixedSizeList dimensions (384 -> 1024) is not supported via alter_columns.\n// Use add_columns + drop_columns + alter_columns(rename) to replace the column.\nvector_table\n .add_columns(\n NewColumnTransform::SqlExpressions(vec![(\n \"embedding_v2\".to_string(),\n format!(\"arrow_cast(NULL, 'FixedSizeList({}, Float32)')\", new_dim),\n )]),\n None,\n )\n .await\n .unwrap();\nvector_table.drop_columns(&[\"embedding\"]).await.unwrap();\nvector_table\n .alter_columns(&[ColumnAlteration::new(\"embedding_v2\".to_string())\n .rename(\"embedding\".to_string())])\n .await\n .unwrap();\n"; + export const RsCreateEmptyTable = "let empty_schema = Arc::new(Schema::new(vec![\n Field::new(\n \"vector\",\n DataType::FixedSizeList(Arc::new(Field::new(\"item\", DataType::Float32, true)), 2),\n false,\n ),\n Field::new(\"item\", DataType::Utf8, false),\n Field::new(\"price\", DataType::Float32, false),\n]));\nlet empty_table = db\n .create_empty_table(\"test_empty_table\", empty_schema)\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\n"; export const RsCreateTableCustomSchema = "let custom_schema = Arc::new(Schema::new(vec![\n Field::new(\n \"vector\",\n DataType::FixedSizeList(Arc::new(Field::new(\"item\", DataType::Float32, true)), 4),\n false,\n ),\n Field::new(\"lat\", DataType::Float32, false),\n Field::new(\"long\", DataType::Float32, false),\n]));\n\nlet custom_batch = RecordBatch::try_new(\n custom_schema.clone(),\n vec![\n Arc::new(\n FixedSizeListArray::from_iter_primitive::(\n vec![\n Some(vec![Some(1.1), Some(1.2), Some(1.3), Some(1.4)]),\n Some(vec![Some(0.2), Some(1.8), Some(0.4), Some(3.6)]),\n ],\n 4,\n ),\n ),\n Arc::new(Float32Array::from(vec![45.5, 40.1])),\n Arc::new(Float32Array::from(vec![-122.7, -74.1])),\n ],\n)\n.unwrap();\nlet custom_reader =\n RecordBatchIterator::new(vec![Ok(custom_batch)].into_iter(), custom_schema.clone());\nlet custom_table = db\n .create_table(\"my_table_custom_schema\", custom_reader)\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\n"; @@ -136,7 +186,17 @@ export const RsCreateTableFromDicts = "struct Location {\n vector: [f32; 2],\ export const RsCreateTableFromIterator = "let batch_schema = Arc::new(Schema::new(vec![\n Field::new(\n \"vector\",\n DataType::FixedSizeList(Arc::new(Field::new(\"item\", DataType::Float32, true)), 4),\n false,\n ),\n Field::new(\"item\", DataType::Utf8, false),\n Field::new(\"price\", DataType::Float32, false),\n]));\n\nlet batches = (0..5)\n .map(|i| {\n RecordBatch::try_new(\n batch_schema.clone(),\n vec![\n Arc::new(\n FixedSizeListArray::from_iter_primitive::(\n vec![\n Some(vec![Some(3.1 + i as f32), Some(4.1), Some(5.1), Some(6.1)]),\n Some(vec![\n Some(5.9),\n Some(26.5 + i as f32),\n Some(4.7),\n Some(32.8),\n ]),\n ],\n 4,\n ),\n ),\n Arc::new(StringArray::from(vec![\n format!(\"item{}\", i * 2 + 1),\n format!(\"item{}\", i * 2 + 2),\n ])),\n Arc::new(Float32Array::from(vec![\n ((i * 2 + 1) * 10) as f32,\n ((i * 2 + 2) * 10) as f32,\n ])),\n ],\n )\n .unwrap()\n })\n .collect::>();\n\nlet batch_reader = RecordBatchIterator::new(batches.into_iter().map(Ok), batch_schema.clone());\nlet batch_table = db\n .create_table(\"batched_table\", batch_reader)\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\n"; +export const RsDropColumnsMultiple = "// Remove the second temporary column\nschema_drop_table.drop_columns(&[\"temp_col2\"]).await.unwrap();\n"; + +export const RsDropColumnsSingle = "// Remove the first temporary column\nschema_drop_table.drop_columns(&[\"temp_col1\"]).await.unwrap();\n"; + export const RsDropTable = "let drop_schema = Arc::new(Schema::new(vec![\n Field::new(\n \"vector\",\n DataType::FixedSizeList(Arc::new(Field::new(\"item\", DataType::Float32, true)), 2),\n false,\n ),\n Field::new(\"lat\", DataType::Float32, false),\n]));\nlet drop_batch = RecordBatch::try_new(\n drop_schema.clone(),\n vec![\n Arc::new(\n FixedSizeListArray::from_iter_primitive::(\n vec![Some(vec![Some(1.1), Some(1.2)])],\n 2,\n ),\n ),\n Arc::new(Float32Array::from(vec![45.5])),\n ],\n)\n.unwrap();\nlet drop_reader =\n RecordBatchIterator::new(vec![Ok(drop_batch)].into_iter(), drop_schema.clone());\ndb.create_table(\"my_table\", drop_reader)\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\n\ndb.drop_table(\"my_table\", &[]).await.unwrap();\n"; export const RsOpenExistingTable = "let open_schema = Arc::new(Schema::new(vec![\n Field::new(\n \"vector\",\n DataType::FixedSizeList(Arc::new(Field::new(\"item\", DataType::Float32, true)), 2),\n false,\n ),\n Field::new(\"lat\", DataType::Float32, false),\n Field::new(\"long\", DataType::Float32, false),\n]));\nlet open_batch = RecordBatch::try_new(\n open_schema.clone(),\n vec![\n Arc::new(\n FixedSizeListArray::from_iter_primitive::(\n vec![Some(vec![Some(1.1), Some(1.2)])],\n 2,\n ),\n ),\n Arc::new(Float32Array::from(vec![45.5])),\n Arc::new(Float32Array::from(vec![-122.7])),\n ],\n)\n.unwrap();\nlet open_reader =\n RecordBatchIterator::new(vec![Ok(open_batch)].into_iter(), open_schema.clone());\ndb.create_table(\"test_table\", open_reader)\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\n\nprintln!(\"{:?}\", db.table_names().execute().await.unwrap());\n\nlet opened_table = db.open_table(\"test_table\").execute().await.unwrap();\n"; +export const RsSchemaAddSetup = "let schema_add_schema = Arc::new(Schema::new(vec![\n Field::new(\"id\", DataType::Int64, false),\n Field::new(\"name\", DataType::Utf8, false),\n Field::new(\"price\", DataType::Float64, false),\n Field::new(\n \"vector\",\n DataType::FixedSizeList(Arc::new(Field::new(\"item\", DataType::Float32, true)), 128),\n false,\n ),\n]));\nlet schema_add_batch = RecordBatch::try_new(\n schema_add_schema.clone(),\n vec![\n Arc::new(Int64Array::from(vec![1, 2, 3])),\n Arc::new(StringArray::from(vec![\"Laptop\", \"Smartphone\", \"Headphones\"])),\n Arc::new(Float64Array::from(vec![1200.0, 800.0, 150.0])),\n Arc::new(\n FixedSizeListArray::from_iter_primitive::(\n vec![\n Some(vec![Some(0.1_f32); 128]),\n Some(vec![Some(0.2_f32); 128]),\n Some(vec![Some(0.3_f32); 128]),\n ],\n 128,\n ),\n ),\n ],\n)\n.unwrap();\nlet schema_add_reader = RecordBatchIterator::new(\n vec![Ok(schema_add_batch)].into_iter(),\n schema_add_schema.clone(),\n);\nlet schema_add_table = db\n .create_table(\"schema_evolution_add_example\", schema_add_reader)\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\n"; + +export const RsSchemaAlterSetup = "let schema_alter_schema = Arc::new(Schema::new(vec![\n Field::new(\"id\", DataType::Int64, false),\n Field::new(\"name\", DataType::Utf8, false),\n Field::new(\"price\", DataType::Int32, false),\n Field::new(\"discount_price\", DataType::Float64, false),\n Field::new(\n \"vector\",\n DataType::FixedSizeList(Arc::new(Field::new(\"item\", DataType::Float32, true)), 128),\n false,\n ),\n]));\nlet schema_alter_batch = RecordBatch::try_new(\n schema_alter_schema.clone(),\n vec![\n Arc::new(Int64Array::from(vec![1, 2])),\n Arc::new(StringArray::from(vec![\"Laptop\", \"Smartphone\"])),\n Arc::new(Int32Array::from(vec![1200, 800])),\n Arc::new(Float64Array::from(vec![1080.0, 720.0])),\n Arc::new(\n FixedSizeListArray::from_iter_primitive::(\n vec![Some(vec![Some(0.1_f32); 128]), Some(vec![Some(0.2_f32); 128])],\n 128,\n ),\n ),\n ],\n)\n.unwrap();\nlet schema_alter_reader = RecordBatchIterator::new(\n vec![Ok(schema_alter_batch)].into_iter(),\n schema_alter_schema.clone(),\n);\nlet schema_alter_table = db\n .create_table(\"schema_evolution_alter_example\", schema_alter_reader)\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\n"; + +export const RsSchemaDropSetup = "let schema_drop_schema = Arc::new(Schema::new(vec![\n Field::new(\"id\", DataType::Int64, false),\n Field::new(\"name\", DataType::Utf8, false),\n Field::new(\"price\", DataType::Float64, false),\n Field::new(\"temp_col1\", DataType::Utf8, false),\n Field::new(\"temp_col2\", DataType::Int32, false),\n Field::new(\n \"vector\",\n DataType::FixedSizeList(Arc::new(Field::new(\"item\", DataType::Float32, true)), 128),\n false,\n ),\n]));\nlet schema_drop_batch = RecordBatch::try_new(\n schema_drop_schema.clone(),\n vec![\n Arc::new(Int64Array::from(vec![1, 2, 3])),\n Arc::new(StringArray::from(vec![\"Laptop\", \"Smartphone\", \"Headphones\"])),\n Arc::new(Float64Array::from(vec![1200.0, 800.0, 150.0])),\n Arc::new(StringArray::from(vec![\"X\", \"Y\", \"Z\"])),\n Arc::new(Int32Array::from(vec![100, 200, 300])),\n Arc::new(\n FixedSizeListArray::from_iter_primitive::(\n vec![\n Some(vec![Some(0.1_f32); 128]),\n Some(vec![Some(0.2_f32); 128]),\n Some(vec![Some(0.3_f32); 128]),\n ],\n 128,\n ),\n ),\n ],\n)\n.unwrap();\nlet schema_drop_reader = RecordBatchIterator::new(\n vec![Ok(schema_drop_batch)].into_iter(),\n schema_drop_schema.clone(),\n);\nlet schema_drop_table = db\n .create_table(\"schema_evolution_drop_example\", schema_drop_reader)\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\n"; + diff --git a/docs/tables/schema.mdx b/docs/tables/schema.mdx index 8659e5a..83f48ac 100644 --- a/docs/tables/schema.mdx +++ b/docs/tables/schema.mdx @@ -6,17 +6,47 @@ icon: "boxes-stacked" --- import { PySchemaAddSetup as SchemaAddSetup, + TsSchemaAddSetup as TsSchemaAddSetup, + RsSchemaAddSetup as RsSchemaAddSetup, PyAddColumnsCalculated as AddColumnsCalculated, + TsAddColumnsCalculated as TsAddColumnsCalculated, + RsAddColumnsCalculated as RsAddColumnsCalculated, PyAddColumnsDefaultValues as AddColumnsDefaultValues, + TsAddColumnsDefaultValues as TsAddColumnsDefaultValues, + RsAddColumnsDefaultValues as RsAddColumnsDefaultValues, PyAddColumnsNullable as AddColumnsNullable, + TsAddColumnsNullable as TsAddColumnsNullable, + RsAddColumnsNullable as RsAddColumnsNullable, PySchemaAlterSetup as SchemaAlterSetup, + TsSchemaAlterSetup as TsSchemaAlterSetup, + RsSchemaAlterSetup as RsSchemaAlterSetup, PyAlterColumnsRename as AlterColumnsRename, + TsAlterColumnsRename as TsAlterColumnsRename, + RsAlterColumnsRename as RsAlterColumnsRename, PyAlterColumnsDataType as AlterColumnsDataType, + TsAlterColumnsDataType as TsAlterColumnsDataType, + RsAlterColumnsDataType as RsAlterColumnsDataType, PyAlterColumnsNullable as AlterColumnsNullable, + TsAlterColumnsNullable as TsAlterColumnsNullable, + RsAlterColumnsNullable as RsAlterColumnsNullable, PyAlterColumnsMultiple as AlterColumnsMultiple, + TsAlterColumnsMultiple as TsAlterColumnsMultiple, + PyAlterColumnsWithExpression as AlterColumnsWithExpression, + TsAlterColumnsWithExpression as TsAlterColumnsWithExpression, + RsAlterColumnsMultiple as RsAlterColumnsMultiple, + RsAlterColumnsWithExpression as RsAlterColumnsWithExpression, + PySchemaDropSetup as SchemaDropSetup, + TsSchemaDropSetup as TsSchemaDropSetup, + RsSchemaDropSetup as RsSchemaDropSetup, PyDropColumnsSingle as DropColumnsSingle, + TsDropColumnsSingle as TsDropColumnsSingle, + RsDropColumnsSingle as RsDropColumnsSingle, PyDropColumnsMultiple as DropColumnsMultiple, + TsDropColumnsMultiple as TsDropColumnsMultiple, + RsDropColumnsMultiple as RsDropColumnsMultiple, PyAlterVectorColumn as AlterVectorColumn, + TsAlterVectorColumn as TsAlterVectorColumn, + RsAlterVectorColumn as RsAlterVectorColumn, } from '/snippets/tables.mdx'; Schema evolution enables non-breaking modifications to a database table's structure — such as adding columns, altering data types, or dropping fields — to adapt to evolving data requirements without service interruptions. @@ -26,7 +56,7 @@ LanceDB supports ACID-compliant schema evolution through granular operations (ad * Scale Seamlessly: Handle ML model iterations, regulatory changes, or feature additions * Optimize Continuously: Remove unused fields or enforce new constraints without downtime -## Schema Evolution Operations +## Schema evolution operations LanceDB supports three primary schema evolution operations: @@ -39,14 +69,14 @@ LanceDB supports three primary schema evolution operations: Schema evolution operations are applied immediately but do not typically require rewriting all data. However, data type changes may involve more substantial operations. -## Adding New Columns +## Add new columns You can add new columns to a table with the [`add_columns`](https://lancedb.github.io/lancedb/python/python/#lancedb.table.Table.add_columns) -method in Python or [`addColumns`](https://lancedb.github.io/lancedb/js/classes/Table/#addcolumns) in TypeScript/JavaScript. +method in Python, [`addColumns`](https://lancedb.github.io/lancedb/js/classes/Table/#addcolumns) in TypeScript/JavaScript, or `add_columns` in Rust. New columns are populated based on SQL expressions you provide. -### Setting Up the Example Table +### Set up the example table First, let's create a sample table with product data to demonstrate schema evolution: @@ -54,19 +84,35 @@ First, let's create a sample table with product data to demonstrate schema evolu {SchemaAddSetup} + + + {TsSchemaAddSetup} + + + + {RsSchemaAddSetup} + -### Adding Calculated Columns +### Add derived columns -You can add new columns that are calculated from existing data using SQL expressions: +You can add new columns that are derived from existing data using SQL expressions: {AddColumnsCalculated} + + + {TsAddColumnsCalculated} + + + + {RsAddColumnsCalculated} + -### Adding Columns with Default Values +### Add columns with default values Add boolean columns with default values for status tracking: @@ -74,9 +120,17 @@ Add boolean columns with default values for status tracking: {AddColumnsDefaultValues} + + + {TsAddColumnsDefaultValues} + + + + {RsAddColumnsDefaultValues} + -### Adding Nullable Columns +### Add nullable columns Add timestamp columns that can contain NULL values: @@ -84,23 +138,31 @@ Add timestamp columns that can contain NULL values: {AddColumnsNullable} + + + {TsAddColumnsNullable} + + + + {RsAddColumnsNullable} + When adding columns that should contain NULL values, be sure to cast the NULL to the appropriate type, e.g., `cast(NULL as timestamp)`. -## Altering Existing Columns +## Alter existing columns You can alter columns using the [`alter_columns`](https://lancedb.github.io/lancedb/python/python/#lancedb.table.Table.alter_columns) -method in Python or [`alterColumns`](https://lancedb.github.io/lancedb/js/classes/Table/#altercolumns) in TypeScript/JavaScript. This allows you to: +method in Python, [`alterColumns`](https://lancedb.github.io/lancedb/js/classes/Table/#altercolumns) in TypeScript/JavaScript, or `alter_columns` in Rust. This allows you to: - Rename a column - Change a column's data type - Modify nullability (whether a column can contain NULL values) -### Setting Up the Example Table +### Set up the example table Create a table with a custom schema to demonstrate column alterations: @@ -108,9 +170,17 @@ Create a table with a custom schema to demonstrate column alterations: {SchemaAlterSetup} + + + {TsSchemaAlterSetup} + + + + {RsSchemaAlterSetup} + -### Renaming Columns +### Rename columns Change column names to better reflect their purpose: @@ -118,9 +188,17 @@ Change column names to better reflect their purpose: {AlterColumnsRename} + + + {TsAlterColumnsRename} + + + + {RsAlterColumnsRename} + -### Changing Data Types +### Change data types Convert column data types for better performance or compatibility: @@ -128,19 +206,35 @@ Convert column data types for better performance or compatibility: {AlterColumnsDataType} + + + {TsAlterColumnsDataType} + + + + {RsAlterColumnsDataType} + -### Making Columns Nullable +### Make columns nullable -Allow columns to contain NULL values: +You can alter columns to contain NULL values: {AlterColumnsNullable} + + + {TsAlterColumnsNullable} + + + + {RsAlterColumnsNullable} + -### Multiple Changes at Once +### Multiple changes at once Apply several alterations in a single operation: @@ -148,56 +242,92 @@ Apply several alterations in a single operation: {AlterColumnsMultiple} + + + {TsAlterColumnsMultiple} + + + + {RsAlterColumnsMultiple} + + + +### Expression-based type changes + +For transformations that are not simple casts (for example, converting `"$100"` to an integer), use a SQL-expression column add, then drop and rename: + + + + {AlterColumnsWithExpression} + + + + {TsAlterColumnsWithExpression} + + + + {RsAlterColumnsWithExpression} + +### Alter embedding types and dimensions + +It's quite common to need to change an embedding column's schema, in case a new model becomes available with a different embedding dimension. +- In Python, the example shows an in-place type update when the cast is compatible. +- In TypeScript and Rust, the example shows a dimension change (`384 -> 1024`), which cannot be cast in-place. + +For dimension changes, use this 3-step pattern: add a new column with the target type, drop the old column, then rename the new column to the original name. + + + + {AlterVectorColumn} + + + + {TsAlterVectorColumn} + + + + {RsAlterVectorColumn} + + + + +**`FixedSizeList` Dimension Changes in TypeScript and Rust** + +`alterColumns` / `alter_columns` can cast between compatible types, but changing `FixedSizeList` dimensions (for example `384 -> 1024`) is not a compatible cast. +For such cases, use `addColumns` / `add_columns` (with `arrow_cast`), then `dropColumns` / `drop_columns`, then rename the replacement column. + + Changing data types requires rewriting the column data and may be resource-intensive for large tables. Renaming columns or changing nullability is more efficient as it only updates metadata. -## Dropping Columns +## Drop columns You can remove columns using the [`drop_columns`](https://lancedb.github.io/lancedb/python/python/#lancedb.table.Table.drop_columns) - method in Python or [`dropColumns`](https://lancedb.github.io/lancedb/js/classes/Table/#dropcolumns) in TypeScript/JavaScript. + method in Python, [`dropColumns`](https://lancedb.github.io/lancedb/js/classes/Table/#dropcolumns) in TypeScript/JavaScript, or `drop_columns` in Rust. -### Setting Up the Example Table +### Set Up the example table Create a table with temporary columns that we'll remove: -```python -table_name = "schema_evolution_drop_example" - -data = [ - { - "id": 1, - "name": "Laptop", - "price": 1200.00, - "temp_col1": "X", - "temp_col2": 100, - "vector": np.random.random(128).tolist(), - }, - { - "id": 2, - "name": "Smartphone", - "price": 800.00, - "temp_col1": "Y", - "temp_col2": 200, - "vector": np.random.random(128).tolist(), - }, - { - "id": 3, - "name": "Headphones", - "price": 150.00, - "temp_col1": "Z", - "temp_col2": 300, - "vector": np.random.random(128).tolist(), - }, -] - -table = db.create_table(table_name, data, mode="overwrite") -``` - -### Dropping Single Columns + + + {SchemaDropSetup} + + + + {TsSchemaDropSetup} + + + + {RsSchemaDropSetup} + + + +### Drop single columns Remove individual columns that are no longer needed: @@ -205,9 +335,17 @@ Remove individual columns that are no longer needed: {DropColumnsSingle} + + + {TsDropColumnsSingle} + + + + {RsDropColumnsSingle} + -### Dropping Multiple Columns +### Drop multiple columns Remove several columns at once for efficiency: @@ -215,23 +353,17 @@ Remove several columns at once for efficiency: {DropColumnsMultiple} + + + {TsDropColumnsMultiple} + + + + {RsDropColumnsMultiple} + Dropping columns cannot be undone. Make sure you have backups or are certain before removing columns. -## Vector Column Considerations - -Vector columns (used for embeddings) have special considerations. When altering vector columns, you should ensure consistent dimensionality. - -### Converting List to FixedSizeList - -A common schema evolution task is converting a generic list column to a fixed-size list for performance: - - - - {AlterVectorColumn} - - - diff --git a/tests/py/test_tables.py b/tests/py/test_tables.py index aa56dfd..4fcf5ed 100644 --- a/tests/py/test_tables.py +++ b/tests/py/test_tables.py @@ -850,6 +850,7 @@ def _setup_schema_alter_table(tmp_db, data=None): def _setup_schema_drop_table(tmp_db, data=None): + # --8<-- [start:schema_drop_setup] if data is None: data = [ { @@ -868,8 +869,18 @@ def _setup_schema_drop_table(tmp_db, data=None): "temp_col2": 200, "vector": np.random.random(128).tolist(), }, + { + "id": 3, + "name": "Headphones", + "price": 150.00, + "temp_col1": "Z", + "temp_col2": 300, + "vector": np.random.random(128).tolist(), + }, ] - return tmp_db.create_table("schema_evolution_drop_example", data, mode="overwrite") + table = tmp_db.create_table("schema_evolution_drop_example", data, mode="overwrite") + # --8<-- [end:schema_drop_setup] + return table def test_add_columns_calculated(tmp_db): @@ -995,6 +1006,24 @@ def test_alter_columns_multiple(tmp_db): assert table.schema.field("final_price").nullable is True +def test_alter_columns_with_expression(tmp_db): + # --8<-- [start:alter_columns_with_expression] + # For custom transforms, create a new column from a SQL expression. + expression_table = tmp_db.create_table( + "schema_evolution_expression_example", + [{"id": 1, "price_text": "$100"}], + mode="overwrite", + ) + + expression_table.add_columns( + {"price_numeric": "cast(replace(price_text, '$', '') as int)"} + ) + expression_table.drop_columns(["price_text"]) + expression_table.alter_columns({"path": "price_numeric", "rename": "price"}) + # --8<-- [end:alter_columns_with_expression] + assert "price" in expression_table.schema.names + + def test_drop_columns_single(tmp_db): table = _setup_schema_drop_table(tmp_db) diff --git a/tests/rs/tables.rs b/tests/rs/tables.rs index ea7a35f..fa167e7 100644 --- a/tests/rs/tables.rs +++ b/tests/rs/tables.rs @@ -5,11 +5,13 @@ use std::sync::Arc; use arrow_array::types::Float32Type; use arrow_array::{ - FixedSizeListArray, Float32Array, RecordBatch, RecordBatchIterator, StringArray, + FixedSizeListArray, Float32Array, Float64Array, Int32Array, Int64Array, RecordBatch, + RecordBatchIterator, StringArray, }; use arrow_schema::{DataType, Field, Schema}; use lancedb::connect; use lancedb::database::CreateTableMode; +use lancedb::table::{ColumnAlteration, NewColumnTransform}; #[tokio::main] async fn main() { @@ -301,4 +303,324 @@ async fn main() { .unwrap() .contains(&"my_table".to_string()) ); + + // --8<-- [start:schema_add_setup] + let schema_add_schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int64, false), + Field::new("name", DataType::Utf8, false), + Field::new("price", DataType::Float64, false), + Field::new( + "vector", + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 128), + false, + ), + ])); + let schema_add_batch = RecordBatch::try_new( + schema_add_schema.clone(), + vec![ + Arc::new(Int64Array::from(vec![1, 2, 3])), + Arc::new(StringArray::from(vec!["Laptop", "Smartphone", "Headphones"])), + Arc::new(Float64Array::from(vec![1200.0, 800.0, 150.0])), + Arc::new( + FixedSizeListArray::from_iter_primitive::( + vec![ + Some(vec![Some(0.1_f32); 128]), + Some(vec![Some(0.2_f32); 128]), + Some(vec![Some(0.3_f32); 128]), + ], + 128, + ), + ), + ], + ) + .unwrap(); + let schema_add_reader = RecordBatchIterator::new( + vec![Ok(schema_add_batch)].into_iter(), + schema_add_schema.clone(), + ); + let schema_add_table = db + .create_table("schema_evolution_add_example", schema_add_reader) + .mode(CreateTableMode::Overwrite) + .execute() + .await + .unwrap(); + // --8<-- [end:schema_add_setup] + assert_eq!(schema_add_table.count_rows(None).await.unwrap(), 3); + + // --8<-- [start:add_columns_calculated] + // Add a discounted price column (10% discount) + schema_add_table + .add_columns( + NewColumnTransform::SqlExpressions(vec![( + "discounted_price".to_string(), + "cast((price * 0.9) as float)".to_string(), + )]), + None, + ) + .await + .unwrap(); + // --8<-- [end:add_columns_calculated] + + // --8<-- [start:add_columns_default_values] + // Add a stock status column with default value + schema_add_table + .add_columns( + NewColumnTransform::SqlExpressions(vec![( + "in_stock".to_string(), + "cast(true as boolean)".to_string(), + )]), + None, + ) + .await + .unwrap(); + // --8<-- [end:add_columns_default_values] + + // --8<-- [start:add_columns_nullable] + // Add a nullable timestamp column + schema_add_table + .add_columns( + NewColumnTransform::SqlExpressions(vec![( + "last_ordered".to_string(), + "cast(NULL as timestamp)".to_string(), + )]), + None, + ) + .await + .unwrap(); + // --8<-- [end:add_columns_nullable] + + // --8<-- [start:schema_alter_setup] + let schema_alter_schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int64, false), + Field::new("name", DataType::Utf8, false), + Field::new("price", DataType::Int32, false), + Field::new("discount_price", DataType::Float64, false), + Field::new( + "vector", + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 128), + false, + ), + ])); + let schema_alter_batch = RecordBatch::try_new( + schema_alter_schema.clone(), + vec![ + Arc::new(Int64Array::from(vec![1, 2])), + Arc::new(StringArray::from(vec!["Laptop", "Smartphone"])), + Arc::new(Int32Array::from(vec![1200, 800])), + Arc::new(Float64Array::from(vec![1080.0, 720.0])), + Arc::new( + FixedSizeListArray::from_iter_primitive::( + vec![Some(vec![Some(0.1_f32); 128]), Some(vec![Some(0.2_f32); 128])], + 128, + ), + ), + ], + ) + .unwrap(); + let schema_alter_reader = RecordBatchIterator::new( + vec![Ok(schema_alter_batch)].into_iter(), + schema_alter_schema.clone(), + ); + let schema_alter_table = db + .create_table("schema_evolution_alter_example", schema_alter_reader) + .mode(CreateTableMode::Overwrite) + .execute() + .await + .unwrap(); + // --8<-- [end:schema_alter_setup] + assert_eq!(schema_alter_table.count_rows(None).await.unwrap(), 2); + + // --8<-- [start:alter_columns_rename] + // Rename discount_price to sale_price + schema_alter_table + .alter_columns(&[ColumnAlteration::new("discount_price".to_string()) + .rename("sale_price".to_string())]) + .await + .unwrap(); + // --8<-- [end:alter_columns_rename] + + // --8<-- [start:alter_columns_data_type] + // Change price from int32 to int64 for larger numbers + schema_alter_table + .alter_columns(&[ColumnAlteration::new("price".to_string()).cast_to(DataType::Int64)]) + .await + .unwrap(); + // --8<-- [end:alter_columns_data_type] + + // --8<-- [start:alter_columns_nullable] + // Make the name column nullable + schema_alter_table + .alter_columns(&[ColumnAlteration::new("name".to_string()).set_nullable(true)]) + .await + .unwrap(); + // --8<-- [end:alter_columns_nullable] + + // --8<-- [start:alter_columns_multiple] + // Rename, change type, and make nullable in one operation + schema_alter_table + .alter_columns(&[ColumnAlteration::new("sale_price".to_string()) + .rename("final_price".to_string()) + .cast_to(DataType::Float64) + .set_nullable(true)]) + .await + .unwrap(); + // --8<-- [end:alter_columns_multiple] + + // --8<-- [start:alter_columns_with_expression] + // For custom transforms, create a new column from a SQL expression. + let expression_schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int64, false), + Field::new("price_text", DataType::Utf8, false), + ])); + let expression_batch = RecordBatch::try_new( + expression_schema.clone(), + vec![ + Arc::new(Int64Array::from(vec![1])), + Arc::new(StringArray::from(vec!["$100"])), + ], + ) + .unwrap(); + let expression_reader = RecordBatchIterator::new( + vec![Ok(expression_batch)].into_iter(), + expression_schema.clone(), + ); + let expression_table = db + .create_table("schema_evolution_expression_example", expression_reader) + .mode(CreateTableMode::Overwrite) + .execute() + .await + .unwrap(); + + expression_table + .add_columns( + NewColumnTransform::SqlExpressions(vec![( + "price_numeric".to_string(), + "cast(replace(price_text, '$', '') as int)".to_string(), + )]), + None, + ) + .await + .unwrap(); + expression_table.drop_columns(&["price_text"]).await.unwrap(); + expression_table + .alter_columns(&[ColumnAlteration::new("price_numeric".to_string()) + .rename("price".to_string())]) + .await + .unwrap(); + // --8<-- [end:alter_columns_with_expression] + assert_eq!(expression_table.count_rows(None).await.unwrap(), 1); + + // --8<-- [start:schema_drop_setup] + let schema_drop_schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int64, false), + Field::new("name", DataType::Utf8, false), + Field::new("price", DataType::Float64, false), + Field::new("temp_col1", DataType::Utf8, false), + Field::new("temp_col2", DataType::Int32, false), + Field::new( + "vector", + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 128), + false, + ), + ])); + let schema_drop_batch = RecordBatch::try_new( + schema_drop_schema.clone(), + vec![ + Arc::new(Int64Array::from(vec![1, 2, 3])), + Arc::new(StringArray::from(vec!["Laptop", "Smartphone", "Headphones"])), + Arc::new(Float64Array::from(vec![1200.0, 800.0, 150.0])), + Arc::new(StringArray::from(vec!["X", "Y", "Z"])), + Arc::new(Int32Array::from(vec![100, 200, 300])), + Arc::new( + FixedSizeListArray::from_iter_primitive::( + vec![ + Some(vec![Some(0.1_f32); 128]), + Some(vec![Some(0.2_f32); 128]), + Some(vec![Some(0.3_f32); 128]), + ], + 128, + ), + ), + ], + ) + .unwrap(); + let schema_drop_reader = RecordBatchIterator::new( + vec![Ok(schema_drop_batch)].into_iter(), + schema_drop_schema.clone(), + ); + let schema_drop_table = db + .create_table("schema_evolution_drop_example", schema_drop_reader) + .mode(CreateTableMode::Overwrite) + .execute() + .await + .unwrap(); + // --8<-- [end:schema_drop_setup] + assert_eq!(schema_drop_table.count_rows(None).await.unwrap(), 3); + + // --8<-- [start:drop_columns_single] + // Remove the first temporary column + schema_drop_table.drop_columns(&["temp_col1"]).await.unwrap(); + // --8<-- [end:drop_columns_single] + + // --8<-- [start:drop_columns_multiple] + // Remove the second temporary column + schema_drop_table.drop_columns(&["temp_col2"]).await.unwrap(); + // --8<-- [end:drop_columns_multiple] + + // --8<-- [start:alter_vector_column] + let old_dim = 384; + let new_dim = 1024; + let vector_schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int64, false), + Field::new( + "embedding", + DataType::FixedSizeList( + Arc::new(Field::new("item", DataType::Float32, true)), + old_dim, + ), + true, + ), + ])); + let vector_batch = RecordBatch::try_new( + vector_schema.clone(), + vec![ + Arc::new(Int64Array::from(vec![1])), + Arc::new( + FixedSizeListArray::from_iter_primitive::( + vec![Some(vec![Some(0.1_f32); old_dim as usize])], + old_dim, + ), + ), + ], + ) + .unwrap(); + let vector_reader = + RecordBatchIterator::new(vec![Ok(vector_batch)].into_iter(), vector_schema.clone()); + let vector_table = db + .create_table("vector_alter_example", vector_reader) + .mode(CreateTableMode::Overwrite) + .execute() + .await + .unwrap(); + + // Changing FixedSizeList dimensions (384 -> 1024) is not supported via alter_columns. + // Use add_columns + drop_columns + alter_columns(rename) to replace the column. + vector_table + .add_columns( + NewColumnTransform::SqlExpressions(vec![( + "embedding_v2".to_string(), + format!("arrow_cast(NULL, 'FixedSizeList({}, Float32)')", new_dim), + )]), + None, + ) + .await + .unwrap(); + vector_table.drop_columns(&["embedding"]).await.unwrap(); + vector_table + .alter_columns(&[ColumnAlteration::new("embedding_v2".to_string()) + .rename("embedding".to_string())]) + .await + .unwrap(); + // --8<-- [end:alter_vector_column] + assert_eq!(vector_table.count_rows(None).await.unwrap(), 1); } diff --git a/tests/ts/tables.test.ts b/tests/ts/tables.test.ts index 2b173a4..b565522 100644 --- a/tests/ts/tables.test.ts +++ b/tests/ts/tables.test.ts @@ -159,3 +159,241 @@ test("table creation snippets (async)", async () => { expect(await db.tableNames()).not.toContain("my_table"); }); }); + +test("schema evolution snippets (async)", async () => { + await withTempDirectory(async (databaseDir) => { + const db = await lancedb.connect(databaseDir); + + // --8<-- [start:schema_add_setup] + const schemaAddData = [ + { + id: 1, + name: "Laptop", + price: 1200.0, + vector: Array.from({ length: 128 }, () => Math.random()), + }, + { + id: 2, + name: "Smartphone", + price: 800.0, + vector: Array.from({ length: 128 }, () => Math.random()), + }, + { + id: 3, + name: "Headphones", + price: 150.0, + vector: Array.from({ length: 128 }, () => Math.random()), + }, + ]; + const schemaAddTable = await db.createTable( + "schema_evolution_add_example", + schemaAddData, + { mode: "overwrite" }, + ); + // --8<-- [end:schema_add_setup] + expect(await schemaAddTable.countRows()).toBe(3); + + // --8<-- [start:add_columns_calculated] + // Add a discounted price column (10% discount) + await schemaAddTable.addColumns([ + { + name: "discounted_price", + valueSql: "cast((price * 0.9) as float)", + }, + ]); + // --8<-- [end:add_columns_calculated] + + // --8<-- [start:add_columns_default_values] + // Add a stock status column with default value + await schemaAddTable.addColumns([ + { + name: "in_stock", + valueSql: "cast(true as boolean)", + }, + ]); + // --8<-- [end:add_columns_default_values] + + // --8<-- [start:add_columns_nullable] + // Add a nullable timestamp column + await schemaAddTable.addColumns([ + { + name: "last_ordered", + valueSql: "cast(NULL as timestamp)", + }, + ]); + // --8<-- [end:add_columns_nullable] + + // --8<-- [start:schema_alter_setup] + const schemaAlter = new arrow.Schema([ + new arrow.Field("id", new arrow.Int64()), + new arrow.Field("name", new arrow.Utf8()), + new arrow.Field("price", new arrow.Int32()), + new arrow.Field("discount_price", new arrow.Float64()), + new arrow.Field( + "vector", + new arrow.FixedSizeList( + 128, + new arrow.Field("item", new arrow.Float32(), true), + ), + ), + ]); + const schemaAlterData = lancedb.makeArrowTable( + [ + { + id: 1, + name: "Laptop", + price: 1200, + discount_price: 1080.0, + vector: Array.from({ length: 128 }, () => Math.random()), + }, + { + id: 2, + name: "Smartphone", + price: 800, + discount_price: 720.0, + vector: Array.from({ length: 128 }, () => Math.random()), + }, + ], + { schema: schemaAlter }, + ); + const schemaAlterTable = await db.createTable( + "schema_evolution_alter_example", + schemaAlterData, + { mode: "overwrite" }, + ); + // --8<-- [end:schema_alter_setup] + expect(await schemaAlterTable.countRows()).toBe(2); + + // --8<-- [start:alter_columns_rename] + // Rename discount_price to sale_price + await schemaAlterTable.alterColumns([ + { path: "discount_price", rename: "sale_price" }, + ]); + // --8<-- [end:alter_columns_rename] + + // --8<-- [start:alter_columns_data_type] + // Change price from int32 to int64 for larger numbers + await schemaAlterTable.alterColumns([ + { path: "price", dataType: new arrow.Int64() }, + ]); + // --8<-- [end:alter_columns_data_type] + + // --8<-- [start:alter_columns_nullable] + // Make the name column nullable + await schemaAlterTable.alterColumns([{ path: "name", nullable: true }]); + // --8<-- [end:alter_columns_nullable] + + // --8<-- [start:alter_columns_multiple] + // Rename, change type, and make nullable in one operation + await schemaAlterTable.alterColumns([ + { + path: "sale_price", + rename: "final_price", + dataType: new arrow.Float64(), + nullable: true, + }, + ]); + // --8<-- [end:alter_columns_multiple] + + // --8<-- [start:alter_columns_with_expression] + // For custom transforms, create a new column from a SQL expression. + const expressionTable = await db.createTable( + "schema_evolution_expression_example", + [{ id: 1, price_text: "$100" }], + { mode: "overwrite" }, + ); + + await expressionTable.addColumns([ + { + name: "price_numeric", + valueSql: "cast(replace(price_text, '$', '') as int)", + }, + ]); + await expressionTable.dropColumns(["price_text"]); + await expressionTable.alterColumns([ + { path: "price_numeric", rename: "price" }, + ]); + // --8<-- [end:alter_columns_with_expression] + expect(await expressionTable.countRows()).toBe(1); + + // --8<-- [start:schema_drop_setup] + const schemaDropData = [ + { + id: 1, + name: "Laptop", + price: 1200.0, + temp_col1: "X", + temp_col2: 100, + vector: Array.from({ length: 128 }, () => Math.random()), + }, + { + id: 2, + name: "Smartphone", + price: 800.0, + temp_col1: "Y", + temp_col2: 200, + vector: Array.from({ length: 128 }, () => Math.random()), + }, + { + id: 3, + name: "Headphones", + price: 150.0, + temp_col1: "Z", + temp_col2: 300, + vector: Array.from({ length: 128 }, () => Math.random()), + }, + ]; + const schemaDropTable = await db.createTable( + "schema_evolution_drop_example", + schemaDropData, + { mode: "overwrite" }, + ); + // --8<-- [end:schema_drop_setup] + expect(await schemaDropTable.countRows()).toBe(3); + + // --8<-- [start:drop_columns_single] + // Remove the first temporary column + await schemaDropTable.dropColumns(["temp_col1"]); + // --8<-- [end:drop_columns_single] + + // --8<-- [start:drop_columns_multiple] + // Remove the second temporary column + await schemaDropTable.dropColumns(["temp_col2"]); + // --8<-- [end:drop_columns_multiple] + + // --8<-- [start:alter_vector_column] + const oldDim = 384; + const newDim = 1024; + const vectorSchema = new arrow.Schema([ + new arrow.Field("id", new arrow.Int64()), + new arrow.Field( + "embedding", + new arrow.FixedSizeList( + oldDim, + new arrow.Field("item", new arrow.Float16(), true), + ), + true, + ), + ]); + const vectorData = lancedb.makeArrowTable( + [{ id: 1, embedding: Array.from({ length: oldDim }, () => Math.random()) }], + { schema: vectorSchema }, + ); + const vectorTable = await db.createTable("vector_alter_example", vectorData, { + mode: "overwrite", + }); + + // Changing FixedSizeList dimensions (384 -> 1024) is not supported via alterColumns. + // Use addColumns + dropColumns + alterColumns(rename) to replace the column. + await vectorTable.addColumns([ + { + name: "embedding_v2", + valueSql: `arrow_cast(NULL, 'FixedSizeList(${newDim}, Float16)')`, + }, + ]); + await vectorTable.dropColumns(["embedding"]); + await vectorTable.alterColumns([{ path: "embedding_v2", rename: "embedding" }]); + // --8<-- [end:alter_vector_column] + expect(await vectorTable.countRows()).toBe(1); + }); +});