diff --git a/Cargo.toml b/Cargo.toml index ff2dc4d..65b23f7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -195,6 +195,11 @@ size = [] # Adds Arena, Table::from_arena, and an optimised consolidate path. arena = [] +# Adds an optional `metadata: BTreeMap` field to `Table`. +# Captures schema-level metadata that Arrow producers like PyArrow embed +# in the top-level ArrowSchema.metadata, e.g. pandas categorical ordering. +table_metadata = [] + # Adds pandas-style selection for Table and TableV with .c() and .r() methods select = [] diff --git a/pyo3/Cargo.lock b/pyo3/Cargo.lock index 2150879..e99e775 100644 --- a/pyo3/Cargo.lock +++ b/pyo3/Cargo.lock @@ -46,7 +46,7 @@ dependencies = [ [[package]] name = "minarrow" -version = "0.9.1" +version = "0.9.2" dependencies = [ "libc", "num-traits", diff --git a/pyo3/Cargo.toml b/pyo3/Cargo.toml index ce6b928..8c61b7b 100644 --- a/pyo3/Cargo.toml +++ b/pyo3/Cargo.toml @@ -17,6 +17,8 @@ keywords = [ categories = ["external-ffi-bindings", "api-bindings"] description = "PyO3 bindings for MinArrow - zero-copy Arrow interop with Python via PyArrow" +[workspace] + [lib] name = "minarrow_pyo3" crate-type = ["cdylib", "rlib"] @@ -27,23 +29,16 @@ pyo3 = { version = "0.23" } thiserror = "2" [features] -default = ["extension-module", "datetime", "extended_numeric_types", "extended_categorical"] +default = ["datetime", "extended_numeric_types", "extended_categorical"] extension-module = ["pyo3/extension-module"] datetime = ["minarrow/datetime"] extended_numeric_types = ["minarrow/extended_numeric_types"] extended_categorical = ["minarrow/extended_categorical"] +table_metadata = ["minarrow/table_metadata"] [build-dependencies] pyo3-build-config = "0.23" -[[example]] -name = "atomic_tests" -path = "tests/atomic_tests.rs" - -[[example]] -name = "python_roundtrip" -path = "tests/python_roundtrip.rs" - [[example]] name = "pycapsule_exchange" path = "examples/pycapsule_exchange.rs" diff --git a/pyo3/examples/pycapsule_exchange.rs b/pyo3/examples/pycapsule_exchange.rs index c0ea3d8..c2c9e97 100644 --- a/pyo3/examples/pycapsule_exchange.rs +++ b/pyo3/examples/pycapsule_exchange.rs @@ -248,7 +248,7 @@ fn example_4_import_from_pyarrow_table(py: Python<'_>) -> PyResult<()> { let result = to_rust::try_capsule_record_batch_stream(&py_table); match result { - Some(Ok(batches)) => { + Some(Ok((batches, _metadata))) => { println!(" Imported {} batch(es) into MinArrow", batches.len()); for (batch_idx, batch) in batches.iter().enumerate() { println!(" Batch {}: {} columns", batch_idx, batch.len()); diff --git a/pyo3/src/ffi/to_py.rs b/pyo3/src/ffi/to_py.rs index 398e33d..fe76e91 100644 --- a/pyo3/src/ffi/to_py.rs +++ b/pyo3/src/ffi/to_py.rs @@ -127,15 +127,20 @@ fn arrow_type_to_pyarrow<'py>( } } -/// Builds an Arrow metadata map containing the table name, if non-empty. -fn table_name_metadata(name: &str) -> Option> { - if name.is_empty() { - None - } else { - let mut m = std::collections::BTreeMap::new(); - m.insert(TABLE_NAME_KEY.to_string(), name.to_string()); - Some(m) +/// Builds the Arrow schema metadata map for a stream export. +/// +/// Always includes the table name when non-empty. When the `table_metadata` +/// feature is enabled, the table's metadata entries are included too. +fn build_stream_metadata(table: &Table) -> Option> { + #[cfg(feature = "table_metadata")] + let mut m = table.metadata.clone(); + #[cfg(not(feature = "table_metadata"))] + let mut m = std::collections::BTreeMap::new(); + + if !table.name.is_empty() { + m.insert(TABLE_NAME_KEY.to_string(), table.name.clone()); } + if m.is_empty() { None } else { Some(m) } } // PyArrow conversion - legacy C data interface @@ -218,8 +223,24 @@ pub fn table_to_py<'py>(table: &Table, py: Python<'py>) -> PyResult = table + .metadata + .iter() + .map(|(k, v)| (k.clone(), v.clone())) + .collect(); + if !table.name.is_empty() { + meta_entries.push((TABLE_NAME_KEY.to_string(), table.name.clone())); + } + if !meta_entries.is_empty() { + let metadata = meta_entries.into_py_dict(py)?; + schema = schema.call_method1("with_metadata", (metadata,))?; + } + } + #[cfg(not(feature = "table_metadata"))] if !table.name.is_empty() { let metadata = [(TABLE_NAME_KEY, &table.name)].into_py_dict(py)?; schema = schema.call_method1("with_metadata", (metadata,))?; @@ -254,6 +275,22 @@ pub fn super_table_to_py<'py>( } let py_fields_list = PyList::new(py, &py_fields)?; let mut schema = pyarrow.call_method1("schema", (py_fields_list,))?; + #[cfg(feature = "table_metadata")] + { + let mut meta_entries: Vec<(String, String)> = super_table + .metadata() + .iter() + .map(|(k, v)| (k.clone(), v.clone())) + .collect(); + if !super_table.name.is_empty() { + meta_entries.push((TABLE_NAME_KEY.to_string(), super_table.name.clone())); + } + if !meta_entries.is_empty() { + let metadata = meta_entries.into_py_dict(py)?; + schema = schema.call_method1("with_metadata", (metadata,))?; + } + } + #[cfg(not(feature = "table_metadata"))] if !super_table.name.is_empty() { let metadata = [(TABLE_NAME_KEY, &super_table.name)].into_py_dict(py)?; schema = schema.call_method1("with_metadata", (metadata,))?; @@ -287,7 +324,28 @@ pub fn super_table_to_py<'py>( PyMinarrowError::PyArrow(format!("Failed to create PyArrow Table: {}", e)) })?; - // Attach table name as schema metadata if present + // Attach table name and metadata as schema metadata if present + #[cfg(feature = "table_metadata")] + { + let mut meta_entries: Vec<(String, String)> = super_table + .metadata() + .iter() + .map(|(k, v)| (k.clone(), v.clone())) + .collect(); + if !super_table.name.is_empty() { + meta_entries.push((TABLE_NAME_KEY.to_string(), super_table.name.clone())); + } + if !meta_entries.is_empty() { + let metadata = meta_entries.into_py_dict(py)?; + return py_table + .call_method1("replace_schema_metadata", (metadata,)) + .map_err(|e| { + PyMinarrowError::PyArrow(format!("Failed to set schema metadata: {}", e)) + .into() + }); + } + } + #[cfg(not(feature = "table_metadata"))] if !super_table.name.is_empty() { let metadata = [(TABLE_NAME_KEY, &super_table.name)] .into_py_dict(py)?; @@ -477,7 +535,7 @@ pub fn table_to_stream_capsule<'py>(table: &Table, py: Python<'py>) -> PyResult< }) .collect(); - let metadata = table_name_metadata(&table.name); + let metadata = build_stream_metadata(table); let stream = export_record_batch_stream_with_metadata(vec![columns], fields, metadata); let stream_ptr = Box::into_raw(stream); @@ -543,7 +601,7 @@ pub fn super_table_to_stream_capsule<'py>( }) .collect(); - let metadata = table_name_metadata(&super_table.name); + let metadata = build_stream_metadata(&super_table.batches[0]); let stream = export_record_batch_stream_with_metadata(batches, fields, metadata); let stream_ptr = Box::into_raw(stream); diff --git a/pyo3/src/ffi/to_rust.rs b/pyo3/src/ffi/to_rust.rs index 07ae3de..e50aea5 100644 --- a/pyo3/src/ffi/to_rust.rs +++ b/pyo3/src/ffi/to_rust.rs @@ -49,6 +49,35 @@ fn extract_table_name_from_pyarrow_schema(schema: &Bound) -> String { .unwrap_or_default() } +/// Splits imported stream metadata into the table name and remaining entries. +/// +/// The `minarrow:table_name` key is extracted as the table name. All other +/// entries are returned as remaining metadata for `Table::new_with_metadata`. +#[cfg(feature = "table_metadata")] +fn split_stream_metadata( + metadata: Option>, +) -> (String, std::collections::BTreeMap) { + match metadata { + None => (String::new(), std::collections::BTreeMap::new()), + Some(mut m) => { + let name = m.remove(TABLE_NAME_KEY).unwrap_or_default(); + (name, m) + } + } +} + +/// Extracts the table name from imported stream metadata. +#[cfg(not(feature = "table_metadata"))] +fn extract_table_name( + metadata: &Option>, +) -> String { + metadata + .as_ref() + .and_then(|m| m.get(TABLE_NAME_KEY)) + .cloned() + .unwrap_or_default() +} + // PyCapsule helpers /// Attempts to import a single array via the `__arrow_c_array__` PyCapsule protocol. @@ -305,11 +334,12 @@ pub fn record_batch_to_rust(obj: &Bound) -> PyMinarrowResult) -> PyMinarrowResult) -> PyMinarrowResult { // Try PyCapsule stream if let Some(result) = try_capsule_record_batch_stream(obj) { let (batches, metadata) = result?; - let table_name = metadata - .as_ref() - .and_then(|m| m.get(TABLE_NAME_KEY)) - .cloned() - .unwrap_or_default(); + + #[cfg(feature = "table_metadata")] + let (table_name, remaining_meta) = split_stream_metadata(metadata); + #[cfg(not(feature = "table_metadata"))] + let table_name = extract_table_name(&metadata); + if batches.is_empty() { return Ok(SuperTable::new(table_name)); } @@ -414,7 +457,19 @@ pub fn table_to_rust(obj: &Bound) -> PyMinarrowResult { .into_iter() .map(|(array, field)| FieldArray::new(field, (*array).clone())) .collect(); - tables.push(Arc::new(minarrow::Table::new(table_name.clone(), Some(cols)))); + #[cfg(feature = "table_metadata")] + let table = if remaining_meta.is_empty() { + minarrow::Table::new(table_name.clone(), Some(cols)) + } else { + minarrow::Table::new_with_metadata( + table_name.clone(), + Some(cols), + remaining_meta.clone(), + ) + }; + #[cfg(not(feature = "table_metadata"))] + let table = minarrow::Table::new(table_name.clone(), Some(cols)); + tables.push(Arc::new(table)); } return Ok(SuperTable::from_batches(tables, None)); diff --git a/pyo3/src/lib.rs b/pyo3/src/lib.rs index 2941b06..e67d41a 100644 --- a/pyo3/src/lib.rs +++ b/pyo3/src/lib.rs @@ -103,9 +103,6 @@ pub mod error; pub mod ffi; pub mod types; -#[cfg(test)] -mod tests; - // Re-export the main types for ease of use pub use error::{PyMinarrowError, PyMinarrowResult}; pub use types::{PyArray, PyChunkedArray, PyField, PyRecordBatch, PyTable}; diff --git a/src/ffi/arrow_c_ffi.rs b/src/ffi/arrow_c_ffi.rs index 274320f..67b6ffa 100644 --- a/src/ffi/arrow_c_ffi.rs +++ b/src/ffi/arrow_c_ffi.rs @@ -1739,8 +1739,20 @@ struct ArrayStreamHolder { /// /// The resulting `ArrowArray` has format `"+s"` with one child per column. /// Callers must eventually call the release callback on the returned pointers. +/// +/// When `metadata` is provided, it is encoded and attached to the top-level +/// struct ArrowSchema. The `table_metadata` feature controls whether metadata +/// is preserved on the minarrow rust side during import. pub fn export_struct_to_c( columns: Vec<(Arc, Schema)>, + metadata: Option>, +) -> (*mut ArrowArray, *mut ArrowSchema) { + export_struct_to_c_inner(columns, metadata) +} + +fn export_struct_to_c_inner( + columns: Vec<(Arc, Schema)>, + metadata: Option>, ) -> (*mut ArrowArray, *mut ArrowSchema) { let n_cols = columns.len(); let n_rows = if n_cols > 0 { @@ -1783,16 +1795,22 @@ pub fn export_struct_to_c( let format_cstr = CString::new("+s").unwrap(); let name_cstr = CString::new("").unwrap(); + let metadata_bytes = metadata.map(|m| encode_arrow_metadata(&m)); + let metadata_ptr = metadata_bytes + .as_ref() + .map(|b| b.as_ptr() as *const i8) + .unwrap_or(ptr::null()); + let struct_holder = Box::new(StructSchemaHolder { format_cstr, name_cstr, - metadata_bytes: None, + metadata_bytes, }); let struct_sch = Box::new(ArrowSchema { format: struct_holder.format_cstr.as_ptr(), name: struct_holder.name_cstr.as_ptr(), - metadata: ptr::null(), + metadata: metadata_ptr, flags: 0, n_children: n_cols as i64, children: children_sch_ptr, @@ -2096,7 +2114,7 @@ unsafe extern "C" fn rb_stream_get_next( let batch = holder.batches[holder.cursor].clone(); holder.cursor += 1; - let (arr_ptr, _sch_ptr) = export_struct_to_c(batch); + let (arr_ptr, _sch_ptr) = export_struct_to_c_inner(batch, None); // Copy the exported struct array into the caller's out pointer unsafe { @@ -3092,4 +3110,60 @@ mod tests { ); } } + + #[test] + #[cfg(feature = "table_metadata")] + fn test_table_metadata_round_trip() { + use super::{ + export_record_batch_stream_with_metadata, import_record_batch_stream_with_metadata, + }; + use std::collections::BTreeMap; + + // Build a simple column + let mut arr = IntegerArray::::default(); + arr.push(1); + arr.push(2); + arr.push(3); + let array = Arc::new(Array::from_int32(arr)); + let field = Field::new("col1", ArrowType::Int32, false, None); + let col_schema = Schema { + fields: vec![field.clone()], + metadata: Default::default(), + }; + + // Schema-level metadata simulating pandas categorical info + let mut table_meta = BTreeMap::new(); + table_meta.insert( + "pandas".to_string(), + r#"{"columns":[{"name":"col1","pandas_type":"categorical","metadata":{"ordered":true}}]}"#.to_string(), + ); + + // Export as stream with metadata + let stream = export_record_batch_stream_with_metadata( + vec![vec![(array, col_schema)]], + vec![field], + Some(table_meta.clone()), + ); + let stream_ptr = Box::into_raw(stream); + + // Import with metadata + let (batches, schema_meta) = + unsafe { import_record_batch_stream_with_metadata(stream_ptr) }; + + // Verify metadata survived the roundtrip + assert_eq!(schema_meta, Some(table_meta.clone())); + + // Verify we can construct a Table with the imported metadata + let imported_cols: Vec<_> = batches[0] + .iter() + .map(|(arr, f)| crate::FieldArray::from_arr(f.name.as_str(), (**arr).clone())) + .collect(); + let table = crate::Table::new_with_metadata( + "test".to_string(), + Some(imported_cols), + table_meta.clone(), + ); + assert_eq!(table.metadata(), &table_meta); + assert_eq!(table.n_rows(), 3); + } } diff --git a/src/kernels/broadcast/array.rs b/src/kernels/broadcast/array.rs index 7f8a66d..8efb8f5 100644 --- a/src/kernels/broadcast/array.rs +++ b/src/kernels/broadcast/array.rs @@ -208,7 +208,15 @@ pub fn broadcast_array_to_table( }) .collect(); - Ok(Table::new(table.name.clone(), Some(new_cols?))) + let table_out = Table::new(table.name.clone(), Some(new_cols?)); + #[cfg(feature = "table_metadata")] + { + let mut t = table_out; + t.metadata = table.metadata.clone(); + return Ok(t); + } + #[cfg(not(feature = "table_metadata"))] + Ok(table_out) } /// Helper function for Array-SuperTable broadcasting - broadcast array to each table batch @@ -546,8 +554,8 @@ mod tests { let table_arr1 = Array::from_int32(IntegerArray::from_slice(&vec64![10, 20, 30])); let table_arr2 = Array::from_int32(IntegerArray::from_slice(&vec64![100, 200, 300])); - let table = Table { - cols: vec![ + let table = Table::build( + vec![ FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), table_arr1, @@ -557,9 +565,9 @@ mod tests { table_arr2, ), ], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let result = broadcast_array_to_table(ArithmeticOperator::Add, &arr, &table).unwrap(); @@ -587,14 +595,14 @@ mod tests { let arr = Array::from_int32(IntegerArray::from_slice(&vec64![2, 3, 4])); let table_arr = Array::from_int32(IntegerArray::from_slice(&vec64![10, 10, 10])); - let table = Table { - cols: vec![FieldArray::new( + let table = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), table_arr, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let result = broadcast_array_to_table(ArithmeticOperator::Multiply, &arr, &table).unwrap(); @@ -616,25 +624,25 @@ mod tests { // Create SuperTableView with 2 slices let table1_arr = Array::from_int32(IntegerArray::from_slice(&vec64![10, 20, 30])); - let table1 = Table { - cols: vec![FieldArray::new( + let table1 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), table1_arr, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view1 = TableV::from_table(table1, 0, 3); let table2_arr = Array::from_int32(IntegerArray::from_slice(&vec64![40, 50, 60])); - let table2 = Table { - cols: vec![FieldArray::new( + let table2 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), table2_arr, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view2 = TableV::from_table(table2, 0, 3); let super_table_view = SuperTableV { @@ -690,24 +698,24 @@ mod tests { // Create SuperTable with 2 batches let table1_arr = Array::from_int32(IntegerArray::from_slice(&vec64![10, 20, 30])); - let table1 = Table { - cols: vec![FieldArray::new( + let table1 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), table1_arr, )], - n_rows: 3, - name: "batch1".to_string(), - }; + 3, + "batch1".to_string(), + ); let table2_arr = Array::from_int32(IntegerArray::from_slice(&vec64![100, 200, 300])); - let table2 = Table { - cols: vec![FieldArray::new( + let table2 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), table2_arr, )], - n_rows: 3, - name: "batch2".to_string(), - }; + 3, + "batch2".to_string(), + ); let super_table = SuperTable::from_batches( vec![Arc::new(table1), Arc::new(table2)], @@ -744,24 +752,24 @@ mod tests { // Create Cube with 2 tables let table1_arr = Array::from_int32(IntegerArray::from_slice(&vec64![10, 20, 30])); - let table1 = Table { - cols: vec![FieldArray::new( + let table1 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), table1_arr, )], - n_rows: 3, - name: "table1".to_string(), - }; + 3, + "table1".to_string(), + ); let table2_arr = Array::from_int32(IntegerArray::from_slice(&vec64![100, 200, 300])); - let table2 = Table { - cols: vec![FieldArray::new( + let table2 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), table2_arr, )], - n_rows: 3, - name: "table2".to_string(), - }; + 3, + "table2".to_string(), + ); let cube = Cube { tables: vec![table1, table2], diff --git a/src/kernels/broadcast/array_view.rs b/src/kernels/broadcast/array_view.rs index 370c43d..1ce10b9 100644 --- a/src/kernels/broadcast/array_view.rs +++ b/src/kernels/broadcast/array_view.rs @@ -52,7 +52,15 @@ pub fn broadcast_arrayview_to_table( }) .collect(); - Ok(Table::new(table.name.clone(), Some(field_arrays))) + let table_out = Table::new(table.name.clone(), Some(field_arrays)); + #[cfg(feature = "table_metadata")] + { + let mut t = table_out; + t.metadata = table.metadata.clone(); + return Ok(t); + } + #[cfg(not(feature = "table_metadata"))] + Ok(table_out) } /// Helper function for arrayview-tableview broadcasting - work with views @@ -154,8 +162,8 @@ mod tests { // Create a table with 2 columns let arr1 = Array::from_int32(IntegerArray::from_slice(&vec64![10, 20, 30])); let arr2 = Array::from_int32(IntegerArray::from_slice(&vec64![100, 200, 300])); - let table = Table { - cols: vec![ + let table = Table::build( + vec![ FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr1, @@ -165,9 +173,9 @@ mod tests { arr2, ), ], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let result = broadcast_arrayview_to_table(ArithmeticOperator::Add, &array_view, &table).unwrap(); @@ -198,14 +206,14 @@ mod tests { // Create a table and table view let arr1 = Array::from_int32(IntegerArray::from_slice(&vec64![10, 10, 10])); - let table = Table { - cols: vec![FieldArray::new( + let table = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr1, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view = TableV::from_table(table, 0, 3); let result = broadcast_arrayview_to_tableview( @@ -234,8 +242,8 @@ mod tests { // Create a table view let arr1 = Array::from_int32(IntegerArray::from_slice(&vec64![10, 20, 30])); let arr2 = Array::from_int32(IntegerArray::from_slice(&vec64![100, 200, 300])); - let table = Table { - cols: vec![ + let table = Table::build( + vec![ FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr1, @@ -245,9 +253,9 @@ mod tests { arr2, ), ], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view = TableV::from_table(table, 0, 3); let result = broadcast_arrayview_to_tableview( @@ -283,25 +291,25 @@ mod tests { // Create SuperTableView with 2 slices let table1_arr = Array::from_int32(IntegerArray::from_slice(&vec64![10, 20, 30])); - let table1 = Table { - cols: vec![FieldArray::new( + let table1 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), table1_arr, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view1 = TableV::from_table(table1, 0, 3); let table2_arr = Array::from_int32(IntegerArray::from_slice(&vec64![40, 50, 60])); - let table2 = Table { - cols: vec![FieldArray::new( + let table2 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), table2_arr, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view2 = TableV::from_table(table2, 0, 3); let super_table_view = SuperTableV { @@ -347,25 +355,25 @@ mod tests { // Create a SuperTableView with 6 total rows (mismatch) let arr1 = Array::from_int32(IntegerArray::from_slice(&vec64![10, 20, 30])); - let table1 = Table { - cols: vec![FieldArray::new( + let table1 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr1, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view1 = TableV::from_table(table1, 0, 3); let arr2 = Array::from_int32(IntegerArray::from_slice(&vec64![40, 50, 60])); - let table2 = Table { - cols: vec![FieldArray::new( + let table2 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr2, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view2 = TableV::from_table(table2, 0, 3); let super_table_view = SuperTableV { diff --git a/src/kernels/broadcast/cube.rs b/src/kernels/broadcast/cube.rs index 4a3483d..f6b238b 100644 --- a/src/kernels/broadcast/cube.rs +++ b/src/kernels/broadcast/cube.rs @@ -727,7 +727,15 @@ fn merge_supertable_batches(super_table: &crate::SuperTable) -> Result Table { - Table { - cols: vec![ + Table::build( + vec![ create_field_array("col1", data1), create_field_array("col2", data2), ], - n_rows: 3, - name: name.to_string(), - } + 3, + name.to_string(), + ) } fn create_super_table(batches: Vec) -> SuperTable { diff --git a/src/kernels/broadcast/super_table_view.rs b/src/kernels/broadcast/super_table_view.rs index 8e1b96b..296fe93 100644 --- a/src/kernels/broadcast/super_table_view.rs +++ b/src/kernels/broadcast/super_table_view.rs @@ -243,25 +243,25 @@ mod tests { fn test_supertableview_to_scalar_add() { // Create SuperTableView with 2 slices let arr1 = Array::from_int32(IntegerArray::from_slice(&vec64![1, 2, 3])); - let table1 = Table { - cols: vec![FieldArray::new( + let table1 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr1, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view1 = TableV::from_table(table1, 0, 3); let arr2 = Array::from_int32(IntegerArray::from_slice(&vec64![4, 5, 6])); - let table2 = Table { - cols: vec![FieldArray::new( + let table2 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr2, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view2 = TableV::from_table(table2, 0, 3); let super_table_view = SuperTableV { @@ -299,25 +299,25 @@ mod tests { fn test_supertableview_to_arrayview_multiply() { // Create SuperTableView with 2 slices let arr1 = Array::from_int32(IntegerArray::from_slice(&vec64![2, 3, 4])); - let table1 = Table { - cols: vec![FieldArray::new( + let table1 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr1, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view1 = TableV::from_table(table1, 0, 3); let arr2 = Array::from_int32(IntegerArray::from_slice(&vec64![5, 6, 7])); - let table2 = Table { - cols: vec![FieldArray::new( + let table2 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr2, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view2 = TableV::from_table(table2, 0, 3); let super_table_view = SuperTableV { @@ -360,25 +360,25 @@ mod tests { fn test_supertableview_to_arrayview_length_mismatch() { // Create SuperTableView with 6 elements let arr1 = Array::from_int32(IntegerArray::from_slice(&vec64![1, 2, 3])); - let table1 = Table { - cols: vec![FieldArray::new( + let table1 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr1, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view1 = TableV::from_table(table1, 0, 3); let arr2 = Array::from_int32(IntegerArray::from_slice(&vec64![4, 5, 6])); - let table2 = Table { - cols: vec![FieldArray::new( + let table2 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr2, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view2 = TableV::from_table(table2, 0, 3); let super_table_view = SuperTableV { @@ -422,14 +422,14 @@ mod tests { // Create Table: [[10, 20, 30, 40, 50, 60]] let arr = Array::from_int32(IntegerArray::from_slice(&vec64![10, 20, 30, 40, 50, 60])); - let table = Table { - cols: vec![FieldArray::new( + let table = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr, )], - n_rows: 6, - name: "test".to_string(), - }; + 6, + "test".to_string(), + ); let result = broadcast_superarrayview_to_table( ArithmeticOperator::Subtract, @@ -476,14 +476,14 @@ mod tests { // Create Table with 5 rows (mismatch) let arr = Array::from_int32(IntegerArray::from_slice(&vec64![10, 20, 30, 40, 50])); - let table = Table { - cols: vec![FieldArray::new( + let table = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr, )], - n_rows: 5, - name: "test".to_string(), - }; + 5, + "test".to_string(), + ); let result = broadcast_superarrayview_to_table(ArithmeticOperator::Add, &super_array_view, &table); @@ -500,25 +500,25 @@ mod tests { fn test_supertableview_to_array_divide() { // Create SuperTableView with 2 slices let arr1 = Array::from_int32(IntegerArray::from_slice(&vec64![100, 200, 300])); - let table1 = Table { - cols: vec![FieldArray::new( + let table1 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr1, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view1 = TableV::from_table(table1, 0, 3); let arr2 = Array::from_int32(IntegerArray::from_slice(&vec64![400, 500, 600])); - let table2 = Table { - cols: vec![FieldArray::new( + let table2 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr2, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view2 = TableV::from_table(table2, 0, 3); let super_table_view = SuperTableV { diff --git a/src/kernels/broadcast/table.rs b/src/kernels/broadcast/table.rs index ae3d7cb..cb2f506 100644 --- a/src/kernels/broadcast/table.rs +++ b/src/kernels/broadcast/table.rs @@ -51,7 +51,15 @@ pub fn broadcast_table_with_operator( result_field_arrays.push(result_field_array); } - Ok(Table::new(table_l.name.clone(), Some(result_field_arrays))) + let table = Table::new(table_l.name.clone(), Some(result_field_arrays)); + #[cfg(feature = "table_metadata")] + { + let mut t = table; + t.metadata = table_l.metadata.clone(); + return Ok(t); + } + #[cfg(not(feature = "table_metadata"))] + Ok(table) } /// Broadcasts addition over table columns element-wise @@ -206,7 +214,15 @@ pub fn broadcast_table_to_array( }) .collect(); - Ok(Table::new(table.name.clone(), Some(new_cols?))) + let table_out = Table::new(table.name.clone(), Some(new_cols?)); + #[cfg(feature = "table_metadata")] + { + let mut t = table_out; + t.metadata = table.metadata.clone(); + return Ok(t); + } + #[cfg(not(feature = "table_metadata"))] + Ok(table_out) } /// Helper function for table-scalar broadcasting - apply table columns to scalar @@ -249,7 +265,15 @@ pub fn broadcast_table_to_scalar( }) .collect(); - Ok(Table::new(table.name.clone(), Some(new_cols?))) + let table_out = Table::new(table.name.clone(), Some(new_cols?)); + #[cfg(feature = "table_metadata")] + { + let mut t = table_out; + t.metadata = table.metadata.clone(); + return Ok(t); + } + #[cfg(not(feature = "table_metadata"))] + Ok(table_out) } /// Helper function for table-arrayview broadcasting - work directly with view without conversion @@ -294,7 +318,15 @@ pub fn broadcast_table_to_arrayview( }) .collect(); - Ok(Table::new(table.name.clone(), Some(new_cols?))) + let table_out = Table::new(table.name.clone(), Some(new_cols?)); + #[cfg(feature = "table_metadata")] + { + let mut t = table_out; + t.metadata = table.metadata.clone(); + return Ok(t); + } + #[cfg(not(feature = "table_metadata"))] + Ok(table_out) } /// Helper function for Table-SuperArrayView broadcasting - promote Table to aligned SuperTableView @@ -630,14 +662,14 @@ mod tests { use crate::ffi::arrow_dtype::ArrowType; // Table with 3 rows to match each chunk size - let table = Table { - cols: vec![FieldArray::new( + let table = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), Array::from_int32(IntegerArray::from_slice(&vec64![2, 3, 4])), )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let field = Field::new("data".to_string(), ArrowType::Int32, false, None); let chunks = vec![ @@ -676,14 +708,14 @@ mod tests { fn test_broadcast_table_to_superarrayview() { use crate::ffi::arrow_dtype::ArrowType; - let table = Table { - cols: vec![FieldArray::new( + let table = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), Array::from_int32(IntegerArray::from_slice(&vec64![1, 2, 3, 4, 5, 6])), )], - n_rows: 6, - name: "test".to_string(), - }; + 6, + "test".to_string(), + ); let arr = Array::from_int32(IntegerArray::from_slice(&vec64![10, 20, 30, 40, 50, 60])); let field = Field::new("data".to_string(), ArrowType::Int32, false, None); diff --git a/src/kernels/broadcast/table_view.rs b/src/kernels/broadcast/table_view.rs index 7edf4af..ccfebf1 100644 --- a/src/kernels/broadcast/table_view.rs +++ b/src/kernels/broadcast/table_view.rs @@ -186,8 +186,8 @@ mod tests { // Create two tables let arr1 = Array::from_int32(IntegerArray::from_slice(&vec64![1, 2, 3])); let arr2 = Array::from_int32(IntegerArray::from_slice(&vec64![10, 20, 30])); - let table1 = Table { - cols: vec![ + let table1 = Table::build( + vec![ FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr1, @@ -197,15 +197,15 @@ mod tests { arr2, ), ], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view1 = TableV::from_table(table1, 0, 3); let arr3 = Array::from_int32(IntegerArray::from_slice(&vec64![5, 5, 5])); let arr4 = Array::from_int32(IntegerArray::from_slice(&vec64![100, 100, 100])); - let table2 = Table { - cols: vec![ + let table2 = Table::build( + vec![ FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr3, @@ -215,9 +215,9 @@ mod tests { arr4, ), ], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view2 = TableV::from_table(table2, 0, 3); let result = @@ -246,20 +246,20 @@ mod tests { fn test_tableview_to_tableview_column_mismatch() { // Create tables with different numbers of columns let arr1 = Array::from_int32(IntegerArray::from_slice(&vec64![1, 2, 3])); - let table1 = Table { - cols: vec![FieldArray::new( + let table1 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr1, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view1 = TableV::from_table(table1, 0, 3); let arr2 = Array::from_int32(IntegerArray::from_slice(&vec64![5, 5, 5])); let arr3 = Array::from_int32(IntegerArray::from_slice(&vec64![10, 10, 10])); - let table2 = Table { - cols: vec![ + let table2 = Table::build( + vec![ FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr2, @@ -269,9 +269,9 @@ mod tests { arr3, ), ], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view2 = TableV::from_table(table2, 0, 3); let result = @@ -290,8 +290,8 @@ mod tests { fn test_tableview_to_scalar_multiply() { let arr1 = Array::from_int32(IntegerArray::from_slice(&vec64![2, 3, 4])); let arr2 = Array::from_int32(IntegerArray::from_slice(&vec64![5, 6, 7])); - let table = Table { - cols: vec![ + let table = Table::build( + vec![ FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr1, @@ -301,9 +301,9 @@ mod tests { arr2, ), ], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view = TableV::from_table(table, 0, 3); let scalar = Scalar::Int32(10); @@ -330,14 +330,14 @@ mod tests { #[test] fn test_tableview_to_arrayview_subtract() { let arr1 = Array::from_int32(IntegerArray::from_slice(&vec64![100, 200, 300])); - let table = Table { - cols: vec![FieldArray::new( + let table = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr1, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view = TableV::from_table(table, 0, 3); let arr2 = Array::from_int32(IntegerArray::from_slice(&vec64![10, 20, 30])); @@ -369,14 +369,14 @@ mod tests { // Create table with 6 rows let arr1 = Array::from_int32(IntegerArray::from_slice(&vec64![1, 2, 3, 4, 5, 6])); - let table = Table { - cols: vec![FieldArray::new( + let table = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr1, )], - n_rows: 6, - name: "test".to_string(), - }; + 6, + "test".to_string(), + ); let table_view = TableV::from_table(table, 0, 6); // Create SuperArrayView with 2 chunks of 3 elements each @@ -426,14 +426,14 @@ mod tests { use crate::{FieldArray as FA, SuperArray, SuperArrayV}; let arr1 = Array::from_int32(IntegerArray::from_slice(&vec64![1, 2, 3, 4, 5])); - let table = Table { - cols: vec![FieldArray::new( + let table = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr1, )], - n_rows: 5, - name: "test".to_string(), - }; + 5, + "test".to_string(), + ); let table_view = TableV::from_table(table, 0, 5); // Create SuperArrayView with 6 elements (mismatch) diff --git a/src/structs/chunked/super_table.rs b/src/structs/chunked/super_table.rs index 81482ea..2ef9a95 100644 --- a/src/structs/chunked/super_table.rs +++ b/src/structs/chunked/super_table.rs @@ -348,6 +348,15 @@ impl SuperTable { self.batches.get(idx) } + /// Returns the schema-level metadata from the first batch, or an empty map + /// if there are no batches. + #[cfg(feature = "table_metadata")] + pub fn metadata(&self) -> &std::collections::BTreeMap { + static EMPTY: std::sync::LazyLock> = + std::sync::LazyLock::new(std::collections::BTreeMap::new); + self.batches.first().map(|b| b.metadata()).unwrap_or(&EMPTY) + } + // Return a new BatchedTable over a sub-range of rows. #[cfg(feature = "views")] pub fn view(&self, offset: usize, len: usize) -> SuperTableV { @@ -688,11 +697,13 @@ impl SuperTable { }); } - Table { - cols: unified_cols, - n_rows: self.n_rows, - name: self.name, + #[allow(unused_mut)] + let mut table = Table::build(unified_cols, self.n_rows, self.name); + #[cfg(feature = "table_metadata")] + { + table.metadata = self.batches[0].metadata.clone(); } + table } /// Arena-based consolidation: writes all column buffers into a single @@ -874,11 +885,7 @@ mod tests { for c in &cols { assert_eq!(c.len(), n_rows, "all columns must have same len for Table"); } - Table { - cols, - n_rows, - name: "batch".to_string(), - } + Table::build(cols, n_rows, "batch".to_string()) } #[test] @@ -1460,6 +1467,8 @@ mod tests { ], n_rows: 2, name: "batch".into(), + #[cfg(feature = "table_metadata")] + metadata: std::collections::BTreeMap::new(), }); let b2 = Arc::new(Table { cols: vec![ @@ -1482,6 +1491,8 @@ mod tests { ], n_rows: 1, name: "batch".into(), + #[cfg(feature = "table_metadata")] + metadata: std::collections::BTreeMap::new(), }); let st = SuperTable::from_batches(vec![b1, b2], None); let result = st.consolidate(); @@ -1594,6 +1605,8 @@ mod tests { ], n_rows: 3, name: "batch".into(), + #[cfg(feature = "table_metadata")] + metadata: std::collections::BTreeMap::new(), }); let b2 = Arc::new(Table { cols: vec![ @@ -1603,6 +1616,8 @@ mod tests { ], n_rows: 2, name: "batch".into(), + #[cfg(feature = "table_metadata")] + metadata: std::collections::BTreeMap::new(), }); SuperTable::from_batches(vec![b1, b2], None) }; diff --git a/src/structs/table.rs b/src/structs/table.rs index b4a92c2..b211adf 100644 --- a/src/structs/table.rs +++ b/src/structs/table.rs @@ -92,9 +92,28 @@ pub struct Table { pub n_rows: usize, /// Table name pub name: String, + /// Schema-level metadata as key-value pairs. + /// Captures metadata that Arrow producers like PyArrow embed + /// in the top-level ArrowSchema.metadata, e.g. pandas categorical ordering. + #[cfg(feature = "table_metadata")] + pub metadata: std::collections::BTreeMap, } impl Table { + /// Internal constructor handling the conditional metadata field. + /// All code paths that build a `Table` from parts should go through here + /// so the `#[cfg]` lives in one place. + #[inline(always)] + pub(crate) fn build(cols: Vec, n_rows: usize, name: String) -> Self { + Self { + cols, + n_rows, + name, + #[cfg(feature = "table_metadata")] + metadata: std::collections::BTreeMap::new(), + } + } + /// Constructs a new Table with a specified name and optional columns. /// If `cols` is provided, the number of rows will be inferred from the first column. pub fn new(name: String, cols: Option>) -> Self { @@ -108,19 +127,35 @@ impl Table { name }; - Self { cols, n_rows, name } + Self::build(cols, n_rows, name) + } + + /// Constructs a new Table with schema-level metadata. + /// + /// Use this when importing data from Arrow producers that embed metadata + /// in the top-level schema, e.g. pandas categorical ordering via PyArrow. + #[cfg(feature = "table_metadata")] + pub fn new_with_metadata( + name: String, + cols: Option>, + metadata: std::collections::BTreeMap, + ) -> Self { + let mut table = Self::new(name, cols); + table.metadata = metadata; + table + } + + /// Returns a reference to the schema-level metadata. + #[cfg(feature = "table_metadata")] + pub fn metadata(&self) -> &std::collections::BTreeMap { + &self.metadata } /// Constructs a new, empty Table with a globally unique name. pub fn new_empty() -> Self { let id = UNNAMED_COUNTER.fetch_add(1, Ordering::Relaxed); let name = format!("UnnamedTable{}", id); - - Self { - cols: Vec::new(), - n_rows: 0, - name, - } + Self::build(Vec::new(), 0, name) } /// Build a Table from an Arena and its collected array regions. @@ -155,7 +190,7 @@ impl Table { }) .collect(); - Self { cols, n_rows, name } + Self::build(cols, n_rows, name) } /// Adds a column with a name. @@ -314,11 +349,13 @@ impl Table { .map(|fa| fa.slice_clone(offset, len)) .collect(); let name = format!("{}[{}, {})", self.name, offset, offset + len); - Table { - cols, - n_rows: len, - name, + #[allow(unused_mut)] + let mut table = Table::build(cols, len, name); + #[cfg(feature = "table_metadata")] + { + table.metadata = self.metadata.clone(); } + table } /// Returns a zero-copy view over rows `[offset, offset+len)`. @@ -548,16 +585,23 @@ impl Table { right_cols.push(right_field); } - let left_table = Table { - cols: left_cols, - n_rows: index, - name: format!("{}_left", self.name), + let left_table = Table::build(left_cols, index, format!("{}_left", self.name)); + let right_table = Table::build( + right_cols, + self.n_rows - index, + format!("{}_right", self.name), + ); + #[cfg(feature = "table_metadata")] + let left_table = { + let mut t = left_table; + t.metadata = self.metadata.clone(); + t }; - - let right_table = Table { - cols: right_cols, - n_rows: self.n_rows - index, - name: format!("{}_right", self.name), + #[cfg(feature = "table_metadata")] + let right_table = { + let mut t = right_table; + t.metadata = self.metadata.clone(); + t }; Ok(SuperTable::from_batches( @@ -754,12 +798,15 @@ impl Concatenate for Table { // Create result table let n_rows = result_cols.first().map(|c| c.len()).unwrap_or(0); let name = format!("{}+{}", self.name, other.name); + let table = Table::build(result_cols, n_rows, name); + #[cfg(feature = "table_metadata")] + let table = { + let mut t = table; + t.metadata = self.metadata; + t + }; - Ok(Table { - cols: result_cols, - n_rows, - name, - }) + Ok(table) } } @@ -821,11 +868,15 @@ fn consolidate_vec_concat(tables: Vec
) -> Table { let n_rows = unified_cols.first().map(|c| c.len()).unwrap_or(0); let name = tables[0].name.clone(); - Table { - cols: unified_cols, - n_rows, - name, + let table = Table::build(unified_cols, n_rows, name); + #[cfg(feature = "table_metadata")] + { + let mut t = table; + t.metadata = tables[0].metadata.clone(); + t } + #[cfg(not(feature = "table_metadata"))] + table } impl Display for Table { diff --git a/src/structs/views/chunked/super_table_view.rs b/src/structs/views/chunked/super_table_view.rs index 88ab716..a1cd5de 100644 --- a/src/structs/views/chunked/super_table_view.rs +++ b/src/structs/views/chunked/super_table_view.rs @@ -256,11 +256,7 @@ mod tests { /// One-column `Table` with Int32 data fn table(name: &str, vals: &[i32]) -> Table { - Table { - cols: Vec::from(vec![fa_i32(name, vals)]), - n_rows: vals.len(), - name: name.to_string(), - } + Table::build(vec![fa_i32(name, vals)], vals.len(), name.to_string()) } /// Handy lens into the first column of a 1-column table fn col_vals(t: &Table) -> Vec { diff --git a/src/structs/views/table_view.rs b/src/structs/views/table_view.rs index 81ecfa5..2fd44f7 100644 --- a/src/structs/views/table_view.rs +++ b/src/structs/views/table_view.rs @@ -261,12 +261,7 @@ impl TableV { .collect(); let n_rows = self.len; - - Table { - cols, - n_rows, - name: self.name.clone(), - } + Table::build(cols, n_rows, self.name.clone()) } /// Apply a transformation to each column view, producing a new table. @@ -669,11 +664,7 @@ impl TableV { }) .collect(); - Table { - cols, - n_rows: indices.len(), - name: self.name.clone(), - } + Table::build(cols, indices.len(), self.name.clone()) } }