From 51fe78fdedf2432bd57b9f55c7282f88e6a9c674 Mon Sep 17 00:00:00 2001 From: PB <37089506+pbower@users.noreply.github.com> Date: Sun, 8 Mar 2026 11:24:13 +0000 Subject: [PATCH 1/3] Improve Pandas FFI compatibility --- pyo3/src/ffi/to_rust.rs | 34 +++++++++++++++-- src/ffi/arrow_c_ffi.rs | 82 ++++++++++++++++++++++++++--------------- 2 files changed, 82 insertions(+), 34 deletions(-) diff --git a/pyo3/src/ffi/to_rust.rs b/pyo3/src/ffi/to_rust.rs index 89f61f2..07ae3de 100644 --- a/pyo3/src/ffi/to_rust.rs +++ b/pyo3/src/ffi/to_rust.rs @@ -224,10 +224,12 @@ fn capsule_to_ptr(capsule: &Bound, name: &std::ffi::CStr) -> PyMinarrowRe // Public import functions -/// Converts a PyArrow Array (or any Arrow-compatible Python object) to a -/// MinArrow FieldArray. +/// Converts a PyArrow Array, pd.Series, pl.Series, or any Arrow-compatible +/// Python object to a MinArrow FieldArray. /// -/// Tries `__arrow_c_array__` first, then falls back to `_export_to_c`. +/// Tries `__arrow_c_array__` first, then `__arrow_c_stream__` for objects +/// that only expose a stream interface e.g. pandas and Polars Series, then +/// falls back to the legacy `_export_to_c` approach. /// /// # Arguments /// * `obj` - A Python object implementing the Arrow array interface @@ -235,11 +237,35 @@ fn capsule_to_ptr(capsule: &Bound, name: &std::ffi::CStr) -> PyMinarrowRe /// # Returns /// * `PyMinarrowResult` - The converted MinArrow FieldArray pub fn array_to_rust(obj: &Bound) -> PyMinarrowResult { - // Try PyCapsule protocol first + // Try PyCapsule array protocol first if let Some(result) = try_capsule_array(obj) { return result; } + // Try PyCapsule stream protocol for objects like pd.Series / pl.Series + // that expose __arrow_c_stream__ but not __arrow_c_array__ + if let Some(result) = try_capsule_array_stream(obj) { + let (arrays, field) = result?; + if arrays.is_empty() { + return Ok(FieldArray::new(field, minarrow::Array::Null)); + } + if arrays.len() == 1 { + let array = Arc::try_unwrap(arrays.into_iter().next().unwrap()) + .unwrap_or_else(|arc| (*arc).clone()); + return Ok(FieldArray::new(field, array)); + } + // Concatenate multiple chunks into a single array + use minarrow::Concatenate; + let mut iter = arrays.into_iter(); + let first = Arc::try_unwrap(iter.next().unwrap()) + .unwrap_or_else(|arc| (*arc).clone()); + let combined = iter.fold(first, |acc, chunk| { + let arr = Arc::try_unwrap(chunk).unwrap_or_else(|arc| (*arc).clone()); + acc.concat(arr).expect("Failed to concatenate array chunks") + }); + return Ok(FieldArray::new(field, combined)); + } + // Fall back to _export_to_c approach array_to_rust_c(obj) } diff --git a/src/ffi/arrow_c_ffi.rs b/src/ffi/arrow_c_ffi.rs index e80b2f5..01e8cdf 100644 --- a/src/ffi/arrow_c_ffi.rs +++ b/src/ffi/arrow_c_ffi.rs @@ -2387,7 +2387,9 @@ pub unsafe fn import_record_batch_stream_with_metadata( /// imported arrays along with the field metadata. /// /// Uses the zero-copy import path since each yielded array is independently -/// owned. +/// owned. The stream schema is kept alive for the duration of the import so +/// that dictionary-encoded arrays can reference it directly, avoiding the need +/// to reconstruct synthetic schemas per chunk. /// /// # Safety /// `stream` must be a valid, non-null pointer to an initialised ArrowArrayStream. @@ -2405,15 +2407,12 @@ pub unsafe fn import_array_stream( rc ); - // Extract complete field including metadata before releasing the schema - let field = field_from_c_schema(&schema_c); + // Extract complete field including metadata + let mut field = field_from_c_schema(&schema_c); - // Release the schema as we have extracted what we need - if let Some(release) = schema_c.release { - release(&mut schema_c as *mut ArrowSchema); - } - - // 2. Consume arrays + // 2. Consume arrays using the original stream schema directly. + // This is essential for dictionary-encoded arrays where import_categorical + // needs schema_c.dictionary to describe the dictionary value type. let mut arrays = Vec::new(); let get_next = ((*stream).get_next).expect("stream has no get_next callback"); @@ -2430,30 +2429,27 @@ pub unsafe fn import_array_stream( break; } - // Create schema for this array to import with ownership - let schema_box = { - let fmt_cstr = fmt_c(field.dtype.clone()); - let name_cstr = CString::new(field.name.clone()).unwrap_or_default(); - let flags = if field.nullable { 2 } else { 0 }; - Box::new(ArrowSchema { - format: fmt_cstr.into_raw(), - name: name_cstr.into_raw(), - metadata: ptr::null(), - flags, - n_children: 0, - children: ptr::null_mut(), - dictionary: ptr::null_mut(), - release: Some(release_arrow_schema), - private_data: ptr::null_mut(), - }) - }; - let arr_box = Box::new(arr); - let (imported, _) = import_from_c_owned(arr_box, schema_box); + let imported = import_array_zero_copy( + arr_box, + field.dtype.clone(), + &schema_c as *const ArrowSchema, + ); arrays.push(imported); } - // 3. Release the stream + // Utf8View is stored internally as String since the data is + // restructured during import from views to offsets+data. + if field.dtype == ArrowType::Utf8View { + field.dtype = ArrowType::String; + } + + // 3. Release the schema now that all chunks have been imported + if let Some(release) = schema_c.release { + release(&mut schema_c as *mut ArrowSchema); + } + + // 4. Release the stream if let Some(release) = (*stream).release { release(stream); } @@ -2464,6 +2460,10 @@ pub unsafe fn import_array_stream( /// Extracts a complete Field from an ArrowSchema, including metadata. /// +/// When the schema has a non-null `dictionary` pointer, the field is recognised +/// as dictionary-encoded and the returned ArrowType is `Dictionary(...)` rather +/// than the raw index type. +/// /// # Safety /// `schema` must point to a valid ArrowSchema with valid name and format pointers. unsafe fn field_from_c_schema(schema: &ArrowSchema) -> crate::Field { @@ -2476,7 +2476,29 @@ unsafe fn field_from_c_schema(schema: &ArrowSchema) -> crate::Field { }; let nullable = (schema.flags & 2) != 0; let fmt = unsafe { std::ffi::CStr::from_ptr(schema.format).to_bytes() }; - let dtype = parse_arrow_format(fmt); + + let dtype = if !schema.dictionary.is_null() { + // Dictionary-encoded: format string describes the index type, the + // dictionary field describes the value type. + use crate::ffi::arrow_dtype::CategoricalIndexType; + let index_type = match fmt { + #[cfg(all(feature = "extended_numeric_types", feature = "extended_categorical"))] + b"c" | b"C" => CategoricalIndexType::UInt8, + #[cfg(all(feature = "extended_numeric_types", feature = "extended_categorical"))] + b"s" | b"S" => CategoricalIndexType::UInt16, + b"i" | b"I" => CategoricalIndexType::UInt32, + #[cfg(all(feature = "extended_numeric_types", feature = "extended_categorical"))] + b"l" | b"L" => CategoricalIndexType::UInt64, + _ => panic!( + "Unsupported dictionary index format: {:?}", + std::str::from_utf8(fmt).unwrap_or("??") + ), + }; + ArrowType::Dictionary(index_type) + } else { + parse_arrow_format(fmt) + }; + let metadata = unsafe { decode_arrow_metadata(schema.metadata) }; crate::Field::new(name, dtype, nullable, metadata) } From 8f5f7139e5d4e3739bb46d3cb4305c701bc866b9 Mon Sep 17 00:00:00 2001 From: PB <37089506+pbower@users.noreply.github.com> Date: Sun, 15 Mar 2026 21:44:11 +0000 Subject: [PATCH 2/3] Improve categorical type handling for arrow-stream FFI --- pyo3/Cargo.lock | 8 +++----- pyo3/Cargo.toml | 2 +- src/ffi/arrow_c_ffi.rs | 32 +++++++++++++++++++++++++++----- 3 files changed, 31 insertions(+), 11 deletions(-) diff --git a/pyo3/Cargo.lock b/pyo3/Cargo.lock index 709ca48..2150879 100644 --- a/pyo3/Cargo.lock +++ b/pyo3/Cargo.lock @@ -46,9 +46,7 @@ dependencies = [ [[package]] name = "minarrow" -version = "0.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dec83afe7627050e33e0dc8b8c343f26a55c26950456e3d98165ce3e860f6954" +version = "0.9.1" dependencies = [ "libc", "num-traits", @@ -224,6 +222,6 @@ checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" [[package]] name = "vec64" -version = "0.2.1" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83cdc51f4ec332213b4850d6fefaa78a365498739d5e5e5a7399f9641789e03d" +checksum = "98adb8aed5f3be39cddb565bf782bcb5798a6b1318d9e1b256575a08ed14b983" diff --git a/pyo3/Cargo.toml b/pyo3/Cargo.toml index 5eb9c15..08762c3 100644 --- a/pyo3/Cargo.toml +++ b/pyo3/Cargo.toml @@ -22,7 +22,7 @@ name = "minarrow_pyo3" crate-type = ["cdylib", "rlib"] [dependencies] -minarrow = { version = "0.8.1", features = ["large_string"] } +minarrow = { version = "0.9.1", path = "..", features = ["large_string"] } pyo3 = { version = "0.23" } thiserror = "2" diff --git a/src/ffi/arrow_c_ffi.rs b/src/ffi/arrow_c_ffi.rs index 01e8cdf..bdb5fec 100644 --- a/src/ffi/arrow_c_ffi.rs +++ b/src/ffi/arrow_c_ffi.rs @@ -842,8 +842,8 @@ pub unsafe fn import_from_c_owned( let is_dict = !arr.dictionary.is_null() || !sch.dictionary.is_null(); // For categorical (dictionary-encoded) types, the codes buffer is zero-copy - // via ForeignBuffer. Dictionary strings are copied due to structural mismatch - // between Arrow's contiguous offsets+data and MinArrow's Vec64. + // via ForeignBuffer. Dictionary strings are currently copied into Vec64, + // however this will be addressed in a future enhancement. if is_dict { drop(sch_box); let result = unsafe { @@ -1441,9 +1441,31 @@ unsafe fn import_categorical( let null_ptr = buffers[0]; let codes_ptr = buffers[1]; - // Import dictionary strings (always copied — structural mismatch between - // Arrow's contiguous offsets+data and MinArrow's Vec64). - let dict = unsafe { import_from_c(arr.dictionary as *const _, sch.dictionary as *const _) }; + // Import dictionary strings into Vec64. + // + // When the schema carries a dictionary pointer we use it directly. + // Some producers (e.g. pandas Series via __arrow_c_stream__) leave + // sch.dictionary null while still populating arr.dictionary. In that + // case we construct a synthetic UTF-8 schema so import_from_c can + // interpret the dictionary values array. + let synthetic_schema; + let dict_sch_ptr: *const ArrowSchema = if !sch.dictionary.is_null() { + sch.dictionary as *const _ + } else { + synthetic_schema = ArrowSchema { + format: b"u\0".as_ptr() as *const i8, + name: b"\0".as_ptr() as *const i8, + metadata: ptr::null(), + flags: 0, + n_children: 0, + children: ptr::null_mut(), + dictionary: ptr::null_mut(), + release: None, + private_data: ptr::null_mut(), + }; + &synthetic_schema as *const _ + }; + let dict = unsafe { import_from_c(arr.dictionary as *const _, dict_sch_ptr) }; let dict_strings = match dict.as_ref() { Array::TextArray(TextArray::String32(s)) => (0..s.len()) .map(|i| s.get(i).unwrap_or_default().to_string()) From 0755cd4723f96e2bcdf790359d8737e6e5a1b909 Mon Sep 17 00:00:00 2001 From: PB <37089506+pbower@users.noreply.github.com> Date: Sun, 15 Mar 2026 22:04:51 +0000 Subject: [PATCH 3/3] Bump version --- Cargo.lock | 2 +- Cargo.toml | 2 +- pyo3/Cargo.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 603cd06..18088d3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1659,7 +1659,7 @@ dependencies = [ [[package]] name = "minarrow" -version = "0.9.1" +version = "0.9.2" dependencies = [ "ahash", "arrow", diff --git a/Cargo.toml b/Cargo.toml index 886ad44..1683b11 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "minarrow" -version = "0.9.1" +version = "0.9.2" edition = "2024" authors = ['Peter G. Bower'] build = "build.rs" diff --git a/pyo3/Cargo.toml b/pyo3/Cargo.toml index 08762c3..ce6b928 100644 --- a/pyo3/Cargo.toml +++ b/pyo3/Cargo.toml @@ -22,7 +22,7 @@ name = "minarrow_pyo3" crate-type = ["cdylib", "rlib"] [dependencies] -minarrow = { version = "0.9.1", path = "..", features = ["large_string"] } +minarrow = { version = "0.9.2", path = "..", features = ["large_string"] } pyo3 = { version = "0.23" } thiserror = "2"