From 51fe78fdedf2432bd57b9f55c7282f88e6a9c674 Mon Sep 17 00:00:00 2001
From: PB <37089506+pbower@users.noreply.github.com>
Date: Sun, 8 Mar 2026 11:24:13 +0000
Subject: [PATCH 1/3] Improve Pandas FFI compatibility

---
 pyo3/src/ffi/to_rust.rs | 34 +++++++++++++++--
 src/ffi/arrow_c_ffi.rs  | 82 ++++++++++++++++++++++++++---------------
 2 files changed, 82 insertions(+), 34 deletions(-)
diff --git a/pyo3/src/ffi/to_rust.rs b/pyo3/src/ffi/to_rust.rs
index 89f61f2..07ae3de 100644
--- a/pyo3/src/ffi/to_rust.rs
+++ b/pyo3/src/ffi/to_rust.rs
@@ -224,10 +224,12 @@ fn capsule_to_ptr(capsule: &Bound<PyAny>, name: &std::ffi::CStr) -> PyMinarrowRe
 
 // Public import functions
 
-/// Converts a PyArrow Array (or any Arrow-compatible Python object) to a
-/// MinArrow FieldArray.
+/// Converts a PyArrow Array, pd.Series, pl.Series, or any Arrow-compatible
+/// Python object to a MinArrow FieldArray.
 ///
-/// Tries `__arrow_c_array__` first, then falls back to `_export_to_c`.
+/// Tries `__arrow_c_array__` first, then `__arrow_c_stream__` for objects
+/// that only expose a stream interface e.g. pandas and Polars Series, then
+/// falls back to the legacy `_export_to_c` approach.
 ///
 /// # Arguments
 /// * `obj` - A Python object implementing the Arrow array interface
@@ -235,11 +237,35 @@ fn capsule_to_ptr(capsule: &Bound<PyAny>, name: &std::ffi::CStr) -> PyMinarrowRe
 /// # Returns
 /// * `PyMinarrowResult<FieldArray>` - The converted MinArrow FieldArray
 pub fn array_to_rust(obj: &Bound<PyAny>) -> PyMinarrowResult<FieldArray> {
-    // Try PyCapsule protocol first
+    // Try PyCapsule array protocol first
     if let Some(result) = try_capsule_array(obj) {
         return result;
     }
 
+    // Try PyCapsule stream protocol for objects like pd.Series / pl.Series
+    // that expose __arrow_c_stream__ but not __arrow_c_array__
+    if let Some(result) = try_capsule_array_stream(obj) {
+        let (arrays, field) = result?;
+        if arrays.is_empty() {
+            return Ok(FieldArray::new(field, minarrow::Array::Null));
+        }
+        if arrays.len() == 1 {
+            let array = Arc::try_unwrap(arrays.into_iter().next().unwrap())
+                .unwrap_or_else(|arc| (*arc).clone());
+            return Ok(FieldArray::new(field, array));
+        }
+        // Concatenate multiple chunks into a single array
+        use minarrow::Concatenate;
+        let mut iter = arrays.into_iter();
+        let first = Arc::try_unwrap(iter.next().unwrap())
+            .unwrap_or_else(|arc| (*arc).clone());
+        let combined = iter.fold(first, |acc, chunk| {
+            let arr = Arc::try_unwrap(chunk).unwrap_or_else(|arc| (*arc).clone());
+            acc.concat(arr).expect("Failed to concatenate array chunks")
+        });
+        return Ok(FieldArray::new(field, combined));
+    }
+
     // Fall back to _export_to_c approach
     array_to_rust_c(obj)
 }
diff --git a/src/ffi/arrow_c_ffi.rs b/src/ffi/arrow_c_ffi.rs
index e80b2f5..01e8cdf 100644
--- a/src/ffi/arrow_c_ffi.rs
+++ b/src/ffi/arrow_c_ffi.rs
@@ -2387,7 +2387,9 @@ pub unsafe fn import_record_batch_stream_with_metadata(
 /// imported arrays along with the field metadata.
 ///
 /// Uses the zero-copy import path since each yielded array is independently
-/// owned.
+/// owned. The stream schema is kept alive for the duration of the import so
+/// that dictionary-encoded arrays can reference it directly, avoiding the need
+/// to reconstruct synthetic schemas per chunk.
 ///
 /// # Safety
 /// `stream` must be a valid, non-null pointer to an initialised ArrowArrayStream.
@@ -2405,15 +2407,12 @@ pub unsafe fn import_array_stream(
             rc
         );
 
-        // Extract complete field including metadata before releasing the schema
-        let field = field_from_c_schema(&schema_c);
+        // Extract complete field including metadata
+        let mut field = field_from_c_schema(&schema_c);
 
-        // Release the schema as we have extracted what we need
-        if let Some(release) = schema_c.release {
-            release(&mut schema_c as *mut ArrowSchema);
-        }
-
-        // 2. Consume arrays
+        // 2. Consume arrays using the original stream schema directly.
+        // This is essential for dictionary-encoded arrays where import_categorical
+        // needs schema_c.dictionary to describe the dictionary value type.
         let mut arrays = Vec::new();
         let get_next = ((*stream).get_next).expect("stream has no get_next callback");
 
@@ -2430,30 +2429,27 @@ pub unsafe fn import_array_stream(
                 break;
             }
 
-            // Create schema for this array to import with ownership
-            let schema_box = {
-                let fmt_cstr = fmt_c(field.dtype.clone());
-                let name_cstr = CString::new(field.name.clone()).unwrap_or_default();
-                let flags = if field.nullable { 2 } else { 0 };
-                Box::new(ArrowSchema {
-                    format: fmt_cstr.into_raw(),
-                    name: name_cstr.into_raw(),
-                    metadata: ptr::null(),
-                    flags,
-                    n_children: 0,
-                    children: ptr::null_mut(),
-                    dictionary: ptr::null_mut(),
-                    release: Some(release_arrow_schema),
-                    private_data: ptr::null_mut(),
-                })
-            };
-
             let arr_box = Box::new(arr);
-            let (imported, _) = import_from_c_owned(arr_box, schema_box);
+            let imported = import_array_zero_copy(
+                arr_box,
+                field.dtype.clone(),
+                &schema_c as *const ArrowSchema,
+            );
             arrays.push(imported);
         }
 
-        // 3. Release the stream
+        // Utf8View is stored internally as String since the data is
+        // restructured during import from views to offsets+data.
+        if field.dtype == ArrowType::Utf8View {
+            field.dtype = ArrowType::String;
+        }
+
+        // 3. Release the schema now that all chunks have been imported
+        if let Some(release) = schema_c.release {
+            release(&mut schema_c as *mut ArrowSchema);
+        }
+
+        // 4. Release the stream
         if let Some(release) = (*stream).release {
             release(stream);
         }
@@ -2464,6 +2460,10 @@ pub unsafe fn import_array_stream(
 
 /// Extracts a complete Field from an ArrowSchema, including metadata.
 ///
+/// When the schema has a non-null `dictionary` pointer, the field is recognised
+/// as dictionary-encoded and the returned ArrowType is `Dictionary(...)` rather
+/// than the raw index type.
+///
 /// # Safety
 /// `schema` must point to a valid ArrowSchema with valid name and format pointers.
 unsafe fn field_from_c_schema(schema: &ArrowSchema) -> crate::Field {
@@ -2476,7 +2476,29 @@ unsafe fn field_from_c_schema(schema: &ArrowSchema) -> crate::Field {
     };
     let nullable = (schema.flags & 2) != 0;
     let fmt = unsafe { std::ffi::CStr::from_ptr(schema.format).to_bytes() };
-    let dtype = parse_arrow_format(fmt);
+
+    let dtype = if !schema.dictionary.is_null() {
+        // Dictionary-encoded: format string describes the index type, the
+        // dictionary field describes the value type.
+        use crate::ffi::arrow_dtype::CategoricalIndexType;
+        let index_type = match fmt {
+            #[cfg(all(feature = "extended_numeric_types", feature = "extended_categorical"))]
+            b"c" | b"C" => CategoricalIndexType::UInt8,
+            #[cfg(all(feature = "extended_numeric_types", feature = "extended_categorical"))]
+            b"s" | b"S" => CategoricalIndexType::UInt16,
+            b"i" | b"I" => CategoricalIndexType::UInt32,
+            #[cfg(all(feature = "extended_numeric_types", feature = "extended_categorical"))]
+            b"l" | b"L" => CategoricalIndexType::UInt64,
+            _ => panic!(
+                "Unsupported dictionary index format: {:?}",
+                std::str::from_utf8(fmt).unwrap_or("??")
+            ),
+        };
+        ArrowType::Dictionary(index_type)
+    } else {
+        parse_arrow_format(fmt)
+    };
+
     let metadata = unsafe { decode_arrow_metadata(schema.metadata) };
     crate::Field::new(name, dtype, nullable, metadata)
 }

From 8f5f7139e5d4e3739bb46d3cb4305c701bc866b9 Mon Sep 17 00:00:00 2001
From: PB <37089506+pbower@users.noreply.github.com>
Date: Sun, 15 Mar 2026 21:44:11 +0000
Subject: [PATCH 2/3] Improve categorical type handling for arrow-stream FFI

---
 pyo3/Cargo.lock        |  8 +++-----
 pyo3/Cargo.toml        |  2 +-
 src/ffi/arrow_c_ffi.rs | 32 +++++++++++++++++++++++++++-----
 3 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/pyo3/Cargo.lock b/pyo3/Cargo.lock
index 709ca48..2150879 100644
--- a/pyo3/Cargo.lock
+++ b/pyo3/Cargo.lock
@@ -46,9 +46,7 @@ dependencies = [
 
 [[package]]
 name = "minarrow"
-version = "0.8.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dec83afe7627050e33e0dc8b8c343f26a55c26950456e3d98165ce3e860f6954"
+version = "0.9.1"
 dependencies = [
  "libc",
  "num-traits",
@@ -224,6 +222,6 @@ checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3"
 
 [[package]]
 name = "vec64"
-version = "0.2.1"
+version = "0.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "83cdc51f4ec332213b4850d6fefaa78a365498739d5e5e5a7399f9641789e03d"
+checksum = "98adb8aed5f3be39cddb565bf782bcb5798a6b1318d9e1b256575a08ed14b983"
diff --git a/pyo3/Cargo.toml b/pyo3/Cargo.toml
index 5eb9c15..08762c3 100644
--- a/pyo3/Cargo.toml
+++ b/pyo3/Cargo.toml
@@ -22,7 +22,7 @@ name = "minarrow_pyo3"
 crate-type = ["cdylib", "rlib"]
 
 [dependencies]
-minarrow = { version = "0.8.1", features = ["large_string"] }
+minarrow = { version = "0.9.1", path = "..", features = ["large_string"] }
 pyo3 = { version = "0.23" }
 thiserror = "2"
 
diff --git a/src/ffi/arrow_c_ffi.rs b/src/ffi/arrow_c_ffi.rs
index 01e8cdf..bdb5fec 100644
--- a/src/ffi/arrow_c_ffi.rs
+++ b/src/ffi/arrow_c_ffi.rs
@@ -842,8 +842,8 @@ pub unsafe fn import_from_c_owned(
     let is_dict = !arr.dictionary.is_null() || !sch.dictionary.is_null();
 
     // For categorical (dictionary-encoded) types, the codes buffer is zero-copy
-    // via ForeignBuffer. Dictionary strings are copied due to structural mismatch
-    // between Arrow's contiguous offsets+data and MinArrow's Vec64<String>.
+    // via ForeignBuffer. Dictionary strings are currently copied into Vec64<String>,
+    // however this will be addressed in a future enhancement.
     if is_dict {
         drop(sch_box);
         let result = unsafe {
@@ -1441,9 +1441,31 @@ unsafe fn import_categorical(
     let null_ptr = buffers[0];
     let codes_ptr = buffers[1];
 
-    // Import dictionary strings (always copied — structural mismatch between
-    // Arrow's contiguous offsets+data and MinArrow's Vec64<String>).
-    let dict = unsafe { import_from_c(arr.dictionary as *const _, sch.dictionary as *const _) };
+    // Import dictionary strings into Vec64<String>.
+    //
+    // When the schema carries a dictionary pointer we use it directly.
+    // Some producers (e.g. pandas Series via __arrow_c_stream__) leave
+    // sch.dictionary null while still populating arr.dictionary. In that
+    // case we construct a synthetic UTF-8 schema so import_from_c can
+    // interpret the dictionary values array.
+    let synthetic_schema;
+    let dict_sch_ptr: *const ArrowSchema = if !sch.dictionary.is_null() {
+        sch.dictionary as *const _
+    } else {
+        synthetic_schema = ArrowSchema {
+            format: b"u\0".as_ptr() as *const i8,
+            name: b"\0".as_ptr() as *const i8,
+            metadata: ptr::null(),
+            flags: 0,
+            n_children: 0,
+            children: ptr::null_mut(),
+            dictionary: ptr::null_mut(),
+            release: None,
+            private_data: ptr::null_mut(),
+        };
+        &synthetic_schema as *const _
+    };
+    let dict = unsafe { import_from_c(arr.dictionary as *const _, dict_sch_ptr) };
     let dict_strings = match dict.as_ref() {
         Array::TextArray(TextArray::String32(s)) => (0..s.len())
             .map(|i| s.get(i).unwrap_or_default().to_string())

From 0755cd4723f96e2bcdf790359d8737e6e5a1b909 Mon Sep 17 00:00:00 2001
From: PB <37089506+pbower@users.noreply.github.com>
Date: Sun, 15 Mar 2026 22:04:51 +0000
Subject: [PATCH 3/3] Bump version

---
 Cargo.lock      | 2 +-
 Cargo.toml      | 2 +-
 pyo3/Cargo.toml | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 603cd06..18088d3 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1659,7 +1659,7 @@ dependencies = [
 
 [[package]]
 name = "minarrow"
-version = "0.9.1"
+version = "0.9.2"
 dependencies = [
  "ahash",
  "arrow",
diff --git a/Cargo.toml b/Cargo.toml
index 886ad44..1683b11 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "minarrow"
-version = "0.9.1"
+version = "0.9.2"
 edition = "2024"
 authors = ['Peter G. Bower']
 build = "build.rs"
diff --git a/pyo3/Cargo.toml b/pyo3/Cargo.toml
index 08762c3..ce6b928 100644
--- a/pyo3/Cargo.toml
+++ b/pyo3/Cargo.toml
@@ -22,7 +22,7 @@ name = "minarrow_pyo3"
 crate-type = ["cdylib", "rlib"]
 
 [dependencies]
-minarrow = { version = "0.9.1", path = "..", features = ["large_string"] }
+minarrow = { version = "0.9.2", path = "..", features = ["large_string"] }
 pyo3 = { version = "0.23" }
 thiserror = "2"