pbower · pbower · Mar 15, 2026 · Mar 8, 2026 · Mar 15, 2026 · Mar 15, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "minarrow"
-version = "0.9.1"
+version = "0.9.2"
 edition = "2024"
 authors = ['Peter G. Bower']
 build = "build.rs"

diff --git a/pyo3/Cargo.lock b/pyo3/Cargo.lock
diff --git a/pyo3/Cargo.toml b/pyo3/Cargo.toml
@@ -22,7 +22,7 @@ name = "minarrow_pyo3"
 crate-type = ["cdylib", "rlib"]
 
 [dependencies]
-minarrow = { version = "0.9.1", features = ["large_string"] }
+minarrow = { version = "0.9.2", path = "..", features = ["large_string"] }
 pyo3 = { version = "0.23" }
 thiserror = "2"
 

diff --git a/pyo3/src/ffi/to_rust.rs b/pyo3/src/ffi/to_rust.rs
@@ -224,22 +224,48 @@ fn capsule_to_ptr(capsule: &Bound<PyAny>, name: &std::ffi::CStr) -> PyMinarrowRe
 
 // Public import functions
 
-/// Converts a PyArrow Array (or any Arrow-compatible Python object) to a
-/// MinArrow FieldArray.
+/// Converts a PyArrow Array, pd.Series, pl.Series, or any Arrow-compatible
+/// Python object to a MinArrow FieldArray.
 ///
-/// Tries `__arrow_c_array__` first, then falls back to `_export_to_c`.
+/// Tries `__arrow_c_array__` first, then `__arrow_c_stream__` for objects
+/// that only expose a stream interface e.g. pandas and Polars Series, then
+/// falls back to the legacy `_export_to_c` approach.
 ///
 /// # Arguments
 /// * `obj` - A Python object implementing the Arrow array interface
 ///
 /// # Returns
 /// * `PyMinarrowResult<FieldArray>` - The converted MinArrow FieldArray
 pub fn array_to_rust(obj: &Bound<PyAny>) -> PyMinarrowResult<FieldArray> {
-    // Try PyCapsule protocol first
+    // Try PyCapsule array protocol first
     if let Some(result) = try_capsule_array(obj) {
         return result;
     }
 
+    // Try PyCapsule stream protocol for objects like pd.Series / pl.Series
+    // that expose __arrow_c_stream__ but not __arrow_c_array__
+    if let Some(result) = try_capsule_array_stream(obj) {
+        let (arrays, field) = result?;
+        if arrays.is_empty() {
+            return Ok(FieldArray::new(field, minarrow::Array::Null));
+        }
+        if arrays.len() == 1 {
+            let array = Arc::try_unwrap(arrays.into_iter().next().unwrap())
+                .unwrap_or_else(|arc| (*arc).clone());
+            return Ok(FieldArray::new(field, array));
+        }
+        // Concatenate multiple chunks into a single array
+        use minarrow::Concatenate;
+        let mut iter = arrays.into_iter();
+        let first = Arc::try_unwrap(iter.next().unwrap())
+            .unwrap_or_else(|arc| (*arc).clone());
+        let combined = iter.fold(first, |acc, chunk| {
+            let arr = Arc::try_unwrap(chunk).unwrap_or_else(|arc| (*arc).clone());
+            acc.concat(arr).expect("Failed to concatenate array chunks")
+        });
+        return Ok(FieldArray::new(field, combined));
+    }
+
     // Fall back to _export_to_c approach
     array_to_rust_c(obj)
 }

diff --git a/src/ffi/arrow_c_ffi.rs b/src/ffi/arrow_c_ffi.rs
@@ -842,8 +842,8 @@ pub unsafe fn import_from_c_owned(
     let is_dict = !arr.dictionary.is_null() || !sch.dictionary.is_null();
 
     // For categorical (dictionary-encoded) types, the codes buffer is zero-copy
-    // via ForeignBuffer. Dictionary strings are copied due to structural mismatch
-    // between Arrow's contiguous offsets+data and MinArrow's Vec64<String>.
+    // via ForeignBuffer. Dictionary strings are currently copied into Vec64<String>,
+    // however this will be addressed in a future enhancement.
     if is_dict {
         drop(sch_box);
         let result = unsafe {
@@ -1441,9 +1441,31 @@ unsafe fn import_categorical(
     let null_ptr = buffers[0];
     let codes_ptr = buffers[1];
 
-    // Import dictionary strings (always copied — structural mismatch between
-    // Arrow's contiguous offsets+data and MinArrow's Vec64<String>).
-    let dict = unsafe { import_from_c(arr.dictionary as *const _, sch.dictionary as *const _) };
+    // Import dictionary strings into Vec64<String>.
+    //
+    // When the schema carries a dictionary pointer we use it directly.
+    // Some producers (e.g. pandas Series via __arrow_c_stream__) leave
+    // sch.dictionary null while still populating arr.dictionary. In that
+    // case we construct a synthetic UTF-8 schema so import_from_c can
+    // interpret the dictionary values array.
+    let synthetic_schema;
+    let dict_sch_ptr: *const ArrowSchema = if !sch.dictionary.is_null() {
+        sch.dictionary as *const _
+    } else {
+        synthetic_schema = ArrowSchema {
+            format: b"u\0".as_ptr() as *const i8,
+            name: b"\0".as_ptr() as *const i8,
+            metadata: ptr::null(),
+            flags: 0,
+            n_children: 0,
+            children: ptr::null_mut(),
+            dictionary: ptr::null_mut(),
+            release: None,
+            private_data: ptr::null_mut(),
+        };
+        &synthetic_schema as *const _
+    };
+    let dict = unsafe { import_from_c(arr.dictionary as *const _, dict_sch_ptr) };
     let dict_strings = match dict.as_ref() {
         Array::TextArray(TextArray::String32(s)) => (0..s.len())
             .map(|i| s.get(i).unwrap_or_default().to_string())
@@ -2384,7 +2406,9 @@ pub unsafe fn import_record_batch_stream_with_metadata(
 /// imported arrays along with the field metadata.
 ///
 /// Uses the zero-copy import path since each yielded array is independently
-/// owned.
+/// owned. The stream schema is kept alive for the duration of the import so
+/// that dictionary-encoded arrays can reference it directly, avoiding the need
+/// to reconstruct synthetic schemas per chunk.
 ///
 /// # Safety
 /// `stream` must be a valid, non-null pointer to an initialised ArrowArrayStream.
@@ -2402,15 +2426,12 @@ pub unsafe fn import_array_stream(
             rc
         );
 
-        // Extract complete field including metadata before releasing the schema
-        let field = field_from_c_schema(&schema_c);
-
-        // Release the schema as we have extracted what we need
-        if let Some(release) = schema_c.release {
-            release(&mut schema_c as *mut ArrowSchema);
-        }
+        // Extract complete field including metadata
+        let mut field = field_from_c_schema(&schema_c);
 
-        // 2. Consume arrays
+        // 2. Consume arrays using the original stream schema directly.
+        // This is essential for dictionary-encoded arrays where import_categorical
+        // needs schema_c.dictionary to describe the dictionary value type.
         let mut arrays = Vec::new();
         let get_next = ((*stream).get_next).expect("stream has no get_next callback");
 
@@ -2427,30 +2448,27 @@ pub unsafe fn import_array_stream(
                 break;
             }
 
-            // Create schema for this array to import with ownership
-            let schema_box = {
-                let fmt_cstr = fmt_c(field.dtype.clone());
-                let name_cstr = CString::new(field.name.clone()).unwrap_or_default();
-                let flags = if field.nullable { 2 } else { 0 };
-                Box::new(ArrowSchema {
-                    format: fmt_cstr.into_raw(),
-                    name: name_cstr.into_raw(),
-                    metadata: ptr::null(),
-                    flags,
-                    n_children: 0,
-                    children: ptr::null_mut(),
-                    dictionary: ptr::null_mut(),
-                    release: Some(release_arrow_schema),
-                    private_data: ptr::null_mut(),
-                })
-            };
-
             let arr_box = Box::new(arr);
-            let (imported, _) = import_from_c_owned(arr_box, schema_box);
+            let imported = import_array_zero_copy(
+                arr_box,
+                field.dtype.clone(),
+                &schema_c as *const ArrowSchema,
+            );
             arrays.push(imported);
         }
 
-        // 3. Release the stream
+        // Utf8View is stored internally as String since the data is
+        // restructured during import from views to offsets+data.
+        if field.dtype == ArrowType::Utf8View {
+            field.dtype = ArrowType::String;
+        }
+
+        // 3. Release the schema now that all chunks have been imported
+        if let Some(release) = schema_c.release {
+            release(&mut schema_c as *mut ArrowSchema);
+        }
+
+        // 4. Release the stream
         if let Some(release) = (*stream).release {
             release(stream);
         }
@@ -2461,6 +2479,10 @@ pub unsafe fn import_array_stream(
 
 /// Extracts a complete Field from an ArrowSchema, including metadata.
 ///
+/// When the schema has a non-null `dictionary` pointer, the field is recognised
+/// as dictionary-encoded and the returned ArrowType is `Dictionary(...)` rather
+/// than the raw index type.
+///
 /// # Safety
 /// `schema` must point to a valid ArrowSchema with valid name and format pointers.
 unsafe fn field_from_c_schema(schema: &ArrowSchema) -> crate::Field {
@@ -2473,7 +2495,29 @@ unsafe fn field_from_c_schema(schema: &ArrowSchema) -> crate::Field {
     };
     let nullable = (schema.flags & 2) != 0;
     let fmt = unsafe { std::ffi::CStr::from_ptr(schema.format).to_bytes() };
-    let dtype = parse_arrow_format(fmt);
+
+    let dtype = if !schema.dictionary.is_null() {
+        // Dictionary-encoded: format string describes the index type, the
+        // dictionary field describes the value type.
+        use crate::ffi::arrow_dtype::CategoricalIndexType;
+        let index_type = match fmt {
+            #[cfg(all(feature = "extended_numeric_types", feature = "extended_categorical"))]
+            b"c" | b"C" => CategoricalIndexType::UInt8,
+            #[cfg(all(feature = "extended_numeric_types", feature = "extended_categorical"))]
+            b"s" | b"S" => CategoricalIndexType::UInt16,
+            b"i" | b"I" => CategoricalIndexType::UInt32,
+            #[cfg(all(feature = "extended_numeric_types", feature = "extended_categorical"))]
+            b"l" | b"L" => CategoricalIndexType::UInt64,
+            _ => panic!(
+                "Unsupported dictionary index format: {:?}",
+                std::str::from_utf8(fmt).unwrap_or("??")
+            ),
+        };
+        ArrowType::Dictionary(index_type)
+    } else {
+        parse_arrow_format(fmt)
+    };
+
     let metadata = unsafe { decode_arrow_metadata(schema.metadata) };
     crate::Field::new(name, dtype, nullable, metadata)
 }