Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "minarrow"
version = "0.9.1"
version = "0.9.2"
edition = "2024"
authors = ['Peter G. Bower']
build = "build.rs"
Expand Down
8 changes: 3 additions & 5 deletions pyo3/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyo3/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ name = "minarrow_pyo3"
crate-type = ["cdylib", "rlib"]

[dependencies]
minarrow = { version = "0.9.1", features = ["large_string"] }
minarrow = { version = "0.9.2", path = "..", features = ["large_string"] }
pyo3 = { version = "0.23" }
thiserror = "2"

Expand Down
34 changes: 30 additions & 4 deletions pyo3/src/ffi/to_rust.rs
Original file line number Diff line number Diff line change
Expand Up @@ -224,22 +224,48 @@ fn capsule_to_ptr(capsule: &Bound<PyAny>, name: &std::ffi::CStr) -> PyMinarrowRe

// Public import functions

/// Converts a PyArrow Array (or any Arrow-compatible Python object) to a
/// MinArrow FieldArray.
/// Converts a PyArrow Array, pd.Series, pl.Series, or any Arrow-compatible
/// Python object to a MinArrow FieldArray.
///
/// Tries `__arrow_c_array__` first, then falls back to `_export_to_c`.
/// Tries `__arrow_c_array__` first, then `__arrow_c_stream__` for objects
/// that only expose a stream interface e.g. pandas and Polars Series, then
/// falls back to the legacy `_export_to_c` approach.
///
/// # Arguments
/// * `obj` - A Python object implementing the Arrow array interface
///
/// # Returns
/// * `PyMinarrowResult<FieldArray>` - The converted MinArrow FieldArray
pub fn array_to_rust(obj: &Bound<PyAny>) -> PyMinarrowResult<FieldArray> {
// Try PyCapsule protocol first
// Try PyCapsule array protocol first
if let Some(result) = try_capsule_array(obj) {
return result;
}

// Try PyCapsule stream protocol for objects like pd.Series / pl.Series
// that expose __arrow_c_stream__ but not __arrow_c_array__
if let Some(result) = try_capsule_array_stream(obj) {
let (arrays, field) = result?;
if arrays.is_empty() {
return Ok(FieldArray::new(field, minarrow::Array::Null));
}
if arrays.len() == 1 {
let array = Arc::try_unwrap(arrays.into_iter().next().unwrap())
.unwrap_or_else(|arc| (*arc).clone());
return Ok(FieldArray::new(field, array));
}
// Concatenate multiple chunks into a single array
use minarrow::Concatenate;
let mut iter = arrays.into_iter();
let first = Arc::try_unwrap(iter.next().unwrap())
.unwrap_or_else(|arc| (*arc).clone());
let combined = iter.fold(first, |acc, chunk| {
let arr = Arc::try_unwrap(chunk).unwrap_or_else(|arc| (*arc).clone());
acc.concat(arr).expect("Failed to concatenate array chunks")
});
return Ok(FieldArray::new(field, combined));
}

// Fall back to _export_to_c approach
array_to_rust_c(obj)
}
Expand Down
114 changes: 79 additions & 35 deletions src/ffi/arrow_c_ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -842,8 +842,8 @@ pub unsafe fn import_from_c_owned(
let is_dict = !arr.dictionary.is_null() || !sch.dictionary.is_null();

// For categorical (dictionary-encoded) types, the codes buffer is zero-copy
// via ForeignBuffer. Dictionary strings are copied due to structural mismatch
// between Arrow's contiguous offsets+data and MinArrow's Vec64<String>.
// via ForeignBuffer. Dictionary strings are currently copied into Vec64<String>,
// however this will be addressed in a future enhancement.
if is_dict {
drop(sch_box);
let result = unsafe {
Expand Down Expand Up @@ -1441,9 +1441,31 @@ unsafe fn import_categorical(
let null_ptr = buffers[0];
let codes_ptr = buffers[1];

// Import dictionary strings (always copied — structural mismatch between
// Arrow's contiguous offsets+data and MinArrow's Vec64<String>).
let dict = unsafe { import_from_c(arr.dictionary as *const _, sch.dictionary as *const _) };
// Import dictionary strings into Vec64<String>.
//
// When the schema carries a dictionary pointer we use it directly.
// Some producers (e.g. pandas Series via __arrow_c_stream__) leave
// sch.dictionary null while still populating arr.dictionary. In that
// case we construct a synthetic UTF-8 schema so import_from_c can
// interpret the dictionary values array.
let synthetic_schema;
let dict_sch_ptr: *const ArrowSchema = if !sch.dictionary.is_null() {
sch.dictionary as *const _
} else {
synthetic_schema = ArrowSchema {
format: b"u\0".as_ptr() as *const i8,
name: b"\0".as_ptr() as *const i8,
metadata: ptr::null(),
flags: 0,
n_children: 0,
children: ptr::null_mut(),
dictionary: ptr::null_mut(),
release: None,
private_data: ptr::null_mut(),
};
&synthetic_schema as *const _
};
let dict = unsafe { import_from_c(arr.dictionary as *const _, dict_sch_ptr) };
let dict_strings = match dict.as_ref() {
Array::TextArray(TextArray::String32(s)) => (0..s.len())
.map(|i| s.get(i).unwrap_or_default().to_string())
Expand Down Expand Up @@ -2384,7 +2406,9 @@ pub unsafe fn import_record_batch_stream_with_metadata(
/// imported arrays along with the field metadata.
///
/// Uses the zero-copy import path since each yielded array is independently
/// owned.
/// owned. The stream schema is kept alive for the duration of the import so
/// that dictionary-encoded arrays can reference it directly, avoiding the need
/// to reconstruct synthetic schemas per chunk.
///
/// # Safety
/// `stream` must be a valid, non-null pointer to an initialised ArrowArrayStream.
Expand All @@ -2402,15 +2426,12 @@ pub unsafe fn import_array_stream(
rc
);

// Extract complete field including metadata before releasing the schema
let field = field_from_c_schema(&schema_c);

// Release the schema as we have extracted what we need
if let Some(release) = schema_c.release {
release(&mut schema_c as *mut ArrowSchema);
}
// Extract complete field including metadata
let mut field = field_from_c_schema(&schema_c);

// 2. Consume arrays
// 2. Consume arrays using the original stream schema directly.
// This is essential for dictionary-encoded arrays where import_categorical
// needs schema_c.dictionary to describe the dictionary value type.
let mut arrays = Vec::new();
let get_next = ((*stream).get_next).expect("stream has no get_next callback");

Expand All @@ -2427,30 +2448,27 @@ pub unsafe fn import_array_stream(
break;
}

// Create schema for this array to import with ownership
let schema_box = {
let fmt_cstr = fmt_c(field.dtype.clone());
let name_cstr = CString::new(field.name.clone()).unwrap_or_default();
let flags = if field.nullable { 2 } else { 0 };
Box::new(ArrowSchema {
format: fmt_cstr.into_raw(),
name: name_cstr.into_raw(),
metadata: ptr::null(),
flags,
n_children: 0,
children: ptr::null_mut(),
dictionary: ptr::null_mut(),
release: Some(release_arrow_schema),
private_data: ptr::null_mut(),
})
};

let arr_box = Box::new(arr);
let (imported, _) = import_from_c_owned(arr_box, schema_box);
let imported = import_array_zero_copy(
arr_box,
field.dtype.clone(),
&schema_c as *const ArrowSchema,
);
arrays.push(imported);
}

// 3. Release the stream
// Utf8View is stored internally as String since the data is
// restructured during import from views to offsets+data.
if field.dtype == ArrowType::Utf8View {
field.dtype = ArrowType::String;
}

// 3. Release the schema now that all chunks have been imported
if let Some(release) = schema_c.release {
release(&mut schema_c as *mut ArrowSchema);
}

// 4. Release the stream
if let Some(release) = (*stream).release {
release(stream);
}
Expand All @@ -2461,6 +2479,10 @@ pub unsafe fn import_array_stream(

/// Extracts a complete Field from an ArrowSchema, including metadata.
///
/// When the schema has a non-null `dictionary` pointer, the field is recognised
/// as dictionary-encoded and the returned ArrowType is `Dictionary(...)` rather
/// than the raw index type.
///
/// # Safety
/// `schema` must point to a valid ArrowSchema with valid name and format pointers.
unsafe fn field_from_c_schema(schema: &ArrowSchema) -> crate::Field {
Expand All @@ -2473,7 +2495,29 @@ unsafe fn field_from_c_schema(schema: &ArrowSchema) -> crate::Field {
};
let nullable = (schema.flags & 2) != 0;
let fmt = unsafe { std::ffi::CStr::from_ptr(schema.format).to_bytes() };
let dtype = parse_arrow_format(fmt);

let dtype = if !schema.dictionary.is_null() {
// Dictionary-encoded: format string describes the index type, the
// dictionary field describes the value type.
use crate::ffi::arrow_dtype::CategoricalIndexType;
let index_type = match fmt {
#[cfg(all(feature = "extended_numeric_types", feature = "extended_categorical"))]
b"c" | b"C" => CategoricalIndexType::UInt8,
#[cfg(all(feature = "extended_numeric_types", feature = "extended_categorical"))]
b"s" | b"S" => CategoricalIndexType::UInt16,
b"i" | b"I" => CategoricalIndexType::UInt32,
#[cfg(all(feature = "extended_numeric_types", feature = "extended_categorical"))]
b"l" | b"L" => CategoricalIndexType::UInt64,
_ => panic!(
"Unsupported dictionary index format: {:?}",
std::str::from_utf8(fmt).unwrap_or("??")
),
};
ArrowType::Dictionary(index_type)
} else {
parse_arrow_format(fmt)
};

let metadata = unsafe { decode_arrow_metadata(schema.metadata) };
crate::Field::new(name, dtype, nullable, metadata)
}
Expand Down
Loading