diff --git a/src/enums/collections/numeric_array.rs b/src/enums/collections/numeric_array.rs index c72c238..3ca3213 100644 --- a/src/enums/collections/numeric_array.rs +++ b/src/enums/collections/numeric_array.rs @@ -16,7 +16,7 @@ use std::{ sync::Arc, }; -use crate::{Bitmask, FloatArray, IntegerArray, MaskedArray}; +use crate::{Bitmask, FloatArray, IntegerArray, MaskedArray, Vec64}; use crate::{BooleanArray, StringArray}; use crate::{ enums::{error::MinarrowError, shape_dim::ShapeDim}, @@ -535,6 +535,54 @@ impl NumericArray { } } + /// Cast this NumericArray to Float64, staying wrapped as NumericArray. + /// + /// If already Float64, returns self unchanged. Otherwise casts element + /// data to f64, preserving the null mask. Uses `Arc::try_unwrap` so that + /// if this is the sole owner of the backing Arc, the old data is consumed + /// and freed rather than cloned. + pub fn cow_into_f64(self) -> Self { + macro_rules! cast_arc { + ($arc:expr) => { + match Arc::try_unwrap($arc) { + Ok(owned) => { + let data: Vec64 = + owned.data.as_slice().iter().map(|&v| v as f64).collect(); + NumericArray::Float64(Arc::new(FloatArray::new(data, owned.null_mask))) + } + Err(shared) => { + let data: Vec64 = + shared.data.as_slice().iter().map(|&v| v as f64).collect(); + NumericArray::Float64(Arc::new(FloatArray::new( + data, + shared.null_mask.clone(), + ))) + } + } + }; + } + + match self { + NumericArray::Float64(_) => self, + NumericArray::Float32(arc) => cast_arc!(arc), + NumericArray::Int32(arc) => cast_arc!(arc), + NumericArray::Int64(arc) => cast_arc!(arc), + NumericArray::UInt32(arc) => cast_arc!(arc), + NumericArray::UInt64(arc) => cast_arc!(arc), + #[cfg(feature = "extended_numeric_types")] + NumericArray::Int8(arc) => cast_arc!(arc), + #[cfg(feature = "extended_numeric_types")] + NumericArray::Int16(arc) => cast_arc!(arc), + #[cfg(feature = "extended_numeric_types")] + NumericArray::UInt8(arc) => cast_arc!(arc), + #[cfg(feature = "extended_numeric_types")] + NumericArray::UInt16(arc) => cast_arc!(arc), + NumericArray::Null => { + NumericArray::Float64(Arc::new(FloatArray::new(Vec64::new(), None))) + } + } + } + /// Convert to FloatArray using From. pub fn f64(self) -> Result, MinarrowError> { match self { diff --git a/src/structs/buffer.rs b/src/structs/buffer.rs index 690b309..ef4ca33 100644 --- a/src/structs/buffer.rs +++ b/src/structs/buffer.rs @@ -200,6 +200,21 @@ impl Buffer { } } + /// Construct a zero-copy buffer as a window into a SharedBuffer. + /// + /// The buffer views elements `[offset .. offset + len]` of type T within + /// the shared allocation. The SharedBuffer is cloned (via a refcount bump) + /// so the underlying memory stays alive. + /// + /// This is used by Matrix::to_table to create per-column FloatArray + /// buffers that all share the same contiguous allocation. + #[inline] + pub fn from_shared_column(owner: SharedBuffer, offset: usize, len: usize) -> Self { + Self { + storage: Storage::Shared { owner, offset, len }, + } + } + /// Construct a zero-copy buffer from an Arc-backed foreign allocation. /// /// Because all `Minarrow` types work off 64-byte alignment at the outset diff --git a/src/structs/matrix.rs b/src/structs/matrix.rs index dd6603c..9faf65c 100644 --- a/src/structs/matrix.rs +++ b/src/structs/matrix.rs @@ -4,15 +4,14 @@ //! BLAS/LAPACK compatible with built-inconversions from `Table` data. use std::fmt; -use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; use crate::enums::error::MinarrowError; use crate::enums::shape_dim::ShapeDim; +use crate::structs::buffer::Buffer; +use crate::structs::shared_buffer::SharedBuffer; use crate::traits::{concatenate::Concatenate, shape::Shape}; -use crate::{FloatArray, Vec64}; - -// Global counter for unnamed matrix instances -static UNNAMED_MATRIX_COUNTER: AtomicUsize = AtomicUsize::new(1); +use crate::{Array, Field, FieldArray, FloatArray, NumericArray, Table, Vec64}; /// # Matrix /// @@ -26,69 +25,86 @@ static UNNAMED_MATRIX_COUNTER: AtomicUsize = AtomicUsize::new(1); /// and is not part of the *`Apache Arrow`* framework**. /// /// ### Properties -/// - `n_rows`: Number of rows. +/// - `n_rows`: Logical number of rows. /// - `n_cols`: Number of columns. -/// - `data`: Flat buffer in column-major order. -/// - `name`: Optional matrix name (used for debugging, diagnostics, or pretty printing). +/// - `stride`: Physical elements per column in the buffer. Padded to 8-element +/// (64-byte) boundaries so every column starts SIMD-aligned. This is the +/// BLAS leading dimension (lda). Always `>= n_rows`. +/// - `data`: Flat buffer in column-major order with stride padding. +/// - `name`: Optional matrix name for diagnostics. /// /// ### Null handling /// - It is dense - nulls can be represented through `f64::NAN` /// - However this is not always reliable, as a single *NaN* can affect vectorised /// calculations when integrating with various frameworks. -/// -/// ### Under Development -/// ⚠️ **Unstable API and WIP: expect future development. Breaking changes will be minimised, -/// but avoid using this in production unless you are ready to wear API adjustments**. -/// Specifically, we are considering whether to make a 'logical columns' matrix for easy -/// access, but backed by a single buffer. This would provide the look/feel of a regular table -/// whilst keeping the implementation efficient and consistent with established frameworks, -/// at the cost of immutability. Consider this change likely. #[repr(C, align(64))] #[derive(Clone, PartialEq)] pub struct Matrix { pub n_rows: usize, pub n_cols: usize, + /// Physical column stride in elements, padded so each column is 64-byte aligned. + pub stride: usize, pub data: Vec64, - pub name: String, + pub name: Option, +} + +/// Number of f64 elements per 64-byte alignment boundary. +const ALIGN_ELEMS: usize = 64 / std::mem::size_of::(); // 8 + +/// Round up to next multiple of ALIGN_ELEMS for 64-byte column alignment. +#[inline] +const fn aligned_stride(n_rows: usize) -> usize { + (n_rows + ALIGN_ELEMS - 1) & !(ALIGN_ELEMS - 1) } impl Matrix { /// Constructs a new dense Matrix with shape and optional name. - /// Data buffer is zeroed. + /// Data buffer is zeroed. Columns are padded to 64-byte alignment. pub fn new(n_rows: usize, n_cols: usize, name: Option) -> Self { - let len = n_rows * n_cols; + let stride = aligned_stride(n_rows); + let len = stride * n_cols; let mut data = Vec64::with_capacity(len); data.0.resize(len, 0.0); - let name = name.unwrap_or_else(|| { - let id = UNNAMED_MATRIX_COUNTER.fetch_add(1, Ordering::Relaxed); - format!("UnnamedMatrix{}", id) - }); - Matrix { - n_rows, - n_cols, - data, - name, - } + Matrix { n_rows, n_cols, stride, data, name } } - /// Constructs a Matrix from a flat buffer (must be column-major order). - /// Panics if data length does not match shape. - pub fn from_flat(data: Vec64, n_rows: usize, n_cols: usize, name: Option) -> Self { + /// Constructs a Matrix from a pre-padded Vec64 buffer. + /// The buffer must already have `stride * n_cols` elements with the correct + /// stride layout. Use `from_f64_unaligned` if your data is unpadded. + pub fn from_f64_aligned(data: Vec64, n_rows: usize, n_cols: usize, name: Option) -> Self { + let stride = aligned_stride(n_rows); assert_eq!( data.len(), + stride * n_cols, + "Matrix: padded buffer length does not match stride * n_cols" + ); + Matrix { n_rows, n_cols, stride, data, name } + } + + /// Constructs a Matrix from a flat column-major buffer without stride padding. + /// The data is re-laid out with 64-byte aligned column padding. + pub fn from_f64_unaligned(src: &[f64], n_rows: usize, n_cols: usize, name: Option) -> Self { + assert_eq!( + src.len(), n_rows * n_cols, "Matrix shape does not match buffer length" ); - let name = name.unwrap_or_else(|| { - let id = UNNAMED_MATRIX_COUNTER.fetch_add(1, Ordering::Relaxed); - format!("UnnamedMatrix{}", id) - }); - Matrix { - n_rows, - n_cols, - data, - name, + let stride = aligned_stride(n_rows); + if stride == n_rows { + // No padding needed, take ownership directly + let data = Vec64::from(src); + return Matrix { n_rows, n_cols, stride, data, name }; + } + // Re-layout with padding between columns + let mut data = Vec64::with_capacity(stride * n_cols); + data.0.resize(stride * n_cols, 0.0); + for col in 0..n_cols { + let src_start = col * n_rows; + let dst_start = col * stride; + data.as_mut_slice()[dst_start..dst_start + n_rows] + .copy_from_slice(&src[src_start..src_start + n_rows]); } + Matrix { n_rows, n_cols, stride, data, name } } /// Returns the value at (row, col) (0-based). Panics if out of bounds. @@ -96,7 +112,7 @@ impl Matrix { pub fn get(&self, row: usize, col: usize) -> f64 { debug_assert!(row < self.n_rows, "Row out of bounds"); debug_assert!(col < self.n_cols, "Col out of bounds"); - self.data[col * self.n_rows + row] + self.data[col * self.stride + row] } /// Sets the value at (row, col) (0-based). Panics if out of bounds. @@ -104,7 +120,7 @@ impl Matrix { pub fn set(&mut self, row: usize, col: usize, value: f64) { debug_assert!(row < self.n_rows, "Row out of bounds"); debug_assert!(col < self.n_cols, "Col out of bounds"); - self.data[col * self.n_rows + row] = value; + self.data[col * self.stride + row] = value; } /// Returns true if the matrix is empty. @@ -113,43 +129,43 @@ impl Matrix { self.n_rows == 0 || self.n_cols == 0 } - /// Returns the total number of elements. + /// Returns the logical number of elements (n_rows * n_cols), not including padding. #[inline] pub fn len(&self) -> usize { - self.data.len() + self.n_rows * self.n_cols } - /// Returns an immutable reference to the flat buffer. + /// Returns an immutable reference to the full flat buffer including padding. #[inline] pub fn as_slice(&self) -> &[f64] { &self.data } - /// Returns a mutable reference to the flat buffer. + /// Returns a mutable reference to the full flat buffer including padding. #[inline] pub fn as_mut_slice(&mut self) -> &mut [f64] { &mut self.data } - /// Returns a view of the matrix as a slice of columns. + /// Returns a view of the matrix as a slice of columns (logical rows only, no padding). pub fn columns(&self) -> Vec<&[f64]> { (0..self.n_cols) - .map(|col| &self.data[(col * self.n_rows)..((col + 1) * self.n_rows)]) + .map(|col| &self.data[(col * self.stride)..(col * self.stride + self.n_rows)]) .collect() } - /// Returns a vector of mutable slices, each corresponding to a column of the matrix. + /// Returns a vector of mutable slices, each corresponding to a column. pub fn columns_mut(&mut self) -> Vec<&mut [f64]> { let n_rows = self.n_rows; + let stride = self.stride; let n_cols = self.n_cols; let ptr = self.data.as_mut_slice().as_mut_ptr(); let mut result = Vec::with_capacity(n_cols); for col in 0..n_cols { - let start = col * n_rows; - // SAFETY: - // - Each slice is within bounds and non-overlapping, - // - We have exclusive &mut access to self. + let start = col * stride; + // SAFETY: each slice is within bounds and non-overlapping, + // we have exclusive &mut access to self. unsafe { let col_ptr = ptr.add(start); let slice = std::slice::from_raw_parts_mut(col_ptr, n_rows); @@ -163,27 +179,28 @@ impl Matrix { #[inline] pub fn col(&self, col: usize) -> &[f64] { debug_assert!(col < self.n_cols, "Col out of bounds"); - &self.data[(col * self.n_rows)..((col + 1) * self.n_rows)] + &self.data[(col * self.stride)..(col * self.stride + self.n_rows)] } /// Returns a single column as a mutable slice, panics if col out of bounds. #[inline] pub fn col_mut(&mut self, col: usize) -> &mut [f64] { debug_assert!(col < self.n_cols, "Col out of bounds"); - &mut self.data[(col * self.n_rows)..((col + 1) * self.n_rows)] + let start = col * self.stride; + &mut self.data[start..start + self.n_rows] } /// Returns a single row as an owned Vec. #[inline] pub fn row(&self, row: usize) -> Vec { debug_assert!(row < self.n_rows, "Row out of bounds"); - (0..self.n_cols).map(|col| self.get(row, col)).collect() + (0..self.n_cols).map(|col| self.data[col * self.stride + row]).collect() } - /// Renames the matrix + /// Sets the matrix name. #[inline] pub fn set_name(&mut self, name: impl Into) { - self.name = name.into(); + self.name = Some(name.into()); } /// Returns the number of columns. @@ -196,6 +213,78 @@ impl Matrix { pub fn n_rows(&self) -> usize { self.n_rows } + + // ********************** BLAS/LAPACK Compatibility ************** + + /// Number of rows as i32 for BLAS parameter passing. + #[inline] + pub fn m(&self) -> i32 { + self.n_rows as i32 + } + + /// Number of columns as i32 for BLAS parameter passing. + #[inline] + pub fn n(&self) -> i32 { + self.n_cols as i32 + } + + /// Leading dimension for BLAS - equals stride, which is n_rows padded to + /// 64-byte alignment. Pass this as the `lda` parameter to all BLAS/LAPACK calls. + #[inline] + pub fn lda(&self) -> i32 { + self.stride as i32 + } + + // ********************** Table conversion ********************** + + /// Convert this Matrix into a Table with zero-copy column sharing. + /// + /// The matrix data buffer is frozen into a SharedBuffer, and each column + /// becomes a FloatArray backed by a window into that shared allocation. + /// No data is copied. + /// + /// `fields` must have exactly `n_cols` entries, providing the name and + /// metadata for each column. + pub fn to_table(self, fields: Vec) -> Result { + if fields.len() != self.n_cols { + return Err(MinarrowError::ShapeError { + message: format!( + "to_table: expected {} fields for {} columns, got {}", + self.n_cols, self.n_cols, fields.len() + ), + }); + } + + let n_rows = self.n_rows; + let n_cols = self.n_cols; + let stride = self.stride; + + // Freeze the Vec64 into a SharedBuffer (zero-copy, refcounted) + // SAFETY: f64 is plain data with no drop logic + let shared = unsafe { SharedBuffer::from_vec64_typed(self.data) }; + + let mut cols = Vec::with_capacity(n_cols); + for (i, field) in fields.into_iter().enumerate() { + // Each column starts at i * stride elements, which is 64-byte aligned + let col_offset = i * stride; + let buf: Buffer = Buffer::from_shared_column(shared.clone(), col_offset, n_rows); + let float_arr = FloatArray::new(buf, None); + let array = Array::NumericArray(NumericArray::Float64(Arc::new(float_arr))); + cols.push(FieldArray::new(field, array)); + } + + Ok(Table::new(self.name.unwrap_or_default(), Some(cols))) + } + + /// Convert this Matrix into a Table using auto-generated column names + /// (col_0, col_1, ...). + pub fn to_table_gen(self) -> Table { + let n_cols = self.n_cols; + let fields: Vec = (0..n_cols) + .map(|i| Field::new(format!("col_{}", i), crate::ffi::arrow_dtype::ArrowType::Float64, false, None)) + .collect(); + self.to_table(fields).unwrap() + } } impl Shape for Matrix { @@ -208,7 +297,7 @@ impl Shape for Matrix { } impl Concatenate for Matrix { - /// Concatenates two matrices vertically (row-wise stacking). + /// Concatenates two matrices vertically (i.e., row-wise stacking). /// /// # Requirements /// - Both matrices must have the same number of columns @@ -236,33 +325,30 @@ impl Concatenate for Matrix { return Ok(Matrix::new( 0, 0, - Some(format!("{}+{}", self.name, other.name)), + None, )); } let result_n_rows = self.n_rows + other.n_rows; let result_n_cols = self.n_cols; - let mut result_data = Vec64::with_capacity(result_n_rows * result_n_cols); + let result_stride = aligned_stride(result_n_rows); + let pad = result_stride - result_n_rows; + let mut result_data = Vec64::with_capacity(result_stride * result_n_cols); - // For each column, concatenate self's column with other's column - // Since data is stored column-major, each column is contiguous for col in 0..result_n_cols { - // Copy self's column - let self_col_start = col * self.n_rows; - let self_col_end = self_col_start + self.n_rows; - result_data.extend_from_slice(&self.data[self_col_start..self_col_end]); - - // Copy other's column - let other_col_start = col * other.n_rows; - let other_col_end = other_col_start + other.n_rows; - result_data.extend_from_slice(&other.data[other_col_start..other_col_end]); + result_data.extend_from_slice(self.col(col)); + result_data.extend_from_slice(other.col(col)); + if pad > 0 { + result_data.extend_from_slice(&[0.0; ALIGN_ELEMS][..pad]); + } } Ok(Matrix { n_rows: result_n_rows, n_cols: result_n_cols, + stride: result_stride, data: result_data, - name: format!("{}+{}", self.name, other.name), + name: None, }) } } @@ -272,8 +358,9 @@ impl fmt::Debug for Matrix { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, - "Matrix '{}': {} × {} [col-major]", - self.name, self.n_rows, self.n_cols + "Matrix{}: {} × {} [col-major]", + self.name.as_deref().map_or(String::new(), |n| format!(" '{}'", n)), + self.n_rows, self.n_cols )?; for row in 0..self.n_rows.min(6) { // Print up to 6 rows @@ -297,134 +384,130 @@ impl fmt::Debug for Matrix { } } -// From Vec> to Matrix (all cols must match length) -impl From<(Vec>, String)> for Matrix { - fn from((columns, name): (Vec>, String)) -> Self { +// From Vec> to unnamed Matrix +impl From>> for Matrix { + fn from(columns: Vec>) -> Self { let n_cols = columns.len(); let n_rows = columns.first().map(|c| c.data.len()).unwrap_or(0); + let stride = aligned_stride(n_rows); + let pad = stride - n_rows; for col in &columns { assert_eq!(col.data.len(), n_rows, "Column length mismatch"); } - let mut data = Vec64::with_capacity(n_rows * n_cols); + let mut data = Vec64::with_capacity(stride * n_cols); for col in &columns { data.extend_from_slice(&col.data); + if pad > 0 { + data.extend_from_slice(&[0.0; ALIGN_ELEMS][..pad]); + } } - Matrix { - n_rows, - n_cols, - data, - name, - } + Matrix { n_rows, n_cols, stride, data, name: None } + } +} + +// From (Vec>, String) to named Matrix +impl From<(Vec>, String)> for Matrix { + fn from((columns, name): (Vec>, String)) -> Self { + let mut mat = Matrix::from(columns); + mat.name = Some(name); + mat } } -// From &[FloatArray] to Matrix +// From &[FloatArray] to unnamed Matrix impl From<&[FloatArray]> for Matrix { fn from(columns: &[FloatArray]) -> Self { let n_cols = columns.len(); let n_rows = columns.first().map(|c| c.data.len()).unwrap_or(0); + let stride = aligned_stride(n_rows); + let pad = stride - n_rows; for col in columns { assert_eq!(col.data.len(), n_rows, "Column length mismatch"); } - let mut data = Vec64::with_capacity(n_rows * n_cols); + let mut data = Vec64::with_capacity(stride * n_cols); for col in columns { data.extend_from_slice(&col.data); + if pad > 0 { + data.extend_from_slice(&[0.0; ALIGN_ELEMS][..pad]); + } } - let name = { - let id = UNNAMED_MATRIX_COUNTER.fetch_add(1, Ordering::Relaxed); - format!("UnnamedMatrix{}", id) - }; - Matrix { - n_rows, - n_cols, - data, - name, + Matrix { n_rows, n_cols, stride, data, name: None } + } +} + +impl TryFrom<&Table> for Matrix { + type Error = MinarrowError; + + fn try_from(table: &Table) -> Result { + let name = if table.name.is_empty() { None } else { Some(table.name.clone()) }; + let n_cols = table.n_cols(); + let n_rows = table.n_rows; + let stride = aligned_stride(n_rows); + let pad = stride - n_rows; + + let mut data = Vec64::with_capacity(stride * n_cols); + for (col_idx, fa) in table.cols.iter().enumerate() { + let numeric = fa.array.num_ref().map_err(|_| MinarrowError::TypeError { + from: "non-numeric", + to: "Float64", + message: Some(format!("column {} is not numeric", col_idx)), + })?; + let f64_arr = numeric.clone().f64()?; + if f64_arr.data.len() != n_rows { + return Err(MinarrowError::ColumnLengthMismatch { + col: col_idx, + expected: n_rows, + found: f64_arr.data.len(), + }); + } + data.extend_from_slice(f64_arr.data.as_slice()); + if pad > 0 { + data.extend_from_slice(&[0.0; ALIGN_ELEMS][..pad]); + } } + + Ok(Matrix { n_rows, n_cols, stride, data, name }) } } -// TODO: Fix -// impl TryFrom<&Table> for Matrix { -// type Error = MinarrowError; - -// fn try_from(table: &Table) -> Result { -// let name = table.name.clone(); -// let n_cols = table.n_cols(); -// let n_rows = table.n_rows(); - -// // Collect and check columns -// let mut float_columns = Vec::with_capacity(n_cols); -// for fa in &table.cols { -// let numeric_array = fa.array.num(); -// let arr: FloatArray = numeric_array.f64()?; -// float_columns.push(arr); -// } - -// // Ensure all columns are the correct length -// for (col_idx, col) in float_columns.iter().enumerate() { -// if col.data.len() != n_rows { -// return Err(MinarrowError::ColumnLengthMismatch { -// col: col_idx, -// expected: n_rows, -// found: col.data.len() -// }); -// } -// } - -// // Flatten into single column-major Vec64 -// let mut data = Vec64::with_capacity(n_rows * n_cols); -// for col in &float_columns { -// data.0.extend_from_slice(&col.data); -// } - -// Ok(Matrix { n_rows, n_cols, data, name }) -// } -// } - -// From Vec> (Vec-of-cols) to Matrix (anonymous name) +impl TryFrom for Matrix { + type Error = MinarrowError; + + fn try_from(table: Table) -> Result { + Matrix::try_from(&table) + } +} + +// From &[Vec] (Vec-of-cols) to unnamed Matrix impl From<&[Vec]> for Matrix { fn from(columns: &[Vec]) -> Self { let n_cols = columns.len(); let n_rows = columns.first().map(|c| c.len()).unwrap_or(0); + let stride = aligned_stride(n_rows); + let pad = stride - n_rows; for col in columns { assert_eq!(col.len(), n_rows, "Column length mismatch"); } - let mut data = Vec64::with_capacity(n_rows * n_cols); + let mut data = Vec64::with_capacity(stride * n_cols); for col in columns { data.extend_from_slice(col); + if pad > 0 { + data.extend_from_slice(&[0.0; ALIGN_ELEMS][..pad]); + } } - let name = { - let id = UNNAMED_MATRIX_COUNTER.fetch_add(1, Ordering::Relaxed); - format!("UnnamedMatrix{}", id) - }; - Matrix { - n_rows, - n_cols, - data, - name, - } + Matrix { n_rows, n_cols, stride, data, name: None } } } -// From flat slice with shape +// From flat unpadded slice with shape - re-lays out with stride padding impl<'a> From<(&'a [f64], usize, usize, Option)> for Matrix { fn from((slice, n_rows, n_cols, name): (&'a [f64], usize, usize, Option)) -> Self { assert_eq!(slice.len(), n_rows * n_cols, "Slice shape mismatch"); - let data = Vec64::from(slice); - let name = name.unwrap_or_else(|| { - let id = UNNAMED_MATRIX_COUNTER.fetch_add(1, Ordering::Relaxed); - format!("UnnamedMatrix{}", id) - }); - Matrix { - n_rows, - n_cols, - data, - name, - } + Matrix::from_f64_unaligned(slice, n_rows, n_cols, name) } } -// ===================== Iterators ====================== +// ********************** Iterators *********************** impl<'a> IntoIterator for &'a Matrix { type Item = &'a f64; diff --git a/src/structs/shared_buffer/mod.rs b/src/structs/shared_buffer/mod.rs index 1a2c1fc..88c2215 100644 --- a/src/structs/shared_buffer/mod.rs +++ b/src/structs/shared_buffer/mod.rs @@ -127,6 +127,26 @@ impl SharedBuffer { }, } } + /// Constructs a `SharedBuffer` from a SIMD-aligned Vec64, reinterpreting + /// the allocation as raw bytes. This is a zero-copy operation that transfers + /// ownership of the allocation to the SharedBuffer. + /// + /// Used by Matrix::to_table to freeze the matrix data buffer so that + /// per-column Buffer instances can share it via zero-copy windows. + /// + /// # Safety + /// T must be a plain data type with no drop logic. + pub unsafe fn from_vec64_typed(v: Vec64) -> Self { + let byte_len = v.len() * std::mem::size_of::(); + let byte_cap = v.0.capacity() * std::mem::size_of::(); + let ptr = v.0.as_ptr() as *mut u8; + std::mem::forget(v); + let raw_vec = unsafe { + Vec::from_raw_parts_in(ptr, byte_len, byte_cap, vec64::Vec64Alloc::default()) + }; + Self::from_vec64(Vec64(raw_vec)) + } + /// Constructs a `SharedBuffer` from an arbitrary owner (e.g. Arc<[u8]>, mmap, etc). /// /// The owner must implement `AsRef<[u8]> + Send + Sync + 'static`. diff --git a/src/structs/table.rs b/src/structs/table.rs index b211adf..b86ea21 100644 --- a/src/structs/table.rs +++ b/src/structs/table.rs @@ -234,6 +234,43 @@ impl Table { self.cols.iter().map(|fa| fa.field.name.as_str()).collect() } + /// Rename columns in place. Each pair is (old_name, new_name). + /// + /// Returns an error if any old name is not found. + /// This is metadata-only - array data is not touched. + pub fn rename_columns( + &mut self, + mapping: &[(&str, &str)], + ) -> Result<(), MinarrowError> { + for &(old, _) in mapping { + if !self.cols.iter().any(|fa| fa.field.name == old) { + return Err(MinarrowError::IndexError(format!( + "rename_columns: column '{}' not found", + old + ))); + } + } + for col in &mut self.cols { + for &(old, new) in mapping { + if col.field.name == old { + let f = &col.field; + col.field = Arc::new(Field::new( + new, + f.dtype.clone(), + f.nullable, + if f.metadata.is_empty() { + None + } else { + Some(f.metadata.clone()) + }, + )); + break; + } + } + } + Ok(()) + } + /// Returns the index of a column by name. pub fn col_name_index(&self, name: &str) -> Option { self.cols.iter().position(|fa| fa.field.name == name) diff --git a/src/structs/views/collections/numeric_array_view.rs b/src/structs/views/collections/numeric_array_view.rs index 9d65f1c..f2080ce 100644 --- a/src/structs/views/collections/numeric_array_view.rs +++ b/src/structs/views/collections/numeric_array_view.rs @@ -38,7 +38,7 @@ use crate::structs::views::bitmask_view::BitmaskV; use crate::traits::concatenate::Concatenate; use crate::traits::print::MAX_PREVIEW; use crate::traits::shape::Shape; -use crate::{Array, ArrayV, FieldArray, MaskedArray, NumericArray}; +use crate::{Array, ArrayV, Bitmask, FieldArray, MaskedArray, NumericArray}; /// # NumericArrayView /// @@ -148,7 +148,13 @@ impl NumericArrayV { NumericArray::Float64(arr) => arr.get(phys_idx), NumericArray::Null => None, #[cfg(feature = "extended_numeric_types")] - _ => unreachable!("get_f64: not implemented for extended numeric types"), + NumericArray::Int8(arr) => arr.get(phys_idx).map(|v| v as f64), + #[cfg(feature = "extended_numeric_types")] + NumericArray::Int16(arr) => arr.get(phys_idx).map(|v| v as f64), + #[cfg(feature = "extended_numeric_types")] + NumericArray::UInt8(arr) => arr.get(phys_idx).map(|v| v as f64), + #[cfg(feature = "extended_numeric_types")] + NumericArray::UInt16(arr) => arr.get(phys_idx).map(|v| v as f64), } } @@ -168,7 +174,13 @@ impl NumericArrayV { NumericArray::Float64(arr) => unsafe { arr.get_unchecked(phys_idx) }, NumericArray::Null => None, #[cfg(feature = "extended_numeric_types")] - _ => unreachable!("get_f64_unchecked: not implemented for extended numeric types"), + NumericArray::Int8(arr) => unsafe { arr.get_unchecked(phys_idx) }.map(|v| v as f64), + #[cfg(feature = "extended_numeric_types")] + NumericArray::Int16(arr) => unsafe { arr.get_unchecked(phys_idx) }.map(|v| v as f64), + #[cfg(feature = "extended_numeric_types")] + NumericArray::UInt8(arr) => unsafe { arr.get_unchecked(phys_idx) }.map(|v| v as f64), + #[cfg(feature = "extended_numeric_types")] + NumericArray::UInt16(arr) => unsafe { arr.get_unchecked(phys_idx) }.map(|v| v as f64), } } @@ -248,6 +260,39 @@ impl NumericArrayV { pub fn set_null_count(&self, count: usize) -> Result<(), usize> { self.null_count.set(count).map_err(|_| count) } + + /// Guarantees the backing array is Float64, then returns the f64 slice, + /// null mask, and null count for this view's window. + /// + /// **If already Float64, this is a pass-through.** Otherwise the full backing + /// NumericArray is cast to Float64 via [`NumericArray::cow_into_f64`], + /// preserving the window offset and length. + /// + /// When multiple views share the same backing array, the first view to + /// call this will trigger the cast. If it holds the sole Arc reference, + /// the old data is consumed in place. If other references exist, the data + /// is cloned, leaving the shared original untouched. Subsequent views + /// that still reference the original will cast independently when they + /// reach this call, so it generally is best avoided in such contexts as it would + /// clone for every independent window view. + pub fn guarantee_f64(&mut self) -> (&[f64], Option<&Bitmask>, Option) { + if !matches!(&self.array, NumericArray::Float64(_)) { + // Take the old array out, leaving Null as placeholder + let old = std::mem::take(&mut self.array); + self.array = old.cow_into_f64(); + } + // Safe: the branch above guarantees Float64 at this point + let NumericArray::Float64(arr) = &self.array else { unreachable!() }; + let slice = &arr.data.as_slice()[self.offset..self.offset + self.len]; + let mask = arr.null_mask.as_ref(); + let nc = if mask.is_some() { + Some(self.null_count()) + } else { + None + }; + (slice, mask, nc) + } + } impl From for NumericArrayV { diff --git a/src/structs/views/table_view.rs b/src/structs/views/table_view.rs index 2fd44f7..a7e10d4 100644 --- a/src/structs/views/table_view.rs +++ b/src/structs/views/table_view.rs @@ -210,6 +210,50 @@ impl TableV { self.fields.iter().map(|f| f.name.as_str()).collect() } + /// Rename columns in place. Each pair is (old_name, new_name). + /// + /// Returns an error if any old name is not found. + /// This is metadata-only - array data is not touched. If the Arc + /// is the sole reference it is mutated in place, otherwise a new Arc is + /// created with the renamed field. + pub fn rename_columns( + &mut self, + mapping: &[(&str, &str)], + ) -> Result<(), MinarrowError> { + for &(old, _) in mapping { + if !self.fields.iter().any(|f| f.name == old) { + return Err(MinarrowError::IndexError(format!( + "rename_columns: column '{}' not found", + old + ))); + } + } + for field_arc in &mut self.fields { + for &(old, new) in mapping { + if field_arc.name == old { + match Arc::get_mut(field_arc) { + Some(f) => f.name = new.to_string(), + None => { + let f = field_arc.as_ref(); + *field_arc = Arc::new(Field::new( + new, + f.dtype.clone(), + f.nullable, + if f.metadata.is_empty() { + None + } else { + Some(f.metadata.clone()) + }, + )); + } + } + break; + } + } + } + Ok(()) + } + /// Returns the index of a column by name. #[inline] pub fn col_name_index(&self, name: &str) -> Option {