diff --git a/src/enums/array.rs b/src/enums/array.rs index d77c006..886646d 100644 --- a/src/enums/array.rs +++ b/src/enums/array.rs @@ -2759,6 +2759,28 @@ impl Array { } } + /// Appends rows `[offset..offset+len)` from another array into self. + /// Extends data and null masks directly from the source range. + pub fn concat_array_range(&mut self, other: &Self, offset: usize, len: usize) -> Result<(), MinarrowError> { + match (self, other) { + (Array::NumericArray(lhs), Array::NumericArray(rhs)) => lhs.append_range(rhs, offset, len), + (Array::BooleanArray(a), Array::BooleanArray(b)) => Arc::make_mut(a).append_range(b, offset, len), + (Array::TextArray(lhs), Array::TextArray(rhs)) => lhs.append_range(rhs, offset, len), + #[cfg(feature = "datetime")] + (Array::TemporalArray(lhs), Array::TemporalArray(rhs)) => lhs.append_range(rhs, offset, len), + (Array::Null, Array::Null) => Ok(()), + (lhs, rhs) => Err(MinarrowError::TypeError { + from: "Array", + to: "Array", + message: Some(format!( + "Cannot append_range {:?} into {:?}", + rhs.arrow_type(), + lhs.arrow_type() + )), + }), + } + } + /// Inserts all values (and null mask if present) from `other` into `self` at the specified index. /// /// This is an **O(n)** operation. diff --git a/src/enums/collections/numeric_array.rs b/src/enums/collections/numeric_array.rs index 3ca3213..3d9159e 100644 --- a/src/enums/collections/numeric_array.rs +++ b/src/enums/collections/numeric_array.rs @@ -167,6 +167,31 @@ impl NumericArray { } } + pub fn append_range(&mut self, other: &Self, offset: usize, len: usize) -> Result<(), MinarrowError> { + match (self, other) { + #[cfg(feature = "extended_numeric_types")] + (NumericArray::Int8(a), NumericArray::Int8(b)) => Arc::make_mut(a).append_range(b, offset, len), + #[cfg(feature = "extended_numeric_types")] + (NumericArray::Int16(a), NumericArray::Int16(b)) => Arc::make_mut(a).append_range(b, offset, len), + (NumericArray::Int32(a), NumericArray::Int32(b)) => Arc::make_mut(a).append_range(b, offset, len), + (NumericArray::Int64(a), NumericArray::Int64(b)) => Arc::make_mut(a).append_range(b, offset, len), + #[cfg(feature = "extended_numeric_types")] + (NumericArray::UInt8(a), NumericArray::UInt8(b)) => Arc::make_mut(a).append_range(b, offset, len), + #[cfg(feature = "extended_numeric_types")] + (NumericArray::UInt16(a), NumericArray::UInt16(b)) => Arc::make_mut(a).append_range(b, offset, len), + (NumericArray::UInt32(a), NumericArray::UInt32(b)) => Arc::make_mut(a).append_range(b, offset, len), + (NumericArray::UInt64(a), NumericArray::UInt64(b)) => Arc::make_mut(a).append_range(b, offset, len), + (NumericArray::Float32(a), NumericArray::Float32(b)) => Arc::make_mut(a).append_range(b, offset, len), + (NumericArray::Float64(a), NumericArray::Float64(b)) => Arc::make_mut(a).append_range(b, offset, len), + (NumericArray::Null, NumericArray::Null) => Ok(()), + (lhs, rhs) => Err(MinarrowError::TypeError { + from: "NumericArray", + to: "NumericArray", + message: Some(format!("Cannot append_range {:?} into {:?}", rhs, lhs)), + }), + } + } + /// Inserts all values (and null mask if present) from `other` into `self` at the specified index. /// /// This is an **O(n)** operation. diff --git a/src/enums/collections/temporal_array.rs b/src/enums/collections/temporal_array.rs index 1b9a093..5f5f6ea 100644 --- a/src/enums/collections/temporal_array.rs +++ b/src/enums/collections/temporal_array.rs @@ -122,6 +122,19 @@ impl TemporalArray { } } + pub fn append_range(&mut self, other: &Self, offset: usize, len: usize) -> Result<(), MinarrowError> { + match (self, other) { + (TemporalArray::Datetime32(a), TemporalArray::Datetime32(b)) => Arc::make_mut(a).append_range(b, offset, len), + (TemporalArray::Datetime64(a), TemporalArray::Datetime64(b)) => Arc::make_mut(a).append_range(b, offset, len), + (TemporalArray::Null, TemporalArray::Null) => Ok(()), + (lhs, rhs) => Err(MinarrowError::TypeError { + from: "TemporalArray", + to: "TemporalArray", + message: Some(format!("Cannot append_range {:?} into {:?}", rhs, lhs)), + }), + } + } + /// Inserts all values (and null mask if present) from `other` into `self` at the specified index. /// /// This is an **O(n)** operation. diff --git a/src/enums/collections/text_array.rs b/src/enums/collections/text_array.rs index 44e2c14..0a2efe2 100644 --- a/src/enums/collections/text_array.rs +++ b/src/enums/collections/text_array.rs @@ -146,6 +146,27 @@ impl TextArray { } } + pub fn append_range(&mut self, other: &Self, offset: usize, len: usize) -> Result<(), MinarrowError> { + match (self, other) { + (TextArray::String32(a), TextArray::String32(b)) => Arc::make_mut(a).append_range(b, offset, len), + #[cfg(feature = "large_string")] + (TextArray::String64(a), TextArray::String64(b)) => Arc::make_mut(a).append_range(b, offset, len), + #[cfg(feature = "extended_categorical")] + (TextArray::Categorical8(a), TextArray::Categorical8(b)) => Arc::make_mut(a).append_range(b, offset, len), + #[cfg(feature = "extended_categorical")] + (TextArray::Categorical16(a), TextArray::Categorical16(b)) => Arc::make_mut(a).append_range(b, offset, len), + (TextArray::Categorical32(a), TextArray::Categorical32(b)) => Arc::make_mut(a).append_range(b, offset, len), + #[cfg(feature = "extended_categorical")] + (TextArray::Categorical64(a), TextArray::Categorical64(b)) => Arc::make_mut(a).append_range(b, offset, len), + (TextArray::Null, TextArray::Null) => Ok(()), + (lhs, rhs) => Err(MinarrowError::TypeError { + from: "TextArray", + to: "TextArray", + message: Some(format!("Cannot append_range {:?} into {:?}", rhs, lhs)), + }), + } + } + /// Inserts all values (and null mask if present) from `other` into `self` at the specified index. /// /// This is an **O(n)** operation. diff --git a/src/macros.rs b/src/macros.rs index 2b95135..89b2456 100644 --- a/src/macros.rs +++ b/src/macros.rs @@ -330,6 +330,34 @@ macro_rules! impl_masked_array { } } + fn append_range(&mut self, other: &Self, offset: usize, len: usize) -> Result<(), $crate::enums::error::MinarrowError> { + if len == 0 { return Ok(()); } + if offset + len > other.len() { + return Err($crate::enums::error::MinarrowError::IndexError( + format!("append_range: offset {} + len {} exceeds source length {}", offset, len, other.len()) + )); + } + let orig_len = self.len(); + + self.data_mut().extend_from_slice(&other.data()[offset..offset + len]); + + match (self.null_mask_mut(), other.null_mask()) { + (Some(self_mask), Some(other_mask)) => { + self_mask.extend_from_bitmask_range(other_mask, offset, len); + } + (Some(self_mask), None) => { + self_mask.resize(orig_len + len, true); + } + (None, Some(other_mask)) => { + let mut mask = Bitmask::new_set_all(orig_len, true); + mask.extend_from_bitmask_range(other_mask, offset, len); + self.set_null_mask(Some(mask)); + } + (None, None) => {} + } + Ok(()) + } + /// Inserts all values (and null mask if present) from `other` into `self` at the specified index. /// /// This is an **O(n)** operation. @@ -933,6 +961,9 @@ macro_rules! impl_arc_masked_array { fn append_array(&mut self, other: &Self) { ::std::sync::Arc::make_mut(self).append_array(&**other) } + fn append_range(&mut self, other: &Self, offset: usize, len: usize) -> Result<(), $crate::enums::error::MinarrowError> { + ::std::sync::Arc::make_mut(self).append_range(&**other, offset, len) + } fn insert_rows( &mut self, index: usize, @@ -1054,6 +1085,9 @@ macro_rules! impl_arc_masked_array { fn append_array(&mut self, other: &Self) { ::std::sync::Arc::make_mut(self).append_array(&**other) } + fn append_range(&mut self, other: &Self, offset: usize, len: usize) -> Result<(), $crate::enums::error::MinarrowError> { + ::std::sync::Arc::make_mut(self).append_range(&**other, offset, len) + } fn insert_rows( &mut self, index: usize, diff --git a/src/structs/bitmask.rs b/src/structs/bitmask.rs index 3e8a430..1233195 100644 --- a/src/structs/bitmask.rs +++ b/src/structs/bitmask.rs @@ -503,6 +503,35 @@ impl Bitmask { } } + /// Appends bits `[offset..offset+len)` from another bitmask into self. + /// Byte-aligned sources copy whole bytes directly. Unaligned sources + /// shift bytes to align before copying. + pub fn extend_from_bitmask_range(&mut self, other: &Bitmask, offset: usize, len: usize) { + if len == 0 { return; } + let src_bytes = other.bits.as_slice(); + if offset & 7 == 0 { + // Source is byte-aligned - pass the bytes starting at the offset + self.extend_from_slice(&src_bytes[offset >> 3..], len); + } else { + // Unaligned source - shift bytes to produce an aligned slice + let src_byte_start = offset >> 3; + let bit_shift = (offset & 7) as u32; + let n_src_bytes = ((len + 7) >> 3) + 1; // +1 for the shifted tail + let end = (src_byte_start + n_src_bytes).min(src_bytes.len()); + let mut shifted = Vec::with_capacity(n_src_bytes); + for i in src_byte_start..end { + let lo = src_bytes[i] >> bit_shift; + let hi = if i + 1 < src_bytes.len() { + src_bytes[i + 1] << (8 - bit_shift) + } else { + 0 + }; + shifted.push(lo | hi); + } + self.extend_from_slice(&shifted, len); + } + } + /// Extends the bitmask by appending `len` bits from a bit-packed `[u8]` slice. /// /// - `src`: The source byte slice (bit-packed; LSB = first bit). diff --git a/src/structs/field_array.rs b/src/structs/field_array.rs index 2069f24..b6d2027 100644 --- a/src/structs/field_array.rs +++ b/src/structs/field_array.rs @@ -255,6 +255,14 @@ impl FieldArray { self.refresh_null_count(); } + /// Appends rows `[offset..offset+len)` from another FieldArray into self. + /// Extends data directly from the source's backing buffer. + pub fn concat_range(&mut self, other: &FieldArray, offset: usize, len: usize) -> Result<(), MinarrowError> { + self.array.concat_array_range(&other.array, offset, len)?; + self.refresh_null_count(); + Ok(()) + } + /// Provides mutable access to the underlying array with automatic null_count refresh. /// Uses copy-on-write semantics - clones array data if Arc reference count > 1. /// Use this for operations that may change the null count. diff --git a/src/structs/variants/boolean.rs b/src/structs/variants/boolean.rs index fc6730f..980fb02 100644 --- a/src/structs/variants/boolean.rs +++ b/src/structs/variants/boolean.rs @@ -610,6 +610,35 @@ impl MaskedArray for BooleanArray<()> { } } + fn append_range(&mut self, other: &Self, offset: usize, len: usize) -> Result<(), MinarrowError> { + if len == 0 { return Ok(()); } + if offset + len > other.len() { + return Err(MinarrowError::IndexError( + format!("append_range: offset {} + len {} exceeds source length {}", offset, len, other.len()) + )); + } + let orig_len = self.len(); + + self.data.extend_from_bitmask_range(&other.data, offset, len); + self.len += len; + + match (self.null_mask_mut(), other.null_mask()) { + (Some(self_mask), Some(other_mask)) => { + self_mask.extend_from_bitmask_range(other_mask, offset, len); + } + (Some(self_mask), None) => { + self_mask.resize(orig_len + len, true); + } + (None, Some(other_mask)) => { + let mut mask = Bitmask::new_set_all(orig_len, true); + mask.extend_from_bitmask_range(other_mask, offset, len); + self.set_null_mask(Some(mask)); + } + (None, None) => {} + } + Ok(()) + } + /// Inserts all values from `other` into `self` at the specified index. /// /// This is an O(n) operation for BooleanArray due to bit-packed data. diff --git a/src/structs/variants/categorical.rs b/src/structs/variants/categorical.rs index 86b6fb7..8264afe 100644 --- a/src/structs/variants/categorical.rs +++ b/src/structs/variants/categorical.rs @@ -798,35 +798,52 @@ impl MaskedArray for CategoricalArray { fn append_array(&mut self, other: &Self) { let orig_len = self.len(); let other_len = other.len(); + if other_len == 0 { return; } - if other_len == 0 { - return; - } - - // Append data self.data_mut().extend_from_slice(other.data()); - // Handle null masks match (self.null_mask_mut(), other.null_mask()) { (Some(self_mask), Some(other_mask)) => { self_mask.extend_from_bitmask(other_mask); } (Some(self_mask), None) => { - // Mark all appended as valid. self_mask.resize(orig_len + other_len, true); } (None, Some(other_mask)) => { - // Materialise new null mask for self, all existing valid. - let mut mask = Bitmask::new_set_all(orig_len + other_len, true); - for i in 0..other_len { - mask.set(orig_len + i, other_mask.get(i)); - } + let mut mask = Bitmask::new_set_all(orig_len, true); + mask.extend_from_bitmask(other_mask); self.set_null_mask(Some(mask)); } - (None, None) => { - // No mask in either: nothing to do. + (None, None) => {} + } + } + + fn append_range(&mut self, other: &Self, offset: usize, len: usize) -> Result<(), MinarrowError> { + if len == 0 { return Ok(()); } + if offset + len > other.len() { + return Err(MinarrowError::IndexError( + format!("append_range: offset {} + len {} exceeds source length {}", offset, len, other.len()) + )); + } + let orig_len = self.len(); + + self.data_mut().extend_from_slice(&other.data()[offset..offset + len]); + + match (self.null_mask_mut(), other.null_mask()) { + (Some(self_mask), Some(other_mask)) => { + self_mask.extend_from_bitmask_range(other_mask, offset, len); } + (Some(self_mask), None) => { + self_mask.resize(orig_len + len, true); + } + (None, Some(other_mask)) => { + let mut mask = Bitmask::new_set_all(orig_len, true); + mask.extend_from_bitmask_range(other_mask, offset, len); + self.set_null_mask(Some(mask)); + } + (None, None) => {} } + Ok(()) } /// Inserts all values from `other` into `self` at the specified index. diff --git a/src/structs/variants/string.rs b/src/structs/variants/string.rs index a36e7f4..82fff5b 100644 --- a/src/structs/variants/string.rs +++ b/src/structs/variants/string.rs @@ -976,24 +976,17 @@ impl MaskedArray for StringArray { fn append_array(&mut self, other: &Self) { let orig_len = self.len(); let other_len = other.len(); + if other_len == 0 { return; } - if other_len == 0 { - return; - } - - // 1. Append data self.data.extend_from_slice(&other.data); - let prev_last_offset = *self - .offsets - .last() + let prev_last_offset = *self.offsets.last() .expect("StringArray must have at least one offset"); for off in other.offsets.iter().skip(1) { let new_offset = prev_last_offset + (*off - other.offsets[0]); self.offsets.push(new_offset); } - // 3. Null mask match (self.null_mask_mut(), other.null_mask()) { (Some(self_mask), Some(other_mask)) => { self_mask.extend_from_bitmask(other_mask); @@ -1002,16 +995,53 @@ impl MaskedArray for StringArray { self_mask.resize(orig_len + other_len, true); } (None, Some(other_mask)) => { - let mut mask = Bitmask::new_set_all(orig_len + other_len, true); - for i in 0..other_len { - mask.set(orig_len + i, other_mask.get(i)); - } + let mut mask = Bitmask::new_set_all(orig_len, true); + mask.extend_from_bitmask(other_mask); self.set_null_mask(Some(mask)); } - (None, None) => { - // No mask in either: nothing to do. + (None, None) => {} + } + } + + fn append_range(&mut self, other: &Self, offset: usize, len: usize) -> Result<(), MinarrowError> { + if len == 0 { return Ok(()); } + if offset + len > other.len() { + return Err(MinarrowError::IndexError( + format!("append_range: offset {} + len {} exceeds source length {}", offset, len, other.len()) + )); + } + let orig_len = self.len(); + + // Byte range in other's data buffer for rows [offset..offset+len) + let src_byte_start = other.offsets[offset].to_usize(); + let src_byte_end = other.offsets[offset + len].to_usize(); + self.data.extend_from_slice(&other.data[src_byte_start..src_byte_end]); + + // Rebase offsets relative to self's current end + let prev_last_offset = *self.offsets.last() + .expect("StringArray must have at least one offset"); + let base = other.offsets[offset]; + for i in 1..=len { + let new_offset = prev_last_offset + (other.offsets[offset + i] - base); + self.offsets.push(new_offset); + } + + // Null mask + match (self.null_mask_mut(), other.null_mask()) { + (Some(self_mask), Some(other_mask)) => { + self_mask.extend_from_bitmask_range(other_mask, offset, len); } + (Some(self_mask), None) => { + self_mask.resize(orig_len + len, true); + } + (None, Some(other_mask)) => { + let mut mask = Bitmask::new_set_all(orig_len, true); + mask.extend_from_bitmask_range(other_mask, offset, len); + self.set_null_mask(Some(mask)); + } + (None, None) => {} } + Ok(()) } /// Inserts all values from `other` into `self` at the specified index. diff --git a/src/structs/views/table_view.rs b/src/structs/views/table_view.rs index a7e10d4..cb818c0 100644 --- a/src/structs/views/table_view.rs +++ b/src/structs/views/table_view.rs @@ -505,6 +505,7 @@ impl TableV { } let mut unique_values = Vec64::::with_capacity(value_map.len()); + unique_values.resize(value_map.len(), String::new()); for (val, code) in value_map { unique_values[code as usize] = val; } @@ -540,6 +541,7 @@ impl TableV { } let mut unique_values = Vec64::::with_capacity(value_map.len()); + unique_values.resize(value_map.len(), String::new()); for (val, code) in value_map { unique_values[code as usize] = val; } @@ -575,6 +577,7 @@ impl TableV { } let mut unique_values = Vec64::::with_capacity(value_map.len()); + unique_values.resize(value_map.len(), String::new()); for (val, code) in value_map { unique_values[code as usize] = val; } @@ -610,6 +613,7 @@ impl TableV { } let mut unique_values = Vec64::::with_capacity(value_map.len()); + unique_values.resize(value_map.len(), String::new()); for (val, code) in value_map { unique_values[code as usize] = val; } diff --git a/src/traits/masked_array.rs b/src/traits/masked_array.rs index 5153744..db4a3d1 100644 --- a/src/traits/masked_array.rs +++ b/src/traits/masked_array.rs @@ -277,6 +277,18 @@ pub trait MaskedArray { /// is an alternative option. fn append_array(&mut self, other: &Self); + /// Appends rows `[offset..offset+len)` from another array into self. + /// + /// Like `append_array` but for a sub-range. Data and null masks are + /// extended from the source range. The destination grows via its + /// backing allocator. + fn append_range( + &mut self, + other: &Self, + offset: usize, + len: usize, + ) -> Result<(), MinarrowError>; + /// Inserts all values (and null mask if present) from `other` into `self` at the specified index. /// /// The inserted array must be of the same concrete type and element type.