diff --git a/cold-string/README.md b/cold-string/README.md index 16969fb..aae5cbb 100644 --- a/cold-string/README.md +++ b/cold-string/README.md @@ -4,7 +4,7 @@ [![docs.rs](https://img.shields.io/badge/docs.rs-66c2a5?style=for-the-badge&labelColor=555555&logo=docs.rs)](https://docs.rs/cold-string) ![MSRV](https://img.shields.io/crates/msrv/cold-string?style=for-the-badge) -A 1-word (8-byte) sized representation of immutable UTF-8 strings that in-lines up to 8 bytes. Optimized for memory usage and struct packing. +A 1-word (8-byte) sized representation of immutable UTF-8 strings that in-lines up to 8 bytes. ## Overview @@ -36,22 +36,23 @@ assert_eq!(s.as_str(), "qwerty"); Packs well with other types: ```rust use cold_string::ColdString; -use std::mem::{align_of, size_of}; +use std::mem::size_of; assert_eq!(size_of::(), size_of::()); -assert_eq!(align_of::(), 1); - -assert_eq!(size_of::<(ColdString, u8)>(), size_of::() + 1); -assert_eq!(size_of::>(), size_of::() + 1); +// ColdString has a null-niche: +assert_eq!(size_of::>(), size_of::()); ``` ## How It Works ColdString is an 8-byte tagged pointer (4 bytes on 32-bit machines): + ```rust -#[repr(packed)] +use std::ptr::NonNull; + +#[repr(transparent)] pub struct ColdString { - encoded: *mut u8, + encoded: NonNull, } ``` The 8 bytes encode one of three representations indicated by the 1st byte: @@ -59,6 +60,7 @@ The 8 bytes encode one of three representations indicated by the 1st byte: least-significant 2 bits of the address are `00`. On the heap, the UTF-8 characters are preceded by the variable-length encoding of the size. The size uses 1 byte for 0 - 127, 2 bytes for 128 - 16383, etc. - `11111xxx`: xxx is the length and the remaining 0-7 bytes are UTF-8 characters. - `xxxxxxxx`: All 8 bytes are UTF-8. +The exception is if `encoded` is `usize::MAX`, the UTF-8 bytes are "\0\0\0\0\0\0\0\0". `10xxxxxx` and `11111xxx` are chosen because they cannot be valid first bytes of UTF-8. diff --git a/cold-string/src/lib.rs b/cold-string/src/lib.rs index 3a494a2..96bbcb2 100644 --- a/cold-string/src/lib.rs +++ b/cold-string/src/lib.rs @@ -22,7 +22,9 @@ use core::{ iter::FromIterator, mem, ops::Deref, - ptr, slice, str, + ptr, + ptr::NonNull, + slice, str, }; mod vint; @@ -42,29 +44,31 @@ const WIDTH: usize = mem::size_of::(); /// assert_eq!(s.as_str(), "qwerty"); /// ``` /// ``` -/// use core::mem; +/// use core::mem::size_of; /// use cold_string::ColdString; /// -/// assert_eq!(mem::size_of::(), mem::size_of::()); -/// assert_eq!(mem::align_of::(), 1); -/// assert_eq!(mem::size_of::<(ColdString, u8)>(), mem::size_of::() + 1); -/// assert_eq!(mem::align_of::<(ColdString, u8)>(), 1); +/// assert_eq!(size_of::(), size_of::()); +/// assert_eq!(size_of::>(), size_of::()); /// ``` -#[repr(packed)] +#[repr(transparent)] pub struct ColdString { /// The first byte of `encoded` is the "tag" and it determines the type: /// - 10xxxxxx: an encoded address for the heap. To decode, 10 is set to 00 and swapped /// with the LSB bits of the tag byte. The address is always a multiple of 4 (`HEAP_ALIGN`). /// - 11111xxx: xxx is the length in range 0..=7, followed by length UTF-8 bytes. /// - xxxxxxxx (valid UTF-8): 8 UTF-8 bytes. - encoded: *const u8, + /// The exception is if `encoded` is `usize::MAX`, the UTF-8 bytes are "\0\0\0\0\0\0\0\0". + encoded: NonNull, } +static EIGHT_NUL: [u8; WIDTH] = [0u8; WIDTH]; + impl ColdString { const TAG_MASK: usize = usize::from_ne_bytes(0b11000000usize.to_le_bytes()); const INLINE_TAG: usize = usize::from_ne_bytes(0b11111000usize.to_le_bytes()); const PTR_TAG: usize = usize::from_ne_bytes(0b10000000usize.to_le_bytes()); const LEN_MASK: usize = usize::from_ne_bytes(0b111usize.to_le_bytes()); + const EIGHT_NUL_MAP: usize = usize::MAX; const ROT: u32 = if cfg!(target_endian = "little") { 0 } else { @@ -135,6 +139,18 @@ impl ColdString { } } + #[rustversion::attr(since(1.61), const)] + #[inline] + fn new_eight_nul() -> Self { + // SAFETY: PTR_TAG is non-zero + unsafe { Self::from_inline_buf(Self::EIGHT_NUL_MAP.to_ne_bytes()) } + } + + #[inline] + fn is_eight_nul(&self) -> bool { + self.addr() == Self::EIGHT_NUL_MAP + } + #[inline] const fn inline_buf(s: &str) -> [u8; WIDTH] { debug_assert!(s.len() <= WIDTH); @@ -147,10 +163,12 @@ impl ColdString { buf } + /// SAFETY: b must not be all-zero #[rustversion::attr(since(1.61), const)] #[inline] - fn from_inline_buf(b: [u8; WIDTH]) -> Self { + unsafe fn from_inline_buf(b: [u8; WIDTH]) -> Self { let encoded = ptr::null_mut::().wrapping_add(usize::from_ne_bytes(b)); + let encoded = NonNull::new_unchecked(encoded); Self { encoded } } @@ -161,10 +179,16 @@ impl ColdString { #[inline] fn new_inline(s: &str) -> Self { + if s.as_bytes() == EIGHT_NUL { + return Self::new_eight_nul(); + } let mut buf = Self::inline_buf(s); let start = Self::utf8_start(s.len()); buf[start..s.len() + start].copy_from_slice(s.as_bytes()); - Self::from_inline_buf(buf) + // SAFETY: + // it is checked at the top of the function than s is not all NUL + // and the inline tag is not 0, so shorter strings will also be not all NUL + unsafe { Self::from_inline_buf(buf) } } /// Creates a new inline [`ColdString`] from `&'static str` at compile time. @@ -190,6 +214,14 @@ impl ColdString { "Length for `new_inline_const` must be less than `core::mem::size_of::()`." ); } + if s.len() == WIDTH { + // can't do a slice comparison in const context + let bytes = unsafe { *(s.as_bytes() as *const _ as *const [u8; WIDTH]) }; + let int = usize::from_ne_bytes(bytes); + if int == 0 { + return Self::new_eight_nul(); + } + } let mut buf = Self::inline_buf(s); let start = Self::utf8_start(s.len()); let mut i = 0; @@ -197,18 +229,21 @@ impl ColdString { buf[i + start] = s.as_bytes()[i]; i += 1; } - Self::from_inline_buf(buf) + // SAFETY: + // It is checked at the top of the function than s is not all NUL, + // and the inline tag is not 0, so shorter strings will also be not all NUL. + unsafe { Self::from_inline_buf(buf) } } #[rustversion::attr(since(1.71), const)] #[inline] - unsafe fn ptr(&self) -> *const u8 { - ptr::read_unaligned(ptr::addr_of!(self.encoded)) + fn ptr(&self) -> *const u8 { + self.encoded.as_ptr() } #[inline] fn addr(&self) -> usize { - unsafe { self.ptr().addr() } + self.ptr().addr() } #[inline] @@ -230,6 +265,7 @@ impl ColdString { let layout = Layout::from_size_align(total, HEAP_ALIGN).unwrap(); unsafe { + // SAFETY: the layout size is non-zero, since the smallest VarInt is one byte let ptr = alloc(layout); if ptr.is_null() { alloc::alloc::handle_alloc_error(layout); @@ -244,6 +280,8 @@ impl ColdString { addr |= Self::PTR_TAG; addr }); + // SAFETY: encoded != 0 because Self::PTR_TAG != 0 + let encoded = NonNull::new_unchecked(encoded); Self { encoded } } } @@ -251,18 +289,17 @@ impl ColdString { #[inline] fn heap_ptr(&self) -> *const u8 { debug_assert!(!self.is_inline()); - unsafe { - self.ptr().map_addr(|mut addr| { - addr ^= Self::PTR_TAG; - let addr = addr.rotate_right(6 + Self::ROT); - debug_assert!(addr % HEAP_ALIGN == 0); - addr - }) - } + self.ptr().map_addr(|mut addr| { + addr ^= Self::PTR_TAG; + let addr = addr.rotate_right(6 + Self::ROT); + debug_assert!(addr % HEAP_ALIGN == 0); + addr + }) } #[inline] fn inline_len(&self) -> usize { + debug_assert!(!self.is_eight_nul()); let addr = self.addr(); match addr & Self::INLINE_TAG { Self::INLINE_TAG => (addr & Self::LEN_MASK).rotate_right(Self::ROT), @@ -288,7 +325,9 @@ impl ColdString { /// ``` #[inline] pub fn len(&self) -> usize { - if self.is_inline() { + if self.is_eight_nul() { + return WIDTH; + } else if self.is_inline() { self.inline_len() } else { unsafe { @@ -302,6 +341,9 @@ impl ColdString { #[allow(unsafe_op_in_unsafe_fn)] #[inline] unsafe fn decode_inline(&self) -> &[u8] { + if self.is_eight_nul() { + return &EIGHT_NUL; + } let len = self.inline_len(); // SAFETY: addr_of! avoids &self.ptr (which is UB due to alignment) let self_bytes_ptr = ptr::addr_of!(self.encoded) as *const u8; @@ -382,11 +424,12 @@ impl Deref for ColdString { impl Drop for ColdString { fn drop(&mut self) { if !self.is_inline() { + let ptr = self.heap_ptr(); unsafe { - let ptr = self.heap_ptr(); let (len, header) = VarInt::read(ptr); let total = header + len; let layout = Layout::from_size_align(total, HEAP_ALIGN).unwrap(); + // SAFETY: if ptr is non-null then it was allocated by alloc() in new_heap() dealloc(ptr as *mut u8, layout); } } @@ -395,13 +438,12 @@ impl Drop for ColdString { impl Clone for ColdString { fn clone(&self) -> Self { - match self.is_inline() { - true => unsafe { - Self { - encoded: self.ptr(), - } - }, - false => Self::new_heap(self.as_str()), + if self.is_inline() { + let ptr = self.ptr(); + let encoded = unsafe { NonNull::new_unchecked(ptr as *mut _) }; + Self { encoded } + } else { + Self::new_heap(self.as_str()) } } } @@ -409,7 +451,7 @@ impl Clone for ColdString { impl PartialEq for ColdString { fn eq(&self, other: &Self) -> bool { match (self.is_inline(), other.is_inline()) { - (true, true) => unsafe { self.ptr() == other.ptr() }, + (true, true) => self.ptr() == other.ptr(), (false, false) => unsafe { self.decode_heap() == other.decode_heap() }, _ => false, } @@ -605,14 +647,6 @@ mod tests { #[test] fn test_layout() { assert_eq!(mem::size_of::(), mem::size_of::()); - assert_eq!(mem::align_of::(), 1); - struct Foo { - _s: ColdString, - _b: u8, - } - - assert_eq!(mem::size_of::(), mem::size_of::() + 1); - assert_eq!(mem::align_of::(), 1); } #[test] @@ -640,6 +674,9 @@ mod tests { assert_eq!(s, cs); assert_eq!(cs, *s); assert_eq!(*s, cs); + let opt_s = Some(cs.clone()); + assert_eq!(opt_s, Some(ColdString::new(s))); + assert!(opt_s != None); } #[test] @@ -741,4 +778,24 @@ mod tests { } } } + + #[test] + fn ensure_zero_repr() { + assert!(str::from_utf8(&ColdString::EIGHT_NUL_MAP.to_ne_bytes()).is_err()); + } + + #[test] + fn test_const_8nul_vs_non_const() { + let nul8 = str::from_utf8(&EIGHT_NUL).unwrap(); + let const8 = ColdString::new_inline_const(nul8); + let non_const = ColdString::new(nul8); + let cloned = non_const.clone(); + assert_eq!(const8.ptr(), non_const.ptr()); + assert_eq!(const8.ptr(), cloned.ptr()); + // check that a null pointer will return a str pointing to EIGHT_NUL + assert_eq!( + &const8.as_str().as_bytes()[0] as *const u8, + (&EIGHT_NUL) as *const u8 + ); + } } diff --git a/cold-string/tests/property.rs b/cold-string/tests/property.rs index 231edf4..7d0ed56 100644 --- a/cold-string/tests/property.rs +++ b/cold-string/tests/property.rs @@ -43,6 +43,9 @@ proptest! { if s.len() <= core::mem::size_of::() { assert_eq!(ColdString::new_inline_const(&s), cold); } + let opt_s = Some(cold.clone()); + assert_eq!(opt_s.as_ref().unwrap(), &cold); + assert_eq!(opt_s.as_ref().map(|x| x.as_str()), Some(s.as_str())); } }