Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 10 additions & 8 deletions cold-string/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
[![docs.rs](https://img.shields.io/badge/docs.rs-66c2a5?style=for-the-badge&labelColor=555555&logo=docs.rs)](https://docs.rs/cold-string)
![MSRV](https://img.shields.io/crates/msrv/cold-string?style=for-the-badge)

A 1-word (8-byte) sized representation of immutable UTF-8 strings that in-lines up to 8 bytes. Optimized for memory usage and struct packing.
A 1-word (8-byte) sized representation of immutable UTF-8 strings that in-lines up to 8 bytes.

## Overview

Expand Down Expand Up @@ -36,29 +36,31 @@ assert_eq!(s.as_str(), "qwerty");
Packs well with other types:
```rust
use cold_string::ColdString;
use std::mem::{align_of, size_of};
use std::mem::size_of;

assert_eq!(size_of::<ColdString>(), size_of::<usize>());
assert_eq!(align_of::<ColdString>(), 1);

assert_eq!(size_of::<(ColdString, u8)>(), size_of::<usize>() + 1);
assert_eq!(size_of::<Option<ColdString>>(), size_of::<usize>() + 1);
// ColdString has a null-niche:
assert_eq!(size_of::<Option<ColdString>>(), size_of::<ColdString>());
```

## How It Works

ColdString is an 8-byte tagged pointer (4 bytes on 32-bit machines):

```rust
#[repr(packed)]
use std::ptr::NonNull;

#[repr(transparent)]
pub struct ColdString {
encoded: *mut u8,
encoded: NonNull<u8>,
}
```
The 8 bytes encode one of three representations indicated by the 1st byte:
- `10xxxxxx`: `encoded` contains a tagged heap pointer. To decode the address, clear the tag bits (`10 → 00`) and rotate so the `00` bits become the least-significant bits. The heap allocation uses [4-byte alignment](https://doc.rust-lang.org/beta/std/alloc/struct.Layout.html#method.from_size_align), guaranteeing the
least-significant 2 bits of the address are `00`. On the heap, the UTF-8 characters are preceded by the variable-length encoding of the size. The size uses 1 byte for 0 - 127, 2 bytes for 128 - 16383, etc.
- `11111xxx`: xxx is the length and the remaining 0-7 bytes are UTF-8 characters.
- `xxxxxxxx`: All 8 bytes are UTF-8.
The exception is if `encoded` is `usize::MAX`, the UTF-8 bytes are "\0\0\0\0\0\0\0\0".

`10xxxxxx` and `11111xxx` are chosen because they cannot be valid first bytes of UTF-8.

Expand Down
137 changes: 97 additions & 40 deletions cold-string/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@ use core::{
iter::FromIterator,
mem,
ops::Deref,
ptr, slice, str,
ptr,
ptr::NonNull,
slice, str,
};

mod vint;
Expand All @@ -42,29 +44,31 @@ const WIDTH: usize = mem::size_of::<usize>();
/// assert_eq!(s.as_str(), "qwerty");
/// ```
/// ```
/// use core::mem;
/// use core::mem::size_of;
/// use cold_string::ColdString;
///
/// assert_eq!(mem::size_of::<ColdString>(), mem::size_of::<usize>());
/// assert_eq!(mem::align_of::<ColdString>(), 1);
/// assert_eq!(mem::size_of::<(ColdString, u8)>(), mem::size_of::<usize>() + 1);
/// assert_eq!(mem::align_of::<(ColdString, u8)>(), 1);
/// assert_eq!(size_of::<ColdString>(), size_of::<usize>());
/// assert_eq!(size_of::<Option<ColdString>>(), size_of::<ColdString>());
/// ```
#[repr(packed)]
#[repr(transparent)]
pub struct ColdString {
/// The first byte of `encoded` is the "tag" and it determines the type:
/// - 10xxxxxx: an encoded address for the heap. To decode, 10 is set to 00 and swapped
/// with the LSB bits of the tag byte. The address is always a multiple of 4 (`HEAP_ALIGN`).
/// - 11111xxx: xxx is the length in range 0..=7, followed by length UTF-8 bytes.
/// - xxxxxxxx (valid UTF-8): 8 UTF-8 bytes.
encoded: *const u8,
/// The exception is if `encoded` is `usize::MAX`, the UTF-8 bytes are "\0\0\0\0\0\0\0\0".
encoded: NonNull<u8>,
}

static EIGHT_NUL: [u8; WIDTH] = [0u8; WIDTH];

impl ColdString {
const TAG_MASK: usize = usize::from_ne_bytes(0b11000000usize.to_le_bytes());
const INLINE_TAG: usize = usize::from_ne_bytes(0b11111000usize.to_le_bytes());
const PTR_TAG: usize = usize::from_ne_bytes(0b10000000usize.to_le_bytes());
const LEN_MASK: usize = usize::from_ne_bytes(0b111usize.to_le_bytes());
const EIGHT_NUL_MAP: usize = usize::MAX;
const ROT: u32 = if cfg!(target_endian = "little") {
0
} else {
Expand Down Expand Up @@ -135,6 +139,18 @@ impl ColdString {
}
}

#[rustversion::attr(since(1.61), const)]
#[inline]
fn new_eight_nul() -> Self {
// SAFETY: PTR_TAG is non-zero
unsafe { Self::from_inline_buf(Self::EIGHT_NUL_MAP.to_ne_bytes()) }
}

#[inline]
fn is_eight_nul(&self) -> bool {
self.addr() == Self::EIGHT_NUL_MAP
}

#[inline]
const fn inline_buf(s: &str) -> [u8; WIDTH] {
debug_assert!(s.len() <= WIDTH);
Expand All @@ -147,10 +163,12 @@ impl ColdString {
buf
}

/// SAFETY: b must not be all-zero
#[rustversion::attr(since(1.61), const)]
#[inline]
fn from_inline_buf(b: [u8; WIDTH]) -> Self {
unsafe fn from_inline_buf(b: [u8; WIDTH]) -> Self {
let encoded = ptr::null_mut::<u8>().wrapping_add(usize::from_ne_bytes(b));
let encoded = NonNull::new_unchecked(encoded);
Self { encoded }
}

Expand All @@ -161,10 +179,16 @@ impl ColdString {

#[inline]
fn new_inline(s: &str) -> Self {
if s.as_bytes() == EIGHT_NUL {
return Self::new_eight_nul();
}
let mut buf = Self::inline_buf(s);
let start = Self::utf8_start(s.len());
buf[start..s.len() + start].copy_from_slice(s.as_bytes());
Self::from_inline_buf(buf)
// SAFETY:
// it is checked at the top of the function than s is not all NUL
// and the inline tag is not 0, so shorter strings will also be not all NUL
unsafe { Self::from_inline_buf(buf) }
}

/// Creates a new inline [`ColdString`] from `&'static str` at compile time.
Expand All @@ -190,25 +214,36 @@ impl ColdString {
"Length for `new_inline_const` must be less than `core::mem::size_of::<usize>()`."
);
}
if s.len() == WIDTH {
// can't do a slice comparison in const context
let bytes = unsafe { *(s.as_bytes() as *const _ as *const [u8; WIDTH]) };
let int = usize::from_ne_bytes(bytes);
if int == 0 {
return Self::new_eight_nul();
}
}
let mut buf = Self::inline_buf(s);
let start = Self::utf8_start(s.len());
let mut i = 0;
while i < s.len() {
buf[i + start] = s.as_bytes()[i];
i += 1;
}
Self::from_inline_buf(buf)
// SAFETY:
// It is checked at the top of the function than s is not all NUL,
// and the inline tag is not 0, so shorter strings will also be not all NUL.
unsafe { Self::from_inline_buf(buf) }
}

#[rustversion::attr(since(1.71), const)]
#[inline]
unsafe fn ptr(&self) -> *const u8 {
ptr::read_unaligned(ptr::addr_of!(self.encoded))
fn ptr(&self) -> *const u8 {
self.encoded.as_ptr()
}

#[inline]
fn addr(&self) -> usize {
unsafe { self.ptr().addr() }
self.ptr().addr()
}

#[inline]
Expand All @@ -230,6 +265,7 @@ impl ColdString {
let layout = Layout::from_size_align(total, HEAP_ALIGN).unwrap();

unsafe {
// SAFETY: the layout size is non-zero, since the smallest VarInt is one byte
let ptr = alloc(layout);
if ptr.is_null() {
alloc::alloc::handle_alloc_error(layout);
Expand All @@ -244,25 +280,26 @@ impl ColdString {
addr |= Self::PTR_TAG;
addr
});
// SAFETY: encoded != 0 because Self::PTR_TAG != 0
let encoded = NonNull::new_unchecked(encoded);
Self { encoded }
}
}

#[inline]
fn heap_ptr(&self) -> *const u8 {
debug_assert!(!self.is_inline());
unsafe {
self.ptr().map_addr(|mut addr| {
addr ^= Self::PTR_TAG;
let addr = addr.rotate_right(6 + Self::ROT);
debug_assert!(addr % HEAP_ALIGN == 0);
addr
})
}
self.ptr().map_addr(|mut addr| {
addr ^= Self::PTR_TAG;
let addr = addr.rotate_right(6 + Self::ROT);
debug_assert!(addr % HEAP_ALIGN == 0);
addr
})
}

#[inline]
fn inline_len(&self) -> usize {
debug_assert!(!self.is_eight_nul());
let addr = self.addr();
match addr & Self::INLINE_TAG {
Self::INLINE_TAG => (addr & Self::LEN_MASK).rotate_right(Self::ROT),
Expand All @@ -288,7 +325,9 @@ impl ColdString {
/// ```
#[inline]
pub fn len(&self) -> usize {
if self.is_inline() {
if self.is_eight_nul() {
return WIDTH;
} else if self.is_inline() {
self.inline_len()
} else {
unsafe {
Expand All @@ -302,6 +341,9 @@ impl ColdString {
#[allow(unsafe_op_in_unsafe_fn)]
#[inline]
unsafe fn decode_inline(&self) -> &[u8] {
if self.is_eight_nul() {
return &EIGHT_NUL;
}
let len = self.inline_len();
// SAFETY: addr_of! avoids &self.ptr (which is UB due to alignment)
let self_bytes_ptr = ptr::addr_of!(self.encoded) as *const u8;
Expand Down Expand Up @@ -382,11 +424,12 @@ impl Deref for ColdString {
impl Drop for ColdString {
fn drop(&mut self) {
if !self.is_inline() {
let ptr = self.heap_ptr();
unsafe {
let ptr = self.heap_ptr();
let (len, header) = VarInt::read(ptr);
let total = header + len;
let layout = Layout::from_size_align(total, HEAP_ALIGN).unwrap();
// SAFETY: if ptr is non-null then it was allocated by alloc() in new_heap()
dealloc(ptr as *mut u8, layout);
}
}
Expand All @@ -395,21 +438,20 @@ impl Drop for ColdString {

impl Clone for ColdString {
fn clone(&self) -> Self {
match self.is_inline() {
true => unsafe {
Self {
encoded: self.ptr(),
}
},
false => Self::new_heap(self.as_str()),
if self.is_inline() {
let ptr = self.ptr();
let encoded = unsafe { NonNull::new_unchecked(ptr as *mut _) };
Self { encoded }
} else {
Self::new_heap(self.as_str())
}
}
}

impl PartialEq for ColdString {
fn eq(&self, other: &Self) -> bool {
match (self.is_inline(), other.is_inline()) {
(true, true) => unsafe { self.ptr() == other.ptr() },
(true, true) => self.ptr() == other.ptr(),
(false, false) => unsafe { self.decode_heap() == other.decode_heap() },
_ => false,
}
Expand Down Expand Up @@ -605,14 +647,6 @@ mod tests {
#[test]
fn test_layout() {
assert_eq!(mem::size_of::<ColdString>(), mem::size_of::<usize>());
assert_eq!(mem::align_of::<ColdString>(), 1);
struct Foo {
_s: ColdString,
_b: u8,
}

assert_eq!(mem::size_of::<Foo>(), mem::size_of::<usize>() + 1);
assert_eq!(mem::align_of::<Foo>(), 1);
}

#[test]
Expand Down Expand Up @@ -640,6 +674,9 @@ mod tests {
assert_eq!(s, cs);
assert_eq!(cs, *s);
assert_eq!(*s, cs);
let opt_s = Some(cs.clone());
assert_eq!(opt_s, Some(ColdString::new(s)));
assert!(opt_s != None);
}

#[test]
Expand Down Expand Up @@ -741,4 +778,24 @@ mod tests {
}
}
}

#[test]
fn ensure_zero_repr() {
assert!(str::from_utf8(&ColdString::EIGHT_NUL_MAP.to_ne_bytes()).is_err());
}

#[test]
fn test_const_8nul_vs_non_const() {
let nul8 = str::from_utf8(&EIGHT_NUL).unwrap();
let const8 = ColdString::new_inline_const(nul8);
let non_const = ColdString::new(nul8);
let cloned = non_const.clone();
assert_eq!(const8.ptr(), non_const.ptr());
assert_eq!(const8.ptr(), cloned.ptr());
// check that a null pointer will return a str pointing to EIGHT_NUL
assert_eq!(
&const8.as_str().as_bytes()[0] as *const u8,
(&EIGHT_NUL) as *const u8
);
}
}
3 changes: 3 additions & 0 deletions cold-string/tests/property.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ proptest! {
if s.len() <= core::mem::size_of::<usize>() {
assert_eq!(ColdString::new_inline_const(&s), cold);
}
let opt_s = Some(cold.clone());
assert_eq!(opt_s.as_ref().unwrap(), &cold);
assert_eq!(opt_s.as_ref().map(|x| x.as_str()), Some(s.as_str()));
}

}
Loading