From ded22f9687127d0b20879d9cc5d51ab0edac95e2 Mon Sep 17 00:00:00 2001 From: Iain McGinniss <309153+iainmcgin@users.noreply.github.com> Date: Fri, 22 May 2026 16:39:23 +0000 Subject: [PATCH 1/3] runtime: generalize string decode/JSON paths over the field type Introduce a `ProtoString` trait (blanket-implemented for any type that is Clone + PartialEq + Default + Debug + AsRef + From + From<&str>) describing the surface generated code needs of a proto `string` field's Rust representation. `String` satisfies it, and so do small-string types like SmolStr / EcoString / CompactString. Add `decode_string_to::()` as the generic counterpart to `decode_string` (decode + UTF-8 validate, then construct via `From`), and make the `proto_string` serde with-module generic: `serialize` over `AsRef` and `deserialize` over `From`, mirroring the existing `bytes` module. For the default `String` representation every conversion is the identity, so the generic path is zero-cost and generated output is unchanged. This is groundwork for a configurable `string_type` codegen knob; no field is yet emitted as anything other than `String`. --- buffa/src/json_helpers.rs | 29 +++++++++-- buffa/src/json_helpers/tests.rs | 37 ++++++++++++++ buffa/src/lib.rs | 1 + buffa/src/types.rs | 88 +++++++++++++++++++++++++++++++++ 4 files changed, 151 insertions(+), 4 deletions(-) diff --git a/buffa/src/json_helpers.rs b/buffa/src/json_helpers.rs index 46a8e54..e08cb9d 100644 --- a/buffa/src/json_helpers.rs +++ b/buffa/src/json_helpers.rs @@ -337,11 +337,32 @@ pub mod proto_string { use alloc::string::ToString; use serde::{Deserializer, Serializer}; - pub fn serialize(value: &str, s: S) -> Result { - s.serialize_str(value) + /// Serialize a `string` field. + /// + /// Generic over `T: AsRef` so configurable string types + /// (`smol_str::SmolStr`, `ecow::EcoString`, ...) serialize without relying + /// on `Deref` coercion at the `#[serde(with = ...)]` call + /// site. `String` and `&str` both satisfy the bound. + pub fn serialize + ?Sized, S: Serializer>( + value: &T, + s: S, + ) -> Result { + s.serialize_str(value.as_ref()) } - pub fn deserialize<'de, D: Deserializer<'de>>(d: D) -> Result { + /// Deserialize a `string` field (or JSON `null` → `""`). + /// + /// Generic over the return type so that codegen's `string_type()` (which can + /// map the field to `smol_str::SmolStr`, `ecow::EcoString`, etc.) works + /// without a shim: the visitor produces `String`, the final `.into()` + /// converts. `String: From` (identity) keeps the default path + /// zero-cost. Type inference picks `T` from the field type at the serde call + /// site. Mirrors the `bytes` module's generic deserialize. + pub fn deserialize<'de, T, D>(d: D) -> Result + where + T: From, + D: Deserializer<'de>, + { struct V; impl<'de> serde::de::Visitor<'de> for V { type Value = alloc::string::String; @@ -358,7 +379,7 @@ pub mod proto_string { Ok(v) } } - d.deserialize_any(V) + d.deserialize_any(V).map(T::from) } } diff --git a/buffa/src/json_helpers/tests.rs b/buffa/src/json_helpers/tests.rs index 3889ad1..6671ffd 100644 --- a/buffa/src/json_helpers/tests.rs +++ b/buffa/src/json_helpers/tests.rs @@ -1227,6 +1227,43 @@ fn proto_string_null_is_empty() { assert_eq!(v.0, ""); } +/// A stand-in for a configurable string type (`SmolStr`/`EcoString`/...): it +/// implements just the `From`/`From<&str>`/`AsRef` surface the +/// generic `proto_string` path relies on, proving the with-module deserializes +/// into an arbitrary target type without a per-type shim. +#[derive(Clone, PartialEq, Debug, Default)] +struct MyStr(alloc::string::String); +impl From for MyStr { + fn from(s: alloc::string::String) -> Self { + MyStr(s) + } +} +impl From<&str> for MyStr { + fn from(s: &str) -> Self { + MyStr(s.into()) + } +} +impl AsRef for MyStr { + fn as_ref(&self) -> &str { + &self.0 + } +} + +#[derive(serde::Serialize, serde::Deserialize)] +struct SerdeCustomStr(#[serde(with = "proto_string")] MyStr); + +#[test] +fn proto_string_deserializes_into_custom_type() { + let recovered: SerdeCustomStr = serde_json::from_str(r#""hello""#).unwrap(); + assert_eq!(recovered.0, MyStr("hello".into())); + // null still maps to the empty value via the `From` conversion. + let empty: SerdeCustomStr = serde_json::from_str("null").unwrap(); + assert_eq!(empty.0, MyStr::default()); + // Round-trips back out, serialized via `AsRef`. + let json = serde_json::to_string(&SerdeCustomStr(MyStr("hi".into()))).unwrap(); + assert_eq!(json, r#""hi""#); +} + // ── closed_enum tests ───────────────────────────────────────────────── #[derive(serde::Serialize, serde::Deserialize)] diff --git a/buffa/src/lib.rs b/buffa/src/lib.rs index 4241434..ab018ba 100644 --- a/buffa/src/lib.rs +++ b/buffa/src/lib.rs @@ -218,6 +218,7 @@ pub use message::{DecodeOptions, Message, MessageName, RECURSION_LIMIT}; pub use message_field::{DefaultInstance, MessageField}; pub use oneof::Oneof; pub use size_cache::SizeCache; +pub use types::ProtoString; pub use unknown_fields::{UnknownField, UnknownFieldData, UnknownFields}; #[cfg(feature = "text")] diff --git a/buffa/src/types.rs b/buffa/src/types.rs index c3fd30c..35940b9 100644 --- a/buffa/src/types.rs +++ b/buffa/src/types.rs @@ -447,6 +447,66 @@ pub fn string_encoded_len(value: &str) -> usize { varint_len(len as u64) + len } +/// A Rust type usable as the in-memory representation of a proto `string` field. +/// +/// buffa generates [`String`] by default. The `string_type` knob in +/// `buffa_build` can substitute a small-string-optimized type — such as +/// [`smol_str::SmolStr`], [`ecow::EcoString`], or +/// [`compact_str::CompactString`] — for read-mostly schemas where `String`'s +/// growable buffer is unnecessary. Any type meeting these bounds qualifies, so +/// there is nothing to implement by hand: the blanket impl below covers every +/// conforming type. +/// +/// The bounds are exactly what generated code requires of a string field: +/// +/// - `Clone + PartialEq + Default + Debug` — for the `#[derive(...)]` and the +/// hand-written `Debug` impl on message structs, and for `clear()` (which +/// resets the field to [`Default`] rather than relying on a `String`-specific +/// `clear`, since the small-string types may be immutable). +/// - [`AsRef`] — the encoder borrows the field as `&str`; +/// [`encode_string`] and [`string_encoded_len`] take `&str`. +/// - [`From`] and [`From<&str>`](From) — the decode path +/// ([`decode_string_to`]) and the view→owned conversion construct the field +/// from freshly decoded text. +/// +/// For the default `String` representation, `From` is the identity, so +/// the generic path costs nothing relative to the specialized one. +pub trait ProtoString: + Clone + PartialEq + Default + core::fmt::Debug + AsRef + From + for<'a> From<&'a str> +{ +} + +impl ProtoString for T where + T: Clone + + PartialEq + + Default + + core::fmt::Debug + + AsRef + + From + + for<'a> From<&'a str> +{ +} + +/// Decode a length-delimited `string` into a configurable [`ProtoString`] type. +/// +/// This is the generic counterpart to [`decode_string`]: it reads the varint +/// length prefix, copies that many bytes, and validates UTF-8 identically, then +/// constructs the target representation via `From`. Generated code uses +/// the in-place [`merge_string`] for `String` fields (which reuses the existing +/// allocation) and this helper for every other [`ProtoString`] type. +/// +/// # Errors +/// +/// Propagates the same errors as [`decode_string`]: +/// - [`DecodeError::UnexpectedEof`] if the buffer is shorter than the declared +/// length. +/// - [`DecodeError::MessageTooLarge`] if the declared length overflows `usize`. +/// - [`DecodeError::InvalidUtf8`] if the bytes are not valid UTF-8. +#[inline] +pub fn decode_string_to(buf: &mut impl Buf) -> Result { + decode_string(buf).map(S::from) +} + /// Encode a `bytes` value as a varint length prefix followed by raw bytes /// (wire type 2). #[inline] @@ -1259,6 +1319,34 @@ mod tests { assert_eq!(decoded, ""); } + #[test] + fn test_decode_string_to_roundtrip() { + // `decode_string_to::` must decode identically to `decode_string` + // for any `ProtoString` type; `String` exercises the identity path. + for s in ["", "hello", "héllo", "世界", "a".repeat(128).as_str()] { + let mut buf = Vec::new(); + encode_string(s, &mut buf); + let decoded: String = decode_string_to(&mut buf.as_slice()).unwrap(); + assert_eq!(s, decoded); + } + } + + #[test] + fn test_decode_string_to_propagates_errors() { + // Invalid UTF-8 surfaces before the `From` conversion. + let bad: &[u8] = &[0x02, 0xFF, 0xFE]; + assert_eq!( + decode_string_to::(&mut &bad[..]), + Err(DecodeError::InvalidUtf8) + ); + // Truncated payload is reported as EOF. + let short: &[u8] = &[0x04, 0x61, 0x62]; + assert_eq!( + decode_string_to::(&mut &short[..]), + Err(DecodeError::UnexpectedEof) + ); + } + #[test] fn test_string_invalid_utf8() { // Length prefix = 2, payload = two invalid UTF-8 bytes. From 3c7d08b4be0af3241f1c08a2d15b117cf33e68d8 Mon Sep 17 00:00:00 2001 From: Iain McGinniss <309153+iainmcgin@users.noreply.github.com> Date: Fri, 22 May 2026 16:41:43 +0000 Subject: [PATCH 2/3] runtime: add smol_str / ecow / compact_str as optional string types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wire the three small-string crates into buffa as optional, feature-gated dependencies and provide the runtime glue codegen will need: - Re-export each crate (`::buffa::smol_str`, `::buffa::ecow`, `::buffa::compact_str`) under its feature, mirroring the `bytes` re-export, so generated code can name the type without the consumer declaring the dep. - ProtoElemJson impls for repeated/map fields, delegating to the now-generic `proto_string` with-module (one line each, no per-type shim). - arbitrary_ecow{,_opt,_vec} helpers, since ecow ships no native Arbitrary; smol_str and compact_str use their own impls via the feature plumbing. Feature plumbing keeps the crates no_std (default-features = false) and turns on their serde/arbitrary/std sub-features only when buffa's matching feature is enabled, via weak (`dep?/feature`) references. Nothing emits these types yet — that is the codegen knob in a follow-up. --- Cargo.lock | 76 +++++++++++++++++++++++++++++++++ Cargo.toml | 7 +++ buffa/Cargo.toml | 37 ++++++++++++++-- buffa/src/json_helpers.rs | 11 +++++ buffa/src/json_helpers/tests.rs | 57 +++++++++++++++++++++++++ buffa/src/lib.rs | 74 ++++++++++++++++++++++++++++++++ buffa/src/types.rs | 38 +++++++++++++++++ 7 files changed, 297 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 130c6c3..372eaac 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -56,6 +56,16 @@ version = "2.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" +[[package]] +name = "borsh" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfd1e3f8955a5d7de9fab72fc8373fade9fb8a703968cb200ae3dc6cf08e185a" +dependencies = [ + "bytes", + "cfg_aliases", +] + [[package]] name = "buffa" version = "0.6.0" @@ -63,10 +73,13 @@ dependencies = [ "arbitrary", "base64", "bytes", + "compact_str", + "ecow", "hashbrown 0.15.5", "once_cell", "serde", "serde_json", + "smol_str", "thiserror", ] @@ -148,12 +161,27 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" +[[package]] +name = "castaway" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dec551ab6e7578819132c713a93c022a05d60159dc86e7a7050223577484c55a" +dependencies = [ + "rustversion", +] + [[package]] name = "cfg-if" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" +[[package]] +name = "cfg_aliases" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" + [[package]] name = "ciborium" version = "0.2.2" @@ -206,6 +234,22 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" +[[package]] +name = "compact_str" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdb1325a1cece981e8a296ab8f0f9b63ae357bd0784a9faaf548cc7b480707a" +dependencies = [ + "arbitrary", + "castaway", + "cfg-if", + "itoa", + "rustversion", + "ryu", + "serde", + "static_assertions", +] + [[package]] name = "criterion" version = "0.5.1" @@ -284,6 +328,15 @@ dependencies = [ "syn", ] +[[package]] +name = "ecow" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78e4f79b296fbaab6ce2e22d52cb4c7f010fe0ebe7a32e34fa25885fd797bd02" +dependencies = [ + "serde", +] + [[package]] name = "either" version = "1.15.0" @@ -621,6 +674,12 @@ version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" +[[package]] +name = "ryu" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" + [[package]] name = "same-file" version = "1.0.6" @@ -679,6 +738,23 @@ dependencies = [ "zmij", ] +[[package]] +name = "smol_str" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4aaa7368fcf4852a4c2dd92df0cace6a71f2091ca0a23391ce7f3a31833f1523" +dependencies = [ + "arbitrary", + "borsh", + "serde_core", +] + +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + [[package]] name = "syn" version = "2.0.114" diff --git a/Cargo.toml b/Cargo.toml index 91783dc..49c1ef1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -62,6 +62,13 @@ serde = { version = "1", default-features = false } serde_json = { version = "1", default-features = false, features = ["alloc"] } arbitrary = { version = "1", default-features = false, features = ["derive"] } thiserror = { version = "2", default-features = false } +# Optional `string` field representations selectable via `buffa_build`'s +# `string_type` knob. Declared `default-features = false` so they stay +# `no_std`; the relevant `serde`/`arbitrary`/`std` sub-features are turned on +# by the matching `buffa` feature (see buffa/Cargo.toml). +smol_str = { version = "0.3", default-features = false } +ecow = { version = "0.2", default-features = false } +compact_str = { version = "0.9", default-features = false } prettyplease = "0.2" proc-macro2 = "1" quote = "1" diff --git a/buffa/Cargo.toml b/buffa/Cargo.toml index ffd83da..9cc82a3 100644 --- a/buffa/Cargo.toml +++ b/buffa/Cargo.toml @@ -16,14 +16,42 @@ rustdoc-args = ["--cfg", "docsrs"] [features] default = ["std"] -std = ["bytes/std", "thiserror/std", "serde?/std", "serde_json?/std"] +std = [ + "bytes/std", + "thiserror/std", + "serde?/std", + "serde_json?/std", + "smol_str?/std", + "ecow?/std", + "compact_str?/std", +] # `json` and `text` are independently enableable. Both share the # `type_registry` module (gated `any(json, text)`) but carry per-format # entry types (`JsonAnyEntry` vs `TextAnyEntry`) — no struct-literal `#[cfg]` # needed, no `Option` placeholder fields. -json = ["dep:serde", "dep:base64", "dep:serde_json", "serde/alloc", "serde/derive", "hashbrown/serde"] -arbitrary = ["dep:arbitrary"] +json = [ + "dep:serde", + "dep:base64", + "dep:serde_json", + "serde/alloc", + "serde/derive", + "hashbrown/serde", + # Pull in serde support for whichever configurable string types are enabled. + "smol_str?/serde", + "ecow?/serde", + "compact_str?/serde", +] +arbitrary = ["dep:arbitrary", "smol_str?/arbitrary", "compact_str?/arbitrary"] text = [] +# Configurable `string` field representations. Enabling one of these lets +# `buffa_build`'s `string_type` knob map proto `string` to that type; buffa +# re-exports the crate so generated code references `::buffa::::` +# and downstream crates do not declare the dependency themselves. `ecow` has +# no native `arbitrary` impl, so buffa supplies a shim under the `arbitrary` +# feature. +smol_str = ["dep:smol_str"] +ecow = ["dep:ecow"] +compact_str = ["dep:compact_str"] [dependencies] arbitrary = { workspace = true, optional = true } @@ -34,3 +62,6 @@ once_cell = { workspace = true, features = ["alloc"] } serde = { workspace = true, optional = true } serde_json = { workspace = true, optional = true } thiserror = { workspace = true } +smol_str = { workspace = true, optional = true } +ecow = { workspace = true, optional = true } +compact_str = { workspace = true, optional = true } diff --git a/buffa/src/json_helpers.rs b/buffa/src/json_helpers.rs index e08cb9d..7252b3b 100644 --- a/buffa/src/json_helpers.rs +++ b/buffa/src/json_helpers.rs @@ -1153,6 +1153,17 @@ proto_elem_json_delegate!(bool, proto_bool); proto_elem_json_delegate!(alloc::string::String, proto_string); proto_elem_json_delegate!(alloc::vec::Vec, bytes); +// Configurable `string` field representations (codegen's `string_type()`), for +// `repeated string` / `map<_, string>`. The `proto_string` with-module is +// generic — `serialize` over `AsRef`, `deserialize` over `From` — +// so each delegate is a one-liner with no per-type shim. +#[cfg(feature = "smol_str")] +proto_elem_json_delegate!(::smol_str::SmolStr, proto_string); +#[cfg(feature = "ecow")] +proto_elem_json_delegate!(::ecow::EcoString, proto_string); +#[cfg(feature = "compact_str")] +proto_elem_json_delegate!(::compact_str::CompactString, proto_string); + // bytes::Bytes — for codegen's `use_bytes_type()` with `repeated bytes`. // Serialize: `Bytes: Deref` → `bytes::serialize(&[u8], s)`. // Deserialize: `bytes::deserialize` is generic over `T: From>`; diff --git a/buffa/src/json_helpers/tests.rs b/buffa/src/json_helpers/tests.rs index 6671ffd..b6775d2 100644 --- a/buffa/src/json_helpers/tests.rs +++ b/buffa/src/json_helpers/tests.rs @@ -1264,6 +1264,63 @@ fn proto_string_deserializes_into_custom_type() { assert_eq!(json, r#""hi""#); } +// Configurable string types: the singular `proto_string` with-module and the +// `ProtoElemJson` (repeated) path must both round-trip through proto3 JSON. +#[cfg(feature = "smol_str")] +#[test] +fn proto_string_smol_str_roundtrip() { + #[derive(serde::Serialize, serde::Deserialize)] + struct W(#[serde(with = "proto_string")] smol_str::SmolStr); + let json = serde_json::to_string(&W("hi".into())).unwrap(); + assert_eq!(json, r#""hi""#); + assert_eq!(serde_json::from_str::(&json).unwrap().0, "hi"); + assert_eq!(serde_json::from_str::("null").unwrap().0, ""); + + // repeated string via ProtoElemJson + #[derive(serde::Serialize, serde::Deserialize)] + struct R(#[serde(with = "proto_seq")] alloc::vec::Vec); + let v = R(vec!["a".into(), "b".into()]); + let json = serde_json::to_string(&v).unwrap(); + assert_eq!(json, r#"["a","b"]"#); + assert_eq!(serde_json::from_str::(&json).unwrap().0, v.0); +} + +#[cfg(feature = "ecow")] +#[test] +fn proto_string_ecow_roundtrip() { + #[derive(serde::Serialize, serde::Deserialize)] + struct W(#[serde(with = "proto_string")] ecow::EcoString); + let json = serde_json::to_string(&W("hi".into())).unwrap(); + assert_eq!(json, r#""hi""#); + assert_eq!(serde_json::from_str::(&json).unwrap().0, "hi"); + assert_eq!(serde_json::from_str::("null").unwrap().0, ""); + + #[derive(serde::Serialize, serde::Deserialize)] + struct R(#[serde(with = "proto_seq")] alloc::vec::Vec); + let v = R(vec!["a".into(), "b".into()]); + let json = serde_json::to_string(&v).unwrap(); + assert_eq!(json, r#"["a","b"]"#); + assert_eq!(serde_json::from_str::(&json).unwrap().0, v.0); +} + +#[cfg(feature = "compact_str")] +#[test] +fn proto_string_compact_str_roundtrip() { + #[derive(serde::Serialize, serde::Deserialize)] + struct W(#[serde(with = "proto_string")] compact_str::CompactString); + let json = serde_json::to_string(&W("hi".into())).unwrap(); + assert_eq!(json, r#""hi""#); + assert_eq!(serde_json::from_str::(&json).unwrap().0, "hi"); + assert_eq!(serde_json::from_str::("null").unwrap().0, ""); + + #[derive(serde::Serialize, serde::Deserialize)] + struct R(#[serde(with = "proto_seq")] alloc::vec::Vec); + let v = R(vec!["a".into(), "b".into()]); + let json = serde_json::to_string(&v).unwrap(); + assert_eq!(json, r#"["a","b"]"#); + assert_eq!(serde_json::from_str::(&json).unwrap().0, v.0); +} + // ── closed_enum tests ───────────────────────────────────────────────── #[derive(serde::Serialize, serde::Deserialize)] diff --git a/buffa/src/lib.rs b/buffa/src/lib.rs index ab018ba..4edc100 100644 --- a/buffa/src/lib.rs +++ b/buffa/src/lib.rs @@ -134,6 +134,21 @@ pub use ::bytes; #[doc(hidden)] pub use ::serde_json; +// Configurable `string` field representations. Re-exported so that code +// generated with `buffa_build`'s `string_type` knob can name +// `::buffa::smol_str::SmolStr` (etc.) without the consumer crate declaring the +// dependency — the same arrangement as `bytes` above. Each is gated on the +// matching `buffa` feature. +#[cfg(feature = "compact_str")] +#[doc(hidden)] +pub use ::compact_str; +#[cfg(feature = "ecow")] +#[doc(hidden)] +pub use ::ecow; +#[cfg(feature = "smol_str")] +#[doc(hidden)] +pub use ::smol_str; + /// Include the generated stitcher for a proto **package** from `OUT_DIR`. /// /// Codegen emits one `.mod.rs` per package which `include!`s the @@ -288,6 +303,39 @@ pub mod __private { let vv: ::alloc::vec::Vec<::alloc::vec::Vec> = ::arbitrary::Arbitrary::arbitrary(u)?; Ok(vv.into_iter().map(::bytes::Bytes::from).collect()) } + + /// `arbitrary` helpers for `ecow::EcoString` string fields. + /// + /// Unlike `smol_str::SmolStr` and `compact_str::CompactString`, `EcoString` + /// ships no `Arbitrary` impl, so codegen attaches + /// `#[arbitrary(with = ::buffa::__private::arbitrary_ecow*)]` to + /// `EcoString`-typed string fields (the same pattern used for + /// `bytes::Bytes`). The three variants cover singular, optional, and + /// repeated fields; oneof variant inner fields use `arbitrary_ecow`. + #[cfg(all(feature = "ecow", feature = "arbitrary"))] + pub fn arbitrary_ecow( + u: &mut ::arbitrary::Unstructured<'_>, + ) -> ::arbitrary::Result<::ecow::EcoString> { + let s: ::alloc::string::String = ::arbitrary::Arbitrary::arbitrary(u)?; + Ok(::ecow::EcoString::from(s)) + } + + #[cfg(all(feature = "ecow", feature = "arbitrary"))] + pub fn arbitrary_ecow_opt( + u: &mut ::arbitrary::Unstructured<'_>, + ) -> ::arbitrary::Result<::core::option::Option<::ecow::EcoString>> { + let opt: ::core::option::Option<::alloc::string::String> = + ::arbitrary::Arbitrary::arbitrary(u)?; + Ok(opt.map(::ecow::EcoString::from)) + } + + #[cfg(all(feature = "ecow", feature = "arbitrary"))] + pub fn arbitrary_ecow_vec( + u: &mut ::arbitrary::Unstructured<'_>, + ) -> ::arbitrary::Result<::alloc::vec::Vec<::ecow::EcoString>> { + let vv: ::alloc::vec::Vec<::alloc::string::String> = ::arbitrary::Arbitrary::arbitrary(u)?; + Ok(vv.into_iter().map(::ecow::EcoString::from).collect()) + } } #[cfg(all(test, feature = "arbitrary"))] @@ -343,6 +391,32 @@ mod arbitrary_tests { assert_eq!(b.slice(..).as_ref(), v.as_slice()); } } + + // The `EcoString` arbitrary shims must mirror the underlying `String` / + // `Option` / `Vec` impls, since `ecow` ships no native + // `Arbitrary`. The other two configurable string types use their own + // native impls and need no shim. + #[cfg(feature = "ecow")] + #[test] + fn arbitrary_ecow_matches_string() { + use super::__private::{arbitrary_ecow, arbitrary_ecow_opt, arbitrary_ecow_vec}; + use alloc::string::String; + + let s = arbitrary_ecow(&mut Unstructured::new(&SEED)).unwrap(); + let v: String = Arbitrary::arbitrary(&mut Unstructured::new(&SEED)).unwrap(); + assert_eq!(s.as_str(), v.as_str()); + + let so = arbitrary_ecow_opt(&mut Unstructured::new(&SEED)).unwrap(); + let vo: Option = Arbitrary::arbitrary(&mut Unstructured::new(&SEED)).unwrap(); + assert_eq!(so.as_ref().map(|x| x.as_str()), vo.as_deref()); + + let sv = arbitrary_ecow_vec(&mut Unstructured::new(&SEED)).unwrap(); + let vv: Vec = Arbitrary::arbitrary(&mut Unstructured::new(&SEED)).unwrap(); + assert_eq!(sv.len(), vv.len()); + for (a, b) in sv.iter().zip(&vv) { + assert_eq!(a.as_str(), b.as_str()); + } + } } /// Minimal fixture types for compile-checking doc examples. diff --git a/buffa/src/types.rs b/buffa/src/types.rs index 35940b9..4805aec 100644 --- a/buffa/src/types.rs +++ b/buffa/src/types.rs @@ -1347,6 +1347,44 @@ mod tests { ); } + /// Every configurable string representation must decode identically to + /// `String` via the shared `decode_string_to` / `ProtoString` path. The + /// inputs straddle the small-string-optimization inline boundary so both + /// the inline and heap representations are exercised. + #[cfg(feature = "smol_str")] + #[test] + fn test_decode_string_to_smol_str() { + for s in ["", "id", "a".repeat(64).as_str()] { + let mut buf = Vec::new(); + encode_string(s, &mut buf); + let decoded: smol_str::SmolStr = decode_string_to(&mut buf.as_slice()).unwrap(); + assert_eq!(decoded.as_str(), s); + } + } + + #[cfg(feature = "ecow")] + #[test] + fn test_decode_string_to_ecow() { + for s in ["", "id", "a".repeat(64).as_str()] { + let mut buf = Vec::new(); + encode_string(s, &mut buf); + let decoded: ecow::EcoString = decode_string_to(&mut buf.as_slice()).unwrap(); + assert_eq!(decoded.as_str(), s); + } + } + + #[cfg(feature = "compact_str")] + #[test] + fn test_decode_string_to_compact_str() { + for s in ["", "id", "a".repeat(64).as_str()] { + let mut buf = Vec::new(); + encode_string(s, &mut buf); + let decoded: compact_str::CompactString = + decode_string_to(&mut buf.as_slice()).unwrap(); + assert_eq!(decoded.as_str(), s); + } + } + #[test] fn test_string_invalid_utf8() { // Length prefix = 2, payload = two invalid UTF-8 bytes. From 45a3ff2bb7a2f26695ce215dabbc91bf42ab4892 Mon Sep 17 00:00:00 2001 From: Iain McGinniss <309153+iainmcgin@users.noreply.github.com> Date: Fri, 22 May 2026 16:49:54 +0000 Subject: [PATCH 3/3] runtime: refine ProtoString contract and JSON string deserialize Address review feedback on the string-type groundwork: - proto_string::deserialize now constructs the target type directly in the visitor (via From<&str> in visit_str), so an SSO type inlines a short JSON string without first allocating an intermediate String. The default String path is unchanged. - Add Send + Sync to the ProtoString bound so a message owning such a field stays Send + Sync; an exotic non-thread-safe string type is now rejected at the field type rather than silently infecting every containing message. - Add a compile-time assertion that String: ProtoString to freeze the default path against future supertrait changes. - Replace the broken intra-doc links to optional-dependency types with plain inline code so `cargo doc` succeeds under default features (the crate denies broken intra-doc links). - Document the three new feature flags in the crate-level table, including how they compose with json/arbitrary, and reword docs/comments so they describe the string_type knob as forthcoming rather than already shipped. - Widen the decode tests to straddle each SSO type's inline capacity. --- Cargo.toml | 9 +++--- buffa/Cargo.toml | 12 ++++---- buffa/src/json_helpers.rs | 40 +++++++++++++----------- buffa/src/lib.rs | 14 +++++++++ buffa/src/types.rs | 65 +++++++++++++++++++++++++++++---------- 5 files changed, 96 insertions(+), 44 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 49c1ef1..d908ac7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -62,10 +62,11 @@ serde = { version = "1", default-features = false } serde_json = { version = "1", default-features = false, features = ["alloc"] } arbitrary = { version = "1", default-features = false, features = ["derive"] } thiserror = { version = "2", default-features = false } -# Optional `string` field representations selectable via `buffa_build`'s -# `string_type` knob. Declared `default-features = false` so they stay -# `no_std`; the relevant `serde`/`arbitrary`/`std` sub-features are turned on -# by the matching `buffa` feature (see buffa/Cargo.toml). +# Optional `string` field representations that will be selectable via +# `buffa_build`'s forthcoming `string_type` knob. Declared +# `default-features = false` so they stay `no_std`; the relevant +# `serde`/`arbitrary`/`std` sub-features are turned on by the matching `buffa` +# feature (see buffa/Cargo.toml). smol_str = { version = "0.3", default-features = false } ecow = { version = "0.2", default-features = false } compact_str = { version = "0.9", default-features = false } diff --git a/buffa/Cargo.toml b/buffa/Cargo.toml index 9cc82a3..65161f8 100644 --- a/buffa/Cargo.toml +++ b/buffa/Cargo.toml @@ -43,12 +43,12 @@ json = [ ] arbitrary = ["dep:arbitrary", "smol_str?/arbitrary", "compact_str?/arbitrary"] text = [] -# Configurable `string` field representations. Enabling one of these lets -# `buffa_build`'s `string_type` knob map proto `string` to that type; buffa -# re-exports the crate so generated code references `::buffa::::` -# and downstream crates do not declare the dependency themselves. `ecow` has -# no native `arbitrary` impl, so buffa supplies a shim under the `arbitrary` -# feature. +# Configurable `string` field representations. Enabling one of these will let +# `buffa_build`'s forthcoming `string_type` knob map proto `string` to that +# type; buffa re-exports the crate so generated code references +# `::buffa::::` and downstream crates do not declare the dependency +# themselves. `ecow` has no native `arbitrary` impl, so buffa supplies a shim +# under the `arbitrary` feature. smol_str = ["dep:smol_str"] ecow = ["dep:ecow"] compact_str = ["dep:compact_str"] diff --git a/buffa/src/json_helpers.rs b/buffa/src/json_helpers.rs index 7252b3b..2d422aa 100644 --- a/buffa/src/json_helpers.rs +++ b/buffa/src/json_helpers.rs @@ -334,7 +334,6 @@ pub mod proto_bool { /// /// Use with `#[serde(with = "::buffa::json_helpers::proto_string")]`. pub mod proto_string { - use alloc::string::ToString; use serde::{Deserializer, Serializer}; /// Serialize a `string` field. @@ -352,34 +351,39 @@ pub mod proto_string { /// Deserialize a `string` field (or JSON `null` → `""`). /// - /// Generic over the return type so that codegen's `string_type()` (which can - /// map the field to `smol_str::SmolStr`, `ecow::EcoString`, etc.) works - /// without a shim: the visitor produces `String`, the final `.into()` - /// converts. `String: From` (identity) keeps the default path - /// zero-cost. Type inference picks `T` from the field type at the serde call - /// site. Mirrors the `bytes` module's generic deserialize. + /// Generic over the return type so that codegen's `string_type` knob (which + /// can map the field to `smol_str::SmolStr`, `ecow::EcoString`, etc.) works + /// without a per-type shim. The visitor constructs the target type directly: + /// `visit_str` goes through `From<&str>`, so a short string is inlined by an + /// SSO type without first allocating an intermediate `String`. `String` + /// itself satisfies both `From<&str>` and `From`, keeping the + /// default path zero-extra-cost. Type inference picks `T` from the field + /// type at the serde call site. pub fn deserialize<'de, T, D>(d: D) -> Result where - T: From, + T: for<'a> From<&'a str> + From, D: Deserializer<'de>, { - struct V; - impl<'de> serde::de::Visitor<'de> for V { - type Value = alloc::string::String; + struct V(core::marker::PhantomData); + impl<'de, T> serde::de::Visitor<'de> for V + where + T: for<'a> From<&'a str> + From, + { + type Value = T; fn expecting(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { f.write_str("a string or null") } - fn visit_unit(self) -> Result { - Ok(alloc::string::String::new()) + fn visit_unit(self) -> Result { + Ok(T::from("")) } - fn visit_str(self, v: &str) -> Result { - Ok(v.to_string()) + fn visit_str(self, v: &str) -> Result { + Ok(T::from(v)) } - fn visit_string(self, v: alloc::string::String) -> Result { - Ok(v) + fn visit_string(self, v: alloc::string::String) -> Result { + Ok(T::from(v)) } } - d.deserialize_any(V).map(T::from) + d.deserialize_any(V::(core::marker::PhantomData)) } } diff --git a/buffa/src/lib.rs b/buffa/src/lib.rs index 4edc100..0b9137e 100644 --- a/buffa/src/lib.rs +++ b/buffa/src/lib.rs @@ -73,6 +73,16 @@ //! | `json` | | Proto3 JSON via `serde` (the `json_helpers` and `any_registry` modules) | //! | `text` | | Textproto (human-readable) encoding and decoding | //! | `arbitrary` | | `arbitrary::Arbitrary` impls for fuzzing | +//! | `smol_str` | | Allow generated `string` fields to use `smol_str::SmolStr` (see [`ProtoString`]) | +//! | `ecow` | | Allow generated `string` fields to use `ecow::EcoString` (see [`ProtoString`]) | +//! | `compact_str` | | Allow generated `string` fields to use `compact_str::CompactString` (see [`ProtoString`]) | +//! +//! The three string-type flags compose with `json` and `arbitrary`: enabling, +//! for example, both `smol_str` and `json` turns on `smol_str/serde`, and +//! `smol_str` + `arbitrary` turns on `smol_str/arbitrary`. `ecow` has no native +//! `Arbitrary` impl, so `ecow` + `arbitrary` is served by an in-crate shim +//! instead. None of the three is selected by generated code until a build is +//! configured to use it (see [`ProtoString`]). //! //! With `default-features = false` the crate is `#![no_std]` (requires //! `alloc`). Proto3 JSON serialization still works without `std` via @@ -333,6 +343,10 @@ pub mod __private { pub fn arbitrary_ecow_vec( u: &mut ::arbitrary::Unstructured<'_>, ) -> ::arbitrary::Result<::alloc::vec::Vec<::ecow::EcoString>> { + // Materializing a `Vec` first (rather than building `EcoString`s + // element-by-element) is deliberate: it makes the byte-consumption order + // identical to the underlying `Vec` impl, which is what the + // parity test asserts. Do not "optimize" the intermediate `Vec` away. let vv: ::alloc::vec::Vec<::alloc::string::String> = ::arbitrary::Arbitrary::arbitrary(u)?; Ok(vv.into_iter().map(::ecow::EcoString::from).collect()) } diff --git a/buffa/src/types.rs b/buffa/src/types.rs index 4805aec..a4af12e 100644 --- a/buffa/src/types.rs +++ b/buffa/src/types.rs @@ -447,15 +447,16 @@ pub fn string_encoded_len(value: &str) -> usize { varint_len(len as u64) + len } -/// A Rust type usable as the in-memory representation of a proto `string` field. +/// The bound generated code places on the Rust type used for a proto `string` +/// field. You neither implement nor name this trait by hand — a blanket impl +/// covers every conforming type, and a forthcoming `string_type` knob in +/// `buffa_build` selects the concrete type at code-generation time. /// -/// buffa generates [`String`] by default. The `string_type` knob in -/// `buffa_build` can substitute a small-string-optimized type — such as -/// [`smol_str::SmolStr`], [`ecow::EcoString`], or -/// [`compact_str::CompactString`] — for read-mostly schemas where `String`'s -/// growable buffer is unnecessary. Any type meeting these bounds qualifies, so -/// there is nothing to implement by hand: the blanket impl below covers every -/// conforming type. +/// buffa generates [`String`] by default. The knob will be able to substitute a +/// small-string-optimized type — such as `smol_str::SmolStr`, +/// `ecow::EcoString`, or `compact_str::CompactString` (each behind the matching +/// `buffa` feature) — for read-mostly schemas where `String`'s growable buffer +/// is unnecessary. /// /// The bounds are exactly what generated code requires of a string field: /// @@ -463,16 +464,27 @@ pub fn string_encoded_len(value: &str) -> usize { /// hand-written `Debug` impl on message structs, and for `clear()` (which /// resets the field to [`Default`] rather than relying on a `String`-specific /// `clear`, since the small-string types may be immutable). +/// - `Send + Sync` — so a message owning such a field stays `Send + Sync`; +/// without this bound an exotic string type could silently make every +/// containing message thread-unsafe. /// - [`AsRef`] — the encoder borrows the field as `&str`; /// [`encode_string`] and [`string_encoded_len`] take `&str`. -/// - [`From`] and [`From<&str>`](From) — the decode path -/// ([`decode_string_to`]) and the view→owned conversion construct the field -/// from freshly decoded text. +/// - `From` and `From<&str>` — used by the decode path +/// ([`decode_string_to`]) and (once the knob lands) the view→owned conversion +/// to construct the field from freshly decoded text. /// -/// For the default `String` representation, `From` is the identity, so +/// For the default `String` representation every conversion is the identity, so /// the generic path costs nothing relative to the specialized one. pub trait ProtoString: - Clone + PartialEq + Default + core::fmt::Debug + AsRef + From + for<'a> From<&'a str> + Clone + + PartialEq + + Default + + core::fmt::Debug + + Send + + Sync + + AsRef + + From + + for<'a> From<&'a str> { } @@ -481,12 +493,21 @@ impl ProtoString for T where + PartialEq + Default + core::fmt::Debug + + Send + + Sync + AsRef + From + for<'a> From<&'a str> { } +// The default representation must always satisfy the bound; freeze that +// invariant against future changes to the trait's supertraits. +const _: fn() = || { + fn assert_proto_string() {} + assert_proto_string::(); +}; + /// Decode a length-delimited `string` into a configurable [`ProtoString`] type. /// /// This is the generic counterpart to [`decode_string`]: it reads the varint @@ -1354,7 +1375,11 @@ mod tests { #[cfg(feature = "smol_str")] #[test] fn test_decode_string_to_smol_str() { - for s in ["", "id", "a".repeat(64).as_str()] { + // Lengths straddle the inline caps of all three SSO types (ecow ~15, + // smol_str 23, compact_str 24 bytes on 64-bit) plus a clearly-heap case. + for n in [0usize, 13, 14, 15, 22, 23, 24, 25, 64] { + let owned = "a".repeat(n); + let s = owned.as_str(); let mut buf = Vec::new(); encode_string(s, &mut buf); let decoded: smol_str::SmolStr = decode_string_to(&mut buf.as_slice()).unwrap(); @@ -1365,7 +1390,11 @@ mod tests { #[cfg(feature = "ecow")] #[test] fn test_decode_string_to_ecow() { - for s in ["", "id", "a".repeat(64).as_str()] { + // Lengths straddle the inline caps of all three SSO types (ecow ~15, + // smol_str 23, compact_str 24 bytes on 64-bit) plus a clearly-heap case. + for n in [0usize, 13, 14, 15, 22, 23, 24, 25, 64] { + let owned = "a".repeat(n); + let s = owned.as_str(); let mut buf = Vec::new(); encode_string(s, &mut buf); let decoded: ecow::EcoString = decode_string_to(&mut buf.as_slice()).unwrap(); @@ -1376,7 +1405,11 @@ mod tests { #[cfg(feature = "compact_str")] #[test] fn test_decode_string_to_compact_str() { - for s in ["", "id", "a".repeat(64).as_str()] { + // Lengths straddle the inline caps of all three SSO types (ecow ~15, + // smol_str 23, compact_str 24 bytes on 64-bit) plus a clearly-heap case. + for n in [0usize, 13, 14, 15, 22, 23, 24, 25, 64] { + let owned = "a".repeat(n); + let s = owned.as_str(); let mut buf = Vec::new(); encode_string(s, &mut buf); let decoded: compact_str::CompactString =