From e86c4c4c08a76df480e6736ba908c3793f68688e Mon Sep 17 00:00:00 2001 From: Iain McGinniss <309153+iainmcgin@users.noreply.github.com> Date: Fri, 22 May 2026 17:07:15 +0000 Subject: [PATCH 1/4] codegen: configurable string field type via string_type knob MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a `StringRepr` knob (String default, plus SmolStr / EcoString / CompactString) selectable per proto path through buffa_build's `string_type` / `string_type_in` builder methods, mirroring the existing `use_bytes_type` machinery. Codegen: - StringRepr enum + string_fields config (ordered path-prefix rules, last match wins) + a CodeGenContext::string_repr predicate. - classify_field emits the chosen owned type for singular, optional, and repeated string fields. Map keys/values stay String, matching the bytes path. Encode/size paths are unchanged — &SmolStr/&EcoString/&CompactString deref-coerce to &str. - Decode routes String through the in-place merge_string fast path and other reprs through decode_string_to; clear() resets non-default reprs to default (the small-string types may be immutable, like bytes::Bytes); view→owned builds via From<&str>; EcoString fields get the arbitrary_ecow shim. Default String output is byte-for-byte unchanged (verified: regenerating the checked-in WKT and bootstrap types produces no diff). Tests: codegen integration assertions for each repr + the arbitrary shim, and an end-to-end buffa-test module (string_types.proto compiled with a SmolStr default and CompactString/EcoString overrides) covering binary/JSON/view round-trips, clear, and the field-type pinning across all string shapes. --- buffa-build/src/lib.rs | 47 ++++++++ buffa-codegen/src/context.rs | 18 +++ buffa-codegen/src/impl_message.rs | 86 ++++++++++++--- buffa-codegen/src/lib.rs | 58 ++++++++++ buffa-codegen/src/message.rs | 36 ++++++ buffa-codegen/src/view.rs | 61 ++++++++++- buffa-codegen/tests/codegen_integration.rs | 114 +++++++++++++++++++ buffa-test/Cargo.toml | 9 +- buffa-test/build.rs | 25 +++++ buffa-test/protos/string_types.proto | 23 ++++ buffa-test/src/lib.rs | 17 +++ buffa-test/src/tests/mod.rs | 1 + buffa-test/src/tests/string_type.rs | 121 +++++++++++++++++++++ 13 files changed, 596 insertions(+), 20 deletions(-) create mode 100644 buffa-test/protos/string_types.proto create mode 100644 buffa-test/src/tests/string_type.rs diff --git a/buffa-build/src/lib.rs b/buffa-build/src/lib.rs index 0766f94..0e75ecc 100644 --- a/buffa-build/src/lib.rs +++ b/buffa-build/src/lib.rs @@ -33,6 +33,8 @@ use buffa::Message; use buffa_codegen::generated::descriptor::FileDescriptorSet; use buffa_codegen::CodeGenConfig; +#[doc(inline)] +pub use buffa_codegen::StringRepr; /// How to produce a `FileDescriptorSet` from `.proto` files. #[derive(Debug, Clone, Default)] @@ -439,6 +441,51 @@ impl Config { self } + /// Map `string` fields to a [`StringRepr`] other than `String` for the + /// given proto path prefixes. + /// + /// Each path is a fully-qualified proto path prefix (e.g. + /// `".my.pkg.MyMessage.name"` for one field, `".my.pkg"` for a package). + /// Rules accumulate and the **last** matching rule wins, so a broad + /// `string_type` can be refined by a later `string_type_in`. The selected + /// type's `buffa` feature (`smol_str`, `ecow`, or `compact_str`) must be + /// enabled by the downstream crate. + /// + /// The wire format is unchanged; only the owned Rust type differs (view + /// types still borrow `&str`). + /// + /// # Example + /// + /// ```rust,ignore + /// use buffa_build::StringRepr; + /// buffa_build::Config::new() + /// .string_type_in(StringRepr::SmolStr, &[".my.pkg"]) + /// .files(&["proto/my_service.proto"]) + /// .includes(&["proto/"]) + /// .compile() + /// .unwrap(); + /// ``` + #[must_use] + pub fn string_type_in(mut self, repr: StringRepr, paths: &[impl AsRef]) -> Self { + self.codegen_config + .string_fields + .extend(paths.iter().map(|p| (p.as_ref().to_string(), repr))); + self + } + + /// Map every `string` field in all messages to the given [`StringRepr`]. + /// + /// Convenience for `.string_type_in(repr, &["."])`. Use + /// [`string_type_in`](Self::string_type_in) to target specific paths, or to + /// override this default for a subset (later rules win). + #[must_use] + pub fn string_type(mut self, repr: StringRepr) -> Self { + self.codegen_config + .string_fields + .push((".".to_string(), repr)); + self + } + /// Add a custom attribute to generated types (messages and enums) /// matching a proto path prefix. /// diff --git a/buffa-codegen/src/context.rs b/buffa-codegen/src/context.rs index 36b1fab..e6e318c 100644 --- a/buffa-codegen/src/context.rs +++ b/buffa-codegen/src/context.rs @@ -540,6 +540,24 @@ impl<'a> CodeGenContext<'a> { .iter() .any(|prefix| matches_proto_prefix(prefix, field_fqn)) } + + /// Resolve the [`StringRepr`](crate::StringRepr) for a `string` field at the + /// given proto path. + /// + /// `field_fqn` is the fully-qualified proto field path, e.g. + /// `".my.pkg.MyMessage.name"`. Rules in `config.string_fields` are matched + /// with the same proto-segment-aware prefix logic as + /// [`use_bytes_type`](Self::use_bytes_type); the **last** matching rule wins, + /// letting a specific override follow a broad default. Fields matching no + /// rule use [`StringRepr::String`](crate::StringRepr::String). + pub fn string_repr(&self, field_fqn: &str) -> crate::StringRepr { + self.config + .string_fields + .iter() + .rev() + .find(|(prefix, _)| matches_proto_prefix(prefix, field_fqn)) + .map_or(crate::StringRepr::default(), |(_, repr)| *repr) + } } /// Scope-local context for code generation within a message. diff --git a/buffa-codegen/src/impl_message.rs b/buffa-codegen/src/impl_message.rs index 1fc4829..b03b1c5 100644 --- a/buffa-codegen/src/impl_message.rs +++ b/buffa-codegen/src/impl_message.rs @@ -849,6 +849,20 @@ pub(crate) fn field_uses_bytes(ctx: &CodeGenContext, proto_fqn: &str, field_name ctx.use_bytes_type(&field_fqn) } +/// Resolve the [`StringRepr`](crate::StringRepr) for a `string`-typed field. +/// +/// `proto_fqn` is the fully-qualified message name (no leading dot). Matched +/// against `config.string_fields` as `".my.pkg.Msg.field"`. Returns +/// [`StringRepr::String`](crate::StringRepr::String) for fields with no rule. +pub(crate) fn field_string_repr( + ctx: &CodeGenContext, + proto_fqn: &str, + field_name: &str, +) -> crate::StringRepr { + let field_fqn = format!(".{}.{}", proto_fqn, field_name); + ctx.string_repr(&field_fqn) +} + fn scalar_clear_stmt( field: &FieldDescriptorProto, ctx: &CodeGenContext, @@ -880,7 +894,15 @@ fn scalar_clear_stmt( } match ty { - Type::TYPE_STRING => Ok(quote! { self.#ident.clear(); }), + Type::TYPE_STRING => { + // Non-default string types may be immutable (no `clear()`), so + // reset to the default value uniformly. + if field_string_repr(ctx, proto_fqn, field_name).is_default() { + Ok(quote! { self.#ident.clear(); }) + } else { + Ok(quote! { self.#ident = ::core::default::Default::default(); }) + } + } Type::TYPE_BYTES => { // bytes::Bytes is immutable (no clear()), so reassign. if use_bytes { @@ -1463,6 +1485,7 @@ fn scalar_write_to_stmt( /// /// Emits `field_number => { wire_check; self.field = Some(decoded_value); }`. /// Proto3 optional fields and proto2 optional non-message fields use this path. +#[allow(clippy::too_many_arguments)] fn explicit_presence_merge_arm( ident: &Ident, field_number: u32, @@ -1470,10 +1493,11 @@ fn explicit_presence_merge_arm( features: &ResolvedFeatures, wire_check: &TokenStream, use_bytes: bool, + string_repr: crate::StringRepr, preserve_unknown_fields: bool, ) -> TokenStream { match ty { - Type::TYPE_STRING => quote! { + Type::TYPE_STRING if string_repr.is_default() => quote! { #field_number => { #wire_check ::buffa::types::merge_string( @@ -1482,6 +1506,14 @@ fn explicit_presence_merge_arm( )?; } }, + Type::TYPE_STRING => quote! { + #field_number => { + #wire_check + self.#ident = ::core::option::Option::Some( + ::buffa::types::decode_string_to(buf)? + ); + } + }, Type::TYPE_BYTES => { if use_bytes { // bytes::Bytes is immutable — can't merge in place. Replace @@ -1560,6 +1592,7 @@ fn scalar_merge_arm( let field_number = validated_field_number(field)?; let ty = effective_type(ctx, field, features); let use_bytes = ty == Type::TYPE_BYTES && field_uses_bytes(ctx, proto_fqn, field_name); + let string_repr = field_string_repr(ctx, proto_fqn, field_name); let ident = make_field_ident(field_name); let wire_type = wire_type_token(ty); let expected_byte = wire_type_byte(ty); @@ -1575,6 +1608,7 @@ fn scalar_merge_arm( features, &wire_check, use_bytes, + string_repr, preserve_unknown_fields, )); } @@ -1589,10 +1623,21 @@ fn scalar_merge_arm( // decode_bytes which always allocate a fresh Vec/String. match ty { Type::TYPE_STRING => { - return Ok(quote! { - #field_number => { - #wire_check - ::buffa::types::merge_string(&mut self.#ident, buf)?; + return Ok(if string_repr.is_default() { + quote! { + #field_number => { + #wire_check + ::buffa::types::merge_string(&mut self.#ident, buf)?; + } + } + } else { + // Non-default string types are constructed fresh on decode; the + // in-place `merge_string` allocation reuse is `String`-only. + quote! { + #field_number => { + #wire_check + self.#ident = ::buffa::types::decode_string_to(buf)?; + } } }); } @@ -1989,7 +2034,10 @@ fn repeated_merge_arm( 2u8, ); let decode_expr = match ty { - Type::TYPE_STRING => quote! { ::buffa::types::decode_string(buf)? }, + Type::TYPE_STRING if field_string_repr(ctx, proto_fqn, field_name).is_default() => { + quote! { ::buffa::types::decode_string(buf)? } + } + Type::TYPE_STRING => quote! { ::buffa::types::decode_string_to(buf)? }, Type::TYPE_BYTES => { if use_bytes { quote! { ::buffa::types::decode_bytes_to_bytes(buf)? } @@ -2275,19 +2323,27 @@ fn oneof_merge_arm( features: &ResolvedFeatures, preserve_unknown_fields: bool, use_bytes: bool, + string_repr: crate::StringRepr, ) -> TokenStream { let wire_type = wire_type_token(ty); let wire_byte = wire_type_byte(ty); let wire_check = wire_type_check(field_number, &wire_type, wire_byte); match ty { - Type::TYPE_STRING => quote! { - #field_number => { - #wire_check - self.#field_ident = ::core::option::Option::Some( - #enum_ident::#variant_ident(::buffa::types::decode_string(buf)?) - ); + Type::TYPE_STRING => { + let decoded = if string_repr.is_default() { + quote! { ::buffa::types::decode_string(buf)? } + } else { + quote! { ::buffa::types::decode_string_to(buf)? } + }; + quote! { + #field_number => { + #wire_check + self.#field_ident = ::core::option::Option::Some( + #enum_ident::#variant_ident(#decoded) + ); + } } - }, + } Type::TYPE_BYTES => { let decoded = if use_bytes { quote! { ::buffa::types::decode_bytes_to_bytes(buf)? } @@ -2427,6 +2483,7 @@ fn generate_oneof_impls( )); let field_features = crate::features::resolve_field(ctx, field, features); let use_bytes = ty == Type::TYPE_BYTES && field_uses_bytes(ctx, proto_fqn, field_name); + let string_repr = field_string_repr(ctx, proto_fqn, field_name); merge_arm_list.push(oneof_merge_arm( &field_ident, &qualified_enum, @@ -2436,6 +2493,7 @@ fn generate_oneof_impls( &field_features, preserve_unknown_fields, use_bytes, + string_repr, )); } diff --git a/buffa-codegen/src/lib.rs b/buffa-codegen/src/lib.rs index 555ba4e..ab0ac70 100644 --- a/buffa-codegen/src/lib.rs +++ b/buffa-codegen/src/lib.rs @@ -169,6 +169,56 @@ pub enum GeneratedFileKind { Companion, } +/// The Rust type a proto `string` field maps to in generated owned structs. +/// +/// The default is [`String`](StringRepr::String). The other variants are +/// small-string-optimized types that avoid `String`'s growable buffer for +/// read-mostly schemas; each is gated behind the matching `buffa` Cargo feature +/// (`smol_str`, `ecow`, `compact_str`), and the downstream crate must enable +/// that feature so the re-exported type path (`::buffa::smol_str::SmolStr`, +/// etc.) resolves. +/// +/// Select a representation through `buffa_build`'s `string_type` / +/// `string_type_in` builder methods. The wire format is identical regardless of +/// representation — only the in-memory owned type changes; view types keep +/// borrowing `&str`. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub enum StringRepr { + /// `::buffa::alloc::string::String` (the default). + #[default] + String, + /// `smol_str::SmolStr` — inline storage for short strings, `O(1)` clone of + /// long strings via `Arc`. Requires the `buffa/smol_str` feature. + SmolStr, + /// `ecow::EcoString` — 16-byte footprint, clone-on-write with `O(1)` clone. + /// Requires the `buffa/ecow` feature. + EcoString, + /// `compact_str::CompactString` — mutable, inline storage up to 24 bytes. + /// Requires the `buffa/compact_str` feature. + CompactString, +} + +impl StringRepr { + /// The owned Rust type path emitted for a `string` field with this + /// representation. + pub(crate) fn type_path(self, resolver: &imports::ImportResolver) -> proc_macro2::TokenStream { + use quote::quote; + match self { + StringRepr::String => resolver.string(), + StringRepr::SmolStr => quote! { ::buffa::smol_str::SmolStr }, + StringRepr::EcoString => quote! { ::buffa::ecow::EcoString }, + StringRepr::CompactString => quote! { ::buffa::compact_str::CompactString }, + } + } + + /// Whether this is the default `String` representation, which keeps the + /// `String`-specialized fast paths (in-place `merge_string`, `clear()`, + /// native `Arbitrary`) instead of the generic `ProtoString` ones. + pub(crate) fn is_default(self) -> bool { + matches!(self, StringRepr::String) + } +} + /// Configuration for code generation. #[derive(Debug, Clone)] #[non_exhaustive] @@ -227,6 +277,13 @@ pub struct CodeGenConfig { /// a specific field, or `"."` for all bytes fields). The path is matched /// as a prefix, so `"."` applies to every bytes field in every message. pub bytes_fields: Vec, + /// Ordered (proto-path-prefix, [`StringRepr`]) rules selecting the Rust type + /// for `string` fields. Later rules win, so a broad rule (e.g. `"."` → + /// `SmolStr`) can be refined by a more specific one + /// (`".my.pkg.Msg.field"` → `CompactString`). Fields matching no rule use + /// `String`. The path is matched with the same proto-segment-aware prefix + /// logic as [`bytes_fields`](Self::bytes_fields). + pub string_fields: Vec<(String, StringRepr)>, /// Honor `features.utf8_validation = NONE` by emitting `Vec` / `&[u8]` /// for such string fields instead of `String` / `&str`. /// @@ -413,6 +470,7 @@ impl Default for CodeGenConfig { generate_arbitrary: false, extern_paths: Vec::new(), bytes_fields: Vec::new(), + string_fields: Vec::new(), strict_utf8_mapping: false, allow_message_set: false, generate_text: false, diff --git a/buffa-codegen/src/message.rs b/buffa-codegen/src/message.rs index f0a9e3f..d873e77 100644 --- a/buffa-codegen/src/message.rs +++ b/buffa-codegen/src/message.rs @@ -1297,6 +1297,11 @@ struct FieldInfo { /// True when the field is proto type `bytes` AND matches the `bytes_fields` /// config — i.e. the struct field type is `bytes::Bytes` not `Vec`. use_bytes: bool, + /// The owned Rust type used for this field when it is proto type `string` + /// (singular, optional, or repeated; map keys/values are unaffected). + /// [`StringRepr::String`] for non-string fields and for string fields with + /// no matching `string_fields` rule. + string_repr: crate::StringRepr, /// Proto2 `required` (or editions `LEGACY_REQUIRED`). Required fields /// must always appear in JSON output regardless of value, matching the /// binary encoder's always-encode semantics. @@ -1357,12 +1362,24 @@ fn classify_field( quote! { #vec } }; + // Configurable owned representation for `string` fields (default `String`). + // Map keys/values are unaffected — they always use `String`, mirroring the + // bytes path where map values always stay `Vec`. + let string_repr = if field_type == Type::TYPE_STRING { + ctx.string_repr(&field_fqn) + } else { + crate::StringRepr::String + }; + let string_type = string_repr.type_path(resolver); + let mut inner_opt_type: Option = None; let rust_type = if let Some(entry) = map_entry { map_rust_type_from_entry(scope, entry, resolver)? } else if is_repeated { let elem = if field_type == Type::TYPE_BYTES { bytes_type + } else if field_type == Type::TYPE_STRING { + string_type } else { scalar_or_message_type_nested(ctx, field, current_package, nesting, features, resolver)? }; @@ -1381,6 +1398,8 @@ fn classify_field( resolve_enum_type(scope, field, resolver)? } else if field_type == Type::TYPE_BYTES { bytes_type + } else if field_type == Type::TYPE_STRING { + string_type } else { scalar_rust_type(field_type, resolver)? }; @@ -1393,6 +1412,8 @@ fn classify_field( resolve_enum_type(scope, field, resolver)? } else if field_type == Type::TYPE_BYTES { bytes_type + } else if field_type == Type::TYPE_STRING { + string_type } else { scalar_rust_type(field_type, resolver)? }; @@ -1444,6 +1465,7 @@ fn classify_field( is_optional, is_required, use_bytes, + string_repr, map_key_type, map_value_type, map_value_enum_closed, @@ -1530,6 +1552,20 @@ fn generate_field( quote! { ::buffa::__private::arbitrary_bytes } }; quote! { #[cfg_attr(feature = "arbitrary", arbitrary(with = #helper))] } + } else if ctx.config.generate_arbitrary + && info.string_repr == crate::StringRepr::EcoString + && !info.is_map + { + // `ecow::EcoString` has no native `Arbitrary` impl; smol_str and + // compact_str do, so only EcoString needs the shim. + let helper = if info.is_optional { + quote! { ::buffa::__private::arbitrary_ecow_opt } + } else if info.is_repeated { + quote! { ::buffa::__private::arbitrary_ecow_vec } + } else { + quote! { ::buffa::__private::arbitrary_ecow } + }; + quote! { #[cfg_attr(feature = "arbitrary", arbitrary(with = #helper))] } } else { quote! {} }; diff --git a/buffa-codegen/src/view.rs b/buffa-codegen/src/view.rs index be2ee97..9057047 100644 --- a/buffa-codegen/src/view.rs +++ b/buffa-codegen/src/view.rs @@ -18,7 +18,7 @@ use crate::context::{ancillary_prefix, AncillaryKind, CodeGenContext, MessageSco use crate::features::ResolvedFeatures; use crate::impl_message::{ closed_enum_decode, closed_enum_decode_with_unknown, decode_fn_token, effective_type, - effective_type_in_map_entry, field_uses_bytes, find_map_entry_fields, + effective_type_in_map_entry, field_string_repr, field_uses_bytes, find_map_entry_fields, is_explicit_presence_scalar, is_packed_type, is_real_oneof_member, is_required_field, is_supported_field_type, validated_field_number, wire_type_byte, wire_type_check, wire_type_token, @@ -1421,6 +1421,28 @@ fn build_to_owned_fields( Ok(out) } +/// Convert a string view binding to the owned `string`-field type. +/// +/// The default `String` keeps the original `binding.to_string()` (which +/// auto-derefs through a `&&str` binding), so default output is byte-identical +/// to before this knob existed. Other representations build via `From<&str>` +/// with the owned field type driving `Into` inference; `Into::into` takes its +/// argument by value and does not auto-deref, so `double_ref` sites (iterator +/// and match-ergonomics bindings that are `&&str`) get one explicit `*`. +fn str_view_to_owned( + repr: crate::StringRepr, + binding: TokenStream, + double_ref: bool, +) -> TokenStream { + if repr.is_default() { + quote! { #binding.to_string() } + } else if double_ref { + quote! { ::core::convert::Into::into(*#binding) } + } else { + quote! { ::core::convert::Into::into(#binding) } + } +} + fn singular_to_owned( scope: MessageScope<'_>, field: &FieldDescriptorProto, @@ -1436,7 +1458,17 @@ fn singular_to_owned( } = scope; if is_explicit_presence_scalar(field, ty, features) { return Ok(match ty { - Type::TYPE_STRING => quote! { self.#ident.map(|s| s.to_string()) }, + Type::TYPE_STRING => { + // Option<&str>::map. The default keeps `|s| s.to_string()`; for + // other reprs pass the `Into::into` fn directly rather than + // wrapping it in a closure (avoids clippy::redundant_closure in + // the consumer crate). Inference picks the target from the field. + if field_string_repr(ctx, proto_fqn, field_name).is_default() { + quote! { self.#ident.map(|s| s.to_string()) } + } else { + quote! { self.#ident.map(::core::convert::Into::into) } + } + } Type::TYPE_BYTES => { let conv = bytes_to_owned(ctx, proto_fqn, field_name, quote! { b }); quote! { self.#ident.map(|b| #conv) } @@ -1445,7 +1477,11 @@ fn singular_to_owned( }); } Ok(match ty { - Type::TYPE_STRING => quote! { self.#ident.to_string() }, + Type::TYPE_STRING => str_view_to_owned( + field_string_repr(ctx, proto_fqn, field_name), + quote! { self.#ident }, + false, + ), Type::TYPE_BYTES => bytes_to_owned(ctx, proto_fqn, field_name, quote! { self.#ident }), Type::TYPE_MESSAGE | Type::TYPE_GROUP => { let owned_path = resolve_owned_path(scope, field)?; @@ -1473,7 +1509,15 @@ fn repeated_to_owned( ) -> TokenStream { let MessageScope { ctx, proto_fqn, .. } = scope; match ty { - Type::TYPE_STRING => quote! { self.#ident.iter().map(|s| s.to_string()).collect() }, + Type::TYPE_STRING => { + // RepeatedView<&str>::iter() yields `&&str` (double ref). + let conv = str_view_to_owned( + field_string_repr(ctx, proto_fqn, field_name), + quote! { s }, + true, + ); + quote! { self.#ident.iter().map(|s| #conv).collect() } + } Type::TYPE_BYTES => { // Vec<&[u8]>::iter() → b: &&[u8]. bytes_to_owned handles double-ref. let conv = bytes_to_owned(ctx, proto_fqn, field_name, quote! { b }); @@ -1521,7 +1565,14 @@ fn map_to_owned_expr( fn oneof_variant_to_owned(scope: MessageScope<'_>, ty: Type, field_name: &str) -> TokenStream { let MessageScope { ctx, proto_fqn, .. } = scope; match ty { - Type::TYPE_STRING => quote! { v.to_string() }, + Type::TYPE_STRING => { + // Match ergonomics on `&ViewEnum` binds `v` as `&&str` (double ref). + str_view_to_owned( + field_string_repr(ctx, proto_fqn, field_name), + quote! { v }, + true, + ) + } // match-ergonomics on &ViewEnum → v: &&[u8]. bytes_to_owned handles it. Type::TYPE_BYTES => bytes_to_owned(ctx, proto_fqn, field_name, quote! { v }), Type::TYPE_MESSAGE | Type::TYPE_GROUP => { diff --git a/buffa-codegen/tests/codegen_integration.rs b/buffa-codegen/tests/codegen_integration.rs index 887114f..69994a5 100644 --- a/buffa-codegen/tests/codegen_integration.rs +++ b/buffa-codegen/tests/codegen_integration.rs @@ -807,6 +807,120 @@ fn inline_bytes_field_selective_mapping() { ); } +#[test] +fn inline_string_field_mapping() { + use buffa_codegen::StringRepr; + let mut config = no_views(); + config.string_fields.push((".".into(), StringRepr::SmolStr)); + let content = generate_proto( + r#" + syntax = "proto3"; + package test; + message Msg { + string name = 1; + optional string nick = 2; + repeated string tags = 3; + } + "#, + &config, + ); + assert!( + content.contains("::buffa::smol_str::SmolStr"), + "string fields should use SmolStr: {content}" + ); + // Non-optional singular decode must use the generic helper, not merge_string. + assert!( + content.contains("::buffa::types::decode_string_to"), + "decode must use decode_string_to for non-default string repr: {content}" + ); + assert!( + !content.contains("merge_string(&mut self"), + "must not call merge_string on a non-default string field: {content}" + ); +} + +#[test] +fn inline_string_field_selective_and_override() { + use buffa_codegen::StringRepr; + let mut config = no_views(); + // Broad default, then a more specific override (last match wins). + config.string_fields.push((".".into(), StringRepr::SmolStr)); + config + .string_fields + .push((".test.Msg.code".into(), StringRepr::CompactString)); + let content = generate_proto( + r#" + syntax = "proto3"; + package test; + message Msg { + string name = 1; + string code = 2; + } + "#, + &config, + ); + assert!( + content.contains("::buffa::smol_str::SmolStr"), + "broad rule should apply SmolStr to `name`: {content}" + ); + assert!( + content.contains("::buffa::compact_str::CompactString"), + "specific override should apply CompactString to `code`: {content}" + ); +} + +#[test] +fn inline_string_ecow_emits_arbitrary_shim() { + use buffa_codegen::StringRepr; + let mut config = no_views(); + config.generate_arbitrary = true; + config + .string_fields + .push((".".into(), StringRepr::EcoString)); + let content = generate_proto( + r#" + syntax = "proto3"; + package test; + message Msg { string name = 1; } + "#, + &config, + ); + assert!( + content.contains("::buffa::ecow::EcoString"), + "string field should use EcoString: {content}" + ); + // EcoString has no native Arbitrary impl — codegen must attach the shim. + assert!( + content.contains("arbitrary(with = ::buffa::__private::arbitrary_ecow"), + "EcoString field must use the arbitrary_ecow shim: {content}" + ); +} + +#[test] +fn inline_string_default_is_unchanged() { + // With no string_fields rule, output must still use String + merge_string. + let content = generate_proto( + r#" + syntax = "proto3"; + package test; + message Msg { string name = 1; } + "#, + &no_views(), + ); + assert!( + content.contains("::buffa::alloc::string::String"), + "default string repr should remain String: {content}" + ); + assert!( + content.contains("merge_string(&mut self"), + "default string field should keep the in-place merge_string fast path: {content}" + ); + assert!( + !content.contains("decode_string_to"), + "default string field must not use the generic decode helper: {content}" + ); +} + #[test] fn inline_preserve_unknown_fields_disabled() { let mut config = no_views(); diff --git a/buffa-test/Cargo.toml b/buffa-test/Cargo.toml index dcb10d8..0725dde 100644 --- a/buffa-test/Cargo.toml +++ b/buffa-test/Cargo.toml @@ -11,7 +11,14 @@ arbitrary = ["dep:arbitrary", "buffa/arbitrary"] [dependencies] arbitrary = { workspace = true, optional = true } -buffa = { workspace = true, features = ["std", "json", "text"] } +buffa = { workspace = true, features = [ + "std", + "json", + "text", + "smol_str", + "ecow", + "compact_str", +] } # `reflect`+`json` so the codegen-emitted `impl Reflectable` and the # embedded descriptor pool resolve. `json` because `basic.proto` is also # generated with `generate_text(true)` and the pool decodes the WKT diff --git a/buffa-test/build.rs b/buffa-test/build.rs index 11c2782..0300d68 100644 --- a/buffa-test/build.rs +++ b/buffa-test/build.rs @@ -181,6 +181,31 @@ fn main() { .compile() .expect("buffa_build failed for basic.proto with use_bytes_type"); + // Configurable string_type: a broad SmolStr default plus per-field + // CompactString / EcoString overrides, with generate_json + arbitrary so + // every string code path (decode/clear/view/json/arbitrary, including the + // EcoString arbitrary shim) is compiled. Map keys/values stay String. + let string_out = + std::path::PathBuf::from(std::env::var("OUT_DIR").unwrap()).join("string_variant"); + std::fs::create_dir_all(&string_out).expect("create string_variant dir"); + buffa_build::Config::new() + .files(&["protos/string_types.proto"]) + .includes(&["protos/"]) + .string_type(buffa_build::StringRepr::SmolStr) + .string_type_in( + buffa_build::StringRepr::CompactString, + &[".stringtypes.StringContexts.compact"], + ) + .string_type_in( + buffa_build::StringRepr::EcoString, + &[".stringtypes.StringContexts.eco"], + ) + .generate_json(true) + .generate_arbitrary(true) + .out_dir(string_out) + .compile() + .expect("buffa_build failed for string_types.proto with string_type"); + // Regression #88: bytes_fields + generate_arbitrary(true). // BytesContexts in basic.proto has singular, optional, repeated, and oneof // bytes fields — this compilation exercises all four shim paths. diff --git a/buffa-test/protos/string_types.proto b/buffa-test/protos/string_types.proto new file mode 100644 index 0000000..8e7d8a4 --- /dev/null +++ b/buffa-test/protos/string_types.proto @@ -0,0 +1,23 @@ +syntax = "proto3"; + +package stringtypes; + +// Exercises the configurable `string_type` knob across every string-field +// shape (singular, optional, repeated, oneof, map). The build compiles this +// with a broad SmolStr default plus per-field CompactString/EcoString +// overrides, so a single compiled module covers all three representations and +// all decode/clear/view/json/arbitrary code paths. +message StringContexts { + string singular = 1; + optional string maybe = 2; + repeated string many = 3; + // Per-field override targets (see build.rs string_type_in rules). + string compact = 4; + string eco = 5; + // Map keys/values are unaffected by string_type — always String. + map by_key = 6; + oneof choice { + string named = 7; + int32 count = 8; + } +} diff --git a/buffa-test/src/lib.rs b/buffa-test/src/lib.rs index af12771..b708305 100644 --- a/buffa-test/src/lib.rs +++ b/buffa-test/src/lib.rs @@ -242,6 +242,23 @@ pub mod basic_arbitrary_bytes { include!(concat!(env!("OUT_DIR"), "/arbitrary_bytes/basic.mod.rs")); } +// Configurable string_type: SmolStr default + CompactString/EcoString +// overrides, generate_json + arbitrary. Compiling this module exercises every +// string code path against the real crates; runtime checks live in +// `tests/string_type.rs`. +#[allow( + clippy::derivable_impls, + clippy::match_single_binding, + non_camel_case_types, + dead_code +)] +pub mod string_types { + include!(concat!( + env!("OUT_DIR"), + "/string_variant/stringtypes.mod.rs" + )); +} + // Views + preserve_unknown_fields=false: covers the else-branches in view // codegen that omit the unknown-fields view field. Compilation IS the test. #[allow( diff --git a/buffa-test/src/tests/mod.rs b/buffa-test/src/tests/mod.rs index aed6c10..22f71c6 100644 --- a/buffa-test/src/tests/mod.rs +++ b/buffa-test/src/tests/mod.rs @@ -43,6 +43,7 @@ mod nesting; mod nestpkg; mod proto2; mod proto3_semantics; +mod string_type; mod textproto; mod utf8_validation; mod view; diff --git a/buffa-test/src/tests/string_type.rs b/buffa-test/src/tests/string_type.rs new file mode 100644 index 0000000..4b1fa5d --- /dev/null +++ b/buffa-test/src/tests/string_type.rs @@ -0,0 +1,121 @@ +//! string_type(): configurable owned string representation. +//! +//! `string_types.proto` is compiled with a broad `SmolStr` default plus +//! per-field `CompactString` (`compact`) and `EcoString` (`eco`) overrides. +//! Compiling `crate::string_types` is itself most of the test — if any decode, +//! clear, view→owned, JSON, or arbitrary path emitted the wrong type these +//! would not build. The runtime checks below verify behavior and pin the field +//! types to the configured representations. + +use crate::string_types::StringContexts; +use crate::string_types::__buffa::oneof::string_contexts::Choice; +use buffa::Message; + +#[test] +fn test_string_type_field_types_are_configured() { + // Type assertions: the broad default and the per-field overrides must + // produce exactly the configured representations. These fail to compile if + // the wrong type was emitted. + let m = StringContexts::default(); + let _: &::buffa::smol_str::SmolStr = &m.singular; + let _: &::core::option::Option<::buffa::smol_str::SmolStr> = &m.maybe; + let _: &::buffa::alloc::vec::Vec<::buffa::smol_str::SmolStr> = &m.many; + let _: &::buffa::compact_str::CompactString = &m.compact; + let _: &::buffa::ecow::EcoString = &m.eco; + // Map keys/values are unaffected — always String. + let _: &std::collections::HashMap = &m.by_key; +} + +fn sample() -> StringContexts { + StringContexts { + singular: "hello".into(), + maybe: Some("nick".into()), + many: vec!["a".into(), "b".into(), "".into()], + compact: "compact-value".into(), + eco: "eco-value".into(), + by_key: [("k".to_string(), "v".to_string())].into_iter().collect(), + choice: Some(Choice::Named("chosen".into())), + ..Default::default() + } +} + +#[test] +fn test_string_type_binary_roundtrip() { + let msg = sample(); + let wire = msg.encode_to_vec(); + let decoded = StringContexts::decode(&mut wire.as_slice()).expect("decode"); + assert_eq!(decoded, msg); + assert_eq!(decoded.singular.as_str(), "hello"); + assert_eq!(decoded.compact.as_str(), "compact-value"); + assert_eq!(decoded.eco.as_str(), "eco-value"); + assert_eq!(decoded.many.len(), 3); + match &decoded.choice { + Some(Choice::Named(s)) => assert_eq!(s.as_str(), "chosen"), + other => panic!("expected Choice::Named, got {other:?}"), + } +} + +#[test] +fn test_string_type_wire_compatible_with_string() { + // Wire format must be identical to the default String representation. Build + // a message via the configured types, decode it back, and confirm the + // bytes round-trip exactly. + let msg = sample(); + let wire = msg.encode_to_vec(); + let back = StringContexts::decode(&mut wire.as_slice()).expect("decode"); + assert_eq!(back.encode_to_vec(), wire); +} + +#[test] +fn test_string_type_clear_resets_immutable_types() { + // SmolStr / EcoString are immutable (no `clear()`); clear() must reset them + // to the default value rather than calling a String-specific method. + let mut msg = sample(); + msg.clear(); + assert!(msg.singular.is_empty()); + assert!(msg.maybe.is_none()); + assert!(msg.many.is_empty()); + assert!(msg.compact.is_empty()); + assert!(msg.eco.is_empty()); + assert!(msg.choice.is_none()); +} + +#[test] +fn test_string_type_view_to_owned() { + use crate::string_types::__buffa::view::StringContextsView; + use buffa::MessageView; + let msg = sample(); + let wire = msg.encode_to_vec(); + let view = StringContextsView::decode_view(&wire).expect("decode_view"); + // Views always borrow &str regardless of the owned representation. + assert_eq!(view.singular, "hello"); + assert_eq!(view.compact, "compact-value"); + let owned: StringContexts = view.to_owned_message(); + assert_eq!(owned, msg); + // to_owned built the configured types, not String. + let _: ::buffa::smol_str::SmolStr = owned.singular.clone(); + let _: ::buffa::compact_str::CompactString = owned.compact.clone(); + let _: ::buffa::ecow::EcoString = owned.eco.clone(); +} + +#[test] +fn test_string_type_json_roundtrip() { + let msg = sample(); + let json = serde_json::to_string(&msg).expect("serialize"); + assert!(json.contains(r#""singular":"hello""#), "{json}"); + assert!(json.contains(r#""many":["a","b",""]"#), "{json}"); + assert!(json.contains(r#""named":"chosen""#), "{json}"); + let back: StringContexts = serde_json::from_str(&json).expect("deserialize"); + assert_eq!(back, msg); +} + +#[test] +fn test_string_type_json_null_is_empty() { + let json = r#"{"singular":null,"maybe":null,"many":null,"compact":null,"eco":null}"#; + let back: StringContexts = serde_json::from_str(json).expect("deserialize nulls"); + assert!(back.singular.is_empty()); + assert!(back.maybe.is_none()); + assert!(back.many.is_empty()); + assert!(back.compact.is_empty()); + assert!(back.eco.is_empty()); +} From 02b025bea397a591754b67160f6055117a0a9536 Mon Sep 17 00:00:00 2001 From: Iain McGinniss <309153+iainmcgin@users.noreply.github.com> Date: Fri, 22 May 2026 17:24:08 +0000 Subject: [PATCH 2/4] codegen: honor string_type in oneof, text format, and proto2 defaults MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three code paths still emitted String for non-default string representations: - Oneof string variants: the variant type (and its JSON-deserialize seed type) ignored string_fields, so a configured SmolStr/EcoString/CompactString was silently dropped back to String. Fixed in oneof.rs and the custom-deserialize path in message.rs; EcoString oneof variants now also get the arbitrary shim. - Text format: read_string()?.into_owned() is String — combined with a non-default repr it failed to compile. The text decoder now converts via From for singular, optional, repeated, and oneof string fields. - proto2 [default = "..."]: the default expression was hard-coded to String::from, breaking the generated Default impl and clear() for bare (required) string fields under a non-default repr. Default String output is unchanged (verified: regenerating the WKT and bootstrap descriptor types produces no diff). Tests: enable generate_text on the string_variant build and add a text round-trip; pin the oneof payload type; add a proto2 fixture exercising [default] + string_type. Also: mark StringRepr #[non_exhaustive], document the map-stays-String contract and the last-match-wins ordering on the builder methods, and note immutability on the SmolStr/EcoString variants. --- buffa-build/src/lib.rs | 32 +++++++++++++------- buffa-codegen/src/defaults.rs | 10 ++++++- buffa-codegen/src/extension.rs | 32 ++++++++++++-------- buffa-codegen/src/impl_message.rs | 11 +++++-- buffa-codegen/src/impl_text.rs | 43 +++++++++++++++++++++------ buffa-codegen/src/lib.rs | 27 ++++++++++++----- buffa-codegen/src/message.rs | 28 +++++++++++++++-- buffa-codegen/src/oneof.rs | 20 ++++++++++++- buffa-test/build.rs | 15 ++++++++++ buffa-test/protos/string_proto2.proto | 13 ++++++++ buffa-test/src/lib.rs | 16 ++++++++++ buffa-test/src/tests/string_type.rs | 39 ++++++++++++++++++++++++ 12 files changed, 239 insertions(+), 47 deletions(-) create mode 100644 buffa-test/protos/string_proto2.proto diff --git a/buffa-build/src/lib.rs b/buffa-build/src/lib.rs index 0e75ecc..72010b9 100644 --- a/buffa-build/src/lib.rs +++ b/buffa-build/src/lib.rs @@ -442,24 +442,32 @@ impl Config { } /// Map `string` fields to a [`StringRepr`] other than `String` for the - /// given proto path prefixes. + /// given proto path prefixes. The string counterpart to + /// [`use_bytes_type_in`](Self::use_bytes_type_in). /// /// Each path is a fully-qualified proto path prefix (e.g. /// `".my.pkg.MyMessage.name"` for one field, `".my.pkg"` for a package). - /// Rules accumulate and the **last** matching rule wins, so a broad - /// `string_type` can be refined by a later `string_type_in`. The selected - /// type's `buffa` feature (`smol_str`, `ecow`, or `compact_str`) must be - /// enabled by the downstream crate. /// - /// The wire format is unchanged; only the owned Rust type differs (view - /// types still borrow `&str`). + /// Rules accumulate and the **last** matching rule wins. Order therefore + /// matters: call [`string_type`](Self::string_type) (the broad default) + /// *first*, then `string_type_in` for narrower overrides — a broad rule + /// added after a specific one will shadow it. + /// + /// The downstream crate must enable the selected type's `buffa` feature + /// (`smol_str`, `ecow`, or `compact_str`); otherwise the generated + /// `::buffa::::` references fail to resolve. + /// + /// Only the owned Rust type changes: the wire format is unchanged, view + /// types still borrow `&str`, and `map<_, string>` keys and values stay + /// `String`. /// /// # Example /// /// ```rust,ignore /// use buffa_build::StringRepr; /// buffa_build::Config::new() - /// .string_type_in(StringRepr::SmolStr, &[".my.pkg"]) + /// .string_type(StringRepr::SmolStr) // broad default first + /// .string_type_in(StringRepr::CompactString, &[".my.pkg.Msg.body"]) // narrow override /// .files(&["proto/my_service.proto"]) /// .includes(&["proto/"]) /// .compile() @@ -475,9 +483,11 @@ impl Config { /// Map every `string` field in all messages to the given [`StringRepr`]. /// - /// Convenience for `.string_type_in(repr, &["."])`. Use - /// [`string_type_in`](Self::string_type_in) to target specific paths, or to - /// override this default for a subset (later rules win). + /// Convenience for `.string_type_in(repr, &["."])`. Call this *before* any + /// [`string_type_in`](Self::string_type_in) overrides, since the last + /// matching rule wins (a `"."` rule added later shadows earlier specific + /// rules). `map<_, string>` keys and values stay `String`, and the + /// downstream crate must enable the selected type's `buffa` feature. #[must_use] pub fn string_type(mut self, repr: StringRepr) -> Self { self.codegen_config diff --git a/buffa-codegen/src/defaults.rs b/buffa-codegen/src/defaults.rs index bac972c..3feef3e 100644 --- a/buffa-codegen/src/defaults.rs +++ b/buffa-codegen/src/defaults.rs @@ -23,6 +23,7 @@ pub fn parse_default_value( current_package: &str, features: &ResolvedFeatures, nesting: usize, + string_repr: crate::StringRepr, ) -> Result, CodeGenError> { use crate::generated::descriptor::field_descriptor_proto::Type; @@ -90,8 +91,15 @@ pub fn parse_default_value( // the proto source literal is valid UTF-8 by definition. if crate::impl_message::effective_type(ctx, field, features) == Type::TYPE_BYTES { quote! { ::buffa::alloc::string::String::from(#default_str).into_bytes() } - } else { + } else if string_repr.is_default() { quote! { ::buffa::alloc::string::String::from(#default_str) } + } else { + // Non-default string types: convert via From. The + // surrounding context (Default initializer / clear assignment) + // pins the target type, so `Into::into` infers it. + quote! { + ::core::convert::Into::into(::buffa::alloc::string::String::from(#default_str)) + } } } Type::TYPE_BYTES => { diff --git a/buffa-codegen/src/extension.rs b/buffa-codegen/src/extension.rs index bcd7dbf..aef98a6 100644 --- a/buffa-codegen/src/extension.rs +++ b/buffa-codegen/src/extension.rs @@ -282,18 +282,26 @@ fn default_fn_tokens( } let value_ty = codec_value_type(ty); - let default_expr = - crate::defaults::parse_default_value(field, ctx, current_package, features, nesting)? - .ok_or_else(|| { - // default_value was non-empty but parse returned None — - // happens when field_presence ≠ Explicit (shouldn't for - // extensions, which are always explicit-presence per - // protocolbuffers/protobuf#8234) or for an unhandled type. - CodeGenError::Other(format!( - "extension `{proto_name}`: could not parse default_value `{}` for type {ty:?}", - field.default_value.as_deref().unwrap_or_default() - )) - })?; + // Extension string fields always use `String` (not remapped by the + // `string_type` knob), so their default keeps the `String` form. + let default_expr = crate::defaults::parse_default_value( + field, + ctx, + current_package, + features, + nesting, + crate::StringRepr::String, + )? + .ok_or_else(|| { + // default_value was non-empty but parse returned None — + // happens when field_presence ≠ Explicit (shouldn't for + // extensions, which are always explicit-presence per + // protocolbuffers/protobuf#8234) or for an unhandled type. + CodeGenError::Other(format!( + "extension `{proto_name}`: could not parse default_value `{}` for type {ty:?}", + field.default_value.as_deref().unwrap_or_default() + )) + })?; // String::from / vec![...] allocate → can't be const. Everything else // (integer/float/bool literals) is const-evaluable. diff --git a/buffa-codegen/src/impl_message.rs b/buffa-codegen/src/impl_message.rs index b03b1c5..98c3144 100644 --- a/buffa-codegen/src/impl_message.rs +++ b/buffa-codegen/src/impl_message.rs @@ -887,9 +887,14 @@ fn scalar_clear_stmt( // If the field has a custom default value (proto2), use it instead of // the type's zero value so that clear() matches Default::default(). - if let Some(default_expr) = - crate::defaults::parse_default_value(field, ctx, current_package, features, nesting)? - { + if let Some(default_expr) = crate::defaults::parse_default_value( + field, + ctx, + current_package, + features, + nesting, + field_string_repr(ctx, proto_fqn, field_name), + )? { return Ok(quote! { self.#ident = #default_expr; }); } diff --git a/buffa-codegen/src/impl_text.rs b/buffa-codegen/src/impl_text.rs index 082d786..f3cdc38 100644 --- a/buffa-codegen/src/impl_text.rs +++ b/buffa-codegen/src/impl_text.rs @@ -25,14 +25,26 @@ use crate::generated::descriptor::field_descriptor_proto::{Label, Type}; use crate::generated::descriptor::{DescriptorProto, FieldDescriptorProto}; use crate::idents::rust_path_to_tokens; use crate::impl_message::{ - effective_type, effective_type_in_map_entry, field_uses_bytes, find_map_entry_fields, - is_explicit_presence_scalar, is_non_default_expr, is_real_oneof_member, is_required_field, - is_supported_field_type, + effective_type, effective_type_in_map_entry, field_string_repr, field_uses_bytes, + find_map_entry_fields, is_explicit_presence_scalar, is_non_default_expr, is_real_oneof_member, + is_required_field, is_supported_field_type, }; use crate::message::{is_closed_enum, is_map_field, make_field_ident}; use crate::oneof::is_boxed_variant; use crate::CodeGenError; +/// Wrap a text-decoded owned `String` expression in the conversion to the +/// configured [`StringRepr`](crate::StringRepr). The default `String` keeps the +/// expression verbatim (byte-identical output); other reprs convert via +/// `From`. +fn text_string_into(repr: crate::StringRepr, owned: TokenStream) -> TokenStream { + if repr.is_default() { + owned + } else { + quote! { ::core::convert::Into::into(#owned) } + } +} + /// Generate `impl ::buffa::text::TextFormat for #name_ident { ... }`. /// /// Returns an empty `TokenStream` when `generate_text` is disabled, the @@ -610,7 +622,10 @@ fn scalar_merge_arm( // String: `read_string()` returns `Cow`, need `.into_owned()`. // Bytes: `read_bytes()` returns `Vec`; `bytes::Bytes: From>`. let read = match ty { - Type::TYPE_STRING => quote! { dec.read_string()?.into_owned() }, + Type::TYPE_STRING => text_string_into( + field_string_repr(ctx, proto_fqn, proto_name), + quote! { dec.read_string()?.into_owned() }, + ), Type::TYPE_BYTES if use_bytes => quote! { ::buffa::bytes::Bytes::from(dec.read_bytes()?) }, _ => read_call(ty), }; @@ -691,7 +706,11 @@ fn repeated_merge_arm( enum_read(closed, &enum_ty, &format_ident!("__d")) } Type::TYPE_STRING => { - quote! { ::core::result::Result::Ok(__d.read_string()?.into_owned()) } + let v = text_string_into( + field_string_repr(ctx, proto_fqn, proto_name), + quote! { __d.read_string()?.into_owned() }, + ); + quote! { ::core::result::Result::Ok(#v) } } Type::TYPE_BYTES if use_bytes => { quote! { ::core::result::Result::Ok(::buffa::bytes::Bytes::from(__d.read_bytes()?)) } @@ -850,11 +869,17 @@ fn oneof_merge_arms( ); } } - Type::TYPE_STRING => quote! { - self.#field_ident = ::core::option::Option::Some( - #qualified::#variant(dec.read_string()?.into_owned()) + Type::TYPE_STRING => { + let read = text_string_into( + field_string_repr(ctx, proto_fqn, proto_name), + quote! { dec.read_string()?.into_owned() }, ); - }, + quote! { + self.#field_ident = ::core::option::Option::Some( + #qualified::#variant(#read) + ); + } + } Type::TYPE_BYTES => { let read = if use_bytes { quote! { ::buffa::bytes::Bytes::from(dec.read_bytes()?) } diff --git a/buffa-codegen/src/lib.rs b/buffa-codegen/src/lib.rs index ab0ac70..c67a53d 100644 --- a/buffa-codegen/src/lib.rs +++ b/buffa-codegen/src/lib.rs @@ -181,20 +181,29 @@ pub enum GeneratedFileKind { /// Select a representation through `buffa_build`'s `string_type` / /// `string_type_in` builder methods. The wire format is identical regardless of /// representation — only the in-memory owned type changes; view types keep -/// borrowing `&str`. +/// borrowing `&str`, and `map<_, string>` / `map` keys and values +/// always stay `String`. +/// +/// Sizes below are for 64-bit targets. See the buffa README for a fuller +/// comparison of the small-string crates. #[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +#[non_exhaustive] pub enum StringRepr { - /// `::buffa::alloc::string::String` (the default). + /// `::buffa::alloc::string::String` — 24-byte struct, growable and mutable + /// (the default). #[default] String, - /// `smol_str::SmolStr` — inline storage for short strings, `O(1)` clone of - /// long strings via `Arc`. Requires the `buffa/smol_str` feature. + /// `smol_str::SmolStr` — 24-byte struct, inlines up to 23 bytes, `O(1)` + /// clone of long strings via `Arc`. **Immutable** (assign a new value + /// to mutate). Requires the `buffa/smol_str` feature. SmolStr, - /// `ecow::EcoString` — 16-byte footprint, clone-on-write with `O(1)` clone. + /// `ecow::EcoString` — 16-byte struct, inlines up to 15 bytes, clone-on-write + /// with `O(1)` clone. **Immutable** (assign a new value to mutate). /// Requires the `buffa/ecow` feature. EcoString, - /// `compact_str::CompactString` — mutable, inline storage up to 24 bytes. - /// Requires the `buffa/compact_str` feature. + /// `compact_str::CompactString` — 24-byte struct, inlines up to 24 bytes, + /// mutable (a drop-in `String` replacement). Requires the + /// `buffa/compact_str` feature. CompactString, } @@ -283,6 +292,10 @@ pub struct CodeGenConfig { /// (`".my.pkg.Msg.field"` → `CompactString`). Fields matching no rule use /// `String`. The path is matched with the same proto-segment-aware prefix /// logic as [`bytes_fields`](Self::bytes_fields). + /// + /// Applies to singular, optional, and repeated `string` fields and oneof + /// `string` variants. Map keys and values always stay `String`, mirroring + /// the bytes path (where map values always stay `Vec`). pub string_fields: Vec<(String, StringRepr)>, /// Honor `features.utf8_validation = NONE` by emitting `Vec` / `&[u8]` /// for such string fields instead of `String` / `&str`. diff --git a/buffa-codegen/src/message.rs b/buffa-codegen/src/message.rs index d873e77..7ea0296 100644 --- a/buffa-codegen/src/message.rs +++ b/buffa-codegen/src/message.rs @@ -544,8 +544,15 @@ fn generate_message_with_nesting( // Check if any non-optional field has a custom default value, which // requires a hand-written `impl Default` instead of `#[derive(Default)]`. - let custom_default_impl = - generate_custom_default(ctx, msg, &name_ident, current_package, features, nesting)?; + let custom_default_impl = generate_custom_default( + ctx, + msg, + &name_ident, + current_package, + proto_fqn, + features, + nesting, + )?; let derive_default = if custom_default_impl.is_some() { quote! {} } else { @@ -1234,10 +1241,17 @@ fn custom_deser_oneof_group( // return type, which pins the generic T in json_helpers::bytes:: // deserialize to Bytes (vs the Vec default). No downstream // shim needed — the helper is generic over T: From>. + let variant_string_repr = if field_type == Type::TYPE_STRING { + crate::impl_message::field_string_repr(ctx, proto_fqn, proto_name) + } else { + crate::StringRepr::String + }; let variant_type = if field_type == Type::TYPE_BYTES && crate::impl_message::field_uses_bytes(ctx, proto_fqn, proto_name) { quote! { ::buffa::bytes::Bytes } + } else if field_type == Type::TYPE_STRING && !variant_string_repr.is_default() { + variant_string_repr.type_path(resolver) } else { scalar_or_message_type_nested(ctx, field, current_package, nesting, features, resolver)? }; @@ -2145,6 +2159,7 @@ fn generate_custom_default( msg: &DescriptorProto, name_ident: &Ident, current_package: &str, + proto_fqn: &str, features: &ResolvedFeatures, nesting: usize, ) -> Result, CodeGenError> { @@ -2209,7 +2224,14 @@ fn generate_custom_default( continue; } - if let Some(expr) = parse_default_value(field, ctx, current_package, features, nesting)? { + if let Some(expr) = parse_default_value( + field, + ctx, + current_package, + features, + nesting, + crate::impl_message::field_string_repr(ctx, proto_fqn, field_name), + )? { field_inits.push(quote! { #field_ident: #expr, }); } else { field_inits.push(quote! { #field_ident: ::core::default::Default::default(), }); diff --git a/buffa-codegen/src/oneof.rs b/buffa-codegen/src/oneof.rs index 7cc89e7..3169397 100644 --- a/buffa-codegen/src/oneof.rs +++ b/buffa-codegen/src/oneof.rs @@ -7,7 +7,7 @@ use quote::{format_ident, quote}; use crate::context::CodeGenContext; use crate::features::ResolvedFeatures; -use crate::impl_message::field_uses_bytes; +use crate::impl_message::{field_string_repr, field_uses_bytes}; use crate::message::scalar_or_message_type_nested; use crate::CodeGenError; @@ -65,6 +65,9 @@ struct VariantInfo { custom_attrs: TokenStream, /// Used to emit `#[arbitrary(with = ...)]` alongside `derive(Arbitrary)`. use_bytes: bool, + /// Owned string representation for a `string` variant (default `String`). + /// Drives both the variant type and the EcoString arbitrary shim. + string_repr: crate::StringRepr, } #[allow(clippy::too_many_arguments)] @@ -115,8 +118,16 @@ fn collect_variant_info( // msg_nesting, so the enum body sits at `nesting + 3`. let use_bytes = field_type == Type::TYPE_BYTES && field_uses_bytes(ctx, proto_fqn, proto_name); + // Configurable owned string representation for a `string` variant. + let string_repr = if field_type == Type::TYPE_STRING { + field_string_repr(ctx, proto_fqn, proto_name) + } else { + crate::StringRepr::String + }; let rust_type = if use_bytes { quote! { ::buffa::bytes::Bytes } + } else if field_type == Type::TYPE_STRING && !string_repr.is_default() { + string_repr.type_path(resolver) } else { scalar_or_message_type_nested( ctx, @@ -139,6 +150,7 @@ fn collect_variant_info( is_null_value: is_null_value_field(field), custom_attrs, use_bytes, + string_repr, }) }) .collect() @@ -195,6 +207,12 @@ pub fn generate_oneof_enum( // enum variants — the attribute must be on the inner field. let arbitrary_field_attr = if ctx.config.generate_arbitrary && v.use_bytes { quote! { #[cfg_attr(feature = "arbitrary", arbitrary(with = ::buffa::__private::arbitrary_bytes))] } + } else if ctx.config.generate_arbitrary + && v.string_repr == crate::StringRepr::EcoString + { + // EcoString has no native Arbitrary impl (unlike SmolStr / + // CompactString), so the derived enum impl needs the shim. + quote! { #[cfg_attr(feature = "arbitrary", arbitrary(with = ::buffa::__private::arbitrary_ecow))] } } else { quote! {} }; diff --git a/buffa-test/build.rs b/buffa-test/build.rs index 0300d68..4d7a171 100644 --- a/buffa-test/build.rs +++ b/buffa-test/build.rs @@ -202,10 +202,25 @@ fn main() { ) .generate_json(true) .generate_arbitrary(true) + .generate_text(true) .out_dir(string_out) .compile() .expect("buffa_build failed for string_types.proto with string_type"); + // proto2 `[default = "..."]` + string_type: a required string field is a + // bare type, so its Default impl and clear() must build the literal via the + // configured repr's From, not String::from. + let string_p2_out = + std::path::PathBuf::from(std::env::var("OUT_DIR").unwrap()).join("string_proto2_variant"); + std::fs::create_dir_all(&string_p2_out).expect("create string_proto2_variant dir"); + buffa_build::Config::new() + .files(&["protos/string_proto2.proto"]) + .includes(&["protos/"]) + .string_type(buffa_build::StringRepr::SmolStr) + .out_dir(string_p2_out) + .compile() + .expect("buffa_build failed for string_proto2.proto with string_type"); + // Regression #88: bytes_fields + generate_arbitrary(true). // BytesContexts in basic.proto has singular, optional, repeated, and oneof // bytes fields — this compilation exercises all four shim paths. diff --git a/buffa-test/protos/string_proto2.proto b/buffa-test/protos/string_proto2.proto new file mode 100644 index 0000000..fdda9f5 --- /dev/null +++ b/buffa-test/protos/string_proto2.proto @@ -0,0 +1,13 @@ +syntax = "proto2"; + +package stringproto2; + +// Exercises proto2 `[default = "..."]` combined with the `string_type` knob. +// A `required` string field is a bare (non-Option) type, so its generated +// `Default` impl and `clear()` both materialize the literal via the configured +// representation's `From` rather than `String::from`. Compiling this +// module is the regression guard for that path. +message Defaults { + required string name = 1 [default = "anonymous"]; + optional int32 n = 2; +} diff --git a/buffa-test/src/lib.rs b/buffa-test/src/lib.rs index b708305..23b2911 100644 --- a/buffa-test/src/lib.rs +++ b/buffa-test/src/lib.rs @@ -259,6 +259,22 @@ pub mod string_types { )); } +// proto2 `[default = "..."]` + string_type. Compiling this verifies the +// generated Default impl and clear() build the literal via the configured +// repr's From. +#[allow( + clippy::derivable_impls, + clippy::match_single_binding, + non_camel_case_types, + dead_code +)] +pub mod string_proto2 { + include!(concat!( + env!("OUT_DIR"), + "/string_proto2_variant/stringproto2.mod.rs" + )); +} + // Views + preserve_unknown_fields=false: covers the else-branches in view // codegen that omit the unknown-fields view field. Compilation IS the test. #[allow( diff --git a/buffa-test/src/tests/string_type.rs b/buffa-test/src/tests/string_type.rs index 4b1fa5d..a5bb650 100644 --- a/buffa-test/src/tests/string_type.rs +++ b/buffa-test/src/tests/string_type.rs @@ -24,6 +24,11 @@ fn test_string_type_field_types_are_configured() { let _: &::buffa::ecow::EcoString = &m.eco; // Map keys/values are unaffected — always String. let _: &std::collections::HashMap = &m.by_key; + // The oneof string variant payload must also honor the configured repr. + let _: ::buffa::smol_str::SmolStr = match Choice::Named("x".into()) { + Choice::Named(s) => s, + Choice::Count(_) => unreachable!(), + }; } fn sample() -> StringContexts { @@ -109,6 +114,40 @@ fn test_string_type_json_roundtrip() { assert_eq!(back, msg); } +#[test] +fn test_string_type_text_roundtrip() { + // generate_text(true) is enabled for this module; the text decoder builds + // each configured string type via From rather than a bare String. + let msg = sample(); + let text = buffa::text::encode_to_string(&msg); + let back: StringContexts = buffa::text::decode_from_str(&text).expect("parse text"); + assert_eq!(back, msg); + assert_eq!(back.singular.as_str(), "hello"); + assert_eq!(back.eco.as_str(), "eco-value"); + match &back.choice { + Some(Choice::Named(s)) => assert_eq!(s.as_str(), "chosen"), + other => panic!("expected Choice::Named, got {other:?}"), + } +} + +#[test] +fn test_string_type_proto2_default() { + // proto2 `[default = "anonymous"]` on a required (bare) string field, with + // string_type(SmolStr): both Default and clear() must yield the literal as + // a SmolStr, not a String. + use crate::string_proto2::Defaults; + let d = Defaults::default(); + assert_eq!(d.name.as_str(), "anonymous"); + let _: ::buffa::smol_str::SmolStr = d.name.clone(); + + let mut m = Defaults { + name: "custom".into(), + ..Default::default() + }; + m.clear(); + assert_eq!(m.name.as_str(), "anonymous", "clear() restores the default"); +} + #[test] fn test_string_type_json_null_is_empty() { let json = r#"{"singular":null,"maybe":null,"many":null,"compact":null,"eco":null}"#; From 3043b2bf9fa755b04cb80f8a27645b0fe0a12ea9 Mon Sep 17 00:00:00 2001 From: Iain McGinniss <309153+iainmcgin@users.noreply.github.com> Date: Fri, 22 May 2026 17:36:09 +0000 Subject: [PATCH 3/4] test: match CI rustfmt import ordering in string_type tests --- buffa-test/src/tests/string_type.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/buffa-test/src/tests/string_type.rs b/buffa-test/src/tests/string_type.rs index a5bb650..bec8f20 100644 --- a/buffa-test/src/tests/string_type.rs +++ b/buffa-test/src/tests/string_type.rs @@ -7,8 +7,8 @@ //! would not build. The runtime checks below verify behavior and pin the field //! types to the configured representations. -use crate::string_types::StringContexts; use crate::string_types::__buffa::oneof::string_contexts::Choice; +use crate::string_types::StringContexts; use buffa::Message; #[test] From ce14a654dd1db9ff3df40173228afcb09099d494 Mon Sep 17 00:00:00 2001 From: Iain McGinniss <309153+iainmcgin@users.noreply.github.com> Date: Fri, 22 May 2026 22:28:06 +0000 Subject: [PATCH 4/4] deps: cap smol_str below 0.3.4 to keep MSRV 1.85 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit smol_str 0.3.4 raised its MSRV to 1.89, above buffa's 1.85, so enabling the `smol_str` string-type feature broke `cargo check` on the MSRV toolchain (msrv-check CI). Pin to `>=0.3, <0.3.4` (resolves to 0.3.2, which declares no MSRV) — it keeps the `serde` and `arbitrary` features the codegen relies on. Re-pin Cargo.lock accordingly. Relax the cap when buffa's MSRV reaches 1.89. --- Cargo.lock | 6 +++--- Cargo.toml | 6 +++++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 372eaac..0acfaa9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -740,13 +740,13 @@ dependencies = [ [[package]] name = "smol_str" -version = "0.3.6" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4aaa7368fcf4852a4c2dd92df0cace6a71f2091ca0a23391ce7f3a31833f1523" +checksum = "9676b89cd56310a87b93dec47b11af744f34d5fc9f367b829474eec0a891350d" dependencies = [ "arbitrary", "borsh", - "serde_core", + "serde", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index d908ac7..bc71ad8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -67,7 +67,11 @@ thiserror = { version = "2", default-features = false } # `default-features = false` so they stay `no_std`; the relevant # `serde`/`arbitrary`/`std` sub-features are turned on by the matching `buffa` # feature (see buffa/Cargo.toml). -smol_str = { version = "0.3", default-features = false } +# +# `smol_str` is capped below 0.3.4: that release raised its MSRV to 1.89, above +# buffa's 1.85. 0.3.2 (no declared MSRV) keeps the `serde`/`arbitrary` features +# we need. Relax the cap when buffa's MSRV reaches 1.89. +smol_str = { version = ">=0.3, <0.3.4", default-features = false } ecow = { version = "0.2", default-features = false } compact_str = { version = "0.9", default-features = false } prettyplease = "0.2"