From 5cdfa5e6e2f8158f66dbee5046176f7896eb2326 Mon Sep 17 00:00:00 2001 From: Alex Fournier Date: Sat, 6 Jun 2026 16:08:09 -0700 Subject: [PATCH 01/35] refactor: restore deterministic pii redaction plugin Signed-off-by: Alex Fournier --- crates/cli/tests/coverage/plugins_tests.rs | 32 + crates/core/src/plugin.rs | 1 + crates/core/src/plugins/mod.rs | 1 + .../src/plugins/pii_redaction/component.rs | 1278 +++++++++++++++++ .../core/src/plugins/pii_redaction/local.rs | 52 + crates/core/src/plugins/pii_redaction/mod.rs | 14 + .../plugins/pii_redaction/component_tests.rs | 744 ++++++++++ docs/index.yml | 4 + docs/pii-redaction-plugin/about.mdx | 90 ++ docs/pii-redaction-plugin/configuration.mdx | 211 +++ 10 files changed, 2427 insertions(+) create mode 100644 crates/core/src/plugins/pii_redaction/component.rs create mode 100644 crates/core/src/plugins/pii_redaction/local.rs create mode 100644 crates/core/src/plugins/pii_redaction/mod.rs create mode 100644 crates/core/tests/unit/plugins/pii_redaction/component_tests.rs create mode 100644 docs/pii-redaction-plugin/about.mdx create mode 100644 docs/pii-redaction-plugin/configuration.mdx diff --git a/crates/cli/tests/coverage/plugins_tests.rs b/crates/cli/tests/coverage/plugins_tests.rs index 0451fa14..8368b38e 100644 --- a/crates/cli/tests/coverage/plugins_tests.rs +++ b/crates/cli/tests/coverage/plugins_tests.rs @@ -224,6 +224,38 @@ fn typed_editor_model_contains_nemo_guardrails_options() { ); } +#[test] +fn typed_editor_model_contains_pii_redaction_options() { + let schema = PiiRedactionConfig::editor_schema(); + assert!(!schema.fields.iter().any(|field| field.name == "version")); + assert_eq!( + schema.field("mode").unwrap().enum_values, + &["builtin", "local_model"] + ); + assert_eq!(schema.field("codec").unwrap().kind, EditorFieldKind::Enum); + assert_eq!( + schema.field("tool_output").unwrap().kind, + EditorFieldKind::Boolean + ); + + let builtin = schema.field("builtin").unwrap().schema().unwrap(); + assert_eq!(builtin.field("action").unwrap().kind, EditorFieldKind::Enum); + assert_eq!( + builtin.field("target_paths").unwrap().kind, + EditorFieldKind::Json + ); + assert_eq!( + builtin.field("replacement").unwrap().kind, + EditorFieldKind::String + ); + + let local = schema.field("local").unwrap().schema().unwrap(); + assert_eq!( + local.field("backend").unwrap().kind, + EditorFieldKind::String + ); +} + #[test] fn plugin_menu_uses_setup_theme_markers() { let theme = ColorfulTheme::default(); diff --git a/crates/core/src/plugin.rs b/crates/core/src/plugin.rs index e3d90715..4f0980f4 100644 --- a/crates/core/src/plugin.rs +++ b/crates/core/src/plugin.rs @@ -765,6 +765,7 @@ pub fn ensure_builtin_plugins_registered() -> Result<()> { let register_builtins = || { crate::observability::plugin_component::register_observability_component()?; crate::plugins::nemo_guardrails::component::register_nemo_guardrails_component()?; + crate::plugins::pii_redaction::component::register_pii_redaction_component()?; crate::plugins::pricing::register_pricing_component() }; match BUILTIN_PLUGIN_REGISTRATION.get_or_init(register_builtins) { diff --git a/crates/core/src/plugins/mod.rs b/crates/core/src/plugins/mod.rs index d6cef9c1..44546e8a 100644 --- a/crates/core/src/plugins/mod.rs +++ b/crates/core/src/plugins/mod.rs @@ -4,4 +4,5 @@ //! First-party plugin implementations for NeMo Relay Core. pub mod nemo_guardrails; +pub mod pii_redaction; pub mod pricing; diff --git a/crates/core/src/plugins/pii_redaction/component.rs b/crates/core/src/plugins/pii_redaction/component.rs new file mode 100644 index 00000000..145adaf0 --- /dev/null +++ b/crates/core/src/plugins/pii_redaction/component.rs @@ -0,0 +1,1278 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! PII redaction plugin component contract. + +use std::future::Future; +use std::pin::Pin; +use std::sync::Arc; + +use regex::Regex; +use serde::de::DeserializeOwned; +use serde::{Deserialize, Serialize}; +use serde_json::{Map, Value as Json}; +use sha2::{Digest, Sha256}; + +use crate::api::llm::LlmRequest; +use crate::api::runtime::{LlmSanitizeRequestFn, LlmSanitizeResponseFn, ToolSanitizeFn}; +use crate::codec::anthropic::AnthropicMessagesCodec; +use crate::codec::openai_chat::OpenAIChatCodec; +use crate::codec::openai_responses::OpenAIResponsesCodec; +use crate::codec::request::{ContentPart, MessageContent}; +use crate::codec::response::{AnnotatedLlmResponse, FinishReason, ResponseToolCall}; +use crate::codec::traits::{LlmCodec, LlmResponseCodec}; +use crate::plugin::{ + ConfigDiagnostic, ConfigPolicy, DiagnosticLevel, Plugin, PluginComponentSpec, PluginError, + PluginRegistrationContext, Result as PluginResult, UnsupportedBehavior, deregister_plugin, + register_plugin, +}; + +#[path = "local.rs"] +mod local; +use local::register_local_backend; +pub use local::{clear_local_backend_provider, register_local_backend_provider}; + +/// The plugin kind reserved for the built-in privacy component. +pub const PII_REDACTION_PLUGIN_KIND: &str = "pii_redaction"; + +/// Top-level PII redaction component wrapper. +#[derive(Debug, Clone)] +pub struct ComponentSpec { + /// Whether the component should be activated. + pub enabled: bool, + /// Component-local PII redaction config. + pub config: PiiRedactionConfig, +} + +impl ComponentSpec { + /// Creates an enabled PII redaction component spec. + pub fn new(config: PiiRedactionConfig) -> Self { + Self { + enabled: true, + config, + } + } +} + +impl From for PluginComponentSpec { + fn from(value: ComponentSpec) -> Self { + let Json::Object(config) = serde_json::to_value(value.config) + .expect("PII redaction config should serialize to an object") + else { + unreachable!("PII redaction config must serialize to an object"); + }; + + PluginComponentSpec { + kind: PII_REDACTION_PLUGIN_KIND.to_string(), + enabled: value.enabled, + config, + } + } +} + +/// Canonical config document for the PII redaction component. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +pub struct PiiRedactionConfig { + /// PII redaction config schema version. + #[serde(default = "default_pii_redaction_config_version")] + pub version: u32, + /// Backend mode: `builtin` or `local_model`. + #[serde(default = "default_mode")] + #[cfg_attr(feature = "schema", schemars(schema_with = "mode_schema"))] + pub mode: String, + /// Whether to sanitize managed LLM request payloads. + #[serde(default = "default_true")] + pub input: bool, + /// Whether to sanitize managed LLM response payloads. + #[serde(default = "default_true")] + pub output: bool, + /// Whether to sanitize managed tool request payloads. + #[serde(default = "default_true")] + pub tool_input: bool, + /// Whether to sanitize managed tool response payloads. + #[serde(default = "default_true")] + pub tool_output: bool, + /// Guardrail priority. Lower values run earlier. + #[serde(default = "default_priority")] + pub priority: i32, + /// Provider request/response codec for LLM-managed surfaces. + #[serde(default, skip_serializing_if = "Option::is_none")] + #[cfg_attr(feature = "schema", schemars(schema_with = "codec_schema"))] + pub codec: Option, + /// Built-in backend settings used when `mode = "builtin"`. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub builtin: Option, + /// Local-backend settings used when `mode = "local_model"`. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub local: Option, + /// Component-local unsupported-config policy. + #[serde(default)] + pub policy: ConfigPolicy, +} + +impl Default for PiiRedactionConfig { + fn default() -> Self { + Self { + version: default_pii_redaction_config_version(), + mode: default_mode(), + input: true, + output: true, + tool_input: true, + tool_output: true, + priority: default_priority(), + codec: None, + builtin: None, + local: None, + policy: ConfigPolicy::default(), + } + } +} + +/// Built-in redaction backend settings. +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +pub struct BuiltinBackendConfig { + /// Action applied to matching string leaves. + #[serde(default = "default_builtin_action")] + #[cfg_attr(feature = "schema", schemars(schema_with = "builtin_action_schema"))] + pub action: String, + /// Exact JSON-pointer paths to sanitize. Empty means every string leaf. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub target_paths: Vec, + /// Regex pattern used when `action = "regex_replace"`. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub pattern: Option, + /// Replacement text used when `action = "regex_replace"`. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub replacement: Option, +} + +/// Local-backend settings for a future in-process local-model runtime. +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +pub struct LocalBackendConfig { + /// Optional local-model backend identifier. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub backend: Option, +} + +crate::editor_config! { + impl PiiRedactionConfig { + mode => { + label: "mode", + kind: Enum, + values: ["builtin", "local_model"], + }, + input => { label: "input", kind: Boolean }, + output => { label: "output", kind: Boolean }, + tool_input => { label: "tool_input", kind: Boolean }, + tool_output => { label: "tool_output", kind: Boolean }, + priority => { label: "priority", kind: Integer }, + codec => { + label: "codec", + kind: Enum, + values: ["openai_chat", "openai_responses", "anthropic_messages"], + optional: true, + }, + builtin => { + label: "builtin", + kind: Section, + optional: true, + nested: BuiltinBackendConfig, + default: BuiltinBackendConfig, + }, + local => { + label: "local", + kind: Section, + optional: true, + nested: LocalBackendConfig, + default: LocalBackendConfig, + }, + policy => { + label: "policy", + kind: Section, + nested: ConfigPolicy, + default: ConfigPolicy, + }, + } +} + +crate::editor_config! { + impl BuiltinBackendConfig { + action => { + label: "action", + kind: Enum, + values: ["remove", "regex_replace", "hash"], + }, + target_paths => { label: "target_paths", kind: Json }, + pattern => { label: "pattern", kind: String, optional: true }, + replacement => { label: "replacement", kind: String, optional: true }, + } +} + +crate::editor_config! { + impl LocalBackendConfig { + backend => { label: "backend", kind: String, optional: true }, + } +} + +struct PiiRedactionPlugin; + +impl Plugin for PiiRedactionPlugin { + fn plugin_kind(&self) -> &str { + PII_REDACTION_PLUGIN_KIND + } + + fn allows_multiple_components(&self) -> bool { + false + } + + fn validate(&self, plugin_config: &Map) -> Vec { + validate_pii_redaction_plugin_config(plugin_config) + } + + fn register<'a>( + &'a self, + plugin_config: &Map, + ctx: &'a mut PluginRegistrationContext, + ) -> Pin> + Send + 'a>> { + let parsed = parse_pii_redaction_config(plugin_config); + Box::pin(async move { + let config = parsed?; + register_pii_redaction_backend(config, ctx) + }) + } +} + +/// Registers the `pii_redaction` component kind in the plugin registry. +pub fn register_pii_redaction_component() -> PluginResult<()> { + match register_plugin(Arc::new(PiiRedactionPlugin)) { + Ok(()) => Ok(()), + Err(PluginError::RegistrationFailed(message)) if message.contains("already registered") => { + Ok(()) + } + Err(err) => Err(err), + } +} + +/// Deregisters the `pii_redaction` component kind from the plugin registry. +pub fn deregister_pii_redaction_component() -> bool { + deregister_plugin(PII_REDACTION_PLUGIN_KIND) +} + +/// Returns the JSON Schema for the PII redaction component configuration. +#[cfg(feature = "schema")] +pub fn pii_redaction_config_schema() -> serde_json::Value { + serde_json::to_value(schemars::schema_for!(PiiRedactionConfig)) + .expect("PII redaction config schema should serialize") +} + +#[cfg(feature = "schema")] +fn mode_schema(generator: &mut schemars::r#gen::SchemaGenerator) -> schemars::schema::Schema { + string_enum_schema(generator, &["builtin", "local_model"], Some("builtin")) +} + +#[cfg(feature = "schema")] +fn builtin_action_schema( + generator: &mut schemars::r#gen::SchemaGenerator, +) -> schemars::schema::Schema { + string_enum_schema( + generator, + &["remove", "regex_replace", "hash"], + Some("remove"), + ) +} + +#[cfg(feature = "schema")] +fn codec_schema(generator: &mut schemars::r#gen::SchemaGenerator) -> schemars::schema::Schema { + string_enum_schema( + generator, + &["openai_chat", "openai_responses", "anthropic_messages"], + None, + ) +} + +#[cfg(feature = "schema")] +fn string_enum_schema( + generator: &mut schemars::r#gen::SchemaGenerator, + values: &[&str], + default: Option<&str>, +) -> schemars::schema::Schema { + let mut schema: schemars::schema::SchemaObject = + ::json_schema(generator).into(); + schema.enum_values = Some( + values + .iter() + .map(|value| Json::String((*value).into())) + .collect(), + ); + if let Some(default) = default { + schema.metadata().default = Some(Json::String(default.into())); + } + schema.into() +} + +fn register_pii_redaction_backend( + config: PiiRedactionConfig, + ctx: &mut PluginRegistrationContext, +) -> PluginResult<()> { + match config.mode.as_str() { + "builtin" => register_builtin_backend(config, ctx), + "local_model" => register_local_backend(config, ctx), + other => Err(PluginError::InvalidConfig(format!( + "unsupported PII redaction mode '{other}'" + ))), + } +} + +fn parse_pii_redaction_config( + plugin_config: &Map, +) -> PluginResult { + serde_json::from_value(Json::Object(plugin_config.clone())).map_err(|err| { + PluginError::InvalidConfig(format!("invalid PII redaction plugin config: {err}")) + }) +} + +fn validate_pii_redaction_plugin_config( + plugin_config: &Map, +) -> Vec { + let config = match parse_pii_redaction_config(plugin_config) { + Ok(config) => config, + Err(err) => { + return vec![ConfigDiagnostic { + level: DiagnosticLevel::Error, + code: "pii_redaction.invalid_plugin_config".to_string(), + component: Some(PII_REDACTION_PLUGIN_KIND.to_string()), + field: None, + message: err.to_string(), + }]; + } + }; + + let mut diagnostics = vec![]; + + validate_unknown_fields( + &mut diagnostics, + &config.policy, + Some(PII_REDACTION_PLUGIN_KIND.to_string()), + plugin_config, + &[ + "version", + "mode", + "input", + "output", + "tool_input", + "tool_output", + "priority", + "codec", + "builtin", + "local", + "policy", + ], + ); + validate_policy_fields(&mut diagnostics, &config.policy, plugin_config); + validate_section_fields( + &mut diagnostics, + &config.policy, + plugin_config, + "builtin", + &["action", "target_paths", "pattern", "replacement"], + ); + validate_section_fields( + &mut diagnostics, + &config.policy, + plugin_config, + "local", + &["backend"], + ); + validate_mode(&mut diagnostics, &config.policy, &config); + validate_surface_selection(&mut diagnostics, &config.policy, &config); + validate_codec_requirements(&mut diagnostics, &config.policy, &config); + validate_builtin_mode_requirements(&mut diagnostics, &config.policy, plugin_config, &config); + validate_builtin_action_requirements(&mut diagnostics, &config.policy, &config); + validate_local_mode_requirements(&mut diagnostics, &config.policy, plugin_config, &config); + + diagnostics +} + +fn validate_mode( + diagnostics: &mut Vec, + policy: &ConfigPolicy, + config: &PiiRedactionConfig, +) { + if matches!(config.mode.as_str(), "builtin" | "local_model") { + return; + } + + push_policy_diag( + diagnostics, + policy.unsupported_value, + "pii_redaction.unsupported_value", + Some(PII_REDACTION_PLUGIN_KIND.to_string()), + Some("mode".to_string()), + "mode must be 'builtin' or 'local_model'".to_string(), + ); +} + +fn validate_surface_selection( + diagnostics: &mut Vec, + policy: &ConfigPolicy, + config: &PiiRedactionConfig, +) { + if config.input || config.output || config.tool_input || config.tool_output { + return; + } + + push_policy_diag( + diagnostics, + policy.unsupported_value, + "pii_redaction.unsupported_value", + Some(PII_REDACTION_PLUGIN_KIND.to_string()), + None, + "at least one redaction surface must be enabled".to_string(), + ); +} + +fn validate_local_mode_requirements( + diagnostics: &mut Vec, + policy: &ConfigPolicy, + plugin_config: &Map, + config: &PiiRedactionConfig, +) { + if config.mode == "local_model" { + return; + } + if !plugin_config.contains_key("local") { + return; + } + + push_policy_diag( + diagnostics, + policy.unsupported_value, + "pii_redaction.unsupported_value", + Some(PII_REDACTION_PLUGIN_KIND.to_string()), + Some("local".to_string()), + "`local` settings are valid only when mode = 'local_model'".to_string(), + ); +} + +fn validate_builtin_mode_requirements( + diagnostics: &mut Vec, + policy: &ConfigPolicy, + plugin_config: &Map, + config: &PiiRedactionConfig, +) { + if config.mode == "builtin" { + if plugin_config.contains_key("builtin") { + return; + } + push_policy_diag( + diagnostics, + policy.unsupported_value, + "pii_redaction.unsupported_value", + Some(PII_REDACTION_PLUGIN_KIND.to_string()), + Some("builtin".to_string()), + "`builtin` settings are required when mode = 'builtin'".to_string(), + ); + return; + } + if !plugin_config.contains_key("builtin") { + return; + } + + push_policy_diag( + diagnostics, + policy.unsupported_value, + "pii_redaction.unsupported_value", + Some(PII_REDACTION_PLUGIN_KIND.to_string()), + Some("builtin".to_string()), + "`builtin` settings are valid only when mode = 'builtin'".to_string(), + ); +} + +fn validate_builtin_action_requirements( + diagnostics: &mut Vec, + policy: &ConfigPolicy, + config: &PiiRedactionConfig, +) { + let Some(builtin) = config.builtin.as_ref() else { + return; + }; + + if !matches!(builtin.action.as_str(), "remove" | "regex_replace" | "hash") { + push_policy_diag( + diagnostics, + policy.unsupported_value, + "pii_redaction.unsupported_value", + Some(PII_REDACTION_PLUGIN_KIND.to_string()), + Some("builtin.action".to_string()), + "builtin.action must be 'remove', 'regex_replace', or 'hash'".to_string(), + ); + } + + if builtin.action == "regex_replace" && builtin.pattern.is_none() { + push_policy_diag( + diagnostics, + policy.unsupported_value, + "pii_redaction.unsupported_value", + Some(PII_REDACTION_PLUGIN_KIND.to_string()), + Some("builtin.pattern".to_string()), + "builtin.pattern is required when builtin.action = 'regex_replace'".to_string(), + ); + } +} + +fn validate_codec_requirements( + diagnostics: &mut Vec, + policy: &ConfigPolicy, + config: &PiiRedactionConfig, +) { + let llm_surface_enabled = config.input || config.output; + if !llm_surface_enabled { + return; + } + + let Some(codec) = config.codec.as_deref() else { + push_policy_diag( + diagnostics, + policy.unsupported_value, + "pii_redaction.unsupported_value", + Some(PII_REDACTION_PLUGIN_KIND.to_string()), + Some("codec".to_string()), + "codec is required when any LLM surface is enabled".to_string(), + ); + return; + }; + + if !matches!( + codec, + "openai_chat" | "openai_responses" | "anthropic_messages" + ) { + push_policy_diag( + diagnostics, + policy.unsupported_value, + "pii_redaction.unsupported_value", + Some(PII_REDACTION_PLUGIN_KIND.to_string()), + Some("codec".to_string()), + "codec must be 'openai_chat', 'openai_responses', or 'anthropic_messages'".to_string(), + ); + } +} + +fn register_builtin_backend( + config: PiiRedactionConfig, + ctx: &mut PluginRegistrationContext, +) -> PluginResult<()> { + let builtin = config.builtin.clone().ok_or_else(|| { + PluginError::InvalidConfig("built-in PII redaction config is missing".to_string()) + })?; + let compiled = CompiledBuiltinBackend::new(builtin, config.codec.clone())?; + + if config.tool_input { + let sanitizer = tool_sanitize_callback(compiled.clone()); + ctx.register_tool_sanitize_request_guardrail("tool_input", config.priority, sanitizer)?; + } + if config.tool_output { + let sanitizer = tool_sanitize_callback(compiled.clone()); + ctx.register_tool_sanitize_response_guardrail("tool_output", config.priority, sanitizer)?; + } + if config.input { + let sanitizer = llm_sanitize_request_callback(compiled.clone()); + ctx.register_llm_sanitize_request_guardrail("input", config.priority, sanitizer)?; + } + if config.output { + let sanitizer = llm_sanitize_response_callback(compiled); + ctx.register_llm_sanitize_response_guardrail("output", config.priority, sanitizer)?; + } + + Ok(()) +} + +#[derive(Clone)] +struct CompiledBuiltinBackend { + action: BuiltinAction, + target_paths: Arc>, + codec: Option>, + codec_name: Option, +} + +#[derive(Clone)] +enum BuiltinAction { + Remove, + Hash, + RegexReplace { + pattern: Arc, + replacement: Arc, + }, +} + +#[derive(Clone, Copy)] +enum BuiltinCodecName { + OpenAIChat, + OpenAIResponses, + AnthropicMessages, +} + +trait BuiltinRequestResponseCodec: LlmCodec + LlmResponseCodec + Send + Sync {} + +impl BuiltinRequestResponseCodec for T where T: LlmCodec + LlmResponseCodec + Send + Sync {} + +impl CompiledBuiltinBackend { + fn new(config: BuiltinBackendConfig, codec_name: Option) -> PluginResult { + let action = match config.action.as_str() { + "remove" => BuiltinAction::Remove, + "hash" => BuiltinAction::Hash, + "regex_replace" => { + let pattern_text = config.pattern.ok_or_else(|| { + PluginError::InvalidConfig( + "builtin.pattern is required when builtin.action = 'regex_replace'" + .to_string(), + ) + })?; + let pattern = Regex::new(&pattern_text).map_err(|err| { + PluginError::InvalidConfig(format!( + "invalid builtin.pattern regex '{pattern_text}': {err}" + )) + })?; + BuiltinAction::RegexReplace { + pattern: Arc::new(pattern), + replacement: Arc::new( + config + .replacement + .unwrap_or_else(|| "[REDACTED]".to_string()), + ), + } + } + other => { + return Err(PluginError::InvalidConfig(format!( + "unsupported builtin.action '{other}'" + ))); + } + }; + + Ok(Self { + action, + target_paths: Arc::new(config.target_paths), + codec_name: codec_name.as_deref().and_then(BuiltinCodecName::parse), + codec: codec_name + .as_deref() + .map(instantiate_builtin_codec) + .transpose()?, + }) + } + + fn sanitize_json_preorder_dfs(&self, value: Json) -> Json { + self.sanitize_json_preorder_dfs_at_path(value, &mut Vec::new()) + .unwrap_or(Json::Null) + } + + fn sanitize_json_preorder_dfs_at_path( + &self, + value: Json, + path_segments: &mut Vec, + ) -> Option { + match value { + Json::String(text) => { + if self.matches_current_preorder_path(path_segments) { + self.sanitize_string_value(text) + } else { + Some(Json::String(text)) + } + } + Json::Array(items) => Some(Json::Array( + items + .into_iter() + .enumerate() + .map(|(index, item)| { + path_segments.push(index.to_string()); + let sanitized = self + .sanitize_json_preorder_dfs_at_path(item, path_segments) + .unwrap_or(Json::Null); + path_segments.pop(); + sanitized + }) + .collect(), + )), + Json::Object(map) => Some(Json::Object( + map.into_iter() + .filter_map(|(key, value)| { + path_segments.push(escape_json_pointer_segment(&key)); + let sanitized = + self.sanitize_json_preorder_dfs_at_path(value, path_segments); + path_segments.pop(); + sanitized.map(|sanitized| (key, sanitized)) + }) + .collect(), + )), + other => { + if self.matches_current_preorder_path(path_segments) + && matches!(self.action, BuiltinAction::Remove) + { + None + } else { + Some(other) + } + } + } + } + + fn matches_current_preorder_path(&self, path_segments: &[String]) -> bool { + if self.target_paths.is_empty() { + return true; + } + let current_path = render_json_pointer_path(path_segments); + self.target_paths.iter().any(|path| path == ¤t_path) + } + + fn sanitize_string_value(&self, text: String) -> Option { + match &self.action { + BuiltinAction::Remove => None, + BuiltinAction::Hash => Some(Json::String(hex_sha256(&text))), + BuiltinAction::RegexReplace { + pattern, + replacement, + } => Some(Json::String( + pattern + .replace_all(&text, replacement.as_str()) + .into_owned(), + )), + } + } + + fn sanitize_request_with_codec(&self, request: &LlmRequest) -> Option { + let codec = self.codec.as_ref()?; + let annotated = codec.decode(request).ok()?; + let sanitized_annotated = sanitize_serializable_with_backend(self, annotated).ok()?; + codec.encode(&sanitized_annotated, request).ok() + } + + fn sanitize_response_with_codec(&self, payload: Json) -> Option { + let codec = self.codec.as_ref()?; + let codec_name = self.codec_name?; + let annotated = codec.decode_response(&payload).ok()?; + let sanitized_annotated = sanitize_serializable_with_backend(self, annotated).ok()?; + Some(codec_name.overlay_response_payload(payload, &sanitized_annotated)) + } +} + +fn tool_sanitize_callback(backend: CompiledBuiltinBackend) -> ToolSanitizeFn { + Arc::new(move |_name: &str, payload: Json| backend.sanitize_json_preorder_dfs(payload)) +} + +fn llm_sanitize_request_callback(backend: CompiledBuiltinBackend) -> LlmSanitizeRequestFn { + Arc::new(move |mut request: LlmRequest| { + if let Some(encoded) = backend.sanitize_request_with_codec(&request) { + return encoded; + } + request.content = backend.sanitize_json_preorder_dfs(request.content); + request + }) +} + +fn llm_sanitize_response_callback(backend: CompiledBuiltinBackend) -> LlmSanitizeResponseFn { + Arc::new(move |payload: Json| { + if backend.target_paths.is_empty() { + return backend.sanitize_json_preorder_dfs(payload); + } + + let payload = backend + .sanitize_response_with_codec(payload.clone()) + .unwrap_or(payload); + backend.sanitize_json_preorder_dfs(payload) + }) +} + +fn render_json_pointer_path(path_segments: &[String]) -> String { + if path_segments.is_empty() { + return String::new(); + } + let mut rendered = String::new(); + for segment in path_segments { + rendered.push('/'); + rendered.push_str(segment); + } + rendered +} + +fn escape_json_pointer_segment(segment: &str) -> String { + segment.replace('~', "~0").replace('/', "~1") +} + +fn hex_sha256(text: &str) -> String { + let digest = Sha256::digest(text.as_bytes()); + let mut output = String::with_capacity(digest.len() * 2); + for byte in digest { + use std::fmt::Write as _; + let _ = write!(&mut output, "{byte:02x}"); + } + output +} + +fn instantiate_builtin_codec( + codec_name: &str, +) -> PluginResult> { + let codec: Arc = match codec_name { + "openai_chat" => Arc::new(OpenAIChatCodec), + "openai_responses" => Arc::new(OpenAIResponsesCodec), + "anthropic_messages" => Arc::new(AnthropicMessagesCodec), + other => { + return Err(PluginError::InvalidConfig(format!( + "unsupported codec '{other}'" + ))); + } + }; + Ok(codec) +} + +impl BuiltinCodecName { + fn parse(value: &str) -> Option { + match value { + "openai_chat" => Some(Self::OpenAIChat), + "openai_responses" => Some(Self::OpenAIResponses), + "anthropic_messages" => Some(Self::AnthropicMessages), + _ => None, + } + } + + fn overlay_response_payload(self, payload: Json, annotated: &AnnotatedLlmResponse) -> Json { + match self { + Self::OpenAIChat => overlay_openai_chat_response(payload, annotated), + Self::OpenAIResponses => overlay_openai_responses_response(payload, annotated), + Self::AnthropicMessages => overlay_anthropic_response(payload, annotated), + } + } +} + +fn overlay_openai_chat_response(mut payload: Json, annotated: &AnnotatedLlmResponse) -> Json { + let Some(root) = payload.as_object_mut() else { + return payload; + }; + set_optional_string_field(root, "id", annotated.id.as_deref()); + set_optional_string_field(root, "model", annotated.model.as_deref()); + + let Some(choice) = root + .get_mut("choices") + .and_then(Json::as_array_mut) + .and_then(|choices| choices.first_mut()) + .and_then(Json::as_object_mut) + else { + return payload; + }; + + set_optional_string_field( + choice, + "finish_reason", + annotated + .finish_reason + .as_ref() + .map(openai_chat_finish_reason), + ); + + let Some(message) = choice.get_mut("message").and_then(Json::as_object_mut) else { + return payload; + }; + set_optional_string_field( + message, + "content", + annotated_message_text(annotated.message.as_ref()).as_deref(), + ); + overlay_openai_chat_tool_calls(message, annotated.tool_calls.as_deref()); + payload +} + +fn overlay_openai_responses_response(mut payload: Json, annotated: &AnnotatedLlmResponse) -> Json { + let Some(root) = payload.as_object_mut() else { + return payload; + }; + set_optional_string_field(root, "id", annotated.id.as_deref()); + set_optional_string_field(root, "model", annotated.model.as_deref()); + set_optional_string_field( + root, + "status", + annotated + .finish_reason + .as_ref() + .map(openai_responses_status), + ); + + if let Some(items) = root.get_mut("output").and_then(Json::as_array_mut) { + overlay_output_text_blocks(items, annotated_message_text(annotated.message.as_ref())); + overlay_openai_responses_tool_calls(items, annotated.tool_calls.as_deref()); + } + payload +} + +fn overlay_anthropic_response(mut payload: Json, annotated: &AnnotatedLlmResponse) -> Json { + let Some(root) = payload.as_object_mut() else { + return payload; + }; + set_optional_string_field(root, "id", annotated.id.as_deref()); + set_optional_string_field(root, "model", annotated.model.as_deref()); + set_optional_string_field( + root, + "stop_reason", + annotated.finish_reason.as_ref().map(anthropic_stop_reason), + ); + + if let Some(blocks) = root.get_mut("content").and_then(Json::as_array_mut) { + overlay_anthropic_text_blocks(blocks, annotated_message_text(annotated.message.as_ref())); + overlay_anthropic_tool_calls(blocks, annotated.tool_calls.as_deref()); + } + payload +} + +fn overlay_openai_chat_tool_calls( + message: &mut Map, + tool_calls: Option<&[ResponseToolCall]>, +) { + let Some(raw_calls) = message.get_mut("tool_calls").and_then(Json::as_array_mut) else { + return; + }; + let Some(tool_calls) = tool_calls else { + message.remove("tool_calls"); + return; + }; + + for (raw_call, sanitized_call) in raw_calls.iter_mut().zip(tool_calls.iter()) { + let Some(raw_call) = raw_call.as_object_mut() else { + continue; + }; + set_optional_string_field(raw_call, "id", Some(sanitized_call.id.as_str())); + let Some(function) = raw_call.get_mut("function").and_then(Json::as_object_mut) else { + continue; + }; + set_optional_string_field(function, "name", Some(sanitized_call.name.as_str())); + set_optional_string_field( + function, + "arguments", + Some(json_string(&sanitized_call.arguments).as_str()), + ); + } +} + +fn overlay_openai_responses_tool_calls( + items: &mut [Json], + tool_calls: Option<&[ResponseToolCall]>, +) { + let Some(tool_calls) = tool_calls else { + return; + }; + + let mut sanitized_calls = tool_calls.iter(); + for item in items { + let Some(item_type) = item.get("type").and_then(Json::as_str) else { + continue; + }; + if item_type != "function_call" { + continue; + } + let Some(raw_call) = item.as_object_mut() else { + continue; + }; + let Some(sanitized_call) = sanitized_calls.next() else { + break; + }; + set_optional_string_field(raw_call, "call_id", Some(sanitized_call.id.as_str())); + set_optional_string_field(raw_call, "name", Some(sanitized_call.name.as_str())); + set_optional_string_field( + raw_call, + "arguments", + Some(json_string(&sanitized_call.arguments).as_str()), + ); + } +} + +fn overlay_anthropic_tool_calls(blocks: &mut [Json], tool_calls: Option<&[ResponseToolCall]>) { + let Some(tool_calls) = tool_calls else { + return; + }; + + let mut sanitized_calls = tool_calls.iter(); + for block in blocks { + let Some(block_type) = block.get("type").and_then(Json::as_str) else { + continue; + }; + if block_type != "tool_use" { + continue; + } + let Some(raw_call) = block.as_object_mut() else { + continue; + }; + let Some(sanitized_call) = sanitized_calls.next() else { + break; + }; + set_optional_string_field(raw_call, "id", Some(sanitized_call.id.as_str())); + set_optional_string_field(raw_call, "name", Some(sanitized_call.name.as_str())); + raw_call.insert("input".into(), sanitized_call.arguments.clone()); + } +} + +fn overlay_output_text_blocks(items: &mut [Json], message_text: Option) { + let text_items = items.iter_mut().filter_map(|item| { + (item.get("type").and_then(Json::as_str) == Some("message")) + .then_some(item.get_mut("content")) + .flatten() + .and_then(Json::as_array_mut) + }); + let Some(text) = message_text else { + for content in text_items { + for block in content.iter_mut() { + if block.get("type").and_then(Json::as_str) == Some("output_text") { + if let Some(block) = block.as_object_mut() { + block.remove("text"); + } + } + } + } + return; + }; + + let parts: Vec<&str> = text.split('\n').collect(); + for content in text_items { + let mut text_blocks = content.iter_mut().filter_map(|block| { + (block.get("type").and_then(Json::as_str) == Some("output_text")) + .then_some(block.as_object_mut()) + .flatten() + }); + for (index, block) in text_blocks.by_ref().enumerate() { + let part = parts + .get(index) + .copied() + .or_else(|| (index == 0).then_some(text.as_str())); + set_optional_string_field(block, "text", part); + } + } +} + +fn overlay_anthropic_text_blocks(blocks: &mut [Json], message_text: Option) { + let parts = message_text + .as_deref() + .map(|text| text.split('\n').collect::>()); + let mut text_block_index = 0usize; + + for block in blocks { + if block.get("type").and_then(Json::as_str) != Some("text") { + continue; + } + let Some(block) = block.as_object_mut() else { + continue; + }; + let part = parts + .as_ref() + .and_then(|parts| parts.get(text_block_index).copied()) + .or_else(|| { + (text_block_index == 0) + .then(|| message_text.as_deref()) + .flatten() + }); + set_optional_string_field(block, "text", part); + text_block_index += 1; + } +} + +fn annotated_message_text(message: Option<&MessageContent>) -> Option { + match message? { + MessageContent::Text(text) => Some(text.clone()), + MessageContent::Parts(parts) => { + let text_parts: Vec<&str> = parts + .iter() + .filter_map(|part| match part { + ContentPart::Text { text } => Some(text.as_str()), + ContentPart::ImageUrl { .. } => None, + }) + .collect(); + (!text_parts.is_empty()).then(|| text_parts.join("\n")) + } + } +} + +fn set_optional_string_field(object: &mut Map, key: &str, value: Option<&str>) { + match value { + Some(value) => { + object.insert(key.to_string(), Json::String(value.to_string())); + } + None => { + object.remove(key); + } + } +} + +fn json_string(value: &Json) -> String { + serde_json::to_string(value).unwrap_or_else(|_| "null".to_string()) +} + +fn openai_chat_finish_reason(reason: &FinishReason) -> &str { + match reason { + FinishReason::Complete => "stop", + FinishReason::Length => "length", + FinishReason::ToolUse => "tool_calls", + FinishReason::ContentFilter => "content_filter", + FinishReason::Unknown(other) => other.as_str(), + } +} + +fn openai_responses_status(reason: &FinishReason) -> &str { + match reason { + FinishReason::Complete => "completed", + FinishReason::Length | FinishReason::ContentFilter => "incomplete", + FinishReason::ToolUse => "completed", + FinishReason::Unknown(other) => other.as_str(), + } +} + +fn anthropic_stop_reason(reason: &FinishReason) -> &str { + match reason { + FinishReason::Complete => "end_turn", + FinishReason::Length => "max_tokens", + FinishReason::ToolUse => "tool_use", + FinishReason::ContentFilter => "refusal", + FinishReason::Unknown(other) => other.as_str(), + } +} + +fn sanitize_serializable_with_backend( + backend: &CompiledBuiltinBackend, + value: T, +) -> PluginResult +where + T: Serialize + DeserializeOwned, +{ + let value = serde_json::to_value(value).map_err(|err| { + PluginError::Internal(format!( + "failed to serialize value for PII redaction: {err}" + )) + })?; + serde_json::from_value(backend.sanitize_json_preorder_dfs(value)).map_err(|err| { + PluginError::Internal(format!( + "failed to deserialize sanitized value for PII redaction: {err}" + )) + }) +} + +fn validate_unknown_fields( + diagnostics: &mut Vec, + policy: &ConfigPolicy, + component: Option, + plugin_config: &Map, + supported: &[&str], +) { + for field in plugin_config.keys() { + if supported + .iter() + .any(|supported_field| supported_field == field) + { + continue; + } + push_policy_diag( + diagnostics, + policy.unknown_field, + "pii_redaction.unknown_field", + component.clone(), + Some(field.clone()), + format!("unknown field '{field}'"), + ); + } +} + +fn validate_policy_fields( + diagnostics: &mut Vec, + policy: &ConfigPolicy, + plugin_config: &Map, +) { + validate_section_fields( + diagnostics, + policy, + plugin_config, + "policy", + &["unknown_field", "unsupported_value"], + ); +} + +fn validate_section_fields( + diagnostics: &mut Vec, + policy: &ConfigPolicy, + plugin_config: &Map, + section_name: &str, + supported: &[&str], +) { + let Some(value) = plugin_config.get(section_name) else { + return; + }; + + let Json::Object(section) = value else { + push_policy_diag( + diagnostics, + policy.unsupported_value, + "pii_redaction.unsupported_value", + Some(PII_REDACTION_PLUGIN_KIND.to_string()), + Some(section_name.to_string()), + format!("'{section_name}' must be an object"), + ); + return; + }; + + for field in section.keys() { + if supported + .iter() + .any(|supported_field| supported_field == field) + { + continue; + } + push_policy_diag( + diagnostics, + policy.unknown_field, + "pii_redaction.unknown_field", + Some(PII_REDACTION_PLUGIN_KIND.to_string()), + Some(format!("{section_name}.{field}")), + format!("unknown field '{section_name}.{field}'"), + ); + } +} + +fn push_policy_diag( + diagnostics: &mut Vec, + behavior: UnsupportedBehavior, + code: &str, + component: Option, + field: Option, + message: String, +) { + let level = match behavior { + UnsupportedBehavior::Ignore => return, + UnsupportedBehavior::Warn => DiagnosticLevel::Warning, + UnsupportedBehavior::Error => DiagnosticLevel::Error, + }; + + diagnostics.push(ConfigDiagnostic { + level, + code: code.to_string(), + component, + field, + message, + }); +} + +fn default_pii_redaction_config_version() -> u32 { + 1 +} + +fn default_mode() -> String { + "builtin".to_string() +} + +fn default_builtin_action() -> String { + "remove".to_string() +} + +fn default_true() -> bool { + true +} + +fn default_priority() -> i32 { + 100 +} + +#[cfg(test)] +#[path = "../../../tests/unit/plugins/pii_redaction/component_tests.rs"] +mod tests; diff --git a/crates/core/src/plugins/pii_redaction/local.rs b/crates/core/src/plugins/pii_redaction/local.rs new file mode 100644 index 00000000..bc244899 --- /dev/null +++ b/crates/core/src/plugins/pii_redaction/local.rs @@ -0,0 +1,52 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::sync::{Arc, LazyLock, Mutex, MutexGuard}; + +use crate::plugin::{PluginError, PluginRegistrationContext, Result as PluginResult}; + +use super::PiiRedactionConfig; + +type LocalBackendProvider = Arc< + dyn Fn(PiiRedactionConfig, &mut PluginRegistrationContext) -> PluginResult<()> + Send + Sync, +>; + +static LOCAL_BACKEND_PROVIDER: LazyLock>> = + LazyLock::new(|| Mutex::new(None)); + +fn local_backend_provider_guard() -> PluginResult>> +{ + LOCAL_BACKEND_PROVIDER.lock().map_err(|e| { + PluginError::Internal(format!( + "PII redaction local backend provider lock poisoned: {e}" + )) + }) +} + +#[doc(hidden)] +pub fn register_local_backend_provider(provider: LocalBackendProvider) -> PluginResult<()> { + let mut guard = local_backend_provider_guard()?; + *guard = Some(provider); + Ok(()) +} + +#[doc(hidden)] +pub fn clear_local_backend_provider() -> PluginResult<()> { + let mut guard = local_backend_provider_guard()?; + *guard = None; + Ok(()) +} + +pub(super) fn register_local_backend( + config: PiiRedactionConfig, + ctx: &mut PluginRegistrationContext, +) -> PluginResult<()> { + let provider = local_backend_provider_guard()?.clone(); + + match provider { + Some(provider) => provider(config, ctx), + None => Err(PluginError::RegistrationFailed( + "PII redaction local-model backend is unavailable in this runtime".to_string(), + )), + } +} diff --git a/crates/core/src/plugins/pii_redaction/mod.rs b/crates/core/src/plugins/pii_redaction/mod.rs new file mode 100644 index 00000000..826dc9f7 --- /dev/null +++ b/crates/core/src/plugins/pii_redaction/mod.rs @@ -0,0 +1,14 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! PII redaction plugin integrations for NeMo Relay Core. + +#[cfg(test)] +use std::sync::Mutex; + +#[cfg(test)] +pub(crate) fn test_mutex() -> &'static Mutex<()> { + crate::shared_runtime::runtime_owner_test_mutex() +} + +pub mod component; diff --git a/crates/core/tests/unit/plugins/pii_redaction/component_tests.rs b/crates/core/tests/unit/plugins/pii_redaction/component_tests.rs new file mode 100644 index 00000000..663d9709 --- /dev/null +++ b/crates/core/tests/unit/plugins/pii_redaction/component_tests.rs @@ -0,0 +1,744 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Unit tests for the PII redaction plugin component contract. +#![allow(clippy::await_holding_lock)] + +use super::*; +use crate::api::event::Event; +use crate::api::llm::{ + LlmCallExecuteParams, LlmCallParams, LlmRequest, llm_call, llm_call_execute, +}; +use crate::api::runtime::{ + LlmExecutionNextFn, NemoRelayContextState, create_scope_stack, global_context, + set_thread_scope_stack, +}; +use crate::api::subscriber::{deregister_subscriber, register_subscriber}; +use crate::api::tool::{ToolCallEndParams, ToolCallParams, tool_call, tool_call_end}; +use crate::codec::openai_chat::OpenAIChatCodec; +use crate::codec::openai_responses::OpenAIResponsesCodec; +use crate::codec::traits::LlmResponseCodec; +use crate::plugin::{ + PluginComponentSpec, PluginConfig, PluginRegistrationContext, clear_plugin_configuration, + ensure_builtin_plugins_registered, initialize_plugins, list_plugin_kinds, + validate_plugin_config, +}; +use serde_json::json; +use std::sync::Arc; +use std::sync::Mutex; +use std::sync::atomic::{AtomicBool, Ordering}; + +fn component(config: Json) -> PluginComponentSpec { + let Json::Object(config) = config else { + panic!("component config must be an object"); + }; + PluginComponentSpec { + kind: PII_REDACTION_PLUGIN_KIND.to_string(), + enabled: true, + config, + } +} + +fn plugin_config(config: Json) -> PluginConfig { + PluginConfig { + version: 1, + components: vec![component(config)], + policy: Default::default(), + } +} + +fn reset_runtime() { + let _ = clear_plugin_configuration(); + crate::plugins::pii_redaction::component::clear_local_backend_provider().unwrap(); + crate::shared_runtime::reset_runtime_owner_for_tests(); + let context = global_context(); + *context.write().unwrap() = NemoRelayContextState::new(); +} + +fn setup_isolated_thread() { + let stack = create_scope_stack(); + set_thread_scope_stack(stack); +} + +fn capture_events(name: &str) -> Arc>> { + let events = Arc::new(Mutex::new(Vec::new())); + let sink = Arc::clone(&events); + register_subscriber( + name, + Arc::new(move |event| sink.lock().unwrap().push(event.clone())), + ) + .unwrap(); + events +} + +fn captured_events_snapshot(events: &Arc>>) -> Vec { + crate::api::subscriber::flush_subscribers().unwrap(); + events.lock().unwrap().clone() +} + +fn noop_openai_chat_exec_fn(response: Json) -> LlmExecutionNextFn { + Arc::new(move |_req| { + let response = response.clone(); + Box::pin(async move { Ok(response) }) + }) +} + +#[test] +fn builtin_registry_includes_pii_redaction_component() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + + ensure_builtin_plugins_registered().unwrap(); + + let plugin_kinds = list_plugin_kinds(); + assert!( + plugin_kinds + .iter() + .any(|kind| kind == PII_REDACTION_PLUGIN_KIND) + ); +} + +#[test] +fn validate_rejects_config_with_no_enabled_surfaces() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + + let report = validate_plugin_config(&plugin_config(json!({ + "mode": "builtin", + "codec": "openai_chat", + "builtin": { + "action": "remove" + }, + "input": false, + "output": false, + "tool_input": false, + "tool_output": false, + }))); + + assert!(report.diagnostics.iter().any(|diag| { + diag.code == "pii_redaction.unsupported_value" + && diag + .message + .contains("at least one redaction surface must be enabled") + })); +} + +#[test] +fn validate_rejects_local_section_outside_local_mode() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + + let report = validate_plugin_config(&plugin_config(json!({ + "mode": "builtin", + "codec": "openai_chat", + "builtin": { + "action": "remove" + }, + "local": { + "backend": "future-local-model" + } + }))); + + assert!(report.diagnostics.iter().any(|diag| { + diag.field.as_deref() == Some("local") && diag.message.contains("mode = 'local_model'") + })); +} + +#[test] +fn validate_rejects_builtin_mode_without_builtin_section() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + + let report = validate_plugin_config(&plugin_config(json!({ + "mode": "builtin" + }))); + + assert!(report.diagnostics.iter().any(|diag| { + diag.field.as_deref() == Some("builtin") + && diag.message.contains("required when mode = 'builtin'") + })); +} + +#[test] +fn validate_rejects_llm_surfaces_without_codec() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + + let report = validate_plugin_config(&plugin_config(json!({ + "mode": "builtin", + "builtin": { + "action": "remove" + }, + "input": true, + "output": false, + }))); + + assert!(report.diagnostics.iter().any(|diag| { + diag.field.as_deref() == Some("codec") + && diag + .message + .contains("codec is required when any LLM surface is enabled") + })); +} + +#[test] +fn validate_rejects_regex_replace_without_pattern() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + + let report = validate_plugin_config(&plugin_config(json!({ + "mode": "builtin", + "builtin": { + "action": "regex_replace" + } + }))); + + assert!(report.diagnostics.iter().any(|diag| { + diag.field.as_deref() == Some("builtin.pattern") + && diag + .message + .contains("required when builtin.action = 'regex_replace'") + })); +} + +#[test] +fn local_backend_provider_is_invoked_for_local_model_mode() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + + let called = Arc::new(AtomicBool::new(false)); + let called_inner = Arc::clone(&called); + register_local_backend_provider(Arc::new( + move |config, _ctx: &mut PluginRegistrationContext| { + called_inner.store(true, Ordering::SeqCst); + assert_eq!(config.mode, "local_model"); + Ok(()) + }, + )) + .unwrap(); + + let plugin = PiiRedactionPlugin; + let mut ctx = PluginRegistrationContext::with_namespace("test::"); + let config = json!({ + "mode": "local_model", + "tool_input": true, + }); + let Json::Object(config) = config else { + panic!("component config must be object"); + }; + + futures::executor::block_on(plugin.register(&config, &mut ctx)).unwrap(); + + assert!(called.load(Ordering::SeqCst)); +} + +#[test] +fn builtin_backend_sanitizes_tool_start_and_end_payloads_with_preorder_targets() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + setup_isolated_thread(); + + futures::executor::block_on(initialize_plugins(plugin_config(json!({ + "mode": "builtin", + "codec": "openai_chat", + "input": false, + "output": false, + "tool_input": true, + "tool_output": true, + "builtin": { + "action": "regex_replace", + "pattern": "sk-[A-Za-z0-9_-]+", + "replacement": "[REDACTED]", + "target_paths": ["/api_key", "/nested/token", "/result/secret"] + } + })))) + .unwrap(); + + let events = capture_events("pii-redaction-tool-events"); + let handle = tool_call( + ToolCallParams::builder() + .name("search") + .args(json!({ + "api_key": "sk-abc123", + "nested": { + "token": "sk-secret", + "note": "leave me" + } + })) + .build(), + ) + .unwrap(); + tool_call_end( + ToolCallEndParams::builder() + .handle(&handle) + .result(json!({ + "result": { + "secret": "sk-final", + "public": "ok" + } + })) + .build(), + ) + .unwrap(); + + let captured_events = captured_events_snapshot(&events); + assert_eq!(captured_events.len(), 2); + assert_eq!( + captured_events[0].input(), + Some(&json!({ + "api_key": "[REDACTED]", + "nested": { + "token": "[REDACTED]", + "note": "leave me" + } + })) + ); + assert_eq!( + captured_events[1].output(), + Some(&json!({ + "result": { + "secret": "[REDACTED]", + "public": "ok" + } + })) + ); + + deregister_subscriber("pii-redaction-tool-events").unwrap(); + clear_plugin_configuration().unwrap(); +} + +#[test] +fn builtin_remove_deletes_object_fields_and_nulls_array_or_root_targets() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + setup_isolated_thread(); + + futures::executor::block_on(initialize_plugins(plugin_config(json!({ + "mode": "builtin", + "codec": "openai_chat", + "input": false, + "output": false, + "tool_input": true, + "tool_output": true, + "builtin": { + "action": "remove", + "target_paths": ["/secret", "/nested/remove_me", "/items/1", "/result/token"] + } + })))) + .unwrap(); + + let events = capture_events("pii-redaction-remove-events"); + let handle = tool_call( + ToolCallParams::builder() + .name("search") + .args(json!({ + "secret": "abc", + "nested": { + "keep": "yes", + "remove_me": "gone" + }, + "items": ["a", "b", "c"] + })) + .build(), + ) + .unwrap(); + tool_call_end( + ToolCallEndParams::builder() + .handle(&handle) + .result(json!({ + "result": { + "token": "drop-me", + "public": "ok" + } + })) + .build(), + ) + .unwrap(); + + let captured_events = captured_events_snapshot(&events); + assert_eq!(captured_events.len(), 2); + assert_eq!( + captured_events[0].input(), + Some(&json!({ + "nested": { + "keep": "yes" + }, + "items": ["a", null, "c"] + })) + ); + assert_eq!( + captured_events[1].output(), + Some(&json!({ + "result": { + "public": "ok" + } + })) + ); + + deregister_subscriber("pii-redaction-remove-events").unwrap(); + clear_plugin_configuration().unwrap(); +} + +#[test] +fn builtin_backend_sanitizes_llm_start_payload_via_codec_and_reencodes_provider_shape() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + setup_isolated_thread(); + + futures::executor::block_on(initialize_plugins(plugin_config(json!({ + "mode": "builtin", + "codec": "openai_chat", + "input": true, + "output": false, + "tool_input": false, + "tool_output": false, + "builtin": { + "action": "regex_replace", + "pattern": "sk-[A-Za-z0-9_-]+", + "replacement": "[REDACTED]", + "target_paths": ["/messages/0/content", "/messages/1/content"] + } + })))) + .unwrap(); + + let events = capture_events("pii-redaction-llm-events"); + let request = LlmRequest { + headers: serde_json::Map::new(), + content: json!({ + "model": "gpt-4o-mini", + "messages": [ + {"role": "system", "content": "sk-system-secret"}, + {"role": "user", "content": "sk-user-secret"} + ], + "temperature": 0.2 + }), + }; + + let _handle = llm_call( + LlmCallParams::builder() + .name("openai") + .request(&request) + .build(), + ) + .unwrap(); + + let captured_events = captured_events_snapshot(&events); + assert_eq!(captured_events.len(), 1); + assert_eq!( + captured_events[0].input(), + Some(&json!({ + "headers": {}, + "content": { + "model": "gpt-4o-mini", + "messages": [ + {"role": "system", "content": "[REDACTED]"}, + {"role": "user", "content": "[REDACTED]"} + ], + "temperature": 0.2 + } + })) + ); + + deregister_subscriber("pii-redaction-llm-events").unwrap(); + clear_plugin_configuration().unwrap(); +} + +#[tokio::test] +async fn builtin_backend_sanitizes_llm_end_payload_and_response_codec_decodes_sanitized_output() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + setup_isolated_thread(); + + initialize_plugins(plugin_config(json!({ + "mode": "builtin", + "codec": "openai_chat", + "input": false, + "output": true, + "tool_input": false, + "tool_output": false, + "builtin": { + "action": "regex_replace", + "pattern": "sk-[A-Za-z0-9_-]+", + "replacement": "[REDACTED]", + "target_paths": ["/choices/0/message/content"] + } + }))) + .await + .unwrap(); + + let events = capture_events("pii-redaction-llm-end-events"); + let request = LlmRequest { + headers: serde_json::Map::new(), + content: json!({ + "model": "gpt-4o-mini", + "messages": [ + {"role": "user", "content": "hello"} + ] + }), + }; + let response = json!({ + "id": "chatcmpl-123", + "model": "gpt-4o-mini", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": "sk-response-secret" + }, + "finish_reason": "stop" + } + ], + "usage": { + "prompt_tokens": 3, + "completion_tokens": 2, + "total_tokens": 5 + } + }); + let response_codec: Arc = Arc::new(OpenAIChatCodec); + + let result = llm_call_execute( + LlmCallExecuteParams::builder() + .name("openai") + .request(request) + .func(noop_openai_chat_exec_fn(response.clone())) + .response_codec(response_codec) + .build(), + ) + .await + .unwrap(); + + assert_eq!(result, response); + + let captured_events = captured_events_snapshot(&events); + assert_eq!(captured_events.len(), 2); + assert_eq!( + captured_events[1].output(), + Some(&json!({ + "id": "chatcmpl-123", + "model": "gpt-4o-mini", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": "[REDACTED]" + }, + "finish_reason": "stop" + } + ], + "usage": { + "prompt_tokens": 3, + "completion_tokens": 2, + "total_tokens": 5 + } + })) + ); + + let annotated = captured_events[1] + .annotated_response() + .expect("annotated_response should be present"); + assert_eq!(annotated.response_text(), Some("[REDACTED]")); + assert_eq!(annotated.model.as_deref(), Some("gpt-4o-mini")); + + deregister_subscriber("pii-redaction-llm-end-events").unwrap(); + clear_plugin_configuration().unwrap(); +} + +#[tokio::test] +async fn builtin_backend_sanitizes_openai_chat_response_from_normalized_message_path() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + setup_isolated_thread(); + + initialize_plugins(plugin_config(json!({ + "mode": "builtin", + "codec": "openai_chat", + "input": false, + "output": true, + "tool_input": false, + "tool_output": false, + "builtin": { + "action": "regex_replace", + "pattern": "sk-[A-Za-z0-9_-]+", + "replacement": "[REDACTED]", + "target_paths": ["/message"] + } + }))) + .await + .unwrap(); + + let events = capture_events("pii-redaction-openai-chat-normalized-response"); + let response_codec: Arc = Arc::new(OpenAIChatCodec); + + let _ = llm_call_execute( + LlmCallExecuteParams::builder() + .name("openai") + .request(LlmRequest { + headers: serde_json::Map::new(), + content: json!({"model": "gpt-4o-mini", "messages": [{"role": "user", "content": "hello"}]}), + }) + .func(noop_openai_chat_exec_fn(json!({ + "id": "chatcmpl-123", + "model": "gpt-4o-mini", + "choices": [ + { + "index": 0, + "message": {"role": "assistant", "content": "sk-chat-secret"}, + "finish_reason": "stop" + } + ] + }))) + .response_codec(response_codec) + .build(), + ) + .await + .unwrap(); + + let captured_events = captured_events_snapshot(&events); + assert_eq!( + captured_events[1].output().unwrap()["choices"][0]["message"]["content"], + json!("[REDACTED]") + ); + assert_eq!( + captured_events[1] + .annotated_response() + .and_then(|response| response.response_text()), + Some("[REDACTED]") + ); + + deregister_subscriber("pii-redaction-openai-chat-normalized-response").unwrap(); + clear_plugin_configuration().unwrap(); +} + +#[tokio::test] +async fn builtin_backend_sanitizes_anthropic_response_from_normalized_message_path() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + setup_isolated_thread(); + + initialize_plugins(plugin_config(json!({ + "mode": "builtin", + "codec": "anthropic_messages", + "input": false, + "output": true, + "tool_input": false, + "tool_output": false, + "builtin": { + "action": "regex_replace", + "pattern": "sk-[A-Za-z0-9_-]+", + "replacement": "[REDACTED]", + "target_paths": ["/message"] + } + }))) + .await + .unwrap(); + + let events = capture_events("pii-redaction-anthropic-normalized-response"); + let response_codec: Arc = + Arc::new(crate::codec::anthropic::AnthropicMessagesCodec); + + let _ = llm_call_execute( + LlmCallExecuteParams::builder() + .name("anthropic") + .request(LlmRequest { + headers: serde_json::Map::new(), + content: json!({"model": "claude-sonnet-4-20250514", "messages": [{"role": "user", "content": "hello"}]}), + }) + .func(noop_openai_chat_exec_fn(json!({ + "id": "msg_123", + "model": "claude-sonnet-4-20250514", + "role": "assistant", + "type": "message", + "content": [{"type": "text", "text": "sk-anthropic-secret"}], + "stop_reason": "end_turn" + }))) + .response_codec(response_codec) + .build(), + ) + .await + .unwrap(); + + let captured_events = captured_events_snapshot(&events); + assert_eq!( + captured_events[1].output().unwrap()["content"][0]["text"], + json!("[REDACTED]") + ); + assert_eq!( + captured_events[1] + .annotated_response() + .and_then(|response| response.response_text()), + Some("[REDACTED]") + ); + + deregister_subscriber("pii-redaction-anthropic-normalized-response").unwrap(); + clear_plugin_configuration().unwrap(); +} + +#[tokio::test] +async fn builtin_backend_sanitizes_openai_responses_response_from_normalized_message_path() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + setup_isolated_thread(); + + initialize_plugins(plugin_config(json!({ + "mode": "builtin", + "codec": "openai_responses", + "input": false, + "output": true, + "tool_input": false, + "tool_output": false, + "builtin": { + "action": "regex_replace", + "pattern": "sk-[A-Za-z0-9_-]+", + "replacement": "[REDACTED]", + "target_paths": ["/message"] + } + }))) + .await + .unwrap(); + + let events = capture_events("pii-redaction-openai-responses-normalized-response"); + let response_codec: Arc = Arc::new(OpenAIResponsesCodec); + + let _ = llm_call_execute( + LlmCallExecuteParams::builder() + .name("openai") + .request(LlmRequest { + headers: serde_json::Map::new(), + content: json!({"model": "gpt-4.1-mini", "input": "hello"}), + }) + .func(noop_openai_chat_exec_fn(json!({ + "id": "resp_123", + "model": "gpt-4.1-mini", + "status": "completed", + "output": [ + { + "type": "message", + "content": [ + {"type": "output_text", "text": "sk-responses-secret"} + ] + } + ] + }))) + .response_codec(response_codec) + .build(), + ) + .await + .unwrap(); + + let captured_events = captured_events_snapshot(&events); + assert_eq!( + captured_events[1].output().unwrap()["output"][0]["content"][0]["text"], + json!("[REDACTED]") + ); + assert_eq!( + captured_events[1] + .annotated_response() + .and_then(|response| response.response_text()), + Some("[REDACTED]") + ); + + deregister_subscriber("pii-redaction-openai-responses-normalized-response").unwrap(); + clear_plugin_configuration().unwrap(); +} diff --git a/docs/index.yml b/docs/index.yml index ed921473..bcdcaba3 100644 --- a/docs/index.yml +++ b/docs/index.yml @@ -34,6 +34,10 @@ navigation: title: "NeMo Guardrails Plugin" slug: nemo-guardrails-plugin title-source: frontmatter + - folder: ./pii-redaction-plugin + title: "PII Redaction Plugin" + slug: pii-redaction-plugin + title-source: frontmatter - folder: ./integrate-into-frameworks title: "Integrate into Frameworks" slug: integrate-into-frameworks diff --git a/docs/pii-redaction-plugin/about.mdx b/docs/pii-redaction-plugin/about.mdx new file mode 100644 index 00000000..4044ab89 --- /dev/null +++ b/docs/pii-redaction-plugin/about.mdx @@ -0,0 +1,90 @@ +--- +title: "PII Redaction Plugin" +sidebar-title: "About" +description: "" +position: 1 +--- +{/* SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +SPDX-License-Identifier: Apache-2.0 */} + +Use the PII redaction plugin when you want first-party redaction, hashing, or +pattern-based replacement around managed NeMo Relay LLM and tool observability +surfaces through the shared plugin system. + +The built-in plugin component has kind `pii_redaction` and is available as a +first-party NeMo Relay plugin. + +The plugin is designed around backend modes: + +- `builtin` + - Uses a native Rust backend for deterministic payload sanitization. +- `local_model` + - Reserves a future local-model backend lane for more stochastic detection behavior. + +## Use This Plugin When + +Start here when you need to: + +- Remove sensitive fields from emitted tool or LLM payloads. +- Replace sensitive text with a deterministic marker such as `[REDACTED]`. +- Hash matching values before observability exporters or subscribers receive + them. +- Keep privacy behavior inside the same plugin config surface used by other + first-party NeMo Relay components. + +## Current Scope + +The built-in plugin currently exposes four managed sanitize surfaces: + +- `input` +- `output` +- `tool_input` +- `tool_output` + +The current built-in backend supports three actions: + +- `remove` +- `regex_replace` +- `hash` + +The current backend boundary is intentional: + +- Managed tool surfaces are sanitized as JSON payloads with exact JSON-pointer + targeting. +- Managed LLM surfaces use the selected built-in codec so redaction can target + normalized Relay request and response shapes such as `/messages/0/content` + and `/message`. + +## Observability Boundary + +This plugin installs sanitize guardrails, not execution intercepts. + +That means: + +- The plugin changes emitted observability payloads. +- The real provider request and response values remain unchanged. +- Subscribers and exporters receive sanitized payloads after the plugin runs. + +For managed LLM request payloads, codec decode and re-encode can canonicalize +the emitted provider-shaped start event. For example, an OpenAI Responses +request may be recorded in the codec's canonical `input` array form instead of +the original shorthand request shape. + +## Current Non-Goals + +This first-party PR scope does not turn NeMo Relay into a full local-model +runtime. + +In particular: + +- `local_model` is an extension point, not a complete backend implementation + in this PR. +- The plugin does not mutate the real callback arguments or return values. +- The plugin does not add a subtree or prefix selector language beyond exact + JSON-pointer matching. + +## Pages + +- [PII Redaction Configuration](/pii-redaction-plugin/configuration) + documents the built-in component shape, action semantics, supported codecs, + and example configs. diff --git a/docs/pii-redaction-plugin/configuration.mdx b/docs/pii-redaction-plugin/configuration.mdx new file mode 100644 index 00000000..35a8ab49 --- /dev/null +++ b/docs/pii-redaction-plugin/configuration.mdx @@ -0,0 +1,211 @@ +--- +title: "PII Redaction Configuration" +sidebar-title: "Configuration" +description: "" +position: 2 +--- +{/* SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +SPDX-License-Identifier: Apache-2.0 */} + +Use this page when you want to configure the built-in PII redaction plugin +component. The component kind is `pii_redaction`. + +For plugin file discovery, precedence, merge behavior, editor controls, and +gateway conflict rules, see +[Plugin Configuration Files](/build-plugins/plugin-configuration-files). + + +NeMo Relay plugin configuration uses the generic plugin document shape, so +field names stay `snake_case` in every binding and in `plugins.toml`. + + + +## Component Shape + +The top-level PII redaction object contains: + +| Field | Purpose | +|---|---| +| `version` | PII redaction config schema version. Defaults to `1`. | +| `mode` | Backend mode. Current values are `builtin` and `local_model`. | +| `input` | Enables managed LLM request sanitization. | +| `output` | Enables managed LLM response sanitization. | +| `tool_input` | Enables managed tool-argument sanitization before execution. | +| `tool_output` | Enables managed tool-result sanitization after execution. | +| `priority` | Guardrail priority. Lower values run earlier. | +| `codec` | Managed LLM provider codec. Required when `input` or `output` is enabled. | +| `builtin` | Built-in backend settings used when `mode = "builtin"`. | +| `local` | Local-backend settings used when `mode = "local_model"`. | +| `policy` | Component-local handling for unknown fields and unsupported values. | + +At least one managed redaction surface must be enabled. + +## Backend Support + +| Area | `builtin` | `local_model` | +|---|---|---| +| Built-in component kind and config validation | Supported | Supported | +| Managed LLM `input` | Supported | Extension point only in this PR | +| Managed LLM `output` | Supported | Extension point only in this PR | +| Managed `tool_input` | Supported | Extension point only in this PR | +| Managed `tool_output` | Supported | Extension point only in this PR | +| Built-in actions | `remove`, `regex_replace`, `hash` | N/A | +| Codec support | `openai_chat`, `openai_responses`, `anthropic_messages` | Runtime-specific future implementation | +| Runtime availability | Any runtime that includes the built-in core plugin | Runtimes that install a local backend provider | + +## Builtin Mode + +Use `builtin` mode when NeMo Relay should sanitize emitted observability +payloads with a deterministic first-party backend. + +### Requirements + +To use `mode = "builtin"`: + +- `builtin` settings are required. +- `codec` is required when `input` or `output` is enabled. +- `builtin.action` must be `remove`, `regex_replace`, or `hash`. +- `builtin.pattern` is required when `builtin.action = "regex_replace"`. + +### `plugins.toml` Example + +You can write this config directly in `plugins.toml`, or create and edit it +through the CLI with `nemo-relay plugins edit`. For plugin file discovery, +precedence, merge behavior, and editor controls, see +[Plugin Configuration Files](/build-plugins/plugin-configuration-files). + +```toml +version = 1 + +[[components]] +kind = "pii_redaction" +enabled = true + +[components.config] +version = 1 +mode = "builtin" +codec = "openai_chat" +input = true +output = true +tool_input = true +tool_output = true + +[components.config.builtin] +action = "regex_replace" +pattern = "sk-[A-Za-z0-9_-]+" +replacement = "[REDACTED]" +target_paths = [ + "/messages/0/content", + "/message", + "/api_key", + "/result/secret", +] + +[components.config.policy] +unknown_component = "warn" +unknown_field = "warn" +unsupported_value = "error" +``` + +This example configures the built-in backend for: + +- LLM request redaction from the normalized request path + `/messages/0/content` +- LLM response redaction from the normalized response path `/message` +- tool argument redaction at `/api_key` +- tool result redaction at `/result/secret` + +### CLI Editor Support + +The NeMo Relay CLI plugin editor now exposes `pii_redaction` directly through +`nemo-relay plugins edit`. + +Use the editor when you want to: + +- toggle the component on or off +- choose `builtin` or `local_model` +- set the LLM `codec` +- edit `builtin` action settings such as `action`, `target_paths`, + `pattern`, and `replacement` +- edit `local.backend` for a runtime-provided future local-model backend + +The editor preserves unknown fields when it rewrites an existing +`pii_redaction` component, so future or runtime-specific settings are not +discarded by the interactive edit flow. + +## Builtin Settings + +The `builtin` section contains: + +| Field | Purpose | +|---|---| +| `action` | Sanitization action. Current values are `remove`, `regex_replace`, and `hash`. | +| `target_paths` | Exact JSON-pointer paths to sanitize. Empty means every matching string leaf. | +| `pattern` | Regex pattern used when `action = "regex_replace"`. | +| `replacement` | Replacement text used when `action = "regex_replace"`. Defaults to `[REDACTED]`. | + +## Action Semantics + +### `remove` + +`remove` is structural. + +When a target matches: + +- object fields are removed +- array elements become `null` +- targeted scalar or root values become `null` + +### `regex_replace` + +`regex_replace` applies the configured regex to matching string leaves and +replaces matches with the configured `replacement`. + +### `hash` + +`hash` replaces matching string leaves with their SHA-256 hex digest. + +## Path Semantics + +`target_paths` are exact JSON-pointer matches. + +The plugin uses different payload boundaries for tools and LLMs: + +- Tools use JSON-native payloads. Paths point into the emitted tool args or + tool result shape directly. +- LLMs use the selected built-in codec. Prefer normalized Relay paths such as: + - `/messages/0/content` for request message content + - `/message` for the normalized assistant response text + +The current implementation also preserves provider-shaped response-path +compatibility for the supported codecs, but normalized LLM paths are the +recommended contract for new configuration. + +If `target_paths` is empty, the built-in backend sanitizes every matching +string leaf in the selected payload boundary. + +## Observability Semantics + +The built-in plugin uses sanitize guardrails. + +That means: + +- the real provider response value is unchanged +- the emitted NeMo Relay start or end event payload is sanitized +- `annotated_response` is populated from the sanitized end-event payload when a + response codec is provided + +## Local Model Mode + +`local_model` is reserved for a future in-process local-model backend. + +### Current Status + +In this PR: + +- the plugin contract accepts `mode = "local_model"` +- the `local` section currently supports `backend` +- actual behavior depends on a runtime-installed local backend provider + +Without a provider, runtimes report the local backend as unavailable during +plugin initialization. From dba8ad41e91f271778fd3ef6668bdcc751aa1eca Mon Sep 17 00:00:00 2001 From: Alex Fournier Date: Sat, 6 Jun 2026 20:27:28 -0700 Subject: [PATCH 02/35] feat: extend deterministic pii redaction actions Signed-off-by: Alex Fournier --- crates/cli/tests/coverage/plugins_tests.rs | 32 +++ .../src/plugins/pii_redaction/component.rs | 250 ++++++++++++++++-- .../plugins/pii_redaction/component_tests.rs | 176 ++++++++++++ docs/pii-redaction-plugin/about.mdx | 3 + docs/pii-redaction-plugin/configuration.mdx | 51 +++- 5 files changed, 486 insertions(+), 26 deletions(-) diff --git a/crates/cli/tests/coverage/plugins_tests.rs b/crates/cli/tests/coverage/plugins_tests.rs index 8368b38e..ab39e0ef 100644 --- a/crates/cli/tests/coverage/plugins_tests.rs +++ b/crates/cli/tests/coverage/plugins_tests.rs @@ -244,16 +244,48 @@ fn typed_editor_model_contains_pii_redaction_options() { builtin.field("target_paths").unwrap().kind, EditorFieldKind::Json ); + assert_eq!( + builtin.field("detector").unwrap().kind, + EditorFieldKind::Enum + ); assert_eq!( builtin.field("replacement").unwrap().kind, EditorFieldKind::String ); + assert_eq!( + builtin.field("mask_char").unwrap().kind, + EditorFieldKind::String + ); + assert_eq!( + builtin.field("unmasked_prefix").unwrap().kind, + EditorFieldKind::Integer + ); + assert_eq!( + builtin.field("unmasked_suffix").unwrap().kind, + EditorFieldKind::Integer + ); let local = schema.field("local").unwrap().schema().unwrap(); assert_eq!( local.field("backend").unwrap().kind, EditorFieldKind::String ); + assert_eq!( + local.field("model_id").unwrap().kind, + EditorFieldKind::String + ); + assert_eq!( + local.field("detector_profile").unwrap().kind, + EditorFieldKind::String + ); + assert_eq!( + local.field("allow_network").unwrap().kind, + EditorFieldKind::Boolean + ); + assert_eq!( + local.field("max_latency_ms").unwrap().kind, + EditorFieldKind::Integer + ); } #[test] diff --git a/crates/core/src/plugins/pii_redaction/component.rs b/crates/core/src/plugins/pii_redaction/component.rs index 145adaf0..57f71d66 100644 --- a/crates/core/src/plugins/pii_redaction/component.rs +++ b/crates/core/src/plugins/pii_redaction/component.rs @@ -143,9 +143,21 @@ pub struct BuiltinBackendConfig { /// Regex pattern used when `action = "regex_replace"`. #[serde(default, skip_serializing_if = "Option::is_none")] pub pattern: Option, + /// Built-in detector preset used when you do not want to write a regex. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub detector: Option, /// Replacement text used when `action = "regex_replace"`. #[serde(default, skip_serializing_if = "Option::is_none")] pub replacement: Option, + /// Masking token used when `action = "mask"`. Defaults to `*`. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub mask_char: Option, + /// Number of leading characters to keep when `action = "mask"`. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub unmasked_prefix: Option, + /// Number of trailing characters to keep when `action = "mask"`. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub unmasked_suffix: Option, } /// Local-backend settings for a future in-process local-model runtime. @@ -155,6 +167,18 @@ pub struct LocalBackendConfig { /// Optional local-model backend identifier. #[serde(default, skip_serializing_if = "Option::is_none")] pub backend: Option, + /// Optional model identifier reserved for future local-model runtimes. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub model_id: Option, + /// Optional detector profile reserved for future local-model runtimes. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub detector_profile: Option, + /// Whether a future local-model backend may use network calls. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub allow_network: Option, + /// Target latency budget hint for a future local-model backend. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub max_latency_ms: Option, } crate::editor_config! { @@ -203,17 +227,30 @@ crate::editor_config! { action => { label: "action", kind: Enum, - values: ["remove", "regex_replace", "hash"], + values: ["remove", "regex_replace", "hash", "mask"], }, target_paths => { label: "target_paths", kind: Json }, pattern => { label: "pattern", kind: String, optional: true }, + detector => { + label: "detector", + kind: Enum, + values: ["email", "phone", "api_key", "ip_address", "url"], + optional: true, + }, replacement => { label: "replacement", kind: String, optional: true }, + mask_char => { label: "mask_char", kind: String, optional: true }, + unmasked_prefix => { label: "unmasked_prefix", kind: Integer, optional: true }, + unmasked_suffix => { label: "unmasked_suffix", kind: Integer, optional: true }, } } crate::editor_config! { impl LocalBackendConfig { backend => { label: "backend", kind: String, optional: true }, + model_id => { label: "model_id", kind: String, optional: true }, + detector_profile => { label: "detector_profile", kind: String, optional: true }, + allow_network => { label: "allow_network", kind: Boolean, optional: true }, + max_latency_ms => { label: "max_latency_ms", kind: Integer, optional: true }, } } @@ -279,7 +316,7 @@ fn builtin_action_schema( ) -> schemars::schema::Schema { string_enum_schema( generator, - &["remove", "regex_replace", "hash"], + &["remove", "regex_replace", "hash", "mask"], Some("remove"), ) } @@ -377,14 +414,29 @@ fn validate_pii_redaction_plugin_config( &config.policy, plugin_config, "builtin", - &["action", "target_paths", "pattern", "replacement"], + &[ + "action", + "target_paths", + "pattern", + "detector", + "replacement", + "mask_char", + "unmasked_prefix", + "unmasked_suffix", + ], ); validate_section_fields( &mut diagnostics, &config.policy, plugin_config, "local", - &["backend"], + &[ + "backend", + "model_id", + "detector_profile", + "allow_network", + "max_latency_ms", + ], ); validate_mode(&mut diagnostics, &config.policy, &config); validate_surface_selection(&mut diagnostics, &config.policy, &config); @@ -500,25 +552,73 @@ fn validate_builtin_action_requirements( return; }; - if !matches!(builtin.action.as_str(), "remove" | "regex_replace" | "hash") { + if !matches!( + builtin.action.as_str(), + "remove" | "regex_replace" | "hash" | "mask" + ) { push_policy_diag( diagnostics, policy.unsupported_value, "pii_redaction.unsupported_value", Some(PII_REDACTION_PLUGIN_KIND.to_string()), Some("builtin.action".to_string()), - "builtin.action must be 'remove', 'regex_replace', or 'hash'".to_string(), + "builtin.action must be 'remove', 'regex_replace', 'hash', or 'mask'".to_string(), ); } - if builtin.action == "regex_replace" && builtin.pattern.is_none() { + if builtin.action == "regex_replace" && builtin.pattern.is_none() && builtin.detector.is_none() + { push_policy_diag( diagnostics, policy.unsupported_value, "pii_redaction.unsupported_value", Some(PII_REDACTION_PLUGIN_KIND.to_string()), Some("builtin.pattern".to_string()), - "builtin.pattern is required when builtin.action = 'regex_replace'".to_string(), + "builtin.pattern or builtin.detector is required when builtin.action = 'regex_replace'" + .to_string(), + ); + } + + if builtin.pattern.is_some() && builtin.detector.is_some() { + push_policy_diag( + diagnostics, + policy.unsupported_value, + "pii_redaction.unsupported_value", + Some(PII_REDACTION_PLUGIN_KIND.to_string()), + Some("builtin.detector".to_string()), + "builtin.pattern and builtin.detector cannot both be set".to_string(), + ); + } + + if builtin + .detector + .as_deref() + .is_some_and(|detector| detector_regex_pattern(detector).is_none()) + { + push_policy_diag( + diagnostics, + policy.unsupported_value, + "pii_redaction.unsupported_value", + Some(PII_REDACTION_PLUGIN_KIND.to_string()), + Some("builtin.detector".to_string()), + "builtin.detector must be 'email', 'phone', 'api_key', 'ip_address', or 'url'" + .to_string(), + ); + } + + if builtin.action == "mask" + && builtin + .mask_char + .as_deref() + .is_some_and(|mask_char| mask_char.is_empty()) + { + push_policy_diag( + diagnostics, + policy.unsupported_value, + "pii_redaction.unsupported_value", + Some(PII_REDACTION_PLUGIN_KIND.to_string()), + Some("builtin.mask_char".to_string()), + "builtin.mask_char must not be empty when builtin.action = 'mask'".to_string(), ); } } @@ -600,7 +700,15 @@ struct CompiledBuiltinBackend { #[derive(Clone)] enum BuiltinAction { Remove, - Hash, + Hash { + matcher: Option>, + }, + Mask { + matcher: Option>, + mask_char: Arc, + unmasked_prefix: usize, + unmasked_suffix: usize, + }, RegexReplace { pattern: Arc, replacement: Arc, @@ -620,23 +728,24 @@ impl BuiltinRequestResponseCodec for T where T: LlmCodec + LlmResponseCodec + impl CompiledBuiltinBackend { fn new(config: BuiltinBackendConfig, codec_name: Option) -> PluginResult { + let matcher = compile_builtin_matcher(config.pattern.clone(), config.detector.clone())?; let action = match config.action.as_str() { "remove" => BuiltinAction::Remove, - "hash" => BuiltinAction::Hash, + "hash" => BuiltinAction::Hash { matcher }, + "mask" => BuiltinAction::Mask { + matcher, + mask_char: Arc::new(config.mask_char.unwrap_or_else(|| "*".to_string())), + unmasked_prefix: config.unmasked_prefix.unwrap_or(0), + unmasked_suffix: config.unmasked_suffix.unwrap_or(0), + }, "regex_replace" => { - let pattern_text = config.pattern.ok_or_else(|| { + let pattern = matcher.ok_or_else(|| { PluginError::InvalidConfig( - "builtin.pattern is required when builtin.action = 'regex_replace'" - .to_string(), + "builtin.pattern or builtin.detector is required when builtin.action = 'regex_replace'".to_string(), ) })?; - let pattern = Regex::new(&pattern_text).map_err(|err| { - PluginError::InvalidConfig(format!( - "invalid builtin.pattern regex '{pattern_text}': {err}" - )) - })?; BuiltinAction::RegexReplace { - pattern: Arc::new(pattern), + pattern, replacement: Arc::new( config .replacement @@ -728,7 +837,45 @@ impl CompiledBuiltinBackend { fn sanitize_string_value(&self, text: String) -> Option { match &self.action { BuiltinAction::Remove => None, - BuiltinAction::Hash => Some(Json::String(hex_sha256(&text))), + BuiltinAction::Hash { matcher } => Some(Json::String(match matcher { + Some(matcher) => matcher + .replace_all(&text, |captures: ®ex::Captures<'_>| { + hex_sha256( + captures + .get(0) + .map(|capture| capture.as_str()) + .unwrap_or(""), + ) + }) + .into_owned(), + None => hex_sha256(&text), + })), + BuiltinAction::Mask { + matcher, + mask_char, + unmasked_prefix, + unmasked_suffix, + } => Some(Json::String(match matcher { + Some(matcher) => matcher + .replace_all(&text, |captures: ®ex::Captures<'_>| { + mask_text( + captures + .get(0) + .map(|capture| capture.as_str()) + .unwrap_or(""), + mask_char.as_str(), + *unmasked_prefix, + *unmasked_suffix, + ) + }) + .into_owned(), + None => mask_text( + &text, + mask_char.as_str(), + *unmasked_prefix, + *unmasked_suffix, + ), + })), BuiltinAction::RegexReplace { pattern, replacement, @@ -809,6 +956,69 @@ fn hex_sha256(text: &str) -> String { output } +fn mask_text( + text: &str, + mask_char: &str, + unmasked_prefix: usize, + unmasked_suffix: usize, +) -> String { + let chars: Vec = text.chars().collect(); + let len = chars.len(); + if len <= unmasked_prefix + unmasked_suffix { + return text.to_string(); + } + + let mut output = String::new(); + for ch in chars.iter().take(unmasked_prefix) { + output.push(*ch); + } + for _ in 0..(len - unmasked_prefix - unmasked_suffix) { + output.push_str(mask_char); + } + for ch in chars.iter().skip(len - unmasked_suffix) { + output.push(*ch); + } + output +} + +fn compile_builtin_matcher( + pattern: Option, + detector: Option, +) -> PluginResult>> { + let pattern_text = match (pattern, detector) { + (Some(pattern), None) => Some(pattern), + (None, Some(detector)) => detector_regex_pattern(&detector).map(str::to_string), + (None, None) => None, + (Some(_), Some(_)) => { + return Err(PluginError::InvalidConfig( + "builtin.pattern and builtin.detector cannot both be set".to_string(), + )); + } + }; + + let Some(pattern_text) = pattern_text else { + return Ok(None); + }; + + let pattern = Regex::new(&pattern_text).map_err(|err| { + PluginError::InvalidConfig(format!( + "invalid builtin matcher regex '{pattern_text}': {err}" + )) + })?; + Ok(Some(Arc::new(pattern))) +} + +fn detector_regex_pattern(detector: &str) -> Option<&'static str> { + match detector { + "email" => Some(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}"), + "phone" => Some(r"\+?[0-9][0-9()\-\s]{6,}[0-9]"), + "api_key" => Some(r"(?:sk|rk|pk|ak)-[A-Za-z0-9_-]{8,}"), + "ip_address" => Some(r"\b(?:\d{1,3}\.){3}\d{1,3}\b"), + "url" => Some(r"https?://[^\s]+"), + _ => None, + } +} + fn instantiate_builtin_codec( codec_name: &str, ) -> PluginResult> { diff --git a/crates/core/tests/unit/plugins/pii_redaction/component_tests.rs b/crates/core/tests/unit/plugins/pii_redaction/component_tests.rs index 663d9709..0bea53cc 100644 --- a/crates/core/tests/unit/plugins/pii_redaction/component_tests.rs +++ b/crates/core/tests/unit/plugins/pii_redaction/component_tests.rs @@ -201,6 +201,64 @@ fn validate_rejects_regex_replace_without_pattern() { })); } +#[test] +fn validate_rejects_mask_with_empty_mask_char() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + + let report = validate_plugin_config(&plugin_config(json!({ + "mode": "builtin", + "builtin": { + "action": "mask", + "mask_char": "" + } + }))); + + assert!(report.diagnostics.iter().any(|diag| { + diag.field.as_deref() == Some("builtin.mask_char") + && diag.message.contains("must not be empty") + })); +} + +#[test] +fn validate_rejects_builtin_detector_and_pattern_together() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + + let report = validate_plugin_config(&plugin_config(json!({ + "mode": "builtin", + "builtin": { + "action": "mask", + "pattern": "secret", + "detector": "email" + } + }))); + + assert!(report.diagnostics.iter().any(|diag| { + diag.field.as_deref() == Some("builtin.detector") + && diag.message.contains("cannot both be set") + })); +} + +#[test] +fn validate_rejects_unknown_builtin_detector() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + + let report = validate_plugin_config(&plugin_config(json!({ + "mode": "builtin", + "builtin": { + "action": "mask", + "detector": "ssn-ish" + } + }))); + + assert!(report.diagnostics.iter().any(|diag| { + diag.field.as_deref() == Some("builtin.detector") + && diag.message.contains("must be 'email'") + })); +} + #[test] fn local_backend_provider_is_invoked_for_local_model_mode() { let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); @@ -379,6 +437,124 @@ fn builtin_remove_deletes_object_fields_and_nulls_array_or_root_targets() { clear_plugin_configuration().unwrap(); } +#[test] +fn builtin_mask_preserves_configured_prefix_and_suffix() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + setup_isolated_thread(); + + futures::executor::block_on(initialize_plugins(plugin_config(json!({ + "mode": "builtin", + "codec": "openai_chat", + "input": false, + "output": false, + "tool_input": true, + "tool_output": true, + "builtin": { + "action": "mask", + "mask_char": "*", + "unmasked_prefix": 2, + "unmasked_suffix": 2, + "target_paths": ["/account", "/result/token"] + } + })))) + .unwrap(); + + let events = capture_events("pii-redaction-mask-events"); + let handle = tool_call( + ToolCallParams::builder() + .name("lookup") + .args(json!({ + "account": "abcdef1234", + "keep": "unchanged" + })) + .build(), + ) + .unwrap(); + tool_call_end( + ToolCallEndParams::builder() + .handle(&handle) + .result(json!({ + "result": { + "token": "9876543210", + "public": "ok" + } + })) + .build(), + ) + .unwrap(); + + let captured_events = captured_events_snapshot(&events); + assert_eq!(captured_events.len(), 2); + assert_eq!( + captured_events[0].input(), + Some(&json!({ + "account": "ab******34", + "keep": "unchanged" + })) + ); + assert_eq!( + captured_events[1].output(), + Some(&json!({ + "result": { + "token": "98******10", + "public": "ok" + } + })) + ); + + deregister_subscriber("pii-redaction-mask-events").unwrap(); + clear_plugin_configuration().unwrap(); +} + +#[test] +fn builtin_mask_with_detector_masks_only_matching_substrings() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + setup_isolated_thread(); + + futures::executor::block_on(initialize_plugins(plugin_config(json!({ + "mode": "builtin", + "codec": "openai_chat", + "input": false, + "output": false, + "tool_input": true, + "tool_output": false, + "builtin": { + "action": "mask", + "detector": "email", + "mask_char": "*", + "target_paths": ["/message"] + } + })))) + .unwrap(); + + let events = capture_events("pii-redaction-detector-mask-events"); + let _handle = tool_call( + ToolCallParams::builder() + .name("notify") + .args(json!({ + "message": "Email alice@example.com or bob@example.com", + "keep": "unchanged" + })) + .build(), + ) + .unwrap(); + + let captured_events = captured_events_snapshot(&events); + assert_eq!(captured_events.len(), 1); + assert_eq!( + captured_events[0].input(), + Some(&json!({ + "message": "Email ***************** or ***************", + "keep": "unchanged" + })) + ); + + deregister_subscriber("pii-redaction-detector-mask-events").unwrap(); + clear_plugin_configuration().unwrap(); +} + #[test] fn builtin_backend_sanitizes_llm_start_payload_via_codec_and_reencodes_provider_shape() { let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); diff --git a/docs/pii-redaction-plugin/about.mdx b/docs/pii-redaction-plugin/about.mdx index 4044ab89..a66943bf 100644 --- a/docs/pii-redaction-plugin/about.mdx +++ b/docs/pii-redaction-plugin/about.mdx @@ -31,6 +31,8 @@ Start here when you need to: them. - Keep privacy behavior inside the same plugin config surface used by other first-party NeMo Relay components. +- Use built-in detector presets for common values such as emails, phone + numbers, URLs, API keys, and IP addresses without writing custom regexes. ## Current Scope @@ -46,6 +48,7 @@ The current built-in backend supports three actions: - `remove` - `regex_replace` - `hash` +- `mask` The current backend boundary is intentional: diff --git a/docs/pii-redaction-plugin/configuration.mdx b/docs/pii-redaction-plugin/configuration.mdx index 35a8ab49..7495db32 100644 --- a/docs/pii-redaction-plugin/configuration.mdx +++ b/docs/pii-redaction-plugin/configuration.mdx @@ -49,7 +49,7 @@ At least one managed redaction surface must be enabled. | Managed LLM `output` | Supported | Extension point only in this PR | | Managed `tool_input` | Supported | Extension point only in this PR | | Managed `tool_output` | Supported | Extension point only in this PR | -| Built-in actions | `remove`, `regex_replace`, `hash` | N/A | +| Built-in actions | `remove`, `regex_replace`, `hash`, `mask` | N/A | | Codec support | `openai_chat`, `openai_responses`, `anthropic_messages` | Runtime-specific future implementation | | Runtime availability | Any runtime that includes the built-in core plugin | Runtimes that install a local backend provider | @@ -64,8 +64,8 @@ To use `mode = "builtin"`: - `builtin` settings are required. - `codec` is required when `input` or `output` is enabled. -- `builtin.action` must be `remove`, `regex_replace`, or `hash`. -- `builtin.pattern` is required when `builtin.action = "regex_replace"`. +- `builtin.action` must be `remove`, `regex_replace`, `hash`, or `mask`. +- `builtin.pattern` or `builtin.detector` is required when `builtin.action = "regex_replace"`. ### `plugins.toml` Example @@ -126,7 +126,7 @@ Use the editor when you want to: - choose `builtin` or `local_model` - set the LLM `codec` - edit `builtin` action settings such as `action`, `target_paths`, - `pattern`, and `replacement` + `pattern`, `detector`, `replacement`, and masking fields - edit `local.backend` for a runtime-provided future local-model backend The editor preserves unknown fields when it rewrites an existing @@ -139,10 +139,14 @@ The `builtin` section contains: | Field | Purpose | |---|---| -| `action` | Sanitization action. Current values are `remove`, `regex_replace`, and `hash`. | +| `action` | Sanitization action. Current values are `remove`, `regex_replace`, `hash`, and `mask`. | | `target_paths` | Exact JSON-pointer paths to sanitize. Empty means every matching string leaf. | | `pattern` | Regex pattern used when `action = "regex_replace"`. | +| `detector` | Optional built-in matcher preset. Current values are `email`, `phone`, `api_key`, `ip_address`, and `url`. | | `replacement` | Replacement text used when `action = "regex_replace"`. Defaults to `[REDACTED]`. | +| `mask_char` | Masking token used when `action = "mask"`. Defaults to `*`. | +| `unmasked_prefix` | Leading character count to keep when `action = "mask"`. Defaults to `0`. | +| `unmasked_suffix` | Trailing character count to keep when `action = "mask"`. Defaults to `0`. | ## Action Semantics @@ -161,10 +165,28 @@ When a target matches: `regex_replace` applies the configured regex to matching string leaves and replaces matches with the configured `replacement`. +If you set `detector` instead of `pattern`, the built-in backend uses the +detector's stock matcher regex. + ### `hash` `hash` replaces matching string leaves with their SHA-256 hex digest. +When `pattern` or `detector` is set, `hash` only replaces the matching +substring instead of hashing the entire string leaf. + +### `mask` + +`mask` replaces the middle portion of each matching string leaf with the +configured `mask_char`. + +Use `unmasked_prefix` and `unmasked_suffix` when you want to preserve a small +leading or trailing segment for correlation or debugging, such as the last four +characters of a token. + +When `pattern` or `detector` is set, `mask` only masks matching substrings +inside the string leaf. + ## Path Semantics `target_paths` are exact JSON-pointer matches. @@ -181,6 +203,18 @@ The current implementation also preserves provider-shaped response-path compatibility for the supported codecs, but normalized LLM paths are the recommended contract for new configuration. +## Detector Presets + +The built-in detector presets are intended for common structured values: + +- `email` +- `phone` +- `api_key` +- `ip_address` +- `url` + +They are deterministic regex-backed helpers, not model inference. + If `target_paths` is empty, the built-in backend sanitizes every matching string leaf in the selected payload boundary. @@ -204,7 +238,12 @@ That means: In this PR: - the plugin contract accepts `mode = "local_model"` -- the `local` section currently supports `backend` +- the `local` section currently supports: + - `backend` + - `model_id` + - `detector_profile` + - `allow_network` + - `max_latency_ms` - actual behavior depends on a runtime-installed local backend provider Without a provider, runtimes report the local backend as unavailable during From 24e1016564a316238d104ef754fb68a47e8dfe36 Mon Sep 17 00:00:00 2001 From: Alex Fournier Date: Sun, 7 Jun 2026 08:47:07 -0700 Subject: [PATCH 03/35] feat: add detector-aware mask defaults Signed-off-by: Alex Fournier --- .../src/plugins/pii_redaction/component.rs | 215 +++++++++++++++--- .../plugins/pii_redaction/component_tests.rs | 143 +++++++++++- docs/pii-redaction-plugin/configuration.mdx | 13 +- 3 files changed, 336 insertions(+), 35 deletions(-) diff --git a/crates/core/src/plugins/pii_redaction/component.rs b/crates/core/src/plugins/pii_redaction/component.rs index 57f71d66..a960a6d2 100644 --- a/crates/core/src/plugins/pii_redaction/component.rs +++ b/crates/core/src/plugins/pii_redaction/component.rs @@ -705,9 +705,7 @@ enum BuiltinAction { }, Mask { matcher: Option>, - mask_char: Arc, - unmasked_prefix: usize, - unmasked_suffix: usize, + strategy: BuiltinMaskStrategy, }, RegexReplace { pattern: Arc, @@ -715,6 +713,28 @@ enum BuiltinAction { }, } +#[derive(Clone)] +enum BuiltinMaskStrategy { + Generic { + mask_char: Arc, + unmasked_prefix: usize, + unmasked_suffix: usize, + }, + DetectorDefault { + detector: BuiltinDetector, + mask_char: Arc, + }, +} + +#[derive(Clone, Copy)] +enum BuiltinDetector { + Email, + Phone, + ApiKey, + IpAddress, + Url, +} + #[derive(Clone, Copy)] enum BuiltinCodecName { OpenAIChat, @@ -728,15 +748,18 @@ impl BuiltinRequestResponseCodec for T where T: LlmCodec + LlmResponseCodec + impl CompiledBuiltinBackend { fn new(config: BuiltinBackendConfig, codec_name: Option) -> PluginResult { - let matcher = compile_builtin_matcher(config.pattern.clone(), config.detector.clone())?; + let detector = config + .detector + .as_deref() + .map(BuiltinDetector::parse) + .transpose()?; + let matcher = compile_builtin_matcher(config.pattern.clone(), detector)?; let action = match config.action.as_str() { "remove" => BuiltinAction::Remove, "hash" => BuiltinAction::Hash { matcher }, "mask" => BuiltinAction::Mask { matcher, - mask_char: Arc::new(config.mask_char.unwrap_or_else(|| "*".to_string())), - unmasked_prefix: config.unmasked_prefix.unwrap_or(0), - unmasked_suffix: config.unmasked_suffix.unwrap_or(0), + strategy: build_mask_strategy(&config, detector), }, "regex_replace" => { let pattern = matcher.ok_or_else(|| { @@ -850,31 +873,19 @@ impl CompiledBuiltinBackend { .into_owned(), None => hex_sha256(&text), })), - BuiltinAction::Mask { - matcher, - mask_char, - unmasked_prefix, - unmasked_suffix, - } => Some(Json::String(match matcher { + BuiltinAction::Mask { matcher, strategy } => Some(Json::String(match matcher { Some(matcher) => matcher .replace_all(&text, |captures: ®ex::Captures<'_>| { - mask_text( + mask_with_strategy( captures .get(0) .map(|capture| capture.as_str()) .unwrap_or(""), - mask_char.as_str(), - *unmasked_prefix, - *unmasked_suffix, + strategy, ) }) .into_owned(), - None => mask_text( - &text, - mask_char.as_str(), - *unmasked_prefix, - *unmasked_suffix, - ), + None => mask_with_strategy(&text, strategy), })), BuiltinAction::RegexReplace { pattern, @@ -981,13 +992,47 @@ fn mask_text( output } +fn build_mask_strategy( + config: &BuiltinBackendConfig, + detector: Option, +) -> BuiltinMaskStrategy { + let mask_char = Arc::new(config.mask_char.clone().unwrap_or_else(|| "*".to_string())); + match detector { + Some(detector) if config.unmasked_prefix.is_none() && config.unmasked_suffix.is_none() => { + BuiltinMaskStrategy::DetectorDefault { + detector, + mask_char, + } + } + _ => BuiltinMaskStrategy::Generic { + mask_char, + unmasked_prefix: config.unmasked_prefix.unwrap_or(0), + unmasked_suffix: config.unmasked_suffix.unwrap_or(0), + }, + } +} + +fn mask_with_strategy(text: &str, strategy: &BuiltinMaskStrategy) -> String { + match strategy { + BuiltinMaskStrategy::Generic { + mask_char, + unmasked_prefix, + unmasked_suffix, + } => mask_text(text, mask_char.as_str(), *unmasked_prefix, *unmasked_suffix), + BuiltinMaskStrategy::DetectorDefault { + detector, + mask_char, + } => detector.default_mask(text, mask_char.as_str()), + } +} + fn compile_builtin_matcher( pattern: Option, - detector: Option, + detector: Option, ) -> PluginResult>> { let pattern_text = match (pattern, detector) { (Some(pattern), None) => Some(pattern), - (None, Some(detector)) => detector_regex_pattern(&detector).map(str::to_string), + (None, Some(detector)) => Some(detector.regex_pattern().to_string()), (None, None) => None, (Some(_), Some(_)) => { return Err(PluginError::InvalidConfig( @@ -1009,14 +1054,120 @@ fn compile_builtin_matcher( } fn detector_regex_pattern(detector: &str) -> Option<&'static str> { - match detector { - "email" => Some(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}"), - "phone" => Some(r"\+?[0-9][0-9()\-\s]{6,}[0-9]"), - "api_key" => Some(r"(?:sk|rk|pk|ak)-[A-Za-z0-9_-]{8,}"), - "ip_address" => Some(r"\b(?:\d{1,3}\.){3}\d{1,3}\b"), - "url" => Some(r"https?://[^\s]+"), - _ => None, + BuiltinDetector::parse(detector) + .ok() + .map(BuiltinDetector::regex_pattern) +} + +impl BuiltinDetector { + fn parse(value: &str) -> PluginResult { + match value { + "email" => Ok(Self::Email), + "phone" => Ok(Self::Phone), + "api_key" => Ok(Self::ApiKey), + "ip_address" => Ok(Self::IpAddress), + "url" => Ok(Self::Url), + other => Err(PluginError::InvalidConfig(format!( + "unsupported builtin.detector '{other}'" + ))), + } + } + + fn regex_pattern(self) -> &'static str { + match self { + Self::Email => r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", + Self::Phone => r"\+?[0-9][0-9()\-\s]{6,}[0-9]", + Self::ApiKey => r"(?:sk|rk|pk|ak)-[A-Za-z0-9_-]{8,}", + Self::IpAddress => r"\b(?:\d{1,3}\.){3}\d{1,3}\b", + Self::Url => r"https?://[^\s]+", + } } + + fn default_mask(self, text: &str, mask_char: &str) -> String { + match self { + Self::Email => mask_email(text, mask_char), + Self::Phone => mask_phone(text, mask_char), + Self::ApiKey => mask_api_key(text, mask_char), + Self::IpAddress => mask_ip_address(text, mask_char), + Self::Url => mask_url(text, mask_char), + } + } +} + +fn mask_email(text: &str, mask_char: &str) -> String { + let Some((local, domain)) = text.split_once('@') else { + return mask_text(text, mask_char, 0, 0); + }; + + let local_chars: Vec = local.chars().collect(); + if local_chars.len() <= 1 { + return text.to_string(); + } + + let mut output = String::new(); + output.push(local_chars[0]); + for _ in 1..local_chars.len() { + output.push_str(mask_char); + } + output.push('@'); + output.push_str(domain); + output +} + +fn mask_phone(text: &str, mask_char: &str) -> String { + let total_digits = text.chars().filter(|ch| ch.is_ascii_digit()).count(); + if total_digits <= 4 { + return text.to_string(); + } + + let mut masked_digits_remaining = total_digits - 4; + let mut output = String::with_capacity(text.len()); + for ch in text.chars() { + if ch.is_ascii_digit() { + if masked_digits_remaining > 0 { + output.push_str(mask_char); + masked_digits_remaining -= 1; + } else { + output.push(ch); + } + } else { + output.push(ch); + } + } + output +} + +fn mask_api_key(text: &str, mask_char: &str) -> String { + let prefix = text.find('-').map_or(0, |idx| idx + 1); + mask_text(text, mask_char, prefix, 4) +} + +fn mask_ip_address(text: &str, mask_char: &str) -> String { + let mut octets = text.split('.').collect::>(); + if octets.len() != 4 { + return mask_text(text, mask_char, 0, 0); + } + + for octet in octets.iter_mut().take(3) { + *octet = "***"; + } + octets.join(".") +} + +fn mask_url(text: &str, mask_char: &str) -> String { + let Some(scheme_idx) = text.find("://") else { + return mask_text(text, mask_char, 0, 0); + }; + let prefix_end = scheme_idx + 3; + let remainder = &text[prefix_end..]; + let Some(path_idx) = remainder.find('/') else { + return text.to_string(); + }; + + let mut output = String::with_capacity(text.len()); + output.push_str(&text[..prefix_end + path_idx + 1]); + output.push_str(mask_char); + output } fn instantiate_builtin_codec( diff --git a/crates/core/tests/unit/plugins/pii_redaction/component_tests.rs b/crates/core/tests/unit/plugins/pii_redaction/component_tests.rs index 0bea53cc..9d4bf93b 100644 --- a/crates/core/tests/unit/plugins/pii_redaction/component_tests.rs +++ b/crates/core/tests/unit/plugins/pii_redaction/component_tests.rs @@ -546,7 +546,7 @@ fn builtin_mask_with_detector_masks_only_matching_substrings() { assert_eq!( captured_events[0].input(), Some(&json!({ - "message": "Email ***************** or ***************", + "message": "Email a****@example.com or b**@example.com", "keep": "unchanged" })) ); @@ -555,6 +555,147 @@ fn builtin_mask_with_detector_masks_only_matching_substrings() { clear_plugin_configuration().unwrap(); } +#[test] +fn builtin_mask_with_email_detector_preserves_domain_by_default() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + setup_isolated_thread(); + + futures::executor::block_on(initialize_plugins(plugin_config(json!({ + "mode": "builtin", + "codec": "openai_chat", + "input": false, + "output": false, + "tool_input": true, + "tool_output": false, + "builtin": { + "action": "mask", + "detector": "email", + "target_paths": ["/contact"] + } + })))) + .unwrap(); + + let events = capture_events("pii-redaction-email-default-mask-events"); + let _handle = tool_call( + ToolCallParams::builder() + .name("notify") + .args(json!({ + "contact": "alice@example.com", + "keep": "unchanged" + })) + .build(), + ) + .unwrap(); + + let captured_events = captured_events_snapshot(&events); + assert_eq!(captured_events.len(), 1); + assert_eq!( + captured_events[0].input(), + Some(&json!({ + "contact": "a****@example.com", + "keep": "unchanged" + })) + ); + + deregister_subscriber("pii-redaction-email-default-mask-events").unwrap(); + clear_plugin_configuration().unwrap(); +} + +#[test] +fn builtin_mask_with_phone_detector_preserves_last_four_digits_by_default() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + setup_isolated_thread(); + + futures::executor::block_on(initialize_plugins(plugin_config(json!({ + "mode": "builtin", + "codec": "openai_chat", + "input": false, + "output": false, + "tool_input": true, + "tool_output": false, + "builtin": { + "action": "mask", + "detector": "phone", + "target_paths": ["/phone"] + } + })))) + .unwrap(); + + let events = capture_events("pii-redaction-phone-default-mask-events"); + let _handle = tool_call( + ToolCallParams::builder() + .name("notify") + .args(json!({ + "phone": "+1 (555) 123-4567", + "keep": "unchanged" + })) + .build(), + ) + .unwrap(); + + let captured_events = captured_events_snapshot(&events); + assert_eq!(captured_events.len(), 1); + assert_eq!( + captured_events[0].input(), + Some(&json!({ + "phone": "+* (***) ***-4567", + "keep": "unchanged" + })) + ); + + deregister_subscriber("pii-redaction-phone-default-mask-events").unwrap(); + clear_plugin_configuration().unwrap(); +} + +#[test] +fn builtin_mask_with_api_key_detector_preserves_prefix_and_last_four_by_default() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + setup_isolated_thread(); + + futures::executor::block_on(initialize_plugins(plugin_config(json!({ + "mode": "builtin", + "codec": "openai_chat", + "input": false, + "output": false, + "tool_input": true, + "tool_output": false, + "builtin": { + "action": "mask", + "detector": "api_key", + "target_paths": ["/api_key"] + } + })))) + .unwrap(); + + let events = capture_events("pii-redaction-api-key-default-mask-events"); + let _handle = tool_call( + ToolCallParams::builder() + .name("notify") + .args(json!({ + "api_key": "sk-abcdef123456", + "keep": "unchanged" + })) + .build(), + ) + .unwrap(); + + let captured_events = captured_events_snapshot(&events); + assert_eq!(captured_events.len(), 1); + assert_eq!( + captured_events[0].input(), + Some(&json!({ + "api_key": "sk-********3456", + "keep": "unchanged" + })) + ); + + deregister_subscriber("pii-redaction-api-key-default-mask-events").unwrap(); + clear_plugin_configuration().unwrap(); +} + #[test] fn builtin_backend_sanitizes_llm_start_payload_via_codec_and_reencodes_provider_shape() { let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); diff --git a/docs/pii-redaction-plugin/configuration.mdx b/docs/pii-redaction-plugin/configuration.mdx index 7495db32..dcbe4cad 100644 --- a/docs/pii-redaction-plugin/configuration.mdx +++ b/docs/pii-redaction-plugin/configuration.mdx @@ -145,8 +145,8 @@ The `builtin` section contains: | `detector` | Optional built-in matcher preset. Current values are `email`, `phone`, `api_key`, `ip_address`, and `url`. | | `replacement` | Replacement text used when `action = "regex_replace"`. Defaults to `[REDACTED]`. | | `mask_char` | Masking token used when `action = "mask"`. Defaults to `*`. | -| `unmasked_prefix` | Leading character count to keep when `action = "mask"`. Defaults to `0`. | -| `unmasked_suffix` | Trailing character count to keep when `action = "mask"`. Defaults to `0`. | +| `unmasked_prefix` | Leading character count to keep when `action = "mask"`. Defaults to `0`, unless a detector-specific masking preset is active. | +| `unmasked_suffix` | Trailing character count to keep when `action = "mask"`. Defaults to `0`, unless a detector-specific masking preset is active. | ## Action Semantics @@ -187,6 +187,15 @@ characters of a token. When `pattern` or `detector` is set, `mask` only masks matching substrings inside the string leaf. +When `detector` is set and you do not specify `unmasked_prefix` or +`unmasked_suffix`, the built-in backend applies detector-aware defaults: + +- `email`: preserves the domain and the first local-part character +- `phone`: preserves the last four digits while keeping separators intact +- `api_key`: preserves the vendor-style prefix such as `sk-` and the last four characters +- `ip_address`: preserves the last octet +- `url`: preserves the scheme and host, then collapses the path/query tail + ## Path Semantics `target_paths` are exact JSON-pointer matches. From 6f6fbd27f0025ab51755904c7ea55ebe7d0a1a54 Mon Sep 17 00:00:00 2001 From: Alex Fournier Date: Mon, 8 Jun 2026 07:39:32 -0700 Subject: [PATCH 04/35] test: expand pii redaction edge case coverage Signed-off-by: Alex Fournier --- .../plugins/pii_redaction/component_tests.rs | 620 ++++++++++++++++++ 1 file changed, 620 insertions(+) diff --git a/crates/core/tests/unit/plugins/pii_redaction/component_tests.rs b/crates/core/tests/unit/plugins/pii_redaction/component_tests.rs index 9d4bf93b..45f7e435 100644 --- a/crates/core/tests/unit/plugins/pii_redaction/component_tests.rs +++ b/crates/core/tests/unit/plugins/pii_redaction/component_tests.rs @@ -696,6 +696,626 @@ fn builtin_mask_with_api_key_detector_preserves_prefix_and_last_four_by_default( clear_plugin_configuration().unwrap(); } +#[test] +fn builtin_mask_with_detector_uses_explicit_prefix_suffix_over_defaults() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + setup_isolated_thread(); + + futures::executor::block_on(initialize_plugins(plugin_config(json!({ + "mode": "builtin", + "codec": "openai_chat", + "input": false, + "output": false, + "tool_input": true, + "tool_output": false, + "builtin": { + "action": "mask", + "detector": "email", + "unmasked_prefix": 2, + "unmasked_suffix": 2, + "target_paths": ["/contact"] + } + })))) + .unwrap(); + + let events = capture_events("pii-redaction-detector-explicit-mask-events"); + let _handle = tool_call( + ToolCallParams::builder() + .name("notify") + .args(json!({ + "contact": "alice@example.com", + "keep": "unchanged" + })) + .build(), + ) + .unwrap(); + + let captured_events = captured_events_snapshot(&events); + assert_eq!(captured_events.len(), 1); + assert_eq!( + captured_events[0].input(), + Some(&json!({ + "contact": "al*************om", + "keep": "unchanged" + })) + ); + + deregister_subscriber("pii-redaction-detector-explicit-mask-events").unwrap(); + clear_plugin_configuration().unwrap(); +} + +#[test] +fn builtin_mask_with_ip_address_detector_preserves_last_octet_by_default() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + setup_isolated_thread(); + + futures::executor::block_on(initialize_plugins(plugin_config(json!({ + "mode": "builtin", + "codec": "openai_chat", + "input": false, + "output": false, + "tool_input": true, + "tool_output": false, + "builtin": { + "action": "mask", + "detector": "ip_address", + "target_paths": ["/ip"] + } + })))) + .unwrap(); + + let events = capture_events("pii-redaction-ip-default-mask-events"); + let _handle = tool_call( + ToolCallParams::builder() + .name("notify") + .args(json!({ + "ip": "192.168.10.42", + "keep": "unchanged" + })) + .build(), + ) + .unwrap(); + + let captured_events = captured_events_snapshot(&events); + assert_eq!(captured_events.len(), 1); + assert_eq!( + captured_events[0].input(), + Some(&json!({ + "ip": "***.***.***.42", + "keep": "unchanged" + })) + ); + + deregister_subscriber("pii-redaction-ip-default-mask-events").unwrap(); + clear_plugin_configuration().unwrap(); +} + +#[test] +fn builtin_mask_with_url_detector_preserves_scheme_and_host_by_default() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + setup_isolated_thread(); + + futures::executor::block_on(initialize_plugins(plugin_config(json!({ + "mode": "builtin", + "codec": "openai_chat", + "input": false, + "output": false, + "tool_input": true, + "tool_output": false, + "builtin": { + "action": "mask", + "detector": "url", + "target_paths": ["/url"] + } + })))) + .unwrap(); + + let events = capture_events("pii-redaction-url-default-mask-events"); + let _handle = tool_call( + ToolCallParams::builder() + .name("notify") + .args(json!({ + "url": "https://example.com/path?q=1", + "keep": "unchanged" + })) + .build(), + ) + .unwrap(); + + let captured_events = captured_events_snapshot(&events); + assert_eq!(captured_events.len(), 1); + assert_eq!( + captured_events[0].input(), + Some(&json!({ + "url": "https://example.com/*", + "keep": "unchanged" + })) + ); + + deregister_subscriber("pii-redaction-url-default-mask-events").unwrap(); + clear_plugin_configuration().unwrap(); +} + +#[test] +fn builtin_hash_with_detector_hashes_only_matching_substrings() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + setup_isolated_thread(); + + futures::executor::block_on(initialize_plugins(plugin_config(json!({ + "mode": "builtin", + "codec": "openai_chat", + "input": false, + "output": false, + "tool_input": true, + "tool_output": false, + "builtin": { + "action": "hash", + "detector": "email", + "target_paths": ["/message"] + } + })))) + .unwrap(); + + let events = capture_events("pii-redaction-detector-hash-events"); + let _handle = tool_call( + ToolCallParams::builder() + .name("notify") + .args(json!({ + "message": "Email alice@example.com please", + "keep": "unchanged" + })) + .build(), + ) + .unwrap(); + + let captured_events = captured_events_snapshot(&events); + assert_eq!(captured_events.len(), 1); + assert_eq!( + captured_events[0].input(), + Some(&json!({ + "message": format!( + "Email {} please", + hex_sha256("alice@example.com") + ), + "keep": "unchanged" + })) + ); + + deregister_subscriber("pii-redaction-detector-hash-events").unwrap(); + clear_plugin_configuration().unwrap(); +} + +#[test] +fn builtin_mask_with_short_detector_match_leaves_value_unchanged() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + setup_isolated_thread(); + + futures::executor::block_on(initialize_plugins(plugin_config(json!({ + "mode": "builtin", + "codec": "openai_chat", + "input": false, + "output": false, + "tool_input": true, + "tool_output": false, + "builtin": { + "action": "mask", + "detector": "email", + "target_paths": ["/contact"] + } + })))) + .unwrap(); + + let events = capture_events("pii-redaction-short-detector-mask-events"); + let _handle = tool_call( + ToolCallParams::builder() + .name("notify") + .args(json!({ + "contact": "a@example.com", + "keep": "unchanged" + })) + .build(), + ) + .unwrap(); + + let captured_events = captured_events_snapshot(&events); + assert_eq!(captured_events.len(), 1); + assert_eq!( + captured_events[0].input(), + Some(&json!({ + "contact": "a@example.com", + "keep": "unchanged" + })) + ); + + deregister_subscriber("pii-redaction-short-detector-mask-events").unwrap(); + clear_plugin_configuration().unwrap(); +} + +#[test] +fn builtin_mask_with_empty_target_paths_sanitizes_all_matching_string_leaves() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + setup_isolated_thread(); + + futures::executor::block_on(initialize_plugins(plugin_config(json!({ + "mode": "builtin", + "codec": "openai_chat", + "input": false, + "output": false, + "tool_input": true, + "tool_output": false, + "builtin": { + "action": "mask", + "detector": "email" + } + })))) + .unwrap(); + + let events = capture_events("pii-redaction-empty-target-mask-events"); + let _handle = tool_call( + ToolCallParams::builder() + .name("notify") + .args(json!({ + "primary": "alice@example.com", + "nested": { + "secondary": "bob@example.com", + "note": "no pii here" + }, + "items": ["carol@example.com", "safe text"] + })) + .build(), + ) + .unwrap(); + + let captured_events = captured_events_snapshot(&events); + assert_eq!(captured_events.len(), 1); + assert_eq!( + captured_events[0].input(), + Some(&json!({ + "primary": "a****@example.com", + "nested": { + "secondary": "b**@example.com", + "note": "no pii here" + }, + "items": ["c****@example.com", "safe text"] + })) + ); + + deregister_subscriber("pii-redaction-empty-target-mask-events").unwrap(); + clear_plugin_configuration().unwrap(); +} + +#[test] +fn builtin_mask_with_malformed_ip_or_url_detector_input_leaves_value_unchanged() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + setup_isolated_thread(); + + futures::executor::block_on(initialize_plugins(plugin_config(json!({ + "mode": "builtin", + "codec": "openai_chat", + "input": false, + "output": false, + "tool_input": true, + "tool_output": false, + "builtin": { + "action": "mask", + "detector": "ip_address", + "target_paths": ["/ip", "/url"] + } + })))) + .unwrap(); + + let events = capture_events("pii-redaction-malformed-detector-mask-events"); + let _handle = tool_call( + ToolCallParams::builder() + .name("notify") + .args(json!({ + "ip": "not-an-ip", + "url": "mailto:alice@example.com", + "keep": "unchanged" + })) + .build(), + ) + .unwrap(); + + let captured_events = captured_events_snapshot(&events); + assert_eq!(captured_events.len(), 1); + assert_eq!( + captured_events[0].input(), + Some(&json!({ + "ip": "not-an-ip", + "url": "mailto:alice@example.com", + "keep": "unchanged" + })) + ); + + deregister_subscriber("pii-redaction-malformed-detector-mask-events").unwrap(); + clear_plugin_configuration().unwrap(); +} + +#[tokio::test] +async fn builtin_mask_with_detector_sanitizes_llm_response_from_normalized_message_path() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + setup_isolated_thread(); + + initialize_plugins(plugin_config(json!({ + "mode": "builtin", + "codec": "openai_chat", + "input": false, + "output": true, + "tool_input": false, + "tool_output": false, + "builtin": { + "action": "mask", + "detector": "email", + "target_paths": ["/message"] + } + }))) + .await + .unwrap(); + + let events = capture_events("pii-redaction-detector-llm-response-events"); + let response_codec: Arc = Arc::new(OpenAIChatCodec); + + let _ = llm_call_execute( + LlmCallExecuteParams::builder() + .name("openai") + .request(LlmRequest { + headers: serde_json::Map::new(), + content: json!({"model": "gpt-4o-mini", "messages": [{"role": "user", "content": "hello"}]}), + }) + .func(noop_openai_chat_exec_fn(json!({ + "id": "chatcmpl-123", + "model": "gpt-4o-mini", + "choices": [ + { + "index": 0, + "message": {"role": "assistant", "content": "Reach me at alice@example.com"}, + "finish_reason": "stop" + } + ] + }))) + .response_codec(response_codec) + .build(), + ) + .await + .unwrap(); + + let captured_events = captured_events_snapshot(&events); + assert_eq!( + captured_events[1].output().unwrap()["choices"][0]["message"]["content"], + json!("Reach me at a****@example.com") + ); + assert_eq!( + captured_events[1] + .annotated_response() + .and_then(|response| response.response_text()), + Some("Reach me at a****@example.com") + ); + + deregister_subscriber("pii-redaction-detector-llm-response-events").unwrap(); + clear_plugin_configuration().unwrap(); +} + +#[test] +fn builtin_hash_with_detector_hashes_multiple_matches_in_one_string() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + setup_isolated_thread(); + + futures::executor::block_on(initialize_plugins(plugin_config(json!({ + "mode": "builtin", + "codec": "openai_chat", + "input": false, + "output": false, + "tool_input": true, + "tool_output": false, + "builtin": { + "action": "hash", + "detector": "email", + "target_paths": ["/message"] + } + })))) + .unwrap(); + + let events = capture_events("pii-redaction-multi-detector-hash-events"); + let _handle = tool_call( + ToolCallParams::builder() + .name("notify") + .args(json!({ + "message": "alice@example.com and bob@example.com", + "keep": "unchanged" + })) + .build(), + ) + .unwrap(); + + let captured_events = captured_events_snapshot(&events); + assert_eq!(captured_events.len(), 1); + assert_eq!( + captured_events[0].input(), + Some(&json!({ + "message": format!( + "{} and {}", + hex_sha256("alice@example.com"), + hex_sha256("bob@example.com") + ), + "keep": "unchanged" + })) + ); + + deregister_subscriber("pii-redaction-multi-detector-hash-events").unwrap(); + clear_plugin_configuration().unwrap(); +} + +#[test] +fn builtin_mask_with_empty_target_paths_handles_arrays_and_multiple_detector_types() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + setup_isolated_thread(); + + futures::executor::block_on(initialize_plugins(plugin_config(json!({ + "mode": "builtin", + "codec": "openai_chat", + "input": false, + "output": false, + "tool_input": true, + "tool_output": false, + "builtin": { + "action": "mask", + "detector": "url" + } + })))) + .unwrap(); + + let events = capture_events("pii-redaction-array-mask-events"); + let _handle = tool_call( + ToolCallParams::builder() + .name("notify") + .args(json!({ + "items": [ + "https://example.com/a", + "safe text", + {"nested": "http://nvidia.com/private/path"}, + 42 + ], + "keep": "mailto:alice@example.com" + })) + .build(), + ) + .unwrap(); + + let captured_events = captured_events_snapshot(&events); + assert_eq!(captured_events.len(), 1); + assert_eq!( + captured_events[0].input(), + Some(&json!({ + "items": [ + "https://example.com/*", + "safe text", + {"nested": "http://nvidia.com/*"}, + 42 + ], + "keep": "mailto:alice@example.com" + })) + ); + + deregister_subscriber("pii-redaction-array-mask-events").unwrap(); + clear_plugin_configuration().unwrap(); +} + +#[test] +fn builtin_mask_with_detector_sanitizes_tool_output_payloads() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + setup_isolated_thread(); + + futures::executor::block_on(initialize_plugins(plugin_config(json!({ + "mode": "builtin", + "codec": "openai_chat", + "input": false, + "output": false, + "tool_input": false, + "tool_output": true, + "builtin": { + "action": "mask", + "detector": "email", + "target_paths": ["/result/contact"] + } + })))) + .unwrap(); + + let events = capture_events("pii-redaction-tool-output-mask-events"); + let handle = tool_call( + ToolCallParams::builder() + .name("lookup") + .args(json!({"query": "alice"})) + .build(), + ) + .unwrap(); + tool_call_end( + ToolCallEndParams::builder() + .handle(&handle) + .result(json!({ + "result": { + "contact": "alice@example.com", + "public": "ok" + } + })) + .build(), + ) + .unwrap(); + + let captured_events = captured_events_snapshot(&events); + assert_eq!(captured_events.len(), 2); + assert_eq!( + captured_events[1].output(), + Some(&json!({ + "result": { + "contact": "a****@example.com", + "public": "ok" + } + })) + ); + + deregister_subscriber("pii-redaction-tool-output-mask-events").unwrap(); + clear_plugin_configuration().unwrap(); +} + +#[test] +fn builtin_mask_with_phone_detector_ignores_non_matching_digit_shapes() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + setup_isolated_thread(); + + futures::executor::block_on(initialize_plugins(plugin_config(json!({ + "mode": "builtin", + "codec": "openai_chat", + "input": false, + "output": false, + "tool_input": true, + "tool_output": false, + "builtin": { + "action": "mask", + "detector": "phone", + "target_paths": ["/value"] + } + })))) + .unwrap(); + + let events = capture_events("pii-redaction-phone-false-positive-events"); + let _handle = tool_call( + ToolCallParams::builder() + .name("notify") + .args(json!({ + "value": "Order 12345 is ready", + "keep": "unchanged" + })) + .build(), + ) + .unwrap(); + + let captured_events = captured_events_snapshot(&events); + assert_eq!(captured_events.len(), 1); + assert_eq!( + captured_events[0].input(), + Some(&json!({ + "value": "Order 12345 is ready", + "keep": "unchanged" + })) + ); + + deregister_subscriber("pii-redaction-phone-false-positive-events").unwrap(); + clear_plugin_configuration().unwrap(); +} + #[test] fn builtin_backend_sanitizes_llm_start_payload_via_codec_and_reencodes_provider_shape() { let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); From 88fb1e5bf936392d8219d0948dbf976ab1daf562 Mon Sep 17 00:00:00 2001 From: Alex Fournier Date: Mon, 8 Jun 2026 08:06:47 -0700 Subject: [PATCH 05/35] feat: add deterministic secret detector presets Signed-off-by: Alex Fournier --- crates/cli/tests/coverage/plugins_tests.rs | 14 + .../src/plugins/pii_redaction/component.rs | 154 ++------ .../src/plugins/pii_redaction/detectors.rs | 341 ++++++++++++++++++ .../plugins/pii_redaction/component_tests.rs | 269 +++++++++++++- docs/pii-redaction-plugin/configuration.mdx | 29 +- 5 files changed, 673 insertions(+), 134 deletions(-) create mode 100644 crates/core/src/plugins/pii_redaction/detectors.rs diff --git a/crates/cli/tests/coverage/plugins_tests.rs b/crates/cli/tests/coverage/plugins_tests.rs index ab39e0ef..b69ceed7 100644 --- a/crates/cli/tests/coverage/plugins_tests.rs +++ b/crates/cli/tests/coverage/plugins_tests.rs @@ -248,6 +248,20 @@ fn typed_editor_model_contains_pii_redaction_options() { builtin.field("detector").unwrap().kind, EditorFieldKind::Enum ); + assert!( + builtin + .field("detector") + .unwrap() + .enum_values + .contains(&"jwt") + ); + assert!( + builtin + .field("detector") + .unwrap() + .enum_values + .contains(&"aws_access_key_id") + ); assert_eq!( builtin.field("replacement").unwrap().kind, EditorFieldKind::String diff --git a/crates/core/src/plugins/pii_redaction/component.rs b/crates/core/src/plugins/pii_redaction/component.rs index a960a6d2..3f825208 100644 --- a/crates/core/src/plugins/pii_redaction/component.rs +++ b/crates/core/src/plugins/pii_redaction/component.rs @@ -32,6 +32,10 @@ mod local; use local::register_local_backend; pub use local::{clear_local_backend_provider, register_local_backend_provider}; +#[path = "detectors.rs"] +mod detectors; +use detectors::{BuiltinDetector, detector_regex_pattern, supported_detector_summary}; + /// The plugin kind reserved for the built-in privacy component. pub const PII_REDACTION_PLUGIN_KIND: &str = "pii_redaction"; @@ -234,7 +238,22 @@ crate::editor_config! { detector => { label: "detector", kind: Enum, - values: ["email", "phone", "api_key", "ip_address", "url"], + values: [ + "email", + "phone", + "api_key", + "ip_address", + "ipv6", + "url", + "uuid", + "bearer_token", + "jwt", + "credit_card", + "aws_access_key_id", + "aws_secret_access_key", + "gcp_api_key", + "azure_storage_account_key", + ], optional: true, }, replacement => { label: "replacement", kind: String, optional: true }, @@ -601,8 +620,10 @@ fn validate_builtin_action_requirements( "pii_redaction.unsupported_value", Some(PII_REDACTION_PLUGIN_KIND.to_string()), Some("builtin.detector".to_string()), - "builtin.detector must be 'email', 'phone', 'api_key', 'ip_address', or 'url'" - .to_string(), + format!( + "builtin.detector must be one of the supported built-in detector presets ({})", + supported_detector_summary() + ), ); } @@ -726,15 +747,6 @@ enum BuiltinMaskStrategy { }, } -#[derive(Clone, Copy)] -enum BuiltinDetector { - Email, - Phone, - ApiKey, - IpAddress, - Url, -} - #[derive(Clone, Copy)] enum BuiltinCodecName { OpenAIChat, @@ -1052,124 +1064,6 @@ fn compile_builtin_matcher( })?; Ok(Some(Arc::new(pattern))) } - -fn detector_regex_pattern(detector: &str) -> Option<&'static str> { - BuiltinDetector::parse(detector) - .ok() - .map(BuiltinDetector::regex_pattern) -} - -impl BuiltinDetector { - fn parse(value: &str) -> PluginResult { - match value { - "email" => Ok(Self::Email), - "phone" => Ok(Self::Phone), - "api_key" => Ok(Self::ApiKey), - "ip_address" => Ok(Self::IpAddress), - "url" => Ok(Self::Url), - other => Err(PluginError::InvalidConfig(format!( - "unsupported builtin.detector '{other}'" - ))), - } - } - - fn regex_pattern(self) -> &'static str { - match self { - Self::Email => r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", - Self::Phone => r"\+?[0-9][0-9()\-\s]{6,}[0-9]", - Self::ApiKey => r"(?:sk|rk|pk|ak)-[A-Za-z0-9_-]{8,}", - Self::IpAddress => r"\b(?:\d{1,3}\.){3}\d{1,3}\b", - Self::Url => r"https?://[^\s]+", - } - } - - fn default_mask(self, text: &str, mask_char: &str) -> String { - match self { - Self::Email => mask_email(text, mask_char), - Self::Phone => mask_phone(text, mask_char), - Self::ApiKey => mask_api_key(text, mask_char), - Self::IpAddress => mask_ip_address(text, mask_char), - Self::Url => mask_url(text, mask_char), - } - } -} - -fn mask_email(text: &str, mask_char: &str) -> String { - let Some((local, domain)) = text.split_once('@') else { - return mask_text(text, mask_char, 0, 0); - }; - - let local_chars: Vec = local.chars().collect(); - if local_chars.len() <= 1 { - return text.to_string(); - } - - let mut output = String::new(); - output.push(local_chars[0]); - for _ in 1..local_chars.len() { - output.push_str(mask_char); - } - output.push('@'); - output.push_str(domain); - output -} - -fn mask_phone(text: &str, mask_char: &str) -> String { - let total_digits = text.chars().filter(|ch| ch.is_ascii_digit()).count(); - if total_digits <= 4 { - return text.to_string(); - } - - let mut masked_digits_remaining = total_digits - 4; - let mut output = String::with_capacity(text.len()); - for ch in text.chars() { - if ch.is_ascii_digit() { - if masked_digits_remaining > 0 { - output.push_str(mask_char); - masked_digits_remaining -= 1; - } else { - output.push(ch); - } - } else { - output.push(ch); - } - } - output -} - -fn mask_api_key(text: &str, mask_char: &str) -> String { - let prefix = text.find('-').map_or(0, |idx| idx + 1); - mask_text(text, mask_char, prefix, 4) -} - -fn mask_ip_address(text: &str, mask_char: &str) -> String { - let mut octets = text.split('.').collect::>(); - if octets.len() != 4 { - return mask_text(text, mask_char, 0, 0); - } - - for octet in octets.iter_mut().take(3) { - *octet = "***"; - } - octets.join(".") -} - -fn mask_url(text: &str, mask_char: &str) -> String { - let Some(scheme_idx) = text.find("://") else { - return mask_text(text, mask_char, 0, 0); - }; - let prefix_end = scheme_idx + 3; - let remainder = &text[prefix_end..]; - let Some(path_idx) = remainder.find('/') else { - return text.to_string(); - }; - - let mut output = String::with_capacity(text.len()); - output.push_str(&text[..prefix_end + path_idx + 1]); - output.push_str(mask_char); - output -} - fn instantiate_builtin_codec( codec_name: &str, ) -> PluginResult> { diff --git a/crates/core/src/plugins/pii_redaction/detectors.rs b/crates/core/src/plugins/pii_redaction/detectors.rs new file mode 100644 index 00000000..90f9350f --- /dev/null +++ b/crates/core/src/plugins/pii_redaction/detectors.rs @@ -0,0 +1,341 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +use crate::plugin::PluginError; + +use super::mask_text; + +#[derive(Clone, Copy, PartialEq, Eq)] +pub(super) enum BuiltinDetector { + Email, + Phone, + ApiKey, + IpAddress, + Ipv6, + Url, + Uuid, + BearerToken, + Jwt, + CreditCard, + AwsAccessKeyId, + AwsSecretAccessKey, + GcpApiKey, + AzureStorageAccountKey, +} + +#[derive(Clone, Copy, PartialEq, Eq)] +enum BuiltinDetectorCategory { + CommonPii, + StructuredSecret, + CloudCredential, +} + +#[derive(Clone, Copy)] +struct BuiltinDetectorSpec { + detector: BuiltinDetector, + name: &'static str, + category: BuiltinDetectorCategory, + regex_pattern: &'static str, +} + +const BUILTIN_DETECTOR_SPECS: &[BuiltinDetectorSpec] = &[ + BuiltinDetectorSpec { + detector: BuiltinDetector::Email, + name: "email", + category: BuiltinDetectorCategory::CommonPii, + regex_pattern: r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", + }, + BuiltinDetectorSpec { + detector: BuiltinDetector::Phone, + name: "phone", + category: BuiltinDetectorCategory::CommonPii, + regex_pattern: r"\+?[0-9][0-9()\-\s]{6,}[0-9]", + }, + BuiltinDetectorSpec { + detector: BuiltinDetector::ApiKey, + name: "api_key", + category: BuiltinDetectorCategory::StructuredSecret, + regex_pattern: r"(?:sk|rk|pk|ak)-[A-Za-z0-9_-]{8,}", + }, + BuiltinDetectorSpec { + detector: BuiltinDetector::IpAddress, + name: "ip_address", + category: BuiltinDetectorCategory::CommonPii, + regex_pattern: r"\b(?:\d{1,3}\.){3}\d{1,3}\b", + }, + BuiltinDetectorSpec { + detector: BuiltinDetector::Ipv6, + name: "ipv6", + category: BuiltinDetectorCategory::CommonPii, + regex_pattern: r"\b(?:[A-Fa-f0-9]{1,4}:){2,7}[A-Fa-f0-9]{1,4}\b", + }, + BuiltinDetectorSpec { + detector: BuiltinDetector::Url, + name: "url", + category: BuiltinDetectorCategory::CommonPii, + regex_pattern: r"https?://[^\s]+", + }, + BuiltinDetectorSpec { + detector: BuiltinDetector::Uuid, + name: "uuid", + category: BuiltinDetectorCategory::StructuredSecret, + regex_pattern: r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-8][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}\b", + }, + BuiltinDetectorSpec { + detector: BuiltinDetector::BearerToken, + name: "bearer_token", + category: BuiltinDetectorCategory::StructuredSecret, + regex_pattern: r"(?i)\bBearer\s+[A-Za-z0-9._~+/\-]+=*\b", + }, + BuiltinDetectorSpec { + detector: BuiltinDetector::Jwt, + name: "jwt", + category: BuiltinDetectorCategory::StructuredSecret, + regex_pattern: r"\beyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\b", + }, + BuiltinDetectorSpec { + detector: BuiltinDetector::CreditCard, + name: "credit_card", + category: BuiltinDetectorCategory::StructuredSecret, + regex_pattern: r"\b(?:\d[ -]?){13,19}\b", + }, + BuiltinDetectorSpec { + detector: BuiltinDetector::AwsAccessKeyId, + name: "aws_access_key_id", + category: BuiltinDetectorCategory::CloudCredential, + regex_pattern: r"\b(?:A3T[A-Z0-9]|AKIA|ASIA|ABIA|ACCA|AGPA|AIDA|AIPA|ANPA|ANVA|APKA|AROA|AUSA)[A-Z0-9]{16}\b", + }, + BuiltinDetectorSpec { + detector: BuiltinDetector::AwsSecretAccessKey, + name: "aws_secret_access_key", + category: BuiltinDetectorCategory::CloudCredential, + regex_pattern: r"\b[A-Za-z0-9/+=]{40}\b", + }, + BuiltinDetectorSpec { + detector: BuiltinDetector::GcpApiKey, + name: "gcp_api_key", + category: BuiltinDetectorCategory::CloudCredential, + regex_pattern: r"\bAIza[0-9A-Za-z\-_]{35}\b", + }, + BuiltinDetectorSpec { + detector: BuiltinDetector::AzureStorageAccountKey, + name: "azure_storage_account_key", + category: BuiltinDetectorCategory::CloudCredential, + regex_pattern: r"\b[A-Za-z0-9+/]{86}==\b", + }, +]; + +impl BuiltinDetector { + pub(super) fn parse(value: &str) -> Result { + BUILTIN_DETECTOR_SPECS + .iter() + .find(|spec| spec.name == value) + .map(|spec| spec.detector) + .ok_or_else(|| { + PluginError::InvalidConfig(format!("unsupported builtin.detector '{value}'")) + }) + } + + fn spec(self) -> &'static BuiltinDetectorSpec { + BUILTIN_DETECTOR_SPECS + .iter() + .find(|spec| spec.detector == self) + .expect("every builtin detector must have a metadata spec") + } + + pub(super) fn regex_pattern(self) -> &'static str { + self.spec().regex_pattern + } + + pub(super) fn default_mask(self, text: &str, mask_char: &str) -> String { + match self { + Self::Email => mask_email(text, mask_char), + Self::Phone => mask_phone(text, mask_char), + Self::ApiKey => mask_api_key(text, mask_char), + Self::IpAddress => mask_ip_address(text, mask_char), + Self::Ipv6 => mask_ipv6(text, mask_char), + Self::Url => mask_url(text, mask_char), + Self::Uuid => mask_text(text, mask_char, 0, 4), + Self::BearerToken => mask_bearer_token(text, mask_char), + Self::Jwt => mask_jwt(text, mask_char), + Self::CreditCard => mask_credit_card(text, mask_char), + Self::AwsAccessKeyId => mask_text(text, mask_char, 4, 4), + Self::AwsSecretAccessKey => mask_text(text, mask_char, 0, 4), + Self::GcpApiKey => mask_text(text, mask_char, 6, 4), + Self::AzureStorageAccountKey => mask_text(text, mask_char, 0, 4), + } + } +} + +pub(super) fn detector_regex_pattern(detector: &str) -> Option<&'static str> { + BuiltinDetector::parse(detector) + .ok() + .map(BuiltinDetector::regex_pattern) +} + +fn supported_detector_names_for_category( + category: BuiltinDetectorCategory, +) -> impl Iterator { + BUILTIN_DETECTOR_SPECS + .iter() + .filter(move |spec| spec.category == category) + .map(|spec| spec.name) +} + +pub(super) fn supported_detector_summary() -> String { + let common = supported_detector_names_for_category(BuiltinDetectorCategory::CommonPii) + .collect::>() + .join(", "); + let structured = + supported_detector_names_for_category(BuiltinDetectorCategory::StructuredSecret) + .collect::>() + .join(", "); + let cloud = supported_detector_names_for_category(BuiltinDetectorCategory::CloudCredential) + .collect::>() + .join(", "); + format!("common PII: {common}; structured secrets: {structured}; cloud credentials: {cloud}") +} + +fn mask_email(text: &str, mask_char: &str) -> String { + let Some((local, domain)) = text.split_once('@') else { + return mask_text(text, mask_char, 0, 0); + }; + + let local_chars: Vec = local.chars().collect(); + if local_chars.len() <= 1 { + return text.to_string(); + } + + let mut output = String::new(); + output.push(local_chars[0]); + for _ in 1..local_chars.len() { + output.push_str(mask_char); + } + output.push('@'); + output.push_str(domain); + output +} + +fn mask_phone(text: &str, mask_char: &str) -> String { + let total_digits = text.chars().filter(|ch| ch.is_ascii_digit()).count(); + if total_digits <= 4 { + return text.to_string(); + } + + let mut masked_digits_remaining = total_digits - 4; + let mut output = String::with_capacity(text.len()); + for ch in text.chars() { + if ch.is_ascii_digit() { + if masked_digits_remaining > 0 { + output.push_str(mask_char); + masked_digits_remaining -= 1; + } else { + output.push(ch); + } + } else { + output.push(ch); + } + } + output +} + +fn mask_api_key(text: &str, mask_char: &str) -> String { + let prefix = text.find('-').map_or(0, |idx| idx + 1); + mask_text(text, mask_char, prefix, 4) +} + +fn mask_ip_address(text: &str, mask_char: &str) -> String { + let mut octets = text.split('.').collect::>(); + if octets.len() != 4 { + return mask_text(text, mask_char, 0, 0); + } + + for octet in octets.iter_mut().take(3) { + *octet = "***"; + } + octets.join(".") +} + +fn mask_ipv6(text: &str, mask_char: &str) -> String { + let mut segments = text.split(':').collect::>(); + if segments.len() < 3 { + return mask_text(text, mask_char, 0, 0); + } + + let visible_tail_start = segments.len().saturating_sub(1); + for segment in segments.iter_mut().take(visible_tail_start) { + if !segment.is_empty() { + *segment = "****"; + } + } + segments.join(":") +} + +fn mask_url(text: &str, mask_char: &str) -> String { + let Some(scheme_idx) = text.find("://") else { + return mask_text(text, mask_char, 0, 0); + }; + let prefix_end = scheme_idx + 3; + let remainder = &text[prefix_end..]; + let Some(path_idx) = remainder.find('/') else { + return text.to_string(); + }; + + let mut output = String::with_capacity(text.len()); + output.push_str(&text[..prefix_end + path_idx + 1]); + output.push_str(mask_char); + output +} + +fn mask_bearer_token(text: &str, mask_char: &str) -> String { + let Some((scheme, token)) = text.split_once(char::is_whitespace) else { + return mask_text(text, mask_char, 0, 4); + }; + let trimmed = token.trim_start(); + if trimmed.is_empty() { + return text.to_string(); + } + + let mut output = String::new(); + output.push_str(scheme); + output.push(' '); + output.push_str(&mask_text(trimmed, mask_char, 0, 4)); + output +} + +fn mask_jwt(text: &str, mask_char: &str) -> String { + let parts = text.split('.').collect::>(); + if parts.len() != 3 { + return mask_text(text, mask_char, 0, 6); + } + + format!( + "{}.{}.{}", + parts[0], + mask_text(parts[1], mask_char, 0, 0), + mask_text(parts[2], mask_char, 0, 6) + ) +} + +fn mask_credit_card(text: &str, mask_char: &str) -> String { + let total_digits = text.chars().filter(|ch| ch.is_ascii_digit()).count(); + if total_digits <= 4 { + return text.to_string(); + } + + let mut masked_digits_remaining = total_digits - 4; + let mut output = String::with_capacity(text.len()); + for ch in text.chars() { + if ch.is_ascii_digit() { + if masked_digits_remaining > 0 { + output.push_str(mask_char); + masked_digits_remaining -= 1; + } else { + output.push(ch); + } + } else { + output.push(ch); + } + } + output +} diff --git a/crates/core/tests/unit/plugins/pii_redaction/component_tests.rs b/crates/core/tests/unit/plugins/pii_redaction/component_tests.rs index 45f7e435..10b63d76 100644 --- a/crates/core/tests/unit/plugins/pii_redaction/component_tests.rs +++ b/crates/core/tests/unit/plugins/pii_redaction/component_tests.rs @@ -255,7 +255,7 @@ fn validate_rejects_unknown_builtin_detector() { assert!(report.diagnostics.iter().any(|diag| { diag.field.as_deref() == Some("builtin.detector") - && diag.message.contains("must be 'email'") + && diag.message.contains("supported built-in detector presets") })); } @@ -839,6 +839,273 @@ fn builtin_mask_with_url_detector_preserves_scheme_and_host_by_default() { clear_plugin_configuration().unwrap(); } +#[test] +fn builtin_mask_with_ipv6_detector_preserves_last_segment_by_default() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + setup_isolated_thread(); + + futures::executor::block_on(initialize_plugins(plugin_config(json!({ + "mode": "builtin", + "codec": "openai_chat", + "input": false, + "output": false, + "tool_input": true, + "tool_output": false, + "builtin": { + "action": "mask", + "detector": "ipv6", + "target_paths": ["/ip"] + } + })))) + .unwrap(); + + let events = capture_events("pii-redaction-ipv6-default-mask-events"); + let _handle = tool_call( + ToolCallParams::builder() + .name("notify") + .args(json!({ + "ip": "2001:0db8:85a3:0000:0000:8a2e:0370:7334", + "keep": "unchanged" + })) + .build(), + ) + .unwrap(); + + let captured_events = captured_events_snapshot(&events); + assert_eq!(captured_events.len(), 1); + assert_eq!( + captured_events[0].input(), + Some(&json!({ + "ip": "****:****:****:****:****:****:****:7334", + "keep": "unchanged" + })) + ); + + deregister_subscriber("pii-redaction-ipv6-default-mask-events").unwrap(); + clear_plugin_configuration().unwrap(); +} + +#[test] +fn builtin_mask_with_bearer_token_detector_preserves_scheme_and_last_four() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + setup_isolated_thread(); + + futures::executor::block_on(initialize_plugins(plugin_config(json!({ + "mode": "builtin", + "codec": "openai_chat", + "input": false, + "output": false, + "tool_input": true, + "tool_output": false, + "builtin": { + "action": "mask", + "detector": "bearer_token", + "target_paths": ["/auth"] + } + })))) + .unwrap(); + + let events = capture_events("pii-redaction-bearer-default-mask-events"); + let _handle = tool_call( + ToolCallParams::builder() + .name("notify") + .args(json!({ + "auth": "Bearer token-value-1234", + "keep": "unchanged" + })) + .build(), + ) + .unwrap(); + + let captured_events = captured_events_snapshot(&events); + assert_eq!(captured_events.len(), 1); + assert_eq!( + captured_events[0].input(), + Some(&json!({ + "auth": "Bearer ************1234", + "keep": "unchanged" + })) + ); + + deregister_subscriber("pii-redaction-bearer-default-mask-events").unwrap(); + clear_plugin_configuration().unwrap(); +} + +#[test] +fn builtin_mask_with_credit_card_detector_preserves_last_four_digits() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + setup_isolated_thread(); + + futures::executor::block_on(initialize_plugins(plugin_config(json!({ + "mode": "builtin", + "codec": "openai_chat", + "input": false, + "output": false, + "tool_input": true, + "tool_output": false, + "builtin": { + "action": "mask", + "detector": "credit_card", + "target_paths": ["/card"] + } + })))) + .unwrap(); + + let events = capture_events("pii-redaction-credit-card-default-mask-events"); + let _handle = tool_call( + ToolCallParams::builder() + .name("notify") + .args(json!({ + "card": "4111 1111 1111 1234", + "keep": "unchanged" + })) + .build(), + ) + .unwrap(); + + let captured_events = captured_events_snapshot(&events); + assert_eq!(captured_events.len(), 1); + assert_eq!( + captured_events[0].input(), + Some(&json!({ + "card": "**** **** **** 1234", + "keep": "unchanged" + })) + ); + + deregister_subscriber("pii-redaction-credit-card-default-mask-events").unwrap(); + clear_plugin_configuration().unwrap(); +} + +#[test] +fn builtin_mask_with_jwt_detector_preserves_header_and_signature_tail() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + setup_isolated_thread(); + + let jwt = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.cGF5bG9hZA.signaturetail"; + let expected_jwt = { + let parts = jwt.split('.').collect::>(); + format!( + "{}.{}.{}", + parts[0], + mask_text(parts[1], "*", 0, 0), + mask_text(parts[2], "*", 0, 6) + ) + }; + futures::executor::block_on(initialize_plugins(plugin_config(json!({ + "mode": "builtin", + "codec": "openai_chat", + "input": false, + "output": false, + "tool_input": true, + "tool_output": false, + "builtin": { + "action": "mask", + "detector": "jwt", + "target_paths": ["/token"] + } + })))) + .unwrap(); + + let events = capture_events("pii-redaction-jwt-default-mask-events"); + let _handle = tool_call( + ToolCallParams::builder() + .name("notify") + .args(json!({ + "token": jwt, + "keep": "unchanged" + })) + .build(), + ) + .unwrap(); + + let captured_events = captured_events_snapshot(&events); + assert_eq!(captured_events.len(), 1); + assert_eq!( + captured_events[0].input(), + Some(&json!({ + "token": expected_jwt, + "keep": "unchanged" + })) + ); + + deregister_subscriber("pii-redaction-jwt-default-mask-events").unwrap(); + clear_plugin_configuration().unwrap(); +} + +#[test] +fn builtin_mask_with_cloud_key_detectors_preserves_expected_segments() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + setup_isolated_thread(); + + futures::executor::block_on(initialize_plugins(plugin_config(json!({ + "mode": "builtin", + "codec": "openai_chat", + "input": false, + "output": false, + "tool_input": true, + "tool_output": false, + "builtin": { + "action": "mask", + "detector": "aws_access_key_id", + "target_paths": ["/key"] + } + })))) + .unwrap(); + let events = capture_events("pii-redaction-aws-access-key-mask-events"); + let aws_access_key = "AKIAIOSFODNN7EXAMPLE"; + let _handle = tool_call( + ToolCallParams::builder() + .name("notify") + .args(json!({"key": aws_access_key})) + .build(), + ) + .unwrap(); + assert_eq!( + captured_events_snapshot(&events)[0].input(), + Some(&json!({"key": mask_text(aws_access_key, "*", 4, 4)})) + ); + deregister_subscriber("pii-redaction-aws-access-key-mask-events").unwrap(); + clear_plugin_configuration().unwrap(); + + reset_runtime(); + setup_isolated_thread(); + futures::executor::block_on(initialize_plugins(plugin_config(json!({ + "mode": "builtin", + "codec": "openai_chat", + "input": false, + "output": false, + "tool_input": true, + "tool_output": false, + "builtin": { + "action": "mask", + "detector": "gcp_api_key", + "target_paths": ["/key"] + } + })))) + .unwrap(); + let events = capture_events("pii-redaction-gcp-key-mask-events"); + let gcp_key = format!("AIza{}", "A".repeat(35)); + let _handle = tool_call( + ToolCallParams::builder() + .name("notify") + .args(json!({"key": gcp_key})) + .build(), + ) + .unwrap(); + assert_eq!( + captured_events_snapshot(&events)[0].input(), + Some(&json!({"key": mask_text(&gcp_key, "*", 6, 4)})) + ); + deregister_subscriber("pii-redaction-gcp-key-mask-events").unwrap(); + clear_plugin_configuration().unwrap(); +} + #[test] fn builtin_hash_with_detector_hashes_only_matching_substrings() { let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); diff --git a/docs/pii-redaction-plugin/configuration.mdx b/docs/pii-redaction-plugin/configuration.mdx index dcbe4cad..2b14ee2d 100644 --- a/docs/pii-redaction-plugin/configuration.mdx +++ b/docs/pii-redaction-plugin/configuration.mdx @@ -142,7 +142,7 @@ The `builtin` section contains: | `action` | Sanitization action. Current values are `remove`, `regex_replace`, `hash`, and `mask`. | | `target_paths` | Exact JSON-pointer paths to sanitize. Empty means every matching string leaf. | | `pattern` | Regex pattern used when `action = "regex_replace"`. | -| `detector` | Optional built-in matcher preset. Current values are `email`, `phone`, `api_key`, `ip_address`, and `url`. | +| `detector` | Optional built-in matcher preset. Current values are `email`, `phone`, `api_key`, `ip_address`, `ipv6`, `url`, `uuid`, `bearer_token`, `jwt`, `credit_card`, `aws_access_key_id`, `aws_secret_access_key`, `gcp_api_key`, and `azure_storage_account_key`. | | `replacement` | Replacement text used when `action = "regex_replace"`. Defaults to `[REDACTED]`. | | `mask_char` | Masking token used when `action = "mask"`. Defaults to `*`. | | `unmasked_prefix` | Leading character count to keep when `action = "mask"`. Defaults to `0`, unless a detector-specific masking preset is active. | @@ -194,7 +194,16 @@ When `detector` is set and you do not specify `unmasked_prefix` or - `phone`: preserves the last four digits while keeping separators intact - `api_key`: preserves the vendor-style prefix such as `sk-` and the last four characters - `ip_address`: preserves the last octet +- `ipv6`: preserves the last segment - `url`: preserves the scheme and host, then collapses the path/query tail +- `uuid`: preserves the last four characters +- `bearer_token`: preserves the auth scheme and the last four characters +- `jwt`: preserves the header segment and the tail of the signature +- `credit_card`: preserves the last four digits while keeping separators intact +- `aws_access_key_id`: preserves the provider prefix and the last four characters +- `aws_secret_access_key`: preserves the last four characters +- `gcp_api_key`: preserves the `AIza`-style prefix and the last four characters +- `azure_storage_account_key`: preserves the last four characters ## Path Semantics @@ -214,14 +223,28 @@ recommended contract for new configuration. ## Detector Presets -The built-in detector presets are intended for common structured values: +The built-in detector presets are grouped into three deterministic families. +Common PII: - `email` - `phone` -- `api_key` - `ip_address` +- `ipv6` - `url` +Structured secrets: +- `api_key` +- `uuid` +- `bearer_token` +- `jwt` +- `credit_card` + +Cloud credentials: +- `aws_access_key_id` +- `aws_secret_access_key` +- `gcp_api_key` +- `azure_storage_account_key` + They are deterministic regex-backed helpers, not model inference. If `target_paths` is empty, the built-in backend sanitizes every matching From 95f4f764be68feb94b57e1f9e3936548e5adf143 Mon Sep 17 00:00:00 2001 From: Alex Fournier Date: Mon, 8 Jun 2026 11:06:37 -0700 Subject: [PATCH 06/35] refactor: split pii redaction backend modules Signed-off-by: Alex Fournier --- crates/cli/tests/coverage/plugins_tests.rs | 7 + .../core/src/plugins/pii_redaction/builtin.rs | 432 ++++++++++ .../src/plugins/pii_redaction/component.rs | 747 +----------------- .../src/plugins/pii_redaction/detectors.rs | 2 +- .../core/src/plugins/pii_redaction/local.rs | 2 +- crates/core/src/plugins/pii_redaction/mod.rs | 4 + .../core/src/plugins/pii_redaction/overlay.rs | 324 ++++++++ .../plugins/pii_redaction/component_tests.rs | 130 +++ docs/pii-redaction-plugin/about.mdx | 30 +- docs/pii-redaction-plugin/configuration.mdx | 66 +- 10 files changed, 1010 insertions(+), 734 deletions(-) create mode 100644 crates/core/src/plugins/pii_redaction/builtin.rs create mode 100644 crates/core/src/plugins/pii_redaction/overlay.rs diff --git a/crates/cli/tests/coverage/plugins_tests.rs b/crates/cli/tests/coverage/plugins_tests.rs index b69ceed7..f02223b0 100644 --- a/crates/cli/tests/coverage/plugins_tests.rs +++ b/crates/cli/tests/coverage/plugins_tests.rs @@ -240,6 +240,13 @@ fn typed_editor_model_contains_pii_redaction_options() { let builtin = schema.field("builtin").unwrap().schema().unwrap(); assert_eq!(builtin.field("action").unwrap().kind, EditorFieldKind::Enum); + assert!( + builtin + .field("action") + .unwrap() + .enum_values + .contains(&"redact") + ); assert_eq!( builtin.field("target_paths").unwrap().kind, EditorFieldKind::Json diff --git a/crates/core/src/plugins/pii_redaction/builtin.rs b/crates/core/src/plugins/pii_redaction/builtin.rs new file mode 100644 index 00000000..e8776ef4 --- /dev/null +++ b/crates/core/src/plugins/pii_redaction/builtin.rs @@ -0,0 +1,432 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::sync::Arc; + +use regex::Regex; +use serde::Serialize; +use serde::de::DeserializeOwned; +use serde_json::Value as Json; +use sha2::{Digest, Sha256}; + +use crate::api::llm::LlmRequest; +use crate::api::runtime::{LlmSanitizeRequestFn, LlmSanitizeResponseFn, ToolSanitizeFn}; +use crate::codec::anthropic::AnthropicMessagesCodec; +use crate::codec::openai_chat::OpenAIChatCodec; +use crate::codec::openai_responses::OpenAIResponsesCodec; +use crate::codec::traits::{LlmCodec, LlmResponseCodec}; +use crate::plugin::{PluginError, Result as PluginResult}; + +use super::component::BuiltinBackendConfig; +use super::detectors::BuiltinDetector; +use super::overlay::BuiltinCodecName; + +#[derive(Clone)] +pub(super) struct CompiledBuiltinBackend { + action: BuiltinAction, + target_paths: Arc>, + codec: Option>, + codec_name: Option, +} + +#[derive(Clone)] +enum BuiltinAction { + Remove, + Hash { + matcher: Option>, + }, + Mask { + matcher: Option>, + strategy: BuiltinMaskStrategy, + }, + Redact { + matcher: Arc, + replacement: Arc, + }, + RegexReplace { + pattern: Arc, + replacement: Arc, + }, +} + +#[derive(Clone)] +enum BuiltinMaskStrategy { + Generic { + mask_char: Arc, + unmasked_prefix: usize, + unmasked_suffix: usize, + }, + DetectorDefault { + detector: BuiltinDetector, + mask_char: Arc, + }, +} + +trait BuiltinRequestResponseCodec: LlmCodec + LlmResponseCodec + Send + Sync {} + +impl BuiltinRequestResponseCodec for T where T: LlmCodec + LlmResponseCodec + Send + Sync {} + +impl CompiledBuiltinBackend { + pub(super) fn new( + config: BuiltinBackendConfig, + codec_name: Option, + ) -> PluginResult { + let detector = config + .detector + .as_deref() + .map(BuiltinDetector::parse) + .transpose()?; + let matcher = compile_builtin_matcher(config.pattern.clone(), detector)?; + let action = match config.action.as_str() { + "remove" => BuiltinAction::Remove, + "hash" => BuiltinAction::Hash { matcher }, + "mask" => BuiltinAction::Mask { + matcher, + strategy: build_mask_strategy(&config, detector), + }, + "redact" | "regex_replace" => { + let pattern = matcher.ok_or_else(|| { + PluginError::InvalidConfig( + "builtin.pattern or builtin.detector is required when builtin.action = 'regex_replace' or 'redact'".to_string(), + ) + })?; + let replacement = Arc::new( + config + .replacement + .unwrap_or_else(|| "[REDACTED]".to_string()), + ); + if config.action == "redact" { + BuiltinAction::Redact { + matcher: pattern, + replacement, + } + } else { + BuiltinAction::RegexReplace { + pattern, + replacement, + } + } + } + other => { + return Err(PluginError::InvalidConfig(format!( + "unsupported builtin.action '{other}'" + ))); + } + }; + + Ok(Self { + action, + target_paths: Arc::new(config.target_paths), + codec_name: codec_name.as_deref().and_then(BuiltinCodecName::parse), + codec: codec_name + .as_deref() + .map(instantiate_builtin_codec) + .transpose()?, + }) + } + + fn sanitize_json_preorder_dfs(&self, value: Json) -> Json { + self.sanitize_json_preorder_dfs_at_path(value, &mut Vec::new()) + .unwrap_or(Json::Null) + } + + fn sanitize_json_preorder_dfs_at_path( + &self, + value: Json, + path_segments: &mut Vec, + ) -> Option { + match value { + Json::String(text) => { + if self.matches_current_preorder_path(path_segments) { + self.sanitize_string_value(text) + } else { + Some(Json::String(text)) + } + } + Json::Array(items) => Some(Json::Array( + items + .into_iter() + .enumerate() + .map(|(index, item)| { + path_segments.push(index.to_string()); + let sanitized = self + .sanitize_json_preorder_dfs_at_path(item, path_segments) + .unwrap_or(Json::Null); + path_segments.pop(); + sanitized + }) + .collect(), + )), + Json::Object(map) => Some(Json::Object( + map.into_iter() + .filter_map(|(key, value)| { + path_segments.push(escape_json_pointer_segment(&key)); + let sanitized = + self.sanitize_json_preorder_dfs_at_path(value, path_segments); + path_segments.pop(); + sanitized.map(|sanitized| (key, sanitized)) + }) + .collect(), + )), + other => { + if self.matches_current_preorder_path(path_segments) + && matches!(self.action, BuiltinAction::Remove) + { + None + } else { + Some(other) + } + } + } + } + + fn matches_current_preorder_path(&self, path_segments: &[String]) -> bool { + if self.target_paths.is_empty() { + return true; + } + let current_path = render_json_pointer_path(path_segments); + self.target_paths.iter().any(|path| path == ¤t_path) + } + + fn sanitize_string_value(&self, text: String) -> Option { + match &self.action { + BuiltinAction::Remove => None, + BuiltinAction::Hash { matcher } => Some(Json::String(match matcher { + Some(matcher) => matcher + .replace_all(&text, |captures: ®ex::Captures<'_>| { + hex_sha256( + captures + .get(0) + .map(|capture| capture.as_str()) + .unwrap_or(""), + ) + }) + .into_owned(), + None => hex_sha256(&text), + })), + BuiltinAction::Mask { matcher, strategy } => Some(Json::String(match matcher { + Some(matcher) => matcher + .replace_all(&text, |captures: ®ex::Captures<'_>| { + mask_with_strategy( + captures + .get(0) + .map(|capture| capture.as_str()) + .unwrap_or(""), + strategy, + ) + }) + .into_owned(), + None => mask_with_strategy(&text, strategy), + })), + BuiltinAction::Redact { + matcher, + replacement, + } => Some(Json::String( + matcher + .replace_all(&text, replacement.as_str()) + .into_owned(), + )), + BuiltinAction::RegexReplace { + pattern, + replacement, + } => Some(Json::String( + pattern + .replace_all(&text, replacement.as_str()) + .into_owned(), + )), + } + } + + fn sanitize_request_with_codec(&self, request: &LlmRequest) -> Option { + let codec = self.codec.as_ref()?; + let annotated = codec.decode(request).ok()?; + let sanitized_annotated = sanitize_serializable_with_backend(self, annotated).ok()?; + codec.encode(&sanitized_annotated, request).ok() + } + + fn sanitize_response_with_codec(&self, payload: Json) -> Option { + let codec = self.codec.as_ref()?; + let codec_name = self.codec_name?; + let annotated = codec.decode_response(&payload).ok()?; + let sanitized_annotated = sanitize_serializable_with_backend(self, annotated).ok()?; + Some(codec_name.overlay_response_payload(payload, &sanitized_annotated)) + } +} + +pub(super) fn tool_sanitize_callback(backend: CompiledBuiltinBackend) -> ToolSanitizeFn { + Arc::new(move |_name: &str, payload: Json| backend.sanitize_json_preorder_dfs(payload)) +} + +pub(super) fn llm_sanitize_request_callback( + backend: CompiledBuiltinBackend, +) -> LlmSanitizeRequestFn { + Arc::new(move |mut request: LlmRequest| { + if let Some(encoded) = backend.sanitize_request_with_codec(&request) { + return encoded; + } + request.content = backend.sanitize_json_preorder_dfs(request.content); + request + }) +} + +pub(super) fn llm_sanitize_response_callback( + backend: CompiledBuiltinBackend, +) -> LlmSanitizeResponseFn { + Arc::new(move |payload: Json| { + if backend.target_paths.is_empty() { + return backend.sanitize_json_preorder_dfs(payload); + } + + let payload = backend + .sanitize_response_with_codec(payload.clone()) + .unwrap_or(payload); + backend.sanitize_json_preorder_dfs(payload) + }) +} + +fn render_json_pointer_path(path_segments: &[String]) -> String { + if path_segments.is_empty() { + return String::new(); + } + let mut rendered = String::new(); + for segment in path_segments { + rendered.push('/'); + rendered.push_str(segment); + } + rendered +} + +fn escape_json_pointer_segment(segment: &str) -> String { + segment.replace('~', "~0").replace('/', "~1") +} + +pub(crate) fn hex_sha256(text: &str) -> String { + let digest = Sha256::digest(text.as_bytes()); + let mut output = String::with_capacity(digest.len() * 2); + for byte in digest { + use std::fmt::Write as _; + let _ = write!(&mut output, "{byte:02x}"); + } + output +} + +pub(crate) fn mask_text( + text: &str, + mask_char: &str, + unmasked_prefix: usize, + unmasked_suffix: usize, +) -> String { + let chars: Vec = text.chars().collect(); + let len = chars.len(); + if len <= unmasked_prefix + unmasked_suffix { + return text.to_string(); + } + + let mut output = String::new(); + for ch in chars.iter().take(unmasked_prefix) { + output.push(*ch); + } + for _ in 0..(len - unmasked_prefix - unmasked_suffix) { + output.push_str(mask_char); + } + for ch in chars.iter().skip(len - unmasked_suffix) { + output.push(*ch); + } + output +} + +fn build_mask_strategy( + config: &BuiltinBackendConfig, + detector: Option, +) -> BuiltinMaskStrategy { + let mask_char = Arc::new(config.mask_char.clone().unwrap_or_else(|| "*".to_string())); + match detector { + Some(detector) if config.unmasked_prefix.is_none() && config.unmasked_suffix.is_none() => { + BuiltinMaskStrategy::DetectorDefault { + detector, + mask_char, + } + } + _ => BuiltinMaskStrategy::Generic { + mask_char, + unmasked_prefix: config.unmasked_prefix.unwrap_or(0), + unmasked_suffix: config.unmasked_suffix.unwrap_or(0), + }, + } +} + +fn mask_with_strategy(text: &str, strategy: &BuiltinMaskStrategy) -> String { + match strategy { + BuiltinMaskStrategy::Generic { + mask_char, + unmasked_prefix, + unmasked_suffix, + } => mask_text(text, mask_char.as_str(), *unmasked_prefix, *unmasked_suffix), + BuiltinMaskStrategy::DetectorDefault { + detector, + mask_char, + } => detector.default_mask(text, mask_char.as_str()), + } +} + +fn compile_builtin_matcher( + pattern: Option, + detector: Option, +) -> PluginResult>> { + let pattern_text = match (pattern, detector) { + (Some(pattern), None) => Some(pattern), + (None, Some(detector)) => Some(detector.regex_pattern().to_string()), + (None, None) => None, + (Some(_), Some(_)) => { + return Err(PluginError::InvalidConfig( + "builtin.pattern and builtin.detector cannot both be set".to_string(), + )); + } + }; + + let Some(pattern_text) = pattern_text else { + return Ok(None); + }; + + let pattern = Regex::new(&pattern_text).map_err(|err| { + PluginError::InvalidConfig(format!( + "invalid builtin matcher regex '{pattern_text}': {err}" + )) + })?; + Ok(Some(Arc::new(pattern))) +} + +fn instantiate_builtin_codec( + codec_name: &str, +) -> PluginResult> { + let codec: Arc = match codec_name { + "openai_chat" => Arc::new(OpenAIChatCodec), + "openai_responses" => Arc::new(OpenAIResponsesCodec), + "anthropic_messages" => Arc::new(AnthropicMessagesCodec), + other => { + return Err(PluginError::InvalidConfig(format!( + "unsupported codec '{other}'" + ))); + } + }; + Ok(codec) +} + +fn sanitize_serializable_with_backend( + backend: &CompiledBuiltinBackend, + value: T, +) -> PluginResult +where + T: Serialize + DeserializeOwned, +{ + let value = serde_json::to_value(value).map_err(|err| { + PluginError::Internal(format!( + "failed to serialize value for PII redaction: {err}" + )) + })?; + serde_json::from_value(backend.sanitize_json_preorder_dfs(value)).map_err(|err| { + PluginError::Internal(format!( + "failed to deserialize sanitized value for PII redaction: {err}" + )) + }) +} diff --git a/crates/core/src/plugins/pii_redaction/component.rs b/crates/core/src/plugins/pii_redaction/component.rs index 3f825208..5240379f 100644 --- a/crates/core/src/plugins/pii_redaction/component.rs +++ b/crates/core/src/plugins/pii_redaction/component.rs @@ -7,34 +7,23 @@ use std::future::Future; use std::pin::Pin; use std::sync::Arc; -use regex::Regex; -use serde::de::DeserializeOwned; -use serde::{Deserialize, Serialize}; -use serde_json::{Map, Value as Json}; -use sha2::{Digest, Sha256}; - -use crate::api::llm::LlmRequest; -use crate::api::runtime::{LlmSanitizeRequestFn, LlmSanitizeResponseFn, ToolSanitizeFn}; -use crate::codec::anthropic::AnthropicMessagesCodec; -use crate::codec::openai_chat::OpenAIChatCodec; -use crate::codec::openai_responses::OpenAIResponsesCodec; -use crate::codec::request::{ContentPart, MessageContent}; -use crate::codec::response::{AnnotatedLlmResponse, FinishReason, ResponseToolCall}; -use crate::codec::traits::{LlmCodec, LlmResponseCodec}; use crate::plugin::{ ConfigDiagnostic, ConfigPolicy, DiagnosticLevel, Plugin, PluginComponentSpec, PluginError, PluginRegistrationContext, Result as PluginResult, UnsupportedBehavior, deregister_plugin, register_plugin, }; +use serde::{Deserialize, Serialize}; +use serde_json::{Map, Value as Json}; -#[path = "local.rs"] -mod local; -use local::register_local_backend; -pub use local::{clear_local_backend_provider, register_local_backend_provider}; - -#[path = "detectors.rs"] -mod detectors; -use detectors::{BuiltinDetector, detector_regex_pattern, supported_detector_summary}; +use super::builtin::{ + CompiledBuiltinBackend, llm_sanitize_request_callback, llm_sanitize_response_callback, + tool_sanitize_callback, +}; +#[cfg(test)] +pub(crate) use super::builtin::{hex_sha256, mask_text}; +use super::detectors::{detector_regex_pattern, supported_detector_summary}; +use super::local::register_local_backend; +pub use super::local::{clear_local_backend_provider, register_local_backend_provider}; /// The plugin kind reserved for the built-in privacy component. pub const PII_REDACTION_PLUGIN_KIND: &str = "pii_redaction"; @@ -144,13 +133,13 @@ pub struct BuiltinBackendConfig { /// Exact JSON-pointer paths to sanitize. Empty means every string leaf. #[serde(default, skip_serializing_if = "Vec::is_empty")] pub target_paths: Vec, - /// Regex pattern used when `action = "regex_replace"`. + /// Regex pattern used when `action = "regex_replace"` or `action = "redact"`. #[serde(default, skip_serializing_if = "Option::is_none")] pub pattern: Option, /// Built-in detector preset used when you do not want to write a regex. #[serde(default, skip_serializing_if = "Option::is_none")] pub detector: Option, - /// Replacement text used when `action = "regex_replace"`. + /// Replacement text used when `action = "regex_replace"` or `action = "redact"`. #[serde(default, skip_serializing_if = "Option::is_none")] pub replacement: Option, /// Masking token used when `action = "mask"`. Defaults to `*`. @@ -231,7 +220,7 @@ crate::editor_config! { action => { label: "action", kind: Enum, - values: ["remove", "regex_replace", "hash", "mask"], + values: ["remove", "redact", "regex_replace", "hash", "mask"], }, target_paths => { label: "target_paths", kind: Json }, pattern => { label: "pattern", kind: String, optional: true }, @@ -335,7 +324,7 @@ fn builtin_action_schema( ) -> schemars::schema::Schema { string_enum_schema( generator, - &["remove", "regex_replace", "hash", "mask"], + &["remove", "redact", "regex_replace", "hash", "mask"], Some("remove"), ) } @@ -573,7 +562,7 @@ fn validate_builtin_action_requirements( if !matches!( builtin.action.as_str(), - "remove" | "regex_replace" | "hash" | "mask" + "remove" | "redact" | "regex_replace" | "hash" | "mask" ) { push_policy_diag( diagnostics, @@ -581,11 +570,14 @@ fn validate_builtin_action_requirements( "pii_redaction.unsupported_value", Some(PII_REDACTION_PLUGIN_KIND.to_string()), Some("builtin.action".to_string()), - "builtin.action must be 'remove', 'regex_replace', 'hash', or 'mask'".to_string(), + "builtin.action must be 'remove', 'redact', 'regex_replace', 'hash', or 'mask'" + .to_string(), ); } - if builtin.action == "regex_replace" && builtin.pattern.is_none() && builtin.detector.is_none() + if matches!(builtin.action.as_str(), "regex_replace" | "redact") + && builtin.pattern.is_none() + && builtin.detector.is_none() { push_policy_diag( diagnostics, @@ -593,7 +585,7 @@ fn validate_builtin_action_requirements( "pii_redaction.unsupported_value", Some(PII_REDACTION_PLUGIN_KIND.to_string()), Some("builtin.pattern".to_string()), - "builtin.pattern or builtin.detector is required when builtin.action = 'regex_replace'" + "builtin.pattern or builtin.detector is required when builtin.action = 'regex_replace' or 'redact'" .to_string(), ); } @@ -710,701 +702,6 @@ fn register_builtin_backend( Ok(()) } -#[derive(Clone)] -struct CompiledBuiltinBackend { - action: BuiltinAction, - target_paths: Arc>, - codec: Option>, - codec_name: Option, -} - -#[derive(Clone)] -enum BuiltinAction { - Remove, - Hash { - matcher: Option>, - }, - Mask { - matcher: Option>, - strategy: BuiltinMaskStrategy, - }, - RegexReplace { - pattern: Arc, - replacement: Arc, - }, -} - -#[derive(Clone)] -enum BuiltinMaskStrategy { - Generic { - mask_char: Arc, - unmasked_prefix: usize, - unmasked_suffix: usize, - }, - DetectorDefault { - detector: BuiltinDetector, - mask_char: Arc, - }, -} - -#[derive(Clone, Copy)] -enum BuiltinCodecName { - OpenAIChat, - OpenAIResponses, - AnthropicMessages, -} - -trait BuiltinRequestResponseCodec: LlmCodec + LlmResponseCodec + Send + Sync {} - -impl BuiltinRequestResponseCodec for T where T: LlmCodec + LlmResponseCodec + Send + Sync {} - -impl CompiledBuiltinBackend { - fn new(config: BuiltinBackendConfig, codec_name: Option) -> PluginResult { - let detector = config - .detector - .as_deref() - .map(BuiltinDetector::parse) - .transpose()?; - let matcher = compile_builtin_matcher(config.pattern.clone(), detector)?; - let action = match config.action.as_str() { - "remove" => BuiltinAction::Remove, - "hash" => BuiltinAction::Hash { matcher }, - "mask" => BuiltinAction::Mask { - matcher, - strategy: build_mask_strategy(&config, detector), - }, - "regex_replace" => { - let pattern = matcher.ok_or_else(|| { - PluginError::InvalidConfig( - "builtin.pattern or builtin.detector is required when builtin.action = 'regex_replace'".to_string(), - ) - })?; - BuiltinAction::RegexReplace { - pattern, - replacement: Arc::new( - config - .replacement - .unwrap_or_else(|| "[REDACTED]".to_string()), - ), - } - } - other => { - return Err(PluginError::InvalidConfig(format!( - "unsupported builtin.action '{other}'" - ))); - } - }; - - Ok(Self { - action, - target_paths: Arc::new(config.target_paths), - codec_name: codec_name.as_deref().and_then(BuiltinCodecName::parse), - codec: codec_name - .as_deref() - .map(instantiate_builtin_codec) - .transpose()?, - }) - } - - fn sanitize_json_preorder_dfs(&self, value: Json) -> Json { - self.sanitize_json_preorder_dfs_at_path(value, &mut Vec::new()) - .unwrap_or(Json::Null) - } - - fn sanitize_json_preorder_dfs_at_path( - &self, - value: Json, - path_segments: &mut Vec, - ) -> Option { - match value { - Json::String(text) => { - if self.matches_current_preorder_path(path_segments) { - self.sanitize_string_value(text) - } else { - Some(Json::String(text)) - } - } - Json::Array(items) => Some(Json::Array( - items - .into_iter() - .enumerate() - .map(|(index, item)| { - path_segments.push(index.to_string()); - let sanitized = self - .sanitize_json_preorder_dfs_at_path(item, path_segments) - .unwrap_or(Json::Null); - path_segments.pop(); - sanitized - }) - .collect(), - )), - Json::Object(map) => Some(Json::Object( - map.into_iter() - .filter_map(|(key, value)| { - path_segments.push(escape_json_pointer_segment(&key)); - let sanitized = - self.sanitize_json_preorder_dfs_at_path(value, path_segments); - path_segments.pop(); - sanitized.map(|sanitized| (key, sanitized)) - }) - .collect(), - )), - other => { - if self.matches_current_preorder_path(path_segments) - && matches!(self.action, BuiltinAction::Remove) - { - None - } else { - Some(other) - } - } - } - } - - fn matches_current_preorder_path(&self, path_segments: &[String]) -> bool { - if self.target_paths.is_empty() { - return true; - } - let current_path = render_json_pointer_path(path_segments); - self.target_paths.iter().any(|path| path == ¤t_path) - } - - fn sanitize_string_value(&self, text: String) -> Option { - match &self.action { - BuiltinAction::Remove => None, - BuiltinAction::Hash { matcher } => Some(Json::String(match matcher { - Some(matcher) => matcher - .replace_all(&text, |captures: ®ex::Captures<'_>| { - hex_sha256( - captures - .get(0) - .map(|capture| capture.as_str()) - .unwrap_or(""), - ) - }) - .into_owned(), - None => hex_sha256(&text), - })), - BuiltinAction::Mask { matcher, strategy } => Some(Json::String(match matcher { - Some(matcher) => matcher - .replace_all(&text, |captures: ®ex::Captures<'_>| { - mask_with_strategy( - captures - .get(0) - .map(|capture| capture.as_str()) - .unwrap_or(""), - strategy, - ) - }) - .into_owned(), - None => mask_with_strategy(&text, strategy), - })), - BuiltinAction::RegexReplace { - pattern, - replacement, - } => Some(Json::String( - pattern - .replace_all(&text, replacement.as_str()) - .into_owned(), - )), - } - } - - fn sanitize_request_with_codec(&self, request: &LlmRequest) -> Option { - let codec = self.codec.as_ref()?; - let annotated = codec.decode(request).ok()?; - let sanitized_annotated = sanitize_serializable_with_backend(self, annotated).ok()?; - codec.encode(&sanitized_annotated, request).ok() - } - - fn sanitize_response_with_codec(&self, payload: Json) -> Option { - let codec = self.codec.as_ref()?; - let codec_name = self.codec_name?; - let annotated = codec.decode_response(&payload).ok()?; - let sanitized_annotated = sanitize_serializable_with_backend(self, annotated).ok()?; - Some(codec_name.overlay_response_payload(payload, &sanitized_annotated)) - } -} - -fn tool_sanitize_callback(backend: CompiledBuiltinBackend) -> ToolSanitizeFn { - Arc::new(move |_name: &str, payload: Json| backend.sanitize_json_preorder_dfs(payload)) -} - -fn llm_sanitize_request_callback(backend: CompiledBuiltinBackend) -> LlmSanitizeRequestFn { - Arc::new(move |mut request: LlmRequest| { - if let Some(encoded) = backend.sanitize_request_with_codec(&request) { - return encoded; - } - request.content = backend.sanitize_json_preorder_dfs(request.content); - request - }) -} - -fn llm_sanitize_response_callback(backend: CompiledBuiltinBackend) -> LlmSanitizeResponseFn { - Arc::new(move |payload: Json| { - if backend.target_paths.is_empty() { - return backend.sanitize_json_preorder_dfs(payload); - } - - let payload = backend - .sanitize_response_with_codec(payload.clone()) - .unwrap_or(payload); - backend.sanitize_json_preorder_dfs(payload) - }) -} - -fn render_json_pointer_path(path_segments: &[String]) -> String { - if path_segments.is_empty() { - return String::new(); - } - let mut rendered = String::new(); - for segment in path_segments { - rendered.push('/'); - rendered.push_str(segment); - } - rendered -} - -fn escape_json_pointer_segment(segment: &str) -> String { - segment.replace('~', "~0").replace('/', "~1") -} - -fn hex_sha256(text: &str) -> String { - let digest = Sha256::digest(text.as_bytes()); - let mut output = String::with_capacity(digest.len() * 2); - for byte in digest { - use std::fmt::Write as _; - let _ = write!(&mut output, "{byte:02x}"); - } - output -} - -fn mask_text( - text: &str, - mask_char: &str, - unmasked_prefix: usize, - unmasked_suffix: usize, -) -> String { - let chars: Vec = text.chars().collect(); - let len = chars.len(); - if len <= unmasked_prefix + unmasked_suffix { - return text.to_string(); - } - - let mut output = String::new(); - for ch in chars.iter().take(unmasked_prefix) { - output.push(*ch); - } - for _ in 0..(len - unmasked_prefix - unmasked_suffix) { - output.push_str(mask_char); - } - for ch in chars.iter().skip(len - unmasked_suffix) { - output.push(*ch); - } - output -} - -fn build_mask_strategy( - config: &BuiltinBackendConfig, - detector: Option, -) -> BuiltinMaskStrategy { - let mask_char = Arc::new(config.mask_char.clone().unwrap_or_else(|| "*".to_string())); - match detector { - Some(detector) if config.unmasked_prefix.is_none() && config.unmasked_suffix.is_none() => { - BuiltinMaskStrategy::DetectorDefault { - detector, - mask_char, - } - } - _ => BuiltinMaskStrategy::Generic { - mask_char, - unmasked_prefix: config.unmasked_prefix.unwrap_or(0), - unmasked_suffix: config.unmasked_suffix.unwrap_or(0), - }, - } -} - -fn mask_with_strategy(text: &str, strategy: &BuiltinMaskStrategy) -> String { - match strategy { - BuiltinMaskStrategy::Generic { - mask_char, - unmasked_prefix, - unmasked_suffix, - } => mask_text(text, mask_char.as_str(), *unmasked_prefix, *unmasked_suffix), - BuiltinMaskStrategy::DetectorDefault { - detector, - mask_char, - } => detector.default_mask(text, mask_char.as_str()), - } -} - -fn compile_builtin_matcher( - pattern: Option, - detector: Option, -) -> PluginResult>> { - let pattern_text = match (pattern, detector) { - (Some(pattern), None) => Some(pattern), - (None, Some(detector)) => Some(detector.regex_pattern().to_string()), - (None, None) => None, - (Some(_), Some(_)) => { - return Err(PluginError::InvalidConfig( - "builtin.pattern and builtin.detector cannot both be set".to_string(), - )); - } - }; - - let Some(pattern_text) = pattern_text else { - return Ok(None); - }; - - let pattern = Regex::new(&pattern_text).map_err(|err| { - PluginError::InvalidConfig(format!( - "invalid builtin matcher regex '{pattern_text}': {err}" - )) - })?; - Ok(Some(Arc::new(pattern))) -} -fn instantiate_builtin_codec( - codec_name: &str, -) -> PluginResult> { - let codec: Arc = match codec_name { - "openai_chat" => Arc::new(OpenAIChatCodec), - "openai_responses" => Arc::new(OpenAIResponsesCodec), - "anthropic_messages" => Arc::new(AnthropicMessagesCodec), - other => { - return Err(PluginError::InvalidConfig(format!( - "unsupported codec '{other}'" - ))); - } - }; - Ok(codec) -} - -impl BuiltinCodecName { - fn parse(value: &str) -> Option { - match value { - "openai_chat" => Some(Self::OpenAIChat), - "openai_responses" => Some(Self::OpenAIResponses), - "anthropic_messages" => Some(Self::AnthropicMessages), - _ => None, - } - } - - fn overlay_response_payload(self, payload: Json, annotated: &AnnotatedLlmResponse) -> Json { - match self { - Self::OpenAIChat => overlay_openai_chat_response(payload, annotated), - Self::OpenAIResponses => overlay_openai_responses_response(payload, annotated), - Self::AnthropicMessages => overlay_anthropic_response(payload, annotated), - } - } -} - -fn overlay_openai_chat_response(mut payload: Json, annotated: &AnnotatedLlmResponse) -> Json { - let Some(root) = payload.as_object_mut() else { - return payload; - }; - set_optional_string_field(root, "id", annotated.id.as_deref()); - set_optional_string_field(root, "model", annotated.model.as_deref()); - - let Some(choice) = root - .get_mut("choices") - .and_then(Json::as_array_mut) - .and_then(|choices| choices.first_mut()) - .and_then(Json::as_object_mut) - else { - return payload; - }; - - set_optional_string_field( - choice, - "finish_reason", - annotated - .finish_reason - .as_ref() - .map(openai_chat_finish_reason), - ); - - let Some(message) = choice.get_mut("message").and_then(Json::as_object_mut) else { - return payload; - }; - set_optional_string_field( - message, - "content", - annotated_message_text(annotated.message.as_ref()).as_deref(), - ); - overlay_openai_chat_tool_calls(message, annotated.tool_calls.as_deref()); - payload -} - -fn overlay_openai_responses_response(mut payload: Json, annotated: &AnnotatedLlmResponse) -> Json { - let Some(root) = payload.as_object_mut() else { - return payload; - }; - set_optional_string_field(root, "id", annotated.id.as_deref()); - set_optional_string_field(root, "model", annotated.model.as_deref()); - set_optional_string_field( - root, - "status", - annotated - .finish_reason - .as_ref() - .map(openai_responses_status), - ); - - if let Some(items) = root.get_mut("output").and_then(Json::as_array_mut) { - overlay_output_text_blocks(items, annotated_message_text(annotated.message.as_ref())); - overlay_openai_responses_tool_calls(items, annotated.tool_calls.as_deref()); - } - payload -} - -fn overlay_anthropic_response(mut payload: Json, annotated: &AnnotatedLlmResponse) -> Json { - let Some(root) = payload.as_object_mut() else { - return payload; - }; - set_optional_string_field(root, "id", annotated.id.as_deref()); - set_optional_string_field(root, "model", annotated.model.as_deref()); - set_optional_string_field( - root, - "stop_reason", - annotated.finish_reason.as_ref().map(anthropic_stop_reason), - ); - - if let Some(blocks) = root.get_mut("content").and_then(Json::as_array_mut) { - overlay_anthropic_text_blocks(blocks, annotated_message_text(annotated.message.as_ref())); - overlay_anthropic_tool_calls(blocks, annotated.tool_calls.as_deref()); - } - payload -} - -fn overlay_openai_chat_tool_calls( - message: &mut Map, - tool_calls: Option<&[ResponseToolCall]>, -) { - let Some(raw_calls) = message.get_mut("tool_calls").and_then(Json::as_array_mut) else { - return; - }; - let Some(tool_calls) = tool_calls else { - message.remove("tool_calls"); - return; - }; - - for (raw_call, sanitized_call) in raw_calls.iter_mut().zip(tool_calls.iter()) { - let Some(raw_call) = raw_call.as_object_mut() else { - continue; - }; - set_optional_string_field(raw_call, "id", Some(sanitized_call.id.as_str())); - let Some(function) = raw_call.get_mut("function").and_then(Json::as_object_mut) else { - continue; - }; - set_optional_string_field(function, "name", Some(sanitized_call.name.as_str())); - set_optional_string_field( - function, - "arguments", - Some(json_string(&sanitized_call.arguments).as_str()), - ); - } -} - -fn overlay_openai_responses_tool_calls( - items: &mut [Json], - tool_calls: Option<&[ResponseToolCall]>, -) { - let Some(tool_calls) = tool_calls else { - return; - }; - - let mut sanitized_calls = tool_calls.iter(); - for item in items { - let Some(item_type) = item.get("type").and_then(Json::as_str) else { - continue; - }; - if item_type != "function_call" { - continue; - } - let Some(raw_call) = item.as_object_mut() else { - continue; - }; - let Some(sanitized_call) = sanitized_calls.next() else { - break; - }; - set_optional_string_field(raw_call, "call_id", Some(sanitized_call.id.as_str())); - set_optional_string_field(raw_call, "name", Some(sanitized_call.name.as_str())); - set_optional_string_field( - raw_call, - "arguments", - Some(json_string(&sanitized_call.arguments).as_str()), - ); - } -} - -fn overlay_anthropic_tool_calls(blocks: &mut [Json], tool_calls: Option<&[ResponseToolCall]>) { - let Some(tool_calls) = tool_calls else { - return; - }; - - let mut sanitized_calls = tool_calls.iter(); - for block in blocks { - let Some(block_type) = block.get("type").and_then(Json::as_str) else { - continue; - }; - if block_type != "tool_use" { - continue; - } - let Some(raw_call) = block.as_object_mut() else { - continue; - }; - let Some(sanitized_call) = sanitized_calls.next() else { - break; - }; - set_optional_string_field(raw_call, "id", Some(sanitized_call.id.as_str())); - set_optional_string_field(raw_call, "name", Some(sanitized_call.name.as_str())); - raw_call.insert("input".into(), sanitized_call.arguments.clone()); - } -} - -fn overlay_output_text_blocks(items: &mut [Json], message_text: Option) { - let text_items = items.iter_mut().filter_map(|item| { - (item.get("type").and_then(Json::as_str) == Some("message")) - .then_some(item.get_mut("content")) - .flatten() - .and_then(Json::as_array_mut) - }); - let Some(text) = message_text else { - for content in text_items { - for block in content.iter_mut() { - if block.get("type").and_then(Json::as_str) == Some("output_text") { - if let Some(block) = block.as_object_mut() { - block.remove("text"); - } - } - } - } - return; - }; - - let parts: Vec<&str> = text.split('\n').collect(); - for content in text_items { - let mut text_blocks = content.iter_mut().filter_map(|block| { - (block.get("type").and_then(Json::as_str) == Some("output_text")) - .then_some(block.as_object_mut()) - .flatten() - }); - for (index, block) in text_blocks.by_ref().enumerate() { - let part = parts - .get(index) - .copied() - .or_else(|| (index == 0).then_some(text.as_str())); - set_optional_string_field(block, "text", part); - } - } -} - -fn overlay_anthropic_text_blocks(blocks: &mut [Json], message_text: Option) { - let parts = message_text - .as_deref() - .map(|text| text.split('\n').collect::>()); - let mut text_block_index = 0usize; - - for block in blocks { - if block.get("type").and_then(Json::as_str) != Some("text") { - continue; - } - let Some(block) = block.as_object_mut() else { - continue; - }; - let part = parts - .as_ref() - .and_then(|parts| parts.get(text_block_index).copied()) - .or_else(|| { - (text_block_index == 0) - .then(|| message_text.as_deref()) - .flatten() - }); - set_optional_string_field(block, "text", part); - text_block_index += 1; - } -} - -fn annotated_message_text(message: Option<&MessageContent>) -> Option { - match message? { - MessageContent::Text(text) => Some(text.clone()), - MessageContent::Parts(parts) => { - let text_parts: Vec<&str> = parts - .iter() - .filter_map(|part| match part { - ContentPart::Text { text } => Some(text.as_str()), - ContentPart::ImageUrl { .. } => None, - }) - .collect(); - (!text_parts.is_empty()).then(|| text_parts.join("\n")) - } - } -} - -fn set_optional_string_field(object: &mut Map, key: &str, value: Option<&str>) { - match value { - Some(value) => { - object.insert(key.to_string(), Json::String(value.to_string())); - } - None => { - object.remove(key); - } - } -} - -fn json_string(value: &Json) -> String { - serde_json::to_string(value).unwrap_or_else(|_| "null".to_string()) -} - -fn openai_chat_finish_reason(reason: &FinishReason) -> &str { - match reason { - FinishReason::Complete => "stop", - FinishReason::Length => "length", - FinishReason::ToolUse => "tool_calls", - FinishReason::ContentFilter => "content_filter", - FinishReason::Unknown(other) => other.as_str(), - } -} - -fn openai_responses_status(reason: &FinishReason) -> &str { - match reason { - FinishReason::Complete => "completed", - FinishReason::Length | FinishReason::ContentFilter => "incomplete", - FinishReason::ToolUse => "completed", - FinishReason::Unknown(other) => other.as_str(), - } -} - -fn anthropic_stop_reason(reason: &FinishReason) -> &str { - match reason { - FinishReason::Complete => "end_turn", - FinishReason::Length => "max_tokens", - FinishReason::ToolUse => "tool_use", - FinishReason::ContentFilter => "refusal", - FinishReason::Unknown(other) => other.as_str(), - } -} - -fn sanitize_serializable_with_backend( - backend: &CompiledBuiltinBackend, - value: T, -) -> PluginResult -where - T: Serialize + DeserializeOwned, -{ - let value = serde_json::to_value(value).map_err(|err| { - PluginError::Internal(format!( - "failed to serialize value for PII redaction: {err}" - )) - })?; - serde_json::from_value(backend.sanitize_json_preorder_dfs(value)).map_err(|err| { - PluginError::Internal(format!( - "failed to deserialize sanitized value for PII redaction: {err}" - )) - }) -} - fn validate_unknown_fields( diagnostics: &mut Vec, policy: &ConfigPolicy, diff --git a/crates/core/src/plugins/pii_redaction/detectors.rs b/crates/core/src/plugins/pii_redaction/detectors.rs index 90f9350f..ef72f0bf 100644 --- a/crates/core/src/plugins/pii_redaction/detectors.rs +++ b/crates/core/src/plugins/pii_redaction/detectors.rs @@ -3,7 +3,7 @@ use crate::plugin::PluginError; -use super::mask_text; +use super::builtin::mask_text; #[derive(Clone, Copy, PartialEq, Eq)] pub(super) enum BuiltinDetector { diff --git a/crates/core/src/plugins/pii_redaction/local.rs b/crates/core/src/plugins/pii_redaction/local.rs index bc244899..f80aa528 100644 --- a/crates/core/src/plugins/pii_redaction/local.rs +++ b/crates/core/src/plugins/pii_redaction/local.rs @@ -5,7 +5,7 @@ use std::sync::{Arc, LazyLock, Mutex, MutexGuard}; use crate::plugin::{PluginError, PluginRegistrationContext, Result as PluginResult}; -use super::PiiRedactionConfig; +use super::component::PiiRedactionConfig; type LocalBackendProvider = Arc< dyn Fn(PiiRedactionConfig, &mut PluginRegistrationContext) -> PluginResult<()> + Send + Sync, diff --git a/crates/core/src/plugins/pii_redaction/mod.rs b/crates/core/src/plugins/pii_redaction/mod.rs index 826dc9f7..00830517 100644 --- a/crates/core/src/plugins/pii_redaction/mod.rs +++ b/crates/core/src/plugins/pii_redaction/mod.rs @@ -11,4 +11,8 @@ pub(crate) fn test_mutex() -> &'static Mutex<()> { crate::shared_runtime::runtime_owner_test_mutex() } +pub(crate) mod builtin; pub mod component; +pub(crate) mod detectors; +pub(crate) mod local; +pub(crate) mod overlay; diff --git a/crates/core/src/plugins/pii_redaction/overlay.rs b/crates/core/src/plugins/pii_redaction/overlay.rs new file mode 100644 index 00000000..938bbd99 --- /dev/null +++ b/crates/core/src/plugins/pii_redaction/overlay.rs @@ -0,0 +1,324 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +use serde_json::{Map, Value as Json}; + +use crate::codec::request::{ContentPart, MessageContent}; +use crate::codec::response::{AnnotatedLlmResponse, FinishReason, ResponseToolCall}; + +#[derive(Clone, Copy)] +pub(crate) enum BuiltinCodecName { + OpenAIChat, + OpenAIResponses, + AnthropicMessages, +} + +impl BuiltinCodecName { + pub(crate) fn parse(value: &str) -> Option { + match value { + "openai_chat" => Some(Self::OpenAIChat), + "openai_responses" => Some(Self::OpenAIResponses), + "anthropic_messages" => Some(Self::AnthropicMessages), + _ => None, + } + } + + pub(crate) fn overlay_response_payload( + self, + payload: Json, + annotated: &AnnotatedLlmResponse, + ) -> Json { + match self { + Self::OpenAIChat => overlay_openai_chat_response(payload, annotated), + Self::OpenAIResponses => overlay_openai_responses_response(payload, annotated), + Self::AnthropicMessages => overlay_anthropic_response(payload, annotated), + } + } +} + +fn overlay_openai_chat_response(mut payload: Json, annotated: &AnnotatedLlmResponse) -> Json { + let Some(root) = payload.as_object_mut() else { + return payload; + }; + set_optional_string_field(root, "id", annotated.id.as_deref()); + set_optional_string_field(root, "model", annotated.model.as_deref()); + + let Some(choice) = root + .get_mut("choices") + .and_then(Json::as_array_mut) + .and_then(|choices| choices.first_mut()) + .and_then(Json::as_object_mut) + else { + return payload; + }; + + set_optional_string_field( + choice, + "finish_reason", + annotated + .finish_reason + .as_ref() + .map(openai_chat_finish_reason), + ); + + let Some(message) = choice.get_mut("message").and_then(Json::as_object_mut) else { + return payload; + }; + set_optional_string_field( + message, + "content", + annotated_message_text(annotated.message.as_ref()).as_deref(), + ); + overlay_openai_chat_tool_calls(message, annotated.tool_calls.as_deref()); + payload +} + +fn overlay_openai_responses_response(mut payload: Json, annotated: &AnnotatedLlmResponse) -> Json { + let Some(root) = payload.as_object_mut() else { + return payload; + }; + set_optional_string_field(root, "id", annotated.id.as_deref()); + set_optional_string_field(root, "model", annotated.model.as_deref()); + set_optional_string_field( + root, + "status", + annotated + .finish_reason + .as_ref() + .map(openai_responses_status), + ); + + if let Some(items) = root.get_mut("output").and_then(Json::as_array_mut) { + overlay_output_text_blocks(items, annotated_message_text(annotated.message.as_ref())); + overlay_openai_responses_tool_calls(items, annotated.tool_calls.as_deref()); + } + payload +} + +fn overlay_anthropic_response(mut payload: Json, annotated: &AnnotatedLlmResponse) -> Json { + let Some(root) = payload.as_object_mut() else { + return payload; + }; + set_optional_string_field(root, "id", annotated.id.as_deref()); + set_optional_string_field(root, "model", annotated.model.as_deref()); + set_optional_string_field( + root, + "stop_reason", + annotated.finish_reason.as_ref().map(anthropic_stop_reason), + ); + + if let Some(blocks) = root.get_mut("content").and_then(Json::as_array_mut) { + overlay_anthropic_text_blocks(blocks, annotated_message_text(annotated.message.as_ref())); + overlay_anthropic_tool_calls(blocks, annotated.tool_calls.as_deref()); + } + payload +} + +fn overlay_openai_chat_tool_calls( + message: &mut Map, + tool_calls: Option<&[ResponseToolCall]>, +) { + let Some(raw_calls) = message.get_mut("tool_calls").and_then(Json::as_array_mut) else { + return; + }; + let Some(tool_calls) = tool_calls else { + message.remove("tool_calls"); + return; + }; + + for (raw_call, sanitized_call) in raw_calls.iter_mut().zip(tool_calls.iter()) { + let Some(raw_call) = raw_call.as_object_mut() else { + continue; + }; + set_optional_string_field(raw_call, "id", Some(sanitized_call.id.as_str())); + let Some(function) = raw_call.get_mut("function").and_then(Json::as_object_mut) else { + continue; + }; + set_optional_string_field(function, "name", Some(sanitized_call.name.as_str())); + set_optional_string_field( + function, + "arguments", + Some(json_string(&sanitized_call.arguments).as_str()), + ); + } +} + +fn overlay_openai_responses_tool_calls( + items: &mut [Json], + tool_calls: Option<&[ResponseToolCall]>, +) { + let Some(tool_calls) = tool_calls else { + return; + }; + + let mut sanitized_calls = tool_calls.iter(); + for item in items { + let Some(item_type) = item.get("type").and_then(Json::as_str) else { + continue; + }; + if item_type != "function_call" { + continue; + } + let Some(raw_call) = item.as_object_mut() else { + continue; + }; + let Some(sanitized_call) = sanitized_calls.next() else { + break; + }; + set_optional_string_field(raw_call, "call_id", Some(sanitized_call.id.as_str())); + set_optional_string_field(raw_call, "name", Some(sanitized_call.name.as_str())); + set_optional_string_field( + raw_call, + "arguments", + Some(json_string(&sanitized_call.arguments).as_str()), + ); + } +} + +fn overlay_anthropic_tool_calls(blocks: &mut [Json], tool_calls: Option<&[ResponseToolCall]>) { + let Some(tool_calls) = tool_calls else { + return; + }; + + let mut sanitized_calls = tool_calls.iter(); + for block in blocks { + let Some(block_type) = block.get("type").and_then(Json::as_str) else { + continue; + }; + if block_type != "tool_use" { + continue; + } + let Some(raw_call) = block.as_object_mut() else { + continue; + }; + let Some(sanitized_call) = sanitized_calls.next() else { + break; + }; + set_optional_string_field(raw_call, "id", Some(sanitized_call.id.as_str())); + set_optional_string_field(raw_call, "name", Some(sanitized_call.name.as_str())); + raw_call.insert("input".into(), sanitized_call.arguments.clone()); + } +} + +fn overlay_output_text_blocks(items: &mut [Json], message_text: Option) { + let text_items = items.iter_mut().filter_map(|item| { + (item.get("type").and_then(Json::as_str) == Some("message")) + .then_some(item.get_mut("content")) + .flatten() + .and_then(Json::as_array_mut) + }); + let Some(text) = message_text else { + for content in text_items { + for block in content.iter_mut() { + if block.get("type").and_then(Json::as_str) == Some("output_text") + && let Some(block) = block.as_object_mut() + { + block.remove("text"); + } + } + } + return; + }; + + let parts: Vec<&str> = text.split('\n').collect(); + for content in text_items { + let mut text_blocks = content.iter_mut().filter_map(|block| { + (block.get("type").and_then(Json::as_str) == Some("output_text")) + .then_some(block.as_object_mut()) + .flatten() + }); + for (index, block) in text_blocks.by_ref().enumerate() { + let part = parts + .get(index) + .copied() + .or_else(|| (index == 0).then_some(text.as_str())); + set_optional_string_field(block, "text", part); + } + } +} + +fn overlay_anthropic_text_blocks(blocks: &mut [Json], message_text: Option) { + let parts = message_text + .as_deref() + .map(|text| text.split('\n').collect::>()); + let mut text_block_index = 0usize; + + for block in blocks { + if block.get("type").and_then(Json::as_str) != Some("text") { + continue; + } + let Some(block) = block.as_object_mut() else { + continue; + }; + let part = parts + .as_ref() + .and_then(|parts| parts.get(text_block_index).copied()) + .or_else(|| { + (text_block_index == 0) + .then_some(message_text.as_deref()) + .flatten() + }); + set_optional_string_field(block, "text", part); + text_block_index += 1; + } +} + +fn annotated_message_text(message: Option<&MessageContent>) -> Option { + match message? { + MessageContent::Text(text) => Some(text.clone()), + MessageContent::Parts(parts) => { + let text_parts: Vec<&str> = parts + .iter() + .filter_map(|part| match part { + ContentPart::Text { text } => Some(text.as_str()), + ContentPart::ImageUrl { .. } => None, + }) + .collect(); + (!text_parts.is_empty()).then(|| text_parts.join("\n")) + } + } +} + +fn set_optional_string_field(object: &mut Map, key: &str, value: Option<&str>) { + match value { + Some(value) => { + object.insert(key.to_string(), Json::String(value.to_string())); + } + None => { + object.remove(key); + } + } +} + +fn json_string(value: &Json) -> String { + serde_json::to_string(value).unwrap_or_else(|_| "null".to_string()) +} + +fn openai_chat_finish_reason(reason: &FinishReason) -> &str { + match reason { + FinishReason::Complete => "stop", + FinishReason::Length => "length", + FinishReason::ToolUse => "tool_calls", + FinishReason::ContentFilter => "content_filter", + FinishReason::Unknown(other) => other.as_str(), + } +} + +fn openai_responses_status(reason: &FinishReason) -> &str { + match reason { + FinishReason::Complete => "completed", + FinishReason::Length | FinishReason::ContentFilter => "incomplete", + FinishReason::ToolUse => "completed", + FinishReason::Unknown(other) => other.as_str(), + } +} + +fn anthropic_stop_reason(reason: &FinishReason) -> &str { + match reason { + FinishReason::Complete => "end_turn", + FinishReason::Length => "max_tokens", + FinishReason::ToolUse => "tool_use", + FinishReason::ContentFilter => "refusal", + FinishReason::Unknown(other) => other.as_str(), + } +} diff --git a/crates/core/tests/unit/plugins/pii_redaction/component_tests.rs b/crates/core/tests/unit/plugins/pii_redaction/component_tests.rs index 10b63d76..b2775af4 100644 --- a/crates/core/tests/unit/plugins/pii_redaction/component_tests.rs +++ b/crates/core/tests/unit/plugins/pii_redaction/component_tests.rs @@ -437,6 +437,71 @@ fn builtin_remove_deletes_object_fields_and_nulls_array_or_root_targets() { clear_plugin_configuration().unwrap(); } +#[test] +fn builtin_redact_replaces_matching_tool_payload_substrings_with_default_token() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + setup_isolated_thread(); + + futures::executor::block_on(initialize_plugins(plugin_config(json!({ + "mode": "builtin", + "tool_input": true, + "tool_output": true, + "input": false, + "output": false, + "builtin": { + "action": "redact", + "detector": "bearer_token", + "target_paths": [] + } + })))) + .unwrap(); + + let events = capture_events("pii-redaction-redact-tool-events"); + let secret = "Bearer sk-demo-secret-123456"; + let handle = tool_call( + ToolCallParams::builder() + .name("redact_tool") + .args(json!({ + "auth": secret, + "message": format!("primary auth={secret}") + })) + .build(), + ) + .unwrap(); + tool_call_end( + ToolCallEndParams::builder() + .handle(&handle) + .result(json!({ + "result": secret, + "nested": {"token": secret} + })) + .build(), + ) + .unwrap(); + + let captured_events = captured_events_snapshot(&events); + assert_eq!( + captured_events[0].input().unwrap()["auth"], + json!("[REDACTED]") + ); + assert_eq!( + captured_events[0].input().unwrap()["message"], + json!("primary auth=[REDACTED]") + ); + assert_eq!( + captured_events[1].output().unwrap()["result"], + json!("[REDACTED]") + ); + assert_eq!( + captured_events[1].output().unwrap()["nested"]["token"], + json!("[REDACTED]") + ); + + deregister_subscriber("pii-redaction-redact-tool-events").unwrap(); + clear_plugin_configuration().unwrap(); +} + #[test] fn builtin_mask_preserves_configured_prefix_and_suffix() { let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); @@ -1815,6 +1880,71 @@ async fn builtin_backend_sanitizes_openai_chat_response_from_normalized_message_ clear_plugin_configuration().unwrap(); } +#[tokio::test] +async fn builtin_redact_sanitizes_openai_chat_response_from_detector_path() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + setup_isolated_thread(); + + initialize_plugins(plugin_config(json!({ + "mode": "builtin", + "codec": "openai_chat", + "input": false, + "output": true, + "tool_input": false, + "tool_output": false, + "builtin": { + "action": "redact", + "detector": "email", + "target_paths": ["/message"] + } + }))) + .await + .unwrap(); + + let events = capture_events("pii-redaction-openai-chat-redact-response"); + let response_codec: Arc = Arc::new(OpenAIChatCodec); + + let _ = llm_call_execute( + LlmCallExecuteParams::builder() + .name("openai") + .request(LlmRequest { + headers: serde_json::Map::new(), + content: json!({"model": "gpt-4o-mini", "messages": [{"role": "user", "content": "hello"}]}), + }) + .func(noop_openai_chat_exec_fn(json!({ + "id": "chatcmpl-redact-123", + "model": "gpt-4o-mini", + "choices": [ + { + "index": 0, + "message": {"role": "assistant", "content": "alice@example.com"}, + "finish_reason": "stop" + } + ] + }))) + .response_codec(response_codec) + .build(), + ) + .await + .unwrap(); + + let captured_events = captured_events_snapshot(&events); + assert_eq!( + captured_events[1].output().unwrap()["choices"][0]["message"]["content"], + json!("[REDACTED]") + ); + assert_eq!( + captured_events[1] + .annotated_response() + .and_then(|response| response.response_text()), + Some("[REDACTED]") + ); + + deregister_subscriber("pii-redaction-openai-chat-redact-response").unwrap(); + clear_plugin_configuration().unwrap(); +} + #[tokio::test] async fn builtin_backend_sanitizes_anthropic_response_from_normalized_message_path() { let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); diff --git a/docs/pii-redaction-plugin/about.mdx b/docs/pii-redaction-plugin/about.mdx index a66943bf..f08328c0 100644 --- a/docs/pii-redaction-plugin/about.mdx +++ b/docs/pii-redaction-plugin/about.mdx @@ -34,6 +34,33 @@ Start here when you need to: - Use built-in detector presets for common values such as emails, phone numbers, URLs, API keys, and IP addresses without writing custom regexes. +## Plugin Versus Middleware + +`pii_redaction` is built on top of NeMo Relay's existing sanitize-guardrail +middleware. + +That means this plugin does **not** introduce a separate runtime mechanism. +Instead, it packages a common privacy policy behind a first-party config +surface. + +Use the plugin when you want: + +- a reusable privacy policy that many applications or teams can share +- declarative config through `plugins.toml` or `nemo-relay plugins edit` +- built-in actions, detector presets, and codec-aware LLM handling +- a supported, documented NeMo Relay surface instead of hand-registered callbacks + +Use raw sanitize-guardrail middleware when you want: + +- custom redaction logic authored directly in application code +- dynamic behavior based on runtime state, external lookups, or one-off heuristics +- experiments that are too app-specific to become a first-party plugin contract + +So the distinction is: + +- middleware is the **mechanism** +- `pii_redaction` is the **packaged policy** + ## Current Scope The built-in plugin currently exposes four managed sanitize surfaces: @@ -43,9 +70,10 @@ The built-in plugin currently exposes four managed sanitize surfaces: - `tool_input` - `tool_output` -The current built-in backend supports three actions: +The current built-in backend supports five actions: - `remove` +- `redact` - `regex_replace` - `hash` - `mask` diff --git a/docs/pii-redaction-plugin/configuration.mdx b/docs/pii-redaction-plugin/configuration.mdx index 2b14ee2d..9fb6b6d7 100644 --- a/docs/pii-redaction-plugin/configuration.mdx +++ b/docs/pii-redaction-plugin/configuration.mdx @@ -20,6 +20,21 @@ field names stay `snake_case` in every binding and in `plugins.toml`. +## Relation to Raw Middleware + +This plugin uses the same sanitize-guardrail middleware family documented in +[Middleware](/about-nemo-relay/concepts/middleware). + +The difference is the layer of abstraction: + +- raw middleware asks you to register sanitize callbacks directly in code +- `pii_redaction` gives you a first-party, config-driven privacy contract on + top of those same runtime hooks + +Choose `pii_redaction` when you want a reusable built-in policy surface. +Choose raw middleware when you need bespoke callback logic that does not fit +the plugin contract. + ## Component Shape The top-level PII redaction object contains: @@ -49,7 +64,7 @@ At least one managed redaction surface must be enabled. | Managed LLM `output` | Supported | Extension point only in this PR | | Managed `tool_input` | Supported | Extension point only in this PR | | Managed `tool_output` | Supported | Extension point only in this PR | -| Built-in actions | `remove`, `regex_replace`, `hash`, `mask` | N/A | +| Built-in actions | `remove`, `redact`, `regex_replace`, `hash`, `mask` | N/A | | Codec support | `openai_chat`, `openai_responses`, `anthropic_messages` | Runtime-specific future implementation | | Runtime availability | Any runtime that includes the built-in core plugin | Runtimes that install a local backend provider | @@ -58,14 +73,18 @@ At least one managed redaction surface must be enabled. Use `builtin` mode when NeMo Relay should sanitize emitted observability payloads with a deterministic first-party backend. +This is the recommended mode when the privacy behavior is common enough to be +described declaratively with built-in actions, detector presets, exact target +paths, and supported codecs. + ### Requirements To use `mode = "builtin"`: - `builtin` settings are required. - `codec` is required when `input` or `output` is enabled. -- `builtin.action` must be `remove`, `regex_replace`, `hash`, or `mask`. -- `builtin.pattern` or `builtin.detector` is required when `builtin.action = "regex_replace"`. +- `builtin.action` must be `remove`, `redact`, `regex_replace`, `hash`, or `mask`. +- `builtin.pattern` or `builtin.detector` is required when `builtin.action = "regex_replace"` or `builtin.action = "redact"`. ### `plugins.toml` Example @@ -133,17 +152,20 @@ The editor preserves unknown fields when it rewrites an existing `pii_redaction` component, so future or runtime-specific settings are not discarded by the interactive edit flow. +If you find yourself needing callback code instead of editor/config fields, it +is a sign that raw middleware may be the better fit for that specific policy. + ## Builtin Settings The `builtin` section contains: | Field | Purpose | |---|---| -| `action` | Sanitization action. Current values are `remove`, `regex_replace`, `hash`, and `mask`. | +| `action` | Sanitization action. Current values are `remove`, `redact`, `regex_replace`, `hash`, and `mask`. | | `target_paths` | Exact JSON-pointer paths to sanitize. Empty means every matching string leaf. | -| `pattern` | Regex pattern used when `action = "regex_replace"`. | +| `pattern` | Regex pattern used when `action = "regex_replace"` or `action = "redact"`. | | `detector` | Optional built-in matcher preset. Current values are `email`, `phone`, `api_key`, `ip_address`, `ipv6`, `url`, `uuid`, `bearer_token`, `jwt`, `credit_card`, `aws_access_key_id`, `aws_secret_access_key`, `gcp_api_key`, and `azure_storage_account_key`. | -| `replacement` | Replacement text used when `action = "regex_replace"`. Defaults to `[REDACTED]`. | +| `replacement` | Replacement text used when `action = "regex_replace"` or `action = "redact"`. Defaults to `[REDACTED]`. | | `mask_char` | Masking token used when `action = "mask"`. Defaults to `*`. | | `unmasked_prefix` | Leading character count to keep when `action = "mask"`. Defaults to `0`, unless a detector-specific masking preset is active. | | `unmasked_suffix` | Trailing character count to keep when `action = "mask"`. Defaults to `0`, unless a detector-specific masking preset is active. | @@ -168,6 +190,20 @@ replaces matches with the configured `replacement`. If you set `detector` instead of `pattern`, the built-in backend uses the detector's stock matcher regex. +### `redact` + +`redact` is the deterministic whole-match replacement lane. + +It uses the same `pattern` or `detector` matcher flow as `regex_replace`, but +defaults the replacement token to `[REDACTED]` and is intended for cases where +you do not want to preserve any matched secret characters. + +Use `redact` when you want: + +- credential-style secrets fully replaced +- a consistent redaction token across detectors +- clearer policy intent than a custom `regex_replace` + ### `hash` `hash` replaces matching string leaves with their SHA-256 hex digest. @@ -221,6 +257,24 @@ The current implementation also preserves provider-shaped response-path compatibility for the supported codecs, but normalized LLM paths are the recommended contract for new configuration. +## Choosing Between This Plugin and Middleware + +Use this plugin when: + +- the privacy behavior should be reusable across applications +- config-driven enablement matters more than hand-written callbacks +- you want built-in detectors and action semantics +- you want a documented first-party NeMo Relay privacy surface + +Use raw middleware when: + +- the policy depends on application-specific runtime state +- the sanitization logic is too custom for the plugin contract +- you need to prototype or experiment before standardizing behavior + +The runtime effect is still sanitize-guardrail middleware in both cases. The +plugin simply gives you a standardized policy layer on top. + ## Detector Presets The built-in detector presets are grouped into three deterministic families. From f7a627d4bf86a3bb2577e3d13d240bf0d39aa82d Mon Sep 17 00:00:00 2001 From: Alex Fournier Date: Mon, 8 Jun 2026 12:25:26 -0700 Subject: [PATCH 07/35] refactor: move pii redaction into dedicated crate Signed-off-by: Alex Fournier --- ATTRIBUTIONS-Rust.md | 209 ++++++++++++++++++ Cargo.lock | 32 ++- Cargo.toml | 2 + crates/cli/Cargo.toml | 1 + crates/cli/src/doctor.rs | 9 + crates/cli/src/plugins.rs | 4 + crates/cli/src/plugins/config_io.rs | 4 + crates/cli/src/plugins/editor_model.rs | 99 +++++++++ crates/cli/src/server.rs | 4 + crates/cli/tests/coverage/plugins_tests.rs | 1 + crates/core/src/plugin.rs | 1 - crates/core/src/plugins/mod.rs | 1 - crates/core/src/plugins/pii_redaction/mod.rs | 18 -- crates/ffi/Cargo.toml | 1 + crates/ffi/src/api/plugin.rs | 14 ++ crates/node/Cargo.toml | 1 + crates/node/src/api/mod.rs | 3 + crates/pii-redaction/Cargo.toml | 30 +++ crates/pii-redaction/README.md | 8 + .../src}/builtin.rs | 14 +- .../src}/component.rs | 10 +- .../src}/detectors.rs | 2 +- crates/pii-redaction/src/lib.rs | 63 ++++++ .../src}/local.rs | 2 +- .../src}/overlay.rs | 4 +- .../tests/unit}/component_tests.rs | 1 + crates/python/Cargo.toml | 1 + crates/python/src/lib.rs | 6 + crates/wasm/Cargo.toml | 1 + crates/wasm/src/api/mod.rs | 9 + 30 files changed, 518 insertions(+), 37 deletions(-) delete mode 100644 crates/core/src/plugins/pii_redaction/mod.rs create mode 100644 crates/pii-redaction/Cargo.toml create mode 100644 crates/pii-redaction/README.md rename crates/{core/src/plugins/pii_redaction => pii-redaction/src}/builtin.rs (97%) rename crates/{core/src/plugins/pii_redaction => pii-redaction/src}/component.rs (99%) rename crates/{core/src/plugins/pii_redaction => pii-redaction/src}/detectors.rs (99%) create mode 100644 crates/pii-redaction/src/lib.rs rename crates/{core/src/plugins/pii_redaction => pii-redaction/src}/local.rs (94%) rename crates/{core/src/plugins/pii_redaction => pii-redaction/src}/overlay.rs (98%) rename crates/{core/tests/unit/plugins/pii_redaction => pii-redaction/tests/unit}/component_tests.rs (99%) diff --git a/ATTRIBUTIONS-Rust.md b/ATTRIBUTIONS-Rust.md index 9ccd2236..6e064658 100644 --- a/ATTRIBUTIONS-Rust.md +++ b/ATTRIBUTIONS-Rust.md @@ -32413,6 +32413,215 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ``` +## sha2 - 0.10.9 +**Repository URL**: https://github.com/RustCrypto/hashes +**License Type(s)**: Apache-2.0 +### License: https://spdx.org/licenses/Apache-2.0.html +``` + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +``` + ## sha2 - 0.11.0 **Repository URL**: https://github.com/RustCrypto/hashes **License Type(s)**: Apache-2.0 diff --git a/Cargo.lock b/Cargo.lock index 90ae7588..63316c79 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1371,7 +1371,7 @@ dependencies = [ "serde", "serde_json", "serde_json_canonicalizer", - "sha2", + "sha2 0.11.0", "tdigest", "thiserror 2.0.18", "tokio", @@ -1395,6 +1395,7 @@ dependencies = [ "http-body-util", "nemo-relay", "nemo-relay-adaptive", + "nemo-relay-pii-redaction", "opentelemetry", "opentelemetry_sdk", "reqwest", @@ -1421,6 +1422,7 @@ dependencies = [ "libc", "nemo-relay", "nemo-relay-adaptive", + "nemo-relay-pii-redaction", "serde_json", "tokio", "tokio-stream", @@ -1437,6 +1439,7 @@ dependencies = [ "napi-derive", "nemo-relay", "nemo-relay-adaptive", + "nemo-relay-pii-redaction", "serde", "serde_json", "tokio", @@ -1444,6 +1447,20 @@ dependencies = [ "uuid", ] +[[package]] +name = "nemo-relay-pii-redaction" +version = "0.4.0" +dependencies = [ + "futures", + "nemo-relay", + "regex", + "schemars", + "serde", + "serde_json", + "sha2 0.10.9", + "tokio", +] + [[package]] name = "nemo-relay-python" version = "0.4.0" @@ -1451,6 +1468,7 @@ dependencies = [ "chrono", "nemo-relay", "nemo-relay-adaptive", + "nemo-relay-pii-redaction", "pyo3", "pyo3-async-runtimes", "pythonize", @@ -1469,6 +1487,7 @@ dependencies = [ "js-sys", "nemo-relay", "nemo-relay-adaptive", + "nemo-relay-pii-redaction", "send_wrapper", "serde", "serde-wasm-bindgen", @@ -2446,6 +2465,17 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbfa15b3dddfee50a0fff136974b3e1bde555604ba463834a7eb7deb6417705d" +[[package]] +name = "sha2" +version = "0.10.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" +dependencies = [ + "cfg-if", + "cpufeatures 0.2.17", + "digest 0.10.7", +] + [[package]] name = "sha2" version = "0.11.0" diff --git a/Cargo.toml b/Cargo.toml index 8c3cdfb5..c1bd5ba9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,6 +5,7 @@ members = [ "crates/core", "crates/adaptive", + "crates/pii-redaction", "crates/cli", # Language Bindings "crates/python", @@ -24,6 +25,7 @@ repository = "https://github.com/NVIDIA/NeMo-Relay" [workspace.dependencies] nemo-relay = { version = "0.4.0", path = "crates/core", default-features = false } nemo-relay-adaptive = { version = "0.4.0", path = "crates/adaptive" } +nemo-relay-pii-redaction = { version = "0.4.0", path = "crates/pii-redaction" } nemo-relay-ffi = { version = "0.4.0", path = "crates/ffi" } nemo-relay-cli = { version = "0.4.0", path = "crates/cli" } opentelemetry = { version = "0.31", default-features = false } diff --git a/crates/cli/Cargo.toml b/crates/cli/Cargo.toml index 16e13b15..6d6538c1 100644 --- a/crates/cli/Cargo.toml +++ b/crates/cli/Cargo.toml @@ -27,6 +27,7 @@ atof-streaming = ["nemo-relay/atof-streaming"] [dependencies] nemo-relay = { workspace = true, features = ["guardrails-remote", "object-store", "openinference"] } nemo-relay-adaptive = { workspace = true, features = ["redis-backend"] } +nemo-relay-pii-redaction.workspace = true async-stream = "0.3" axum = "0.8" bytes = "1" diff --git a/crates/cli/src/doctor.rs b/crates/cli/src/doctor.rs index 51d6250b..8cf4ffca 100644 --- a/crates/cli/src/doctor.rs +++ b/crates/cli/src/doctor.rs @@ -19,6 +19,7 @@ use nemo_relay::codec::pricing::{PricingCatalog, PricingConfig, PricingSourceCon use nemo_relay::observability::plugin_component::OBSERVABILITY_PLUGIN_KIND; use nemo_relay::plugin::{DiagnosticLevel, PluginConfig, validate_plugin_config}; use nemo_relay_adaptive::plugin_component::register_adaptive_component; +use nemo_relay_pii_redaction::component::register_pii_redaction_component; use serde::Serialize; use serde_json::{Value, json}; use tokio::time::timeout; @@ -607,6 +608,14 @@ async fn collect_observability(gateway: &GatewayConfig) -> Vec { }); return checks; } + if let Err(error) = register_pii_redaction_component() { + checks.push(Check { + name: "PII redaction plugin", + status: Status::Fail, + details: format!("registration failed: {error}"), + }); + return checks; + } let report = validate_plugin_config(&plugin_config); if report.diagnostics.is_empty() { checks.push(Check { diff --git a/crates/cli/src/plugins.rs b/crates/cli/src/plugins.rs index f6f44800..30ced81b 100644 --- a/crates/cli/src/plugins.rs +++ b/crates/cli/src/plugins.rs @@ -280,6 +280,10 @@ fn edit_component_field( edit_config_field(theme, &mut state.config, field)?; state.mark_config_touched(); } + EditableComponent::PiiRedaction(state) => { + edit_config_field(theme, &mut state.config, field)?; + state.mark_config_touched(); + } } Ok(()) } diff --git a/crates/cli/src/plugins/config_io.rs b/crates/cli/src/plugins/config_io.rs index 98b35b26..1928d47d 100644 --- a/crates/cli/src/plugins/config_io.rs +++ b/crates/cli/src/plugins/config_io.rs @@ -8,6 +8,7 @@ use std::path::{Path, PathBuf}; use console::style; use nemo_relay::plugin::{ConfigPolicy, PluginConfig, validate_plugin_config}; use nemo_relay_adaptive::plugin_component::register_adaptive_component; +use nemo_relay_pii_redaction::component::register_pii_redaction_component; use serde_json::{Map, Value}; use crate::config::{ @@ -119,6 +120,9 @@ pub(crate) fn validate_config(config: &PluginConfig) -> Result<(), CliError> { register_adaptive_component().map_err(|error| { CliError::Config(format!("adaptive plugin registration failed: {error}")) })?; + register_pii_redaction_component().map_err(|error| { + CliError::Config(format!("PII redaction plugin registration failed: {error}")) + })?; let report = validate_plugin_config(config); if report.has_errors() { let messages = report diff --git a/crates/cli/src/plugins/editor_model.rs b/crates/cli/src/plugins/editor_model.rs index 3bcbf1cd..3adea5da 100644 --- a/crates/cli/src/plugins/editor_model.rs +++ b/crates/cli/src/plugins/editor_model.rs @@ -13,6 +13,7 @@ use nemo_relay::plugins::nemo_guardrails::component::{ }; use nemo_relay_adaptive::AdaptiveConfig; use nemo_relay_adaptive::plugin_component::ADAPTIVE_PLUGIN_KIND; +use nemo_relay_pii_redaction::component::{PII_REDACTION_PLUGIN_KIND, PiiRedactionConfig}; use serde::Serialize; use serde::de::DeserializeOwned; use serde_json::{Map, Value, json}; @@ -36,6 +37,7 @@ pub(super) enum EditableComponent { Observability(Box>), Adaptive(Box>), NemoGuardrails(Box>), + PiiRedaction(Box>), } impl EditableComponent { @@ -44,6 +46,7 @@ impl EditableComponent { Self::Observability(_) => "Observability", Self::Adaptive(_) => "Adaptive", Self::NemoGuardrails(_) => "NeMo Guardrails", + Self::PiiRedaction(_) => "PII Redaction", } } @@ -52,6 +55,7 @@ impl EditableComponent { Self::Observability(_) => ObservabilityConfig::editor_schema().fields, Self::Adaptive(_) => AdaptiveConfig::editor_schema().fields, Self::NemoGuardrails(_) => NeMoGuardrailsConfig::editor_schema().fields, + Self::PiiRedaction(_) => PiiRedactionConfig::editor_schema().fields, } } @@ -60,6 +64,7 @@ impl EditableComponent { Self::Observability(state) => state.enabled, Self::Adaptive(state) => state.enabled, Self::NemoGuardrails(state) => state.enabled, + Self::PiiRedaction(state) => state.enabled, } } @@ -68,6 +73,7 @@ impl EditableComponent { Self::Observability(state) => state.toggle_enabled(), Self::Adaptive(state) => state.toggle_enabled(), Self::NemoGuardrails(state) => state.toggle_enabled(), + Self::PiiRedaction(state) => state.toggle_enabled(), } } @@ -76,6 +82,7 @@ impl EditableComponent { Self::Observability(state) => state.set_enabled(enabled), Self::Adaptive(state) => state.set_enabled(enabled), Self::NemoGuardrails(state) => state.set_enabled(enabled), + Self::PiiRedaction(state) => state.set_enabled(enabled), } } @@ -84,6 +91,7 @@ impl EditableComponent { Self::Observability(state) => state.reset_enabled(), Self::Adaptive(state) => state.reset_enabled(), Self::NemoGuardrails(state) => state.reset_enabled(), + Self::PiiRedaction(state) => state.reset_enabled(), } } @@ -92,6 +100,7 @@ impl EditableComponent { Self::Observability(state) => observability_summary(state), Self::Adaptive(state) => adaptive_summary(state), Self::NemoGuardrails(state) => nemo_guardrails_summary(state), + Self::PiiRedaction(state) => pii_redaction_summary(state), } } @@ -102,6 +111,9 @@ impl EditableComponent { Self::NemoGuardrails(state) => { config_field_configured(&state.config, field).unwrap_or(false) } + Self::PiiRedaction(state) => { + config_field_configured(&state.config, field).unwrap_or(false) + } } } @@ -119,6 +131,10 @@ impl EditableComponent { reset_config_field(&mut state.config, field)?; state.mark_config_touched(); } + Self::PiiRedaction(state) => { + reset_config_field(&mut state.config, field)?; + state.mark_config_touched(); + } } Ok(()) } @@ -128,6 +144,7 @@ impl EditableComponent { Self::Observability(state) => store_observability_state(config, state), Self::Adaptive(state) => store_adaptive_state(config, state), Self::NemoGuardrails(state) => store_nemo_guardrails_state(config, state), + Self::PiiRedaction(state) => store_pii_redaction_state(config, state), } } } @@ -151,6 +168,7 @@ pub(super) fn editable_components( EditableComponent::Observability(Box::new(component_observability_state(config)?)), EditableComponent::Adaptive(Box::new(component_adaptive_state(config)?)), EditableComponent::NemoGuardrails(Box::new(component_nemo_guardrails_state(config)?)), + EditableComponent::PiiRedaction(Box::new(component_pii_redaction_state(config)?)), ]) } @@ -334,6 +352,12 @@ pub(super) fn component_nemo_guardrails_state( component_editor_state(config, NEMO_GUARDRAILS_PLUGIN_KIND, false) } +pub(super) fn component_pii_redaction_state( + config: &PluginConfig, +) -> Result, CliError> { + component_editor_state(config, PII_REDACTION_PLUGIN_KIND, false) +} + pub(super) fn store_observability_state( config: &mut PluginConfig, state: &ComponentEditorState, @@ -382,6 +406,22 @@ pub(super) fn store_nemo_guardrails_state( Ok(()) } +pub(super) fn store_pii_redaction_state( + config: &mut PluginConfig, + state: &ComponentEditorState, +) -> Result<(), CliError> { + if state.should_store(state.config_touched || pii_redaction_configured(&state.config)) { + store_component_editor_config( + config, + PII_REDACTION_PLUGIN_KIND, + state.enabled, + pii_redaction_config_map(&state.config)?, + merge_pii_redaction_editor_config, + ); + } + Ok(()) +} + fn store_component_editor_config( config: &mut PluginConfig, kind: &str, @@ -713,6 +753,23 @@ pub(super) fn nemo_guardrails_config_map( } } +pub(super) fn pii_redaction_config_map( + config: &PiiRedactionConfig, +) -> Result, CliError> { + let value = serde_json::to_value(config).map_err(serde_error)?; + match value { + Value::Object(mut map) => { + if is_version_one(map.get("version")) { + map.remove("version"); + } + Ok(map) + } + _ => Err(CliError::Config( + "pii_redaction config must serialize to an object".into(), + )), + } +} + pub(super) fn merge_observability_editor_config( existing: &mut Map, edited: Map, @@ -755,6 +812,21 @@ pub(super) fn merge_nemo_guardrails_editor_config( ); } +pub(super) fn merge_pii_redaction_editor_config( + existing: &mut Map, + edited: Map, +) { + if is_version_one(existing.get("version")) { + existing.remove("version"); + } + merge_known_editor_object( + existing, + edited, + &nested_editor_keys(PiiRedactionConfig::editor_schema()), + PiiRedactionConfig::editor_schema(), + ); +} + fn is_version_one(value: Option<&Value>) -> bool { value.and_then(Value::as_u64) == Some(1) } @@ -911,3 +983,30 @@ pub(super) fn nemo_guardrails_summary( } ) } + +pub(super) fn pii_redaction_configured(config: &PiiRedactionConfig) -> bool { + PiiRedactionConfig::editor_schema() + .fields + .iter() + .filter(|field| field.name != POLICY_SECTION) + .any(|field| config_field_configured(config, *field).unwrap_or(false)) +} + +pub(super) fn pii_redaction_summary(state: &ComponentEditorState) -> String { + let configured_fields = PiiRedactionConfig::editor_schema() + .fields + .iter() + .filter(|field| field.name != POLICY_SECTION) + .filter(|field| config_field_configured(&state.config, **field).unwrap_or(false)) + .map(|field| field.label) + .collect::>(); + format!( + "component {}, fields {}", + if state.enabled { "enabled" } else { "disabled" }, + if configured_fields.is_empty() { + "none".into() + } else { + configured_fields.join(", ") + } + ) +} diff --git a/crates/cli/src/server.rs b/crates/cli/src/server.rs index fef92e1d..ba19dc15 100644 --- a/crates/cli/src/server.rs +++ b/crates/cli/src/server.rs @@ -9,6 +9,7 @@ use axum::routing::{get, post}; use axum::{Json, Router}; use nemo_relay::plugin::{PluginConfig, clear_plugin_configuration, initialize_plugins}; use nemo_relay_adaptive::plugin_component::register_adaptive_component; +use nemo_relay_pii_redaction::component::register_pii_redaction_component; use reqwest::Client; use serde_json::Value; use tokio::net::TcpListener; @@ -157,6 +158,9 @@ impl PluginActivation { register_adaptive_component().map_err(|error| { CliError::Config(format!("adaptive plugin registration failed: {error}")) })?; + register_pii_redaction_component().map_err(|error| { + CliError::Config(format!("PII redaction plugin registration failed: {error}")) + })?; let plugin_config: PluginConfig = serde_json::from_value(config) .map_err(|error| CliError::Config(format!("invalid plugin config: {error}")))?; initialize_plugins(plugin_config) diff --git a/crates/cli/tests/coverage/plugins_tests.rs b/crates/cli/tests/coverage/plugins_tests.rs index f02223b0..1c3fd373 100644 --- a/crates/cli/tests/coverage/plugins_tests.rs +++ b/crates/cli/tests/coverage/plugins_tests.rs @@ -11,6 +11,7 @@ use nemo_relay::plugins::nemo_guardrails::component::{ }; use nemo_relay_adaptive::AdaptiveConfig; use nemo_relay_adaptive::plugin_component::ADAPTIVE_PLUGIN_KIND; +use nemo_relay_pii_redaction::component::PiiRedactionConfig; fn adaptive_component_config(agent_id: &str) -> serde_json::Map { json!({ diff --git a/crates/core/src/plugin.rs b/crates/core/src/plugin.rs index 4f0980f4..e3d90715 100644 --- a/crates/core/src/plugin.rs +++ b/crates/core/src/plugin.rs @@ -765,7 +765,6 @@ pub fn ensure_builtin_plugins_registered() -> Result<()> { let register_builtins = || { crate::observability::plugin_component::register_observability_component()?; crate::plugins::nemo_guardrails::component::register_nemo_guardrails_component()?; - crate::plugins::pii_redaction::component::register_pii_redaction_component()?; crate::plugins::pricing::register_pricing_component() }; match BUILTIN_PLUGIN_REGISTRATION.get_or_init(register_builtins) { diff --git a/crates/core/src/plugins/mod.rs b/crates/core/src/plugins/mod.rs index 44546e8a..d6cef9c1 100644 --- a/crates/core/src/plugins/mod.rs +++ b/crates/core/src/plugins/mod.rs @@ -4,5 +4,4 @@ //! First-party plugin implementations for NeMo Relay Core. pub mod nemo_guardrails; -pub mod pii_redaction; pub mod pricing; diff --git a/crates/core/src/plugins/pii_redaction/mod.rs b/crates/core/src/plugins/pii_redaction/mod.rs deleted file mode 100644 index 00830517..00000000 --- a/crates/core/src/plugins/pii_redaction/mod.rs +++ /dev/null @@ -1,18 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -//! PII redaction plugin integrations for NeMo Relay Core. - -#[cfg(test)] -use std::sync::Mutex; - -#[cfg(test)] -pub(crate) fn test_mutex() -> &'static Mutex<()> { - crate::shared_runtime::runtime_owner_test_mutex() -} - -pub(crate) mod builtin; -pub mod component; -pub(crate) mod detectors; -pub(crate) mod local; -pub(crate) mod overlay; diff --git a/crates/ffi/Cargo.toml b/crates/ffi/Cargo.toml index 4a019f86..f729563b 100644 --- a/crates/ffi/Cargo.toml +++ b/crates/ffi/Cargo.toml @@ -19,6 +19,7 @@ crate-type = ["cdylib", "staticlib", "rlib"] [dependencies] nemo-relay = { workspace = true, features = ["atof-streaming", "otel", "openinference"] } nemo-relay-adaptive = { workspace = true, features = ["redis-backend"] } +nemo-relay-pii-redaction.workspace = true chrono = "0.4" libc = "0.2" serde_json = "1" diff --git a/crates/ffi/src/api/plugin.rs b/crates/ffi/src/api/plugin.rs index ad795e49..f8dd4752 100644 --- a/crates/ffi/src/api/plugin.rs +++ b/crates/ffi/src/api/plugin.rs @@ -17,6 +17,7 @@ use super::{ wrap_tool_conditional_fn, wrap_tool_exec_intercept_fn, wrap_tool_request_intercept_fn, wrap_tool_sanitize_fn, }; +use nemo_relay_pii_redaction::component::register_pii_redaction_component; struct FfiHostedPluginUserData { ptr: *mut libc::c_void, @@ -126,6 +127,10 @@ fn ensure_adaptive_component_registered() -> std::result::Result<(), NemoRelaySt register_adaptive_component().map_err(|err| status_from_plugin_error(&err)) } +fn ensure_pii_redaction_component_registered() -> std::result::Result<(), NemoRelayStatus> { + register_pii_redaction_component().map_err(|err| status_from_plugin_error(&err)) +} + /// Validate a generic plugin config document and return the diagnostics report as JSON. /// /// # Safety @@ -143,6 +148,9 @@ pub unsafe extern "C" fn nemo_relay_validate_plugin_config( if let Err(status) = ensure_adaptive_component_registered() { return status; } + if let Err(status) = ensure_pii_redaction_component_registered() { + return status; + } let config_value = match c_str_to_json(config_json) { Some(value) => value, None => return NemoRelayStatus::InvalidJson, @@ -182,6 +190,9 @@ pub unsafe extern "C" fn nemo_relay_initialize_plugins( if let Err(status) = ensure_adaptive_component_registered() { return status; } + if let Err(status) = ensure_pii_redaction_component_registered() { + return status; + } let config_value = match c_str_to_json(config_json) { Some(value) => value, None => return NemoRelayStatus::InvalidJson, @@ -258,6 +269,9 @@ pub unsafe extern "C" fn nemo_relay_list_plugin_kinds_json( if let Err(status) = ensure_adaptive_component_registered() { return status; } + if let Err(status) = ensure_pii_redaction_component_registered() { + return status; + } let kinds_json = match serde_json::to_value(list_plugin_kinds()) { Ok(value) => value, Err(err) => { diff --git a/crates/node/Cargo.toml b/crates/node/Cargo.toml index 82ec92ec..139ecf98 100644 --- a/crates/node/Cargo.toml +++ b/crates/node/Cargo.toml @@ -20,6 +20,7 @@ test = false [dependencies] nemo-relay = { workspace = true, features = ["atof-streaming", "otel", "openinference"] } nemo-relay-adaptive = { workspace = true, features = ["redis-backend"] } +nemo-relay-pii-redaction.workspace = true chrono = "0.4" napi = { version = "2", features = ["napi6", "async", "serde-json", "tokio_rt"] } napi-derive = "2" diff --git a/crates/node/src/api/mod.rs b/crates/node/src/api/mod.rs index 5d3134c9..bb85a41b 100644 --- a/crates/node/src/api/mod.rs +++ b/crates/node/src/api/mod.rs @@ -50,6 +50,7 @@ use nemo_relay::plugin::{ }; use nemo_relay::shared_runtime::initialize_shared_runtime_binding; use nemo_relay_adaptive::plugin_component::register_adaptive_component; +use nemo_relay_pii_redaction::component::register_pii_redaction_component; use crate::callable; use crate::convert::{ @@ -66,6 +67,8 @@ fn init() { .expect("node runtime ownership initialization should succeed"); register_adaptive_component() .expect("node adaptive plugin component registration should succeed"); + register_pii_redaction_component() + .expect("node pii redaction plugin component registration should succeed"); } fn parse_string_map( diff --git a/crates/pii-redaction/Cargo.toml b/crates/pii-redaction/Cargo.toml new file mode 100644 index 00000000..235f7266 --- /dev/null +++ b/crates/pii-redaction/Cargo.toml @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +[package] +name = "nemo-relay-pii-redaction" +version.workspace = true +edition.workspace = true +license.workspace = true +repository.workspace = true +description = "First-party deterministic and model-backed PII redaction plugin surfaces for NeMo Relay." +readme = "README.md" + +[lints] +workspace = true + +[features] +default = [] +schema = ["dep:schemars"] + +[dependencies] +nemo-relay = { workspace = true, features = ["schema"] } +serde = { version = "1", features = ["derive"] } +serde_json = "1" +regex = "1" +sha2 = "0.10" +schemars = { version = "0.8", optional = true } + +[dev-dependencies] +futures = "0.3" +tokio = { version = "1", features = ["rt", "macros", "sync", "test-util", "rt-multi-thread", "time"] } diff --git a/crates/pii-redaction/README.md b/crates/pii-redaction/README.md new file mode 100644 index 00000000..9457f38b --- /dev/null +++ b/crates/pii-redaction/README.md @@ -0,0 +1,8 @@ + + +# NeMo Relay PII Redaction + +First-party PII redaction plugin crate for NeMo Relay. diff --git a/crates/core/src/plugins/pii_redaction/builtin.rs b/crates/pii-redaction/src/builtin.rs similarity index 97% rename from crates/core/src/plugins/pii_redaction/builtin.rs rename to crates/pii-redaction/src/builtin.rs index e8776ef4..94259235 100644 --- a/crates/core/src/plugins/pii_redaction/builtin.rs +++ b/crates/pii-redaction/src/builtin.rs @@ -9,13 +9,13 @@ use serde::de::DeserializeOwned; use serde_json::Value as Json; use sha2::{Digest, Sha256}; -use crate::api::llm::LlmRequest; -use crate::api::runtime::{LlmSanitizeRequestFn, LlmSanitizeResponseFn, ToolSanitizeFn}; -use crate::codec::anthropic::AnthropicMessagesCodec; -use crate::codec::openai_chat::OpenAIChatCodec; -use crate::codec::openai_responses::OpenAIResponsesCodec; -use crate::codec::traits::{LlmCodec, LlmResponseCodec}; -use crate::plugin::{PluginError, Result as PluginResult}; +use nemo_relay::api::llm::LlmRequest; +use nemo_relay::api::runtime::{LlmSanitizeRequestFn, LlmSanitizeResponseFn, ToolSanitizeFn}; +use nemo_relay::codec::anthropic::AnthropicMessagesCodec; +use nemo_relay::codec::openai_chat::OpenAIChatCodec; +use nemo_relay::codec::openai_responses::OpenAIResponsesCodec; +use nemo_relay::codec::traits::{LlmCodec, LlmResponseCodec}; +use nemo_relay::plugin::{PluginError, Result as PluginResult}; use super::component::BuiltinBackendConfig; use super::detectors::BuiltinDetector; diff --git a/crates/core/src/plugins/pii_redaction/component.rs b/crates/pii-redaction/src/component.rs similarity index 99% rename from crates/core/src/plugins/pii_redaction/component.rs rename to crates/pii-redaction/src/component.rs index 5240379f..2b4da01c 100644 --- a/crates/core/src/plugins/pii_redaction/component.rs +++ b/crates/pii-redaction/src/component.rs @@ -7,7 +7,7 @@ use std::future::Future; use std::pin::Pin; use std::sync::Arc; -use crate::plugin::{ +use nemo_relay::plugin::{ ConfigDiagnostic, ConfigPolicy, DiagnosticLevel, Plugin, PluginComponentSpec, PluginError, PluginRegistrationContext, Result as PluginResult, UnsupportedBehavior, deregister_plugin, register_plugin, @@ -174,7 +174,7 @@ pub struct LocalBackendConfig { pub max_latency_ms: Option, } -crate::editor_config! { +nemo_relay::editor_config! { impl PiiRedactionConfig { mode => { label: "mode", @@ -215,7 +215,7 @@ crate::editor_config! { } } -crate::editor_config! { +nemo_relay::editor_config! { impl BuiltinBackendConfig { action => { label: "action", @@ -252,7 +252,7 @@ crate::editor_config! { } } -crate::editor_config! { +nemo_relay::editor_config! { impl LocalBackendConfig { backend => { label: "backend", kind: String, optional: true }, model_id => { label: "model_id", kind: String, optional: true }, @@ -826,5 +826,5 @@ fn default_priority() -> i32 { } #[cfg(test)] -#[path = "../../../tests/unit/plugins/pii_redaction/component_tests.rs"] +#[path = "../tests/unit/component_tests.rs"] mod tests; diff --git a/crates/core/src/plugins/pii_redaction/detectors.rs b/crates/pii-redaction/src/detectors.rs similarity index 99% rename from crates/core/src/plugins/pii_redaction/detectors.rs rename to crates/pii-redaction/src/detectors.rs index ef72f0bf..baba4900 100644 --- a/crates/core/src/plugins/pii_redaction/detectors.rs +++ b/crates/pii-redaction/src/detectors.rs @@ -1,7 +1,7 @@ // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -use crate::plugin::PluginError; +use nemo_relay::plugin::PluginError; use super::builtin::mask_text; diff --git a/crates/pii-redaction/src/lib.rs b/crates/pii-redaction/src/lib.rs new file mode 100644 index 00000000..b9acb28f --- /dev/null +++ b/crates/pii-redaction/src/lib.rs @@ -0,0 +1,63 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#![deny(rustdoc::broken_intra_doc_links, rustdoc::private_intra_doc_links)] + +//! First-party PII redaction plugin crate for NeMo Relay. + +#[cfg(test)] +use std::sync::Mutex; + +pub(crate) mod builtin; +pub mod component; +pub(crate) mod detectors; +pub(crate) mod local; +pub(crate) mod overlay; + +#[cfg(test)] +pub(crate) fn test_mutex() -> &'static Mutex<()> { + static TEST_MUTEX: Mutex<()> = Mutex::new(()); + &TEST_MUTEX +} + +#[cfg(test)] +#[allow(missing_docs)] +pub mod api { + pub use nemo_relay::api::*; +} + +#[cfg(test)] +#[allow(missing_docs)] +pub mod codec { + pub use nemo_relay::codec::*; +} + +#[cfg(test)] +#[allow(missing_docs)] +pub mod plugin { + pub use nemo_relay::plugin::*; + + pub fn ensure_builtin_plugins_registered() -> Result<()> { + nemo_relay::plugin::ensure_builtin_plugins_registered()?; + crate::component::register_pii_redaction_component() + } +} + +#[cfg(test)] +#[allow(missing_docs)] +pub mod plugins { + pub mod pii_redaction { + pub use crate::component; + + #[cfg(test)] + pub fn test_mutex() -> &'static std::sync::Mutex<()> { + crate::test_mutex() + } + } +} + +#[cfg(test)] +#[allow(missing_docs)] +pub mod shared_runtime { + pub fn reset_runtime_owner_for_tests() {} +} diff --git a/crates/core/src/plugins/pii_redaction/local.rs b/crates/pii-redaction/src/local.rs similarity index 94% rename from crates/core/src/plugins/pii_redaction/local.rs rename to crates/pii-redaction/src/local.rs index f80aa528..fe763a83 100644 --- a/crates/core/src/plugins/pii_redaction/local.rs +++ b/crates/pii-redaction/src/local.rs @@ -3,7 +3,7 @@ use std::sync::{Arc, LazyLock, Mutex, MutexGuard}; -use crate::plugin::{PluginError, PluginRegistrationContext, Result as PluginResult}; +use nemo_relay::plugin::{PluginError, PluginRegistrationContext, Result as PluginResult}; use super::component::PiiRedactionConfig; diff --git a/crates/core/src/plugins/pii_redaction/overlay.rs b/crates/pii-redaction/src/overlay.rs similarity index 98% rename from crates/core/src/plugins/pii_redaction/overlay.rs rename to crates/pii-redaction/src/overlay.rs index 938bbd99..a1e96720 100644 --- a/crates/core/src/plugins/pii_redaction/overlay.rs +++ b/crates/pii-redaction/src/overlay.rs @@ -3,8 +3,8 @@ use serde_json::{Map, Value as Json}; -use crate::codec::request::{ContentPart, MessageContent}; -use crate::codec::response::{AnnotatedLlmResponse, FinishReason, ResponseToolCall}; +use nemo_relay::codec::request::{ContentPart, MessageContent}; +use nemo_relay::codec::response::{AnnotatedLlmResponse, FinishReason, ResponseToolCall}; #[derive(Clone, Copy)] pub(crate) enum BuiltinCodecName { diff --git a/crates/core/tests/unit/plugins/pii_redaction/component_tests.rs b/crates/pii-redaction/tests/unit/component_tests.rs similarity index 99% rename from crates/core/tests/unit/plugins/pii_redaction/component_tests.rs rename to crates/pii-redaction/tests/unit/component_tests.rs index b2775af4..c78cb1b8 100644 --- a/crates/core/tests/unit/plugins/pii_redaction/component_tests.rs +++ b/crates/pii-redaction/tests/unit/component_tests.rs @@ -53,6 +53,7 @@ fn reset_runtime() { crate::shared_runtime::reset_runtime_owner_for_tests(); let context = global_context(); *context.write().unwrap() = NemoRelayContextState::new(); + register_pii_redaction_component().unwrap(); } fn setup_isolated_thread() { diff --git a/crates/python/Cargo.toml b/crates/python/Cargo.toml index 4861d109..1b7655c6 100644 --- a/crates/python/Cargo.toml +++ b/crates/python/Cargo.toml @@ -20,6 +20,7 @@ crate-type = ["cdylib", "rlib"] [dependencies] nemo-relay = { workspace = true, features = ["atof-streaming", "otel", "openinference"] } nemo-relay-adaptive = { workspace = true, features = ["redis-backend"] } +nemo-relay-pii-redaction.workspace = true pyo3 = { version = "0.28.2", features = ["abi3", "abi3-py311", "experimental-inspect", "macros"] } pyo3-async-runtimes = { version = "0.28.0", features = ["tokio-runtime"] } pythonize = "0.28.0" diff --git a/crates/python/src/lib.rs b/crates/python/src/lib.rs index 0c7c1998..ca103568 100644 --- a/crates/python/src/lib.rs +++ b/crates/python/src/lib.rs @@ -22,6 +22,7 @@ //! - `convert` — JSON ↔ Python conversion utilities use nemo_relay::shared_runtime::initialize_shared_runtime_binding; use nemo_relay_adaptive::plugin_component::register_adaptive_component; +use nemo_relay_pii_redaction::component::register_pii_redaction_component; use pyo3::prelude::*; use pyo3::types::PyModule; @@ -53,6 +54,11 @@ fn _native(m: &Bound<'_, PyModule>) -> PyResult<()> { "failed to register adaptive plugin component: {e}" )) })?; + register_pii_redaction_component().map_err(|e| { + pyo3::exceptions::PyRuntimeError::new_err(format!( + "failed to register PII redaction plugin component: {e}" + )) + })?; py_types::register(m)?; py_api::register(m)?; py_plugin::register(m)?; diff --git a/crates/wasm/Cargo.toml b/crates/wasm/Cargo.toml index 35012361..55862707 100644 --- a/crates/wasm/Cargo.toml +++ b/crates/wasm/Cargo.toml @@ -20,6 +20,7 @@ crate-type = ["cdylib", "rlib"] [dependencies] nemo-relay = { workspace = true, features = ["otel", "openinference"] } nemo-relay-adaptive.workspace = true +nemo-relay-pii-redaction.workspace = true chrono = "0.4" wasm-bindgen = "0.2" wasm-bindgen-futures = "0.4" diff --git a/crates/wasm/src/api/mod.rs b/crates/wasm/src/api/mod.rs index 3aa06f6c..c9511e2c 100644 --- a/crates/wasm/src/api/mod.rs +++ b/crates/wasm/src/api/mod.rs @@ -61,6 +61,7 @@ use nemo_relay::plugin::{ validate_plugin_config as validate_plugin_config_impl, }; use nemo_relay_adaptive::plugin_component::register_adaptive_component; +use nemo_relay_pii_redaction::component::register_pii_redaction_component; use crate::callable; use crate::convert::{ @@ -2198,12 +2199,17 @@ fn ensure_adaptive_component_registered() -> Result<(), JsValue> { register_adaptive_component().map_err(to_js_err) } +fn ensure_pii_redaction_component_registered() -> Result<(), JsValue> { + register_pii_redaction_component().map_err(to_js_err) +} + /// Validate a plugin config document and return a structured diagnostics report. #[wasm_bindgen(js_name = "validatePluginConfig", unchecked_return_type = "Json")] pub fn validate_plugin_config( #[wasm_bindgen(unchecked_param_type = "Json")] config: JsValue, ) -> Result { ensure_adaptive_component_registered()?; + ensure_pii_redaction_component_registered()?; let config: PluginConfig = serde_wasm_bindgen::from_value(config)?; serde_wasm_bindgen::to_value(&validate_plugin_config_impl(&config)) .map_err(|e| JsValue::from_str(&e.to_string())) @@ -2849,6 +2855,7 @@ pub fn register_plugin( #[wasm_bindgen(unchecked_param_type = "(...args: any[]) => any")] register: Function, ) -> Result<(), JsValue> { ensure_adaptive_component_registered()?; + ensure_pii_redaction_component_registered()?; register_plugin_impl(Arc::new(WasmPlugin { plugin_kind, validate: validate.map(send_wrapper::SendWrapper::new), @@ -2875,6 +2882,7 @@ pub async fn initialize_plugins( #[wasm_bindgen(unchecked_param_type = "Json")] config: JsValue, ) -> Result { ensure_adaptive_component_registered()?; + ensure_pii_redaction_component_registered()?; let config: PluginConfig = serde_wasm_bindgen::from_value(config)?; let report = initialize_plugins_impl(config).await.map_err(to_js_err)?; serde_wasm_bindgen::to_value(&report).map_err(|e| JsValue::from_str(&e.to_string())) @@ -2897,6 +2905,7 @@ pub fn active_plugin_report() -> Result { /// List the plugin kinds currently registered with the runtime. pub fn list_plugin_kinds() -> Result { ensure_adaptive_component_registered()?; + ensure_pii_redaction_component_registered()?; serde_wasm_bindgen::to_value(&list_plugin_kinds_impl()) .map_err(|e| JsValue::from_str(&e.to_string())) } From b98f67f317d0cc122eec5fc12c04bea82f7c82b3 Mon Sep 17 00:00:00 2001 From: Alex Fournier Date: Mon, 8 Jun 2026 13:33:39 -0700 Subject: [PATCH 08/35] fix: address pii redaction review feedback Signed-off-by: Alex Fournier --- crates/cli/tests/coverage/plugins_tests.rs | 78 +++++++++++++ crates/pii-redaction/src/builtin.rs | 16 ++- crates/pii-redaction/src/detectors.rs | 14 ++- crates/pii-redaction/src/local.rs | 3 +- crates/pii-redaction/src/overlay.rs | 103 +++++++++++++++--- .../tests/unit/component_tests.rs | 94 +++++++++++++++- docs/pii-redaction-plugin/configuration.mdx | 6 +- 7 files changed, 282 insertions(+), 32 deletions(-) diff --git a/crates/cli/tests/coverage/plugins_tests.rs b/crates/cli/tests/coverage/plugins_tests.rs index 1c3fd373..657a85eb 100644 --- a/crates/cli/tests/coverage/plugins_tests.rs +++ b/crates/cli/tests/coverage/plugins_tests.rs @@ -814,6 +814,84 @@ fn editor_save_preserves_unknown_nemo_guardrails_fields_and_sections() { assert_eq!(request_defaults["rails"]["future_rails"], json!("preserve")); } +#[test] +fn editor_save_preserves_unknown_pii_redaction_fields_and_prunes_version() { + let mut config = PluginConfig { + components: vec![PluginComponentSpec { + kind: "pii_redaction".to_string(), + enabled: true, + config: json!({ + "version": 1, + "future_top_level": "preserve", + "mode": "builtin", + "codec": "openai_chat", + "builtin": { + "action": "mask", + "detector": "email", + "target_paths": ["/message"], + "future_builtin": "preserve" + }, + "local": { + "future_local": "preserve" + } + }) + .as_object() + .unwrap() + .clone(), + }], + ..PluginConfig::default() + }; + + let mut pii_redaction = component_pii_redaction_state(&config).unwrap(); + let schema = PiiRedactionConfig::editor_schema(); + let builtin = schema.field("builtin").unwrap(); + + set_struct_field(&mut pii_redaction.config, "mode", json!("builtin")).unwrap(); + set_struct_field(&mut pii_redaction.config, "codec", json!("openai_chat")).unwrap(); + set_section_field( + &mut pii_redaction.config, + builtin, + "action", + json!("redact"), + ) + .unwrap(); + set_section_field( + &mut pii_redaction.config, + builtin, + "detector", + json!("bearer_token"), + ) + .unwrap(); + set_section_field( + &mut pii_redaction.config, + builtin, + "replacement", + json!("[REDACTED]"), + ) + .unwrap(); + + pii_redaction.set_enabled(false); + store_pii_redaction_state(&mut config, &pii_redaction).unwrap(); + + let component = config + .components + .iter() + .find(|component| component.kind == "pii_redaction") + .unwrap(); + assert!(!component.enabled); + assert!(!component.config.contains_key("version")); + assert_eq!( + component.config.get("future_top_level"), + Some(&json!("preserve")) + ); + let builtin = component.config["builtin"].as_object().unwrap(); + assert_eq!(builtin.get("action"), Some(&json!("redact"))); + assert_eq!(builtin.get("detector"), Some(&json!("bearer_token"))); + assert_eq!(builtin.get("future_builtin"), Some(&json!("preserve"))); + let local = component.config["local"].as_object().unwrap(); + assert_eq!(local.get("future_local"), Some(&json!("preserve"))); +} + #[test] fn adaptive_config_field_reset_handles_optional_and_default_fields() { let mut adaptive = AdaptiveConfig { diff --git a/crates/pii-redaction/src/builtin.rs b/crates/pii-redaction/src/builtin.rs index 94259235..4d5068ae 100644 --- a/crates/pii-redaction/src/builtin.rs +++ b/crates/pii-redaction/src/builtin.rs @@ -135,6 +135,12 @@ impl CompiledBuiltinBackend { value: Json, path_segments: &mut Vec, ) -> Option { + if self.matches_current_preorder_path(path_segments) + && matches!(self.action, BuiltinAction::Remove) + { + return None; + } + match value { Json::String(text) => { if self.matches_current_preorder_path(path_segments) { @@ -168,15 +174,7 @@ impl CompiledBuiltinBackend { }) .collect(), )), - other => { - if self.matches_current_preorder_path(path_segments) - && matches!(self.action, BuiltinAction::Remove) - { - None - } else { - Some(other) - } - } + other => Some(other), } } diff --git a/crates/pii-redaction/src/detectors.rs b/crates/pii-redaction/src/detectors.rs index baba4900..28043926 100644 --- a/crates/pii-redaction/src/detectors.rs +++ b/crates/pii-redaction/src/detectors.rs @@ -245,19 +245,25 @@ fn mask_api_key(text: &str, mask_char: &str) -> String { } fn mask_ip_address(text: &str, mask_char: &str) -> String { - let mut octets = text.split('.').collect::>(); + let mut octets = text + .split('.') + .map(std::borrow::ToOwned::to_owned) + .collect::>(); if octets.len() != 4 { return mask_text(text, mask_char, 0, 0); } for octet in octets.iter_mut().take(3) { - *octet = "***"; + *octet = mask_char.repeat(3); } octets.join(".") } fn mask_ipv6(text: &str, mask_char: &str) -> String { - let mut segments = text.split(':').collect::>(); + let mut segments = text + .split(':') + .map(std::borrow::ToOwned::to_owned) + .collect::>(); if segments.len() < 3 { return mask_text(text, mask_char, 0, 0); } @@ -265,7 +271,7 @@ fn mask_ipv6(text: &str, mask_char: &str) -> String { let visible_tail_start = segments.len().saturating_sub(1); for segment in segments.iter_mut().take(visible_tail_start) { if !segment.is_empty() { - *segment = "****"; + *segment = mask_char.repeat(4); } } segments.join(":") diff --git a/crates/pii-redaction/src/local.rs b/crates/pii-redaction/src/local.rs index fe763a83..12fbadb8 100644 --- a/crates/pii-redaction/src/local.rs +++ b/crates/pii-redaction/src/local.rs @@ -7,7 +7,8 @@ use nemo_relay::plugin::{PluginError, PluginRegistrationContext, Result as Plugi use super::component::PiiRedactionConfig; -type LocalBackendProvider = Arc< +#[doc(hidden)] +pub type LocalBackendProvider = Arc< dyn Fn(PiiRedactionConfig, &mut PluginRegistrationContext) -> PluginResult<()> + Send + Sync, >; diff --git a/crates/pii-redaction/src/overlay.rs b/crates/pii-redaction/src/overlay.rs index a1e96720..0de0911c 100644 --- a/crates/pii-redaction/src/overlay.rs +++ b/crates/pii-redaction/src/overlay.rs @@ -126,6 +126,8 @@ fn overlay_openai_chat_tool_calls( return; }; + raw_calls.truncate(tool_calls.len()); + for (raw_call, sanitized_call) in raw_calls.iter_mut().zip(tool_calls.iter()) { let Some(raw_call) = raw_call.as_object_mut() else { continue; @@ -144,26 +146,27 @@ fn overlay_openai_chat_tool_calls( } fn overlay_openai_responses_tool_calls( - items: &mut [Json], + items: &mut Vec, tool_calls: Option<&[ResponseToolCall]>, ) { let Some(tool_calls) = tool_calls else { + items.retain(|item| item.get("type").and_then(Json::as_str) != Some("function_call")); return; }; let mut sanitized_calls = tool_calls.iter(); - for item in items { + items.retain_mut(|item| { let Some(item_type) = item.get("type").and_then(Json::as_str) else { - continue; + return true; }; if item_type != "function_call" { - continue; + return true; } let Some(raw_call) = item.as_object_mut() else { - continue; + return true; }; let Some(sanitized_call) = sanitized_calls.next() else { - break; + return false; }; set_optional_string_field(raw_call, "call_id", Some(sanitized_call.id.as_str())); set_optional_string_field(raw_call, "name", Some(sanitized_call.name.as_str())); @@ -172,32 +175,35 @@ fn overlay_openai_responses_tool_calls( "arguments", Some(json_string(&sanitized_call.arguments).as_str()), ); - } + true + }); } -fn overlay_anthropic_tool_calls(blocks: &mut [Json], tool_calls: Option<&[ResponseToolCall]>) { +fn overlay_anthropic_tool_calls(blocks: &mut Vec, tool_calls: Option<&[ResponseToolCall]>) { let Some(tool_calls) = tool_calls else { + blocks.retain(|block| block.get("type").and_then(Json::as_str) != Some("tool_use")); return; }; let mut sanitized_calls = tool_calls.iter(); - for block in blocks { + blocks.retain_mut(|block| { let Some(block_type) = block.get("type").and_then(Json::as_str) else { - continue; + return true; }; if block_type != "tool_use" { - continue; + return true; } let Some(raw_call) = block.as_object_mut() else { - continue; + return true; }; let Some(sanitized_call) = sanitized_calls.next() else { - break; + return false; }; set_optional_string_field(raw_call, "id", Some(sanitized_call.id.as_str())); set_optional_string_field(raw_call, "name", Some(sanitized_call.name.as_str())); raw_call.insert("input".into(), sanitized_call.arguments.clone()); - } + true + }); } fn overlay_output_text_blocks(items: &mut [Json], message_text: Option) { @@ -322,3 +328,72 @@ fn anthropic_stop_reason(reason: &FinishReason) -> &str { FinishReason::Unknown(other) => other.as_str(), } } + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + fn tool_call(id: &str, name: &str, arguments: Json) -> ResponseToolCall { + ResponseToolCall { + id: id.to_string(), + name: name.to_string(), + arguments, + } + } + + #[test] + fn openai_chat_overlay_truncates_extra_raw_tool_calls() { + let mut message = json!({ + "tool_calls": [ + {"id": "call_1", "function": {"name": "one", "arguments": "{\"secret\":\"raw-1\"}"}}, + {"id": "call_2", "function": {"name": "two", "arguments": "{\"secret\":\"raw-2\"}"}} + ] + }) + .as_object() + .unwrap() + .clone(); + + overlay_openai_chat_tool_calls( + &mut message, + Some(&[tool_call("call_1", "one", json!({"secret": "[REDACTED]"}))]), + ); + + let calls = message["tool_calls"].as_array().unwrap(); + assert_eq!(calls.len(), 1); + assert_eq!( + calls[0]["function"]["arguments"], + json!("{\"secret\":\"[REDACTED]\"}") + ); + } + + #[test] + fn openai_responses_overlay_removes_extra_function_calls() { + let mut items = vec![ + json!({"type": "message", "content": [{"type": "output_text", "text": "ok"}]}), + json!({"type": "function_call", "call_id": "call_1", "name": "one", "arguments": "{\"secret\":\"raw-1\"}"}), + json!({"type": "function_call", "call_id": "call_2", "name": "two", "arguments": "{\"secret\":\"raw-2\"}"}), + ]; + + overlay_openai_responses_tool_calls( + &mut items, + Some(&[tool_call("call_1", "one", json!({"secret": "[REDACTED]"}))]), + ); + + assert_eq!(items.len(), 2); + assert_eq!(items[1]["type"], json!("function_call")); + assert_eq!(items[1]["arguments"], json!("{\"secret\":\"[REDACTED]\"}")); + } + + #[test] + fn anthropic_overlay_removes_tool_use_blocks_when_no_sanitized_calls_exist() { + let mut blocks = vec![ + json!({"type": "text", "text": "hello"}), + json!({"type": "tool_use", "id": "call_1", "name": "one", "input": {"secret": "raw-1"}}), + ]; + + overlay_anthropic_tool_calls(&mut blocks, None); + + assert_eq!(blocks, vec![json!({"type": "text", "text": "hello"})]); + } +} diff --git a/crates/pii-redaction/tests/unit/component_tests.rs b/crates/pii-redaction/tests/unit/component_tests.rs index c78cb1b8..8e9f0528 100644 --- a/crates/pii-redaction/tests/unit/component_tests.rs +++ b/crates/pii-redaction/tests/unit/component_tests.rs @@ -438,6 +438,54 @@ fn builtin_remove_deletes_object_fields_and_nulls_array_or_root_targets() { clear_plugin_configuration().unwrap(); } +#[test] +fn builtin_remove_deletes_targeted_object_and_array_container_fields() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + setup_isolated_thread(); + + futures::executor::block_on(initialize_plugins(plugin_config(json!({ + "mode": "builtin", + "input": false, + "output": false, + "tool_input": true, + "tool_output": false, + "builtin": { + "action": "remove", + "target_paths": ["/nested", "/items"] + } + })))) + .unwrap(); + + let events = capture_events("pii-redaction-remove-container-events"); + let _handle = tool_call( + ToolCallParams::builder() + .name("search") + .args(json!({ + "nested": { + "keep": "yes", + "remove_me": "gone" + }, + "items": ["a", "b", "c"], + "public": "ok" + })) + .build(), + ) + .unwrap(); + + let captured_events = captured_events_snapshot(&events); + assert_eq!(captured_events.len(), 1); + assert_eq!( + captured_events[0].input(), + Some(&json!({ + "public": "ok" + })) + ); + + deregister_subscriber("pii-redaction-remove-container-events").unwrap(); + clear_plugin_configuration().unwrap(); +} + #[test] fn builtin_redact_replaces_matching_tool_payload_substrings_with_default_token() { let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); @@ -1021,11 +1069,12 @@ fn builtin_mask_with_credit_card_detector_preserves_last_four_digits() { .unwrap(); let events = capture_events("pii-redaction-credit-card-default-mask-events"); + let credit_card = ["4111", "1111", "1111", "1234"].join(" "); let _handle = tool_call( ToolCallParams::builder() .name("notify") .args(json!({ - "card": "4111 1111 1111 1234", + "card": credit_card, "keep": "unchanged" })) .build(), @@ -1046,6 +1095,49 @@ fn builtin_mask_with_credit_card_detector_preserves_last_four_digits() { clear_plugin_configuration().unwrap(); } +#[test] +fn builtin_mask_with_ip_detector_honors_custom_mask_char() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + setup_isolated_thread(); + + futures::executor::block_on(initialize_plugins(plugin_config(json!({ + "mode": "builtin", + "input": false, + "output": false, + "tool_input": true, + "tool_output": false, + "builtin": { + "action": "mask", + "detector": "ip_address", + "mask_char": "#", + "target_paths": ["/ip"] + } + })))) + .unwrap(); + + let events = capture_events("pii-redaction-ip-custom-mask-events"); + let _handle = tool_call( + ToolCallParams::builder() + .name("notify") + .args(json!({ + "ip": "10.20.30.40" + })) + .build(), + ) + .unwrap(); + + let captured_events = captured_events_snapshot(&events); + assert_eq!(captured_events.len(), 1); + assert_eq!( + captured_events[0].input().unwrap()["ip"], + json!("###.###.###.40") + ); + + deregister_subscriber("pii-redaction-ip-custom-mask-events").unwrap(); + clear_plugin_configuration().unwrap(); +} + #[test] fn builtin_mask_with_jwt_detector_preserves_header_and_signature_tail() { let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); diff --git a/docs/pii-redaction-plugin/configuration.mdx b/docs/pii-redaction-plugin/configuration.mdx index 9fb6b6d7..96abba34 100644 --- a/docs/pii-redaction-plugin/configuration.mdx +++ b/docs/pii-redaction-plugin/configuration.mdx @@ -66,9 +66,9 @@ At least one managed redaction surface must be enabled. | Managed `tool_output` | Supported | Extension point only in this PR | | Built-in actions | `remove`, `redact`, `regex_replace`, `hash`, `mask` | N/A | | Codec support | `openai_chat`, `openai_responses`, `anthropic_messages` | Runtime-specific future implementation | -| Runtime availability | Any runtime that includes the built-in core plugin | Runtimes that install a local backend provider | +| Runtime availability | Any runtime that includes the `nemo-relay-pii-redaction` plugin crate | Runtimes that install a local backend provider | -## Builtin Mode +## Built-In Mode Use `builtin` mode when NeMo Relay should sanitize emitted observability payloads with a deterministic first-party backend. @@ -155,7 +155,7 @@ discarded by the interactive edit flow. If you find yourself needing callback code instead of editor/config fields, it is a sign that raw middleware may be the better fit for that specific policy. -## Builtin Settings +## Built-In Settings The `builtin` section contains: From 30859c109fe47c5a5866c20f42a985ed7324fcfe Mon Sep 17 00:00:00 2001 From: Alex Fournier Date: Mon, 8 Jun 2026 13:43:52 -0700 Subject: [PATCH 09/35] docs: fix attribution markdown formatting Signed-off-by: Alex Fournier --- ATTRIBUTIONS-Rust.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ATTRIBUTIONS-Rust.md b/ATTRIBUTIONS-Rust.md index 6e064658..5e812dd4 100644 --- a/ATTRIBUTIONS-Rust.md +++ b/ATTRIBUTIONS-Rust.md @@ -32414,10 +32414,13 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ``` ## sha2 - 0.10.9 + **Repository URL**: https://github.com/RustCrypto/hashes **License Type(s)**: Apache-2.0 + ### License: https://spdx.org/licenses/Apache-2.0.html -``` + +```text Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ From 11502554350aad1a8d052660fd2b57db09c28e26 Mon Sep 17 00:00:00 2001 From: Alex Fournier Date: Mon, 8 Jun 2026 13:58:11 -0700 Subject: [PATCH 10/35] docs: regenerate rust attributions Signed-off-by: Alex Fournier --- ATTRIBUTIONS-Rust.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/ATTRIBUTIONS-Rust.md b/ATTRIBUTIONS-Rust.md index 5e812dd4..6e064658 100644 --- a/ATTRIBUTIONS-Rust.md +++ b/ATTRIBUTIONS-Rust.md @@ -32414,13 +32414,10 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ``` ## sha2 - 0.10.9 - **Repository URL**: https://github.com/RustCrypto/hashes **License Type(s)**: Apache-2.0 - ### License: https://spdx.org/licenses/Apache-2.0.html - -```text +``` Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ From 234c4dea7241f4472ecd2773ff8468c5d401d247 Mon Sep 17 00:00:00 2001 From: Alex Fournier Date: Tue, 9 Jun 2026 08:52:20 -0700 Subject: [PATCH 11/35] fix: tighten secret detector heuristics Signed-off-by: Alex Fournier --- crates/pii-redaction/src/detectors.rs | 4 +- .../tests/unit/component_tests.rs | 79 +++++++++++++++++++ docs/pii-redaction-plugin/configuration.mdx | 4 + 3 files changed, 85 insertions(+), 2 deletions(-) diff --git a/crates/pii-redaction/src/detectors.rs b/crates/pii-redaction/src/detectors.rs index 28043926..2a50ff9d 100644 --- a/crates/pii-redaction/src/detectors.rs +++ b/crates/pii-redaction/src/detectors.rs @@ -85,7 +85,7 @@ const BUILTIN_DETECTOR_SPECS: &[BuiltinDetectorSpec] = &[ detector: BuiltinDetector::BearerToken, name: "bearer_token", category: BuiltinDetectorCategory::StructuredSecret, - regex_pattern: r"(?i)\bBearer\s+[A-Za-z0-9._~+/\-]+=*\b", + regex_pattern: r"(?i)\bBearer\s+[A-Za-z0-9._~+/\-]{12,}={0,2}\b", }, BuiltinDetectorSpec { detector: BuiltinDetector::Jwt, @@ -121,7 +121,7 @@ const BUILTIN_DETECTOR_SPECS: &[BuiltinDetectorSpec] = &[ detector: BuiltinDetector::AzureStorageAccountKey, name: "azure_storage_account_key", category: BuiltinDetectorCategory::CloudCredential, - regex_pattern: r"\b[A-Za-z0-9+/]{86}==\b", + regex_pattern: r"\b[A-Za-z0-9+/]{86}==", }, ]; diff --git a/crates/pii-redaction/tests/unit/component_tests.rs b/crates/pii-redaction/tests/unit/component_tests.rs index 8e9f0528..47770048 100644 --- a/crates/pii-redaction/tests/unit/component_tests.rs +++ b/crates/pii-redaction/tests/unit/component_tests.rs @@ -1047,6 +1047,53 @@ fn builtin_mask_with_bearer_token_detector_preserves_scheme_and_last_four() { clear_plugin_configuration().unwrap(); } +#[test] +fn builtin_bearer_token_detector_ignores_short_benign_values() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + setup_isolated_thread(); + + futures::executor::block_on(initialize_plugins(plugin_config(json!({ + "mode": "builtin", + "codec": "openai_chat", + "input": false, + "output": false, + "tool_input": true, + "tool_output": false, + "builtin": { + "action": "redact", + "detector": "bearer_token", + "target_paths": ["/auth"] + } + })))) + .unwrap(); + + let events = capture_events("pii-redaction-bearer-short-benign-events"); + let _handle = tool_call( + ToolCallParams::builder() + .name("notify") + .args(json!({ + "auth": "Bearer token", + "keep": "unchanged" + })) + .build(), + ) + .unwrap(); + + let captured_events = captured_events_snapshot(&events); + assert_eq!(captured_events.len(), 1); + assert_eq!( + captured_events[0].input(), + Some(&json!({ + "auth": "Bearer token", + "keep": "unchanged" + })) + ); + + deregister_subscriber("pii-redaction-bearer-short-benign-events").unwrap(); + clear_plugin_configuration().unwrap(); +} + #[test] fn builtin_mask_with_credit_card_detector_preserves_last_four_digits() { let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); @@ -1262,6 +1309,38 @@ fn builtin_mask_with_cloud_key_detectors_preserves_expected_segments() { ); deregister_subscriber("pii-redaction-gcp-key-mask-events").unwrap(); clear_plugin_configuration().unwrap(); + + reset_runtime(); + setup_isolated_thread(); + futures::executor::block_on(initialize_plugins(plugin_config(json!({ + "mode": "builtin", + "codec": "openai_chat", + "input": false, + "output": false, + "tool_input": true, + "tool_output": false, + "builtin": { + "action": "mask", + "detector": "azure_storage_account_key", + "target_paths": ["/key"] + } + })))) + .unwrap(); + let events = capture_events("pii-redaction-azure-storage-key-mask-events"); + let azure_key = format!("{}==", "A".repeat(86)); + let _handle = tool_call( + ToolCallParams::builder() + .name("notify") + .args(json!({"key": azure_key})) + .build(), + ) + .unwrap(); + assert_eq!( + captured_events_snapshot(&events)[0].input(), + Some(&json!({"key": mask_text(&azure_key, "*", 0, 4)})) + ); + deregister_subscriber("pii-redaction-azure-storage-key-mask-events").unwrap(); + clear_plugin_configuration().unwrap(); } #[test] diff --git a/docs/pii-redaction-plugin/configuration.mdx b/docs/pii-redaction-plugin/configuration.mdx index 96abba34..41fdc53a 100644 --- a/docs/pii-redaction-plugin/configuration.mdx +++ b/docs/pii-redaction-plugin/configuration.mdx @@ -293,6 +293,10 @@ Structured secrets: - `jwt` - `credit_card` + `bearer_token` is heuristic rather than vendor-specific. It can still match + benign bearer-style values, so prefer a narrower detector when you know the + credential family. + Cloud credentials: - `aws_access_key_id` - `aws_secret_access_key` From 35e5c46a0057fb94cb58ed0229e86e6c5d41917f Mon Sep 17 00:00:00 2001 From: Alex Fournier Date: Tue, 9 Jun 2026 09:15:45 -0700 Subject: [PATCH 12/35] feat: add pii redaction binding helpers Signed-off-by: Alex Fournier --- crates/node/package.json | 4 + crates/node/pii_redaction.d.ts | 66 ++++++ crates/node/pii_redaction.js | 71 +++++++ crates/node/tests/pii_redaction_tests.mjs | 49 +++++ crates/wasm/scripts/prepare_pkg.mjs | 12 +- crates/wasm/tests-js/index_loader_tests.mjs | 3 + crates/wasm/tests-js/pii_redaction_tests.mjs | 64 ++++++ crates/wasm/wrappers/esm/pii_redaction.d.ts | 66 ++++++ crates/wasm/wrappers/esm/pii_redaction.js | 61 ++++++ crates/wasm/wrappers/nodejs/pii_redaction.js | 71 +++++++ python/nemo_relay/__init__.py | 2 + python/nemo_relay/__init__.pyi | 1 + python/nemo_relay/pii_redaction.py | 199 +++++++++++++++++++ python/nemo_relay/pii_redaction.pyi | 82 ++++++++ python/tests/test_pii_redaction_plugin.py | 68 +++++++ 15 files changed, 817 insertions(+), 2 deletions(-) create mode 100644 crates/node/pii_redaction.d.ts create mode 100644 crates/node/pii_redaction.js create mode 100644 crates/node/tests/pii_redaction_tests.mjs create mode 100644 crates/wasm/tests-js/pii_redaction_tests.mjs create mode 100644 crates/wasm/wrappers/esm/pii_redaction.d.ts create mode 100644 crates/wasm/wrappers/esm/pii_redaction.js create mode 100644 crates/wasm/wrappers/nodejs/pii_redaction.js create mode 100644 python/nemo_relay/pii_redaction.py create mode 100644 python/nemo_relay/pii_redaction.pyi create mode 100644 python/tests/test_pii_redaction_plugin.py diff --git a/crates/node/package.json b/crates/node/package.json index 2fe3a8fd..0f1811cf 100644 --- a/crates/node/package.json +++ b/crates/node/package.json @@ -44,6 +44,10 @@ "./observability": { "types": "./observability.d.ts", "default": "./observability.js" + }, + "./pii_redaction": { + "types": "./pii_redaction.d.ts", + "default": "./pii_redaction.js" } }, "engines": { diff --git a/crates/node/pii_redaction.d.ts b/crates/node/pii_redaction.d.ts new file mode 100644 index 00000000..24824d46 --- /dev/null +++ b/crates/node/pii_redaction.d.ts @@ -0,0 +1,66 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import type { ConfigDiagnostic, ConfigReport } from './plugin.js'; + +export { ConfigDiagnostic, ConfigReport }; + +export interface ConfigPolicy { + unknown_field?: 'ignore' | 'warn' | 'error' | string; + unsupported_value?: 'ignore' | 'warn' | 'error' | string; +} + +export interface BuiltinConfig { + action?: 'remove' | 'redact' | 'regex_replace' | 'hash' | 'mask' | string; + target_paths?: string[]; + pattern?: string; + detector?: string; + replacement?: string; + mask_char?: string; + unmasked_prefix?: number; + unmasked_suffix?: number; +} + +export interface LocalModelConfig { + backend?: string; + model_id?: string; + detector_profile?: string; + allow_network?: boolean; + max_latency_ms?: number; +} + +export interface Config { + version?: number; + mode?: 'builtin' | 'local_model' | string; + input?: boolean; + output?: boolean; + tool_input?: boolean; + tool_output?: boolean; + priority?: number; + codec?: 'openai_chat' | 'openai_responses' | 'anthropic_messages' | string; + builtin?: BuiltinConfig; + local?: LocalModelConfig; + policy?: ConfigPolicy; +} + +export interface ComponentSpec { + kind: 'pii_redaction'; + enabled?: boolean; + config: Config; +} + +/** Top-level plugin kind used by the built-in PII redaction component. */ +export declare const PII_REDACTION_PLUGIN_KIND: 'pii_redaction'; +/** Create a default PII redaction component config. */ +export declare function defaultConfig(): Config; +/** Create deterministic built-in redaction backend settings with defaults applied. */ +export declare function builtinConfig(config?: BuiltinConfig): BuiltinConfig; +/** Create future local-model backend settings with defaults applied. */ +export declare function localModelConfig(config?: LocalModelConfig): LocalModelConfig; +/** Wrap PII redaction config as a top-level plugin component. */ +export declare function ComponentSpec( + config: Config, + options?: { + enabled?: boolean; + }, +): import('./plugin.js').ComponentSpec; diff --git a/crates/node/pii_redaction.js b/crates/node/pii_redaction.js new file mode 100644 index 00000000..192bfebc --- /dev/null +++ b/crates/node/pii_redaction.js @@ -0,0 +1,71 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +'use strict'; + +const plugin = require('./plugin.js'); + +const PII_REDACTION_PLUGIN_KIND = 'pii_redaction'; + +/** + * Create a default PII redaction component config. + * + * @returns {object} The minimal PII redaction config with schema version 1. + */ +function defaultConfig() { + return { + version: 1, + mode: 'builtin', + input: true, + output: true, + tool_input: true, + tool_output: true, + priority: 100, + }; +} + +/** + * Create deterministic built-in redaction backend settings with defaults applied. + * + * @param {object} [config={}] - Partial built-in settings to override. + * @returns {object} A normalized built-in backend config object. + */ +function builtinConfig(config = {}) { + return { + action: 'redact', + ...config, + }; +} + +/** + * Create future local-model backend settings with defaults applied. + * + * @param {object} [config={}] - Partial local-model settings to override. + * @returns {object} A normalized local-model backend config object. + */ +function localModelConfig(config = {}) { + return { + ...config, + }; +} + +/** + * Wrap PII redaction config as a top-level plugin component. + * + * @param {object} config - PII redaction component configuration document. + * @param {{ enabled?: boolean }} [options={}] - Optional component-level flags. + * @returns {object} A plugin component spec for the PII redaction plugin. + */ +function ComponentSpec(config, { enabled = true } = {}) { + return plugin.ComponentSpec(PII_REDACTION_PLUGIN_KIND, config, { + enabled, + }); +} + +module.exports = { + PII_REDACTION_PLUGIN_KIND, + defaultConfig, + builtinConfig, + localModelConfig, + ComponentSpec, +}; diff --git a/crates/node/tests/pii_redaction_tests.mjs b/crates/node/tests/pii_redaction_tests.mjs new file mode 100644 index 00000000..10def956 --- /dev/null +++ b/crates/node/tests/pii_redaction_tests.mjs @@ -0,0 +1,49 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { describe, it } from 'node:test'; +import assert from 'node:assert/strict'; +import { createRequire } from 'node:module'; + +const require = createRequire(import.meta.url); +const plugin = require('../plugin.js'); +const piiRedaction = require('../pii_redaction.js'); + +describe('pii_redaction plugin helpers', () => { + it('builds defaults and plugin component shape', () => { + assert.deepEqual(piiRedaction.defaultConfig(), { + version: 1, + mode: 'builtin', + input: true, + output: true, + tool_input: true, + tool_output: true, + priority: 100, + }); + assert.deepEqual(piiRedaction.builtinConfig(), { action: 'redact' }); + assert.deepEqual(piiRedaction.localModelConfig(), {}); + + const component = piiRedaction.ComponentSpec({ + ...piiRedaction.defaultConfig(), + builtin: piiRedaction.builtinConfig({ detector: 'email' }), + }); + assert.equal(component.kind, piiRedaction.PII_REDACTION_PLUGIN_KIND); + assert.equal(component.enabled, true); + }); + + it('lists builtin pii_redaction kind and validates bad values', () => { + assert.equal(plugin.listKinds().includes(piiRedaction.PII_REDACTION_PLUGIN_KIND), true); + const report = plugin.validate({ + version: 1, + components: [ + piiRedaction.ComponentSpec({ + ...piiRedaction.defaultConfig(), + input: false, + output: false, + builtin: piiRedaction.builtinConfig({ action: 'mask', detector: 'not_a_detector' }), + }), + ], + }); + assert.deepEqual(report.diagnostics.map((diagnostic) => diagnostic.field), ['builtin.detector']); + }); +}); diff --git a/crates/wasm/scripts/prepare_pkg.mjs b/crates/wasm/scripts/prepare_pkg.mjs index 15f1c781..22b88ce1 100644 --- a/crates/wasm/scripts/prepare_pkg.mjs +++ b/crates/wasm/scripts/prepare_pkg.mjs @@ -12,8 +12,8 @@ const nodeJsWrapperDir = path.join(crateDir, 'wrappers', 'nodejs'); const pkgDir = process.argv[2] ? path.resolve(process.argv[2]) : path.join(crateDir, 'pkg'); const rootJsFiles = ['index.js']; -const jsWrapperFiles = ['typed.js', 'plugin.js', 'adaptive.js', 'observability.js']; -const typeWrapperFiles = ['typed.d.ts', 'plugin.d.ts', 'adaptive.d.ts', 'observability.d.ts']; +const jsWrapperFiles = ['typed.js', 'plugin.js', 'adaptive.js', 'observability.js', 'pii_redaction.js']; +const typeWrapperFiles = ['typed.d.ts', 'plugin.d.ts', 'adaptive.d.ts', 'observability.d.ts', 'pii_redaction.d.ts']; const wrapperFiles = [...rootJsFiles, ...jsWrapperFiles, ...typeWrapperFiles]; const packageMetadata = { description: 'WebAssembly bindings for the NeMo Relay agent runtime.', @@ -103,6 +103,10 @@ function updatePackageManifest(manifest) { types: './observability.d.ts', default: './observability.js', }, + './pii_redaction': { + types: './pii_redaction.d.ts', + default: './pii_redaction.js', + }, './typed.js': { types: './typed.d.ts', default: './typed.js', @@ -119,6 +123,10 @@ function updatePackageManifest(manifest) { types: './observability.d.ts', default: './observability.js', }, + './pii_redaction.js': { + types: './pii_redaction.d.ts', + default: './pii_redaction.js', + }, }; fs.writeFileSync(manifestPath, `${JSON.stringify(manifest, null, 2)}\n`); diff --git a/crates/wasm/tests-js/index_loader_tests.mjs b/crates/wasm/tests-js/index_loader_tests.mjs index cf930a17..017c5afb 100644 --- a/crates/wasm/tests-js/index_loader_tests.mjs +++ b/crates/wasm/tests-js/index_loader_tests.mjs @@ -17,6 +17,7 @@ test('WebAssembly generated package exposes the expected package metadata', () = assert.equal(packageJson.exports['./typed'].default, './typed.js'); assert.equal(packageJson.exports['./plugin'].default, './plugin.js'); assert.equal(packageJson.exports['./adaptive'].default, './adaptive.js'); + assert.equal(packageJson.exports['./pii_redaction'].default, './pii_redaction.js'); assert.equal(packageJson.exports['./typed.js'].default, './typed.js'); assert.equal(typeof wasm.ScopeType.Agent, 'number'); assert.equal(wasm.ScopeType.Agent, 0); @@ -32,6 +33,8 @@ test('WebAssembly generated package includes the expected wrapper files', () => 'plugin.d.ts', 'adaptive.js', 'adaptive.d.ts', + 'pii_redaction.js', + 'pii_redaction.d.ts', ]; for (const fileName of expectedFiles) { diff --git a/crates/wasm/tests-js/pii_redaction_tests.mjs b/crates/wasm/tests-js/pii_redaction_tests.mjs new file mode 100644 index 00000000..21b21901 --- /dev/null +++ b/crates/wasm/tests-js/pii_redaction_tests.mjs @@ -0,0 +1,64 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import assert from 'node:assert/strict'; +import { test } from 'node:test'; + +import * as piiRedaction from '../pkg/pii_redaction.js'; +import * as plugin from '../pkg/plugin.js'; + +test('WebAssembly pii_redaction wrappers expose helper defaults', () => { + assert.deepEqual(piiRedaction.defaultConfig(), { + version: 1, + mode: 'builtin', + input: true, + output: true, + tool_input: true, + tool_output: true, + priority: 100, + }); + assert.deepEqual(piiRedaction.builtinConfig(), { + action: 'redact', + }); + assert.deepEqual(piiRedaction.localModelConfig(), {}); +}); + +test('WebAssembly pii_redaction wrappers build component specs and validate bad values', () => { + assert.equal(plugin.listKinds().includes(piiRedaction.PII_REDACTION_PLUGIN_KIND), true); + + const component = piiRedaction.ComponentSpec({ + ...piiRedaction.defaultConfig(), + builtin: piiRedaction.builtinConfig({ detector: 'email' }), + }); + + assert.deepEqual(component, { + kind: 'pii_redaction', + enabled: true, + config: { + version: 1, + mode: 'builtin', + input: true, + output: true, + tool_input: true, + tool_output: true, + priority: 100, + builtin: { + action: 'redact', + detector: 'email', + }, + }, + }); + + const report = plugin.validate({ + version: 1, + components: [ + piiRedaction.ComponentSpec({ + ...piiRedaction.defaultConfig(), + input: false, + output: false, + builtin: piiRedaction.builtinConfig({ action: 'mask', detector: 'not_a_detector' }), + }), + ], + }); + assert.deepEqual(report.diagnostics.map((diagnostic) => diagnostic.field), ['builtin.detector']); +}); diff --git a/crates/wasm/wrappers/esm/pii_redaction.d.ts b/crates/wasm/wrappers/esm/pii_redaction.d.ts new file mode 100644 index 00000000..24824d46 --- /dev/null +++ b/crates/wasm/wrappers/esm/pii_redaction.d.ts @@ -0,0 +1,66 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import type { ConfigDiagnostic, ConfigReport } from './plugin.js'; + +export { ConfigDiagnostic, ConfigReport }; + +export interface ConfigPolicy { + unknown_field?: 'ignore' | 'warn' | 'error' | string; + unsupported_value?: 'ignore' | 'warn' | 'error' | string; +} + +export interface BuiltinConfig { + action?: 'remove' | 'redact' | 'regex_replace' | 'hash' | 'mask' | string; + target_paths?: string[]; + pattern?: string; + detector?: string; + replacement?: string; + mask_char?: string; + unmasked_prefix?: number; + unmasked_suffix?: number; +} + +export interface LocalModelConfig { + backend?: string; + model_id?: string; + detector_profile?: string; + allow_network?: boolean; + max_latency_ms?: number; +} + +export interface Config { + version?: number; + mode?: 'builtin' | 'local_model' | string; + input?: boolean; + output?: boolean; + tool_input?: boolean; + tool_output?: boolean; + priority?: number; + codec?: 'openai_chat' | 'openai_responses' | 'anthropic_messages' | string; + builtin?: BuiltinConfig; + local?: LocalModelConfig; + policy?: ConfigPolicy; +} + +export interface ComponentSpec { + kind: 'pii_redaction'; + enabled?: boolean; + config: Config; +} + +/** Top-level plugin kind used by the built-in PII redaction component. */ +export declare const PII_REDACTION_PLUGIN_KIND: 'pii_redaction'; +/** Create a default PII redaction component config. */ +export declare function defaultConfig(): Config; +/** Create deterministic built-in redaction backend settings with defaults applied. */ +export declare function builtinConfig(config?: BuiltinConfig): BuiltinConfig; +/** Create future local-model backend settings with defaults applied. */ +export declare function localModelConfig(config?: LocalModelConfig): LocalModelConfig; +/** Wrap PII redaction config as a top-level plugin component. */ +export declare function ComponentSpec( + config: Config, + options?: { + enabled?: boolean; + }, +): import('./plugin.js').ComponentSpec; diff --git a/crates/wasm/wrappers/esm/pii_redaction.js b/crates/wasm/wrappers/esm/pii_redaction.js new file mode 100644 index 00000000..4cd684c7 --- /dev/null +++ b/crates/wasm/wrappers/esm/pii_redaction.js @@ -0,0 +1,61 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import * as plugin from './plugin.js'; + +export const PII_REDACTION_PLUGIN_KIND = 'pii_redaction'; + +/** + * Create a default PII redaction component config. + * + * @returns {object} The minimal PII redaction config with schema version 1. + */ +export function defaultConfig() { + return { + version: 1, + mode: 'builtin', + input: true, + output: true, + tool_input: true, + tool_output: true, + priority: 100, + }; +} + +/** + * Create deterministic built-in redaction backend settings with defaults applied. + * + * @param {object} [config={}] - Partial built-in settings to override. + * @returns {object} A normalized built-in backend config object. + */ +export function builtinConfig(config = {}) { + return { + action: 'redact', + ...config, + }; +} + +/** + * Create future local-model backend settings with defaults applied. + * + * @param {object} [config={}] - Partial local-model settings to override. + * @returns {object} A normalized local-model backend config object. + */ +export function localModelConfig(config = {}) { + return { + ...config, + }; +} + +/** + * Wrap PII redaction config as a top-level plugin component. + * + * @param {object} config - PII redaction component configuration document. + * @param {{ enabled?: boolean }} [options={}] - Optional component-level flags. + * @returns {object} A plugin component spec for the PII redaction plugin. + */ +export function ComponentSpec(config, { enabled = true } = {}) { + return plugin.ComponentSpec(PII_REDACTION_PLUGIN_KIND, config, { + enabled, + }); +} diff --git a/crates/wasm/wrappers/nodejs/pii_redaction.js b/crates/wasm/wrappers/nodejs/pii_redaction.js new file mode 100644 index 00000000..192bfebc --- /dev/null +++ b/crates/wasm/wrappers/nodejs/pii_redaction.js @@ -0,0 +1,71 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +'use strict'; + +const plugin = require('./plugin.js'); + +const PII_REDACTION_PLUGIN_KIND = 'pii_redaction'; + +/** + * Create a default PII redaction component config. + * + * @returns {object} The minimal PII redaction config with schema version 1. + */ +function defaultConfig() { + return { + version: 1, + mode: 'builtin', + input: true, + output: true, + tool_input: true, + tool_output: true, + priority: 100, + }; +} + +/** + * Create deterministic built-in redaction backend settings with defaults applied. + * + * @param {object} [config={}] - Partial built-in settings to override. + * @returns {object} A normalized built-in backend config object. + */ +function builtinConfig(config = {}) { + return { + action: 'redact', + ...config, + }; +} + +/** + * Create future local-model backend settings with defaults applied. + * + * @param {object} [config={}] - Partial local-model settings to override. + * @returns {object} A normalized local-model backend config object. + */ +function localModelConfig(config = {}) { + return { + ...config, + }; +} + +/** + * Wrap PII redaction config as a top-level plugin component. + * + * @param {object} config - PII redaction component configuration document. + * @param {{ enabled?: boolean }} [options={}] - Optional component-level flags. + * @returns {object} A plugin component spec for the PII redaction plugin. + */ +function ComponentSpec(config, { enabled = true } = {}) { + return plugin.ComponentSpec(PII_REDACTION_PLUGIN_KIND, config, { + enabled, + }); +} + +module.exports = { + PII_REDACTION_PLUGIN_KIND, + defaultConfig, + builtinConfig, + localModelConfig, + ComponentSpec, +}; diff --git a/python/nemo_relay/__init__.py b/python/nemo_relay/__init__.py index 9326edc0..364dfaa8 100644 --- a/python/nemo_relay/__init__.py +++ b/python/nemo_relay/__init__.py @@ -16,6 +16,7 @@ - ``nemo_relay.plugin`` for global plugin configuration and custom plugin registration - ``nemo_relay.adaptive`` for adaptive component configuration helpers - ``nemo_relay.observability`` for observability component configuration helpers +- ``nemo_relay.pii_redaction`` for PII redaction component configuration helpers Top-level exports also include: @@ -190,6 +191,7 @@ async def main(): intercepts, llm, observability, + pii_redaction, plugin, scope, scope_local, diff --git a/python/nemo_relay/__init__.pyi b/python/nemo_relay/__init__.pyi index 902654ce..d5a25931 100644 --- a/python/nemo_relay/__init__.pyi +++ b/python/nemo_relay/__init__.pyi @@ -31,6 +31,7 @@ from nemo_relay import guardrails as guardrails from nemo_relay import intercepts as intercepts from nemo_relay import llm as llm from nemo_relay import observability as observability +from nemo_relay import pii_redaction as pii_redaction from nemo_relay import plugin as plugin from nemo_relay import scope as scope from nemo_relay import scope_local as scope_local diff --git a/python/nemo_relay/pii_redaction.py b/python/nemo_relay/pii_redaction.py new file mode 100644 index 00000000..5339a1d0 --- /dev/null +++ b/python/nemo_relay/pii_redaction.py @@ -0,0 +1,199 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""PII redaction plugin configuration helpers.""" + +from __future__ import annotations + +from dataclasses import dataclass, field, fields, is_dataclass +from typing import Literal, Protocol, TypedDict, cast + +from nemo_relay import Json, JsonObject, UnsupportedBehavior +from nemo_relay import plugin as plugin_module + + +class _ConfigDiagnosticRequired(TypedDict): + level: Literal["warning", "error"] + code: str + message: str + + +class ConfigDiagnostic(_ConfigDiagnosticRequired, total=False): + """One PII redaction validation diagnostic.""" + + component: str + field: str + + +class ConfigReport(TypedDict): + """Validation report for PII redaction configuration.""" + + diagnostics: list[ConfigDiagnostic] + + +class _SupportsToDict(Protocol): + def to_dict(self) -> JsonObject: ... + + +def _normalize(value: object) -> Json: + if hasattr(value, "to_dict"): + return cast(_SupportsToDict, value).to_dict() + if is_dataclass(value) and not isinstance(value, type): + return { + field_info.name: _normalize(field_value) + for field_info in fields(value) + if (field_value := getattr(value, field_info.name)) is not None + } + if isinstance(value, list): + return [_normalize(item) for item in value] + if isinstance(value, dict): + return {cast(str, key): _normalize(val) for key, val in value.items() if val is not None} + return cast(Json, value) + + +def _normalize_object(value: object) -> JsonObject: + return cast(JsonObject, _normalize(value)) + + +@dataclass(slots=True) +class ConfigPolicy: + """Policy for unsupported PII redaction configuration.""" + + unknown_field: UnsupportedBehavior = "warn" + unsupported_value: UnsupportedBehavior = "error" + + def to_dict(self) -> JsonObject: + """Serialize this policy to the canonical JSON object shape.""" + return { + "unknown_field": self.unknown_field, + "unsupported_value": self.unsupported_value, + } + + +@dataclass(slots=True) +class BuiltinConfig: + """Deterministic built-in redaction backend settings.""" + + action: Literal["remove", "redact", "regex_replace", "hash", "mask"] = "redact" + target_paths: list[str] = field(default_factory=list) + pattern: str | None = None + detector: str | None = None + replacement: str | None = None + mask_char: str | None = None + unmasked_prefix: int | None = None + unmasked_suffix: int | None = None + + def to_dict(self) -> JsonObject: + """Serialize this built-in backend config to the canonical JSON object shape.""" + return _normalize_object( + { + "action": self.action, + "target_paths": self.target_paths, + "pattern": self.pattern, + "detector": self.detector, + "replacement": self.replacement, + "mask_char": self.mask_char, + "unmasked_prefix": self.unmasked_prefix, + "unmasked_suffix": self.unmasked_suffix, + } + ) + + +@dataclass(slots=True) +class LocalModelConfig: + """Future local-model backend seam settings.""" + + backend: str | None = None + model_id: str | None = None + detector_profile: str | None = None + allow_network: bool | None = None + max_latency_ms: int | None = None + + def to_dict(self) -> JsonObject: + """Serialize this local-model config to the canonical JSON object shape.""" + return _normalize_object( + { + "backend": self.backend, + "model_id": self.model_id, + "detector_profile": self.detector_profile, + "allow_network": self.allow_network, + "max_latency_ms": self.max_latency_ms, + } + ) + + +@dataclass(slots=True) +class PiiRedactionConfig: + """Canonical config document for the top-level PII redaction component.""" + + version: int = 1 + mode: Literal["builtin", "local_model"] = "builtin" + input: bool = True + output: bool = True + tool_input: bool = True + tool_output: bool = True + priority: int = 100 + codec: Literal["openai_chat", "openai_responses", "anthropic_messages"] | str | None = None + builtin: BuiltinConfig | None = None + local: LocalModelConfig | None = None + policy: ConfigPolicy = field(default_factory=ConfigPolicy) + + def to_dict(self) -> JsonObject: + """Serialize this PII redaction config to the canonical JSON object shape.""" + return _normalize_object( + { + "version": self.version, + "mode": self.mode, + "input": self.input, + "output": self.output, + "tool_input": self.tool_input, + "tool_output": self.tool_output, + "priority": self.priority, + "codec": self.codec, + "builtin": self.builtin, + "local": self.local, + "policy": self.policy, + } + ) + + +PII_REDACTION_PLUGIN_KIND = "pii_redaction" + + +@dataclass(slots=True) +class ComponentSpec: + """Top-level PII redaction component wrapper.""" + + config: PiiRedactionConfig | JsonObject + enabled: bool = True + + def to_dict(self) -> JsonObject: + """Serialize this component to the canonical plugin shape.""" + return { + "kind": PII_REDACTION_PLUGIN_KIND, + "enabled": self.enabled, + "config": _normalize_object(self.config), + } + + +def validate_config(config: PiiRedactionConfig | JsonObject) -> ConfigReport: + """Validate a PII redaction config document without activating it.""" + report = plugin_module.validate( + plugin_module.PluginConfig( + components=[ComponentSpec(config)], + ) + ) + return cast(ConfigReport, report) + + +__all__ = [ + "BuiltinConfig", + "ComponentSpec", + "ConfigDiagnostic", + "ConfigPolicy", + "ConfigReport", + "LocalModelConfig", + "PII_REDACTION_PLUGIN_KIND", + "PiiRedactionConfig", + "validate_config", +] diff --git a/python/nemo_relay/pii_redaction.pyi b/python/nemo_relay/pii_redaction.pyi new file mode 100644 index 00000000..ef6d993a --- /dev/null +++ b/python/nemo_relay/pii_redaction.pyi @@ -0,0 +1,82 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Type stubs for ``nemo_relay.pii_redaction``.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Literal, TypedDict + +from nemo_relay import JsonObject, UnsupportedBehavior + + +class ConfigDiagnostic(TypedDict, total=False): + level: Literal["warning", "error"] + code: str + message: str + component: str + field: str + + +class ConfigReport(TypedDict): + diagnostics: list[ConfigDiagnostic] + + +@dataclass(slots=True) +class ConfigPolicy: + unknown_field: UnsupportedBehavior = ... + unsupported_value: UnsupportedBehavior = ... + def to_dict(self) -> JsonObject: ... + + +@dataclass(slots=True) +class BuiltinConfig: + action: Literal["remove", "redact", "regex_replace", "hash", "mask"] = ... + target_paths: list[str] = field(default_factory=list) + pattern: str | None = ... + detector: str | None = ... + replacement: str | None = ... + mask_char: str | None = ... + unmasked_prefix: int | None = ... + unmasked_suffix: int | None = ... + def to_dict(self) -> JsonObject: ... + + +@dataclass(slots=True) +class LocalModelConfig: + backend: str | None = ... + model_id: str | None = ... + detector_profile: str | None = ... + allow_network: bool | None = ... + max_latency_ms: int | None = ... + def to_dict(self) -> JsonObject: ... + + +@dataclass(slots=True) +class PiiRedactionConfig: + version: int = ... + mode: Literal["builtin", "local_model"] = ... + input: bool = ... + output: bool = ... + tool_input: bool = ... + tool_output: bool = ... + priority: int = ... + codec: Literal["openai_chat", "openai_responses", "anthropic_messages"] | str | None = ... + builtin: BuiltinConfig | None = ... + local: LocalModelConfig | None = ... + policy: ConfigPolicy = field(default_factory=ConfigPolicy) + def to_dict(self) -> JsonObject: ... + + +PII_REDACTION_PLUGIN_KIND: Literal["pii_redaction"] + + +@dataclass(slots=True) +class ComponentSpec: + config: PiiRedactionConfig | JsonObject + enabled: bool = ... + def to_dict(self) -> JsonObject: ... + + +def validate_config(config: PiiRedactionConfig | JsonObject) -> ConfigReport: ... diff --git a/python/tests/test_pii_redaction_plugin.py b/python/tests/test_pii_redaction_plugin.py new file mode 100644 index 00000000..fabc834e --- /dev/null +++ b/python/tests/test_pii_redaction_plugin.py @@ -0,0 +1,68 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Tests for the built-in PII redaction plugin config helpers.""" + +from __future__ import annotations + +from nemo_relay import plugin +from nemo_relay.pii_redaction import ( + PII_REDACTION_PLUGIN_KIND, + BuiltinConfig, + ComponentSpec, + LocalModelConfig, + PiiRedactionConfig, + validate_config, +) + + +class TestPiiRedactionConfigHelpers: + def test_defaults_and_component_wrapper(self): + assert BuiltinConfig().to_dict() == { + "action": "redact", + "target_paths": [], + } + assert LocalModelConfig().to_dict() == {} + + wrapped = ComponentSpec(PiiRedactionConfig()).to_dict() + assert wrapped["kind"] == PII_REDACTION_PLUGIN_KIND + assert wrapped["enabled"] is True + wrapped_config = wrapped["config"] + assert isinstance(wrapped_config, dict) + assert wrapped_config["version"] == 1 + assert wrapped_config["mode"] == "builtin" + + def test_validation_rejects_bad_values(self): + report = validate_config( + PiiRedactionConfig( + input=False, + output=False, + builtin=BuiltinConfig( + action="mask", + detector="not_a_detector", + ) + ) + ) + assert any(diag.get("field") == "builtin.detector" for diag in report["diagnostics"]) + + def test_component_configures_plugin_validation(self): + report = plugin.validate( + plugin.PluginConfig( + components=[ + ComponentSpec( + PiiRedactionConfig( + input=False, + output=False, + builtin=BuiltinConfig( + action="mask", + detector="email", + ) + ) + ) + ] + ) + ) + assert report["diagnostics"] == [] + + def test_list_kinds_includes_builtin_pii_redaction(self): + assert PII_REDACTION_PLUGIN_KIND in plugin.list_kinds() From f2d8c8fe73df2fc65e47ff8fa2ac4cfaccdcbd67 Mon Sep 17 00:00:00 2001 From: Alex Fournier Date: Tue, 9 Jun 2026 09:33:02 -0700 Subject: [PATCH 13/35] feat: expose pii redaction binding helpers Signed-off-by: Alex Fournier --- python/nemo_relay/_config_normalize.py | 37 +++++++++++++ python/nemo_relay/adaptive.py | 73 +++++++++++--------------- python/nemo_relay/observability.py | 65 +++++++++++------------ python/nemo_relay/pii_redaction.py | 48 ++++++----------- python/nemo_relay/plugin.py | 36 +++---------- 5 files changed, 121 insertions(+), 138 deletions(-) create mode 100644 python/nemo_relay/_config_normalize.py diff --git a/python/nemo_relay/_config_normalize.py b/python/nemo_relay/_config_normalize.py new file mode 100644 index 00000000..50a5c115 --- /dev/null +++ b/python/nemo_relay/_config_normalize.py @@ -0,0 +1,37 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Private helpers for normalizing config helper dataclasses to JSON-like values.""" + +from __future__ import annotations + +from dataclasses import fields, is_dataclass +from typing import Any, Protocol, cast + + +class SupportsToDict(Protocol): + """Private protocol for helper objects that provide ``to_dict()``.""" + + def to_dict(self) -> dict[str, Any]: ... + + +def normalize(value: object) -> Any: + """Recursively normalize dataclasses, lists, and dicts into JSON-like values.""" + if hasattr(value, "to_dict"): + return cast(SupportsToDict, value).to_dict() + if is_dataclass(value) and not isinstance(value, type): + return { + field_info.name: normalize(field_value) + for field_info in fields(value) + if (field_value := getattr(value, field_info.name)) is not None + } + if isinstance(value, list): + return [normalize(item) for item in value] + if isinstance(value, dict): + return {cast(str, key): normalize(val) for key, val in value.items() if val is not None} + return value + + +def normalize_object(value: object) -> dict[str, Any]: + """Normalize a helper value and assert the result is mapping-shaped.""" + return cast(dict[str, Any], normalize(value)) diff --git a/python/nemo_relay/adaptive.py b/python/nemo_relay/adaptive.py index e4f37545..84a3fa78 100644 --- a/python/nemo_relay/adaptive.py +++ b/python/nemo_relay/adaptive.py @@ -9,10 +9,11 @@ from __future__ import annotations -from dataclasses import dataclass, field, fields, is_dataclass -from typing import Literal, Protocol, TypedDict, cast +from dataclasses import dataclass, field +from typing import Literal, TypedDict, cast -from nemo_relay import Json, JsonObject, UnsupportedBehavior +from nemo_relay import JsonObject, UnsupportedBehavior +from nemo_relay._config_normalize import normalize, normalize_object from nemo_relay._native import AdaptiveRuntime as AdaptiveRuntime from nemo_relay._native import build_cache_telemetry_event as _build_cache_telemetry_event from nemo_relay._native import set_latency_sensitivity as _set_latency_sensitivity @@ -38,30 +39,6 @@ class ConfigReport(TypedDict): diagnostics: list[ConfigDiagnostic] -class _SupportsToDict(Protocol): - def to_dict(self) -> JsonObject: ... - - -def _normalize(value: object) -> Json: - if hasattr(value, "to_dict"): - return cast(_SupportsToDict, value).to_dict() - if is_dataclass(value) and not isinstance(value, type): - return { - field_info.name: _normalize(field_value) - for field_info in fields(value) - if (field_value := getattr(value, field_info.name)) is not None - } - if isinstance(value, list): - return [_normalize(item) for item in value] - if isinstance(value, dict): - return {cast(str, key): _normalize(val) for key, val in value.items() if val is not None} - return cast(Json, value) - - -def _normalize_object(value: object) -> JsonObject: - return cast(JsonObject, _normalize(value)) - - @dataclass(slots=True) class ConfigPolicy: """Policy for unsupported adaptive configuration. @@ -109,7 +86,7 @@ def redis(url: str, key_prefix: str = "nemo_relay:") -> "BackendSpec": def to_dict(self) -> JsonObject: """Serialize this backend spec to the canonical JSON object shape.""" - return {"kind": self.kind, "config": _normalize_object(self.config)} + return {"kind": self.kind, "config": cast(JsonObject, normalize_object(self.config))} @dataclass(slots=True) @@ -125,7 +102,7 @@ class StateConfig: def to_dict(self) -> JsonObject: """Serialize this state config to the canonical JSON object shape.""" - return {"backend": _normalize_object(self.backend)} + return {"backend": cast(JsonObject, normalize_object(self.backend))} @dataclass(slots=True) @@ -142,11 +119,14 @@ class TelemetryConfig: def to_dict(self) -> JsonObject: """Serialize this telemetry config to the canonical JSON object shape.""" - return _normalize_object( + return cast( + JsonObject, + normalize_object( { "subscriber_name": self.subscriber_name, "learners": self.learners, } + ), ) @@ -168,13 +148,16 @@ class AdaptiveHintsConfig: def to_dict(self) -> JsonObject: """Serialize this adaptive-hints config to the canonical JSON object shape.""" - return _normalize_object( + return cast( + JsonObject, + normalize_object( { "priority": self.priority, "break_chain": self.break_chain, "inject_header": self.inject_header, "inject_body_path": self.inject_body_path, } + ), ) @@ -194,7 +177,7 @@ class ToolParallelismConfig: def to_dict(self) -> JsonObject: """Serialize this tool-parallelism config to the canonical JSON object shape.""" - return _normalize_object({"priority": self.priority, "mode": self.mode}) + return cast(JsonObject, normalize_object({"priority": self.priority, "mode": self.mode})) @dataclass(slots=True) @@ -214,12 +197,15 @@ class AcgStabilityThresholds: def to_dict(self) -> JsonObject: """Serialize these ACG stability thresholds to the canonical JSON object shape.""" - return _normalize_object( + return cast( + JsonObject, + normalize_object( { "stable_threshold": self.stable_threshold, "semi_stable_threshold": self.semi_stable_threshold, "min_observations_for_full_confidence": self.min_observations_for_full_confidence, } + ), ) @@ -241,13 +227,16 @@ class AcgConfig: def to_dict(self) -> JsonObject: """Serialize this ACG config to the canonical JSON object shape.""" - return _normalize_object( + return cast( + JsonObject, + normalize_object( { "provider": self.provider, "observation_window": self.observation_window, "priority": self.priority, - "stability_thresholds": _normalize(self.stability_thresholds), + "stability_thresholds": normalize(self.stability_thresholds), } + ), ) @@ -284,11 +273,11 @@ def to_dict(self) -> JsonObject: return { "version": self.version, "agent_id": self.agent_id, - "state": _normalize(self.state), - "telemetry": _normalize(self.telemetry), - "adaptive_hints": _normalize(self.adaptive_hints), - "tool_parallelism": _normalize(self.tool_parallelism), - "acg": _normalize(self.acg), + "state": normalize(self.state), + "telemetry": normalize(self.telemetry), + "adaptive_hints": normalize(self.adaptive_hints), + "tool_parallelism": normalize(self.tool_parallelism), + "acg": normalize(self.acg), "policy": self.policy.to_dict(), } @@ -316,13 +305,13 @@ def to_dict(self) -> JsonObject: return { "kind": ADAPTIVE_PLUGIN_KIND, "enabled": self.enabled, - "config": _normalize_object(self.config), + "config": cast(JsonObject, normalize_object(self.config)), } def validate_config(config: AdaptiveConfig | JsonObject) -> ConfigReport: """Validate an adaptive config document without constructing a runtime.""" - return cast(ConfigReport, _validate_adaptive_config(_normalize_object(config))) + return cast(ConfigReport, _validate_adaptive_config(cast(JsonObject, normalize_object(config)))) def build_cache_telemetry_event( diff --git a/python/nemo_relay/observability.py b/python/nemo_relay/observability.py index a067f4cc..154d6057 100644 --- a/python/nemo_relay/observability.py +++ b/python/nemo_relay/observability.py @@ -5,34 +5,11 @@ from __future__ import annotations -from dataclasses import dataclass, field, fields, is_dataclass -from typing import Literal, Protocol, cast +from dataclasses import dataclass, field +from typing import Literal, cast -from nemo_relay import Json, JsonObject, UnsupportedBehavior - - -class _SupportsToDict(Protocol): - def to_dict(self) -> JsonObject: ... - - -def _normalize(value: object) -> Json: - if hasattr(value, "to_dict"): - return cast(_SupportsToDict, value).to_dict() - if is_dataclass(value) and not isinstance(value, type): - return { - field_info.name: _normalize(field_value) - for field_info in fields(value) - if (field_value := getattr(value, field_info.name)) is not None - } - if isinstance(value, list): - return [_normalize(item) for item in value] - if isinstance(value, dict): - return {cast(str, key): _normalize(val) for key, val in value.items() if val is not None} - return cast(Json, value) - - -def _normalize_object(value: object) -> JsonObject: - return cast(JsonObject, _normalize(value)) +from nemo_relay import JsonObject, UnsupportedBehavior +from nemo_relay._config_normalize import normalize_object @dataclass(slots=True) @@ -63,13 +40,16 @@ class AtofEndpointConfig: def to_dict(self) -> JsonObject: """Serialize this ATOF endpoint config to the canonical JSON object shape.""" - return _normalize_object( + return cast( + JsonObject, + normalize_object( { "url": self.url, "transport": self.transport, "headers": self.headers, "timeout_millis": self.timeout_millis, } + ), ) @@ -85,7 +65,9 @@ class AtofConfig: def to_dict(self) -> JsonObject: """Serialize this ATOF config to the canonical JSON object shape.""" - return _normalize_object( + return cast( + JsonObject, + normalize_object( { "enabled": self.enabled, "output_directory": self.output_directory, @@ -93,6 +75,7 @@ def to_dict(self) -> JsonObject: "mode": self.mode, "endpoints": self.endpoints, } + ), ) @@ -118,7 +101,9 @@ class S3StorageConfig: def to_dict(self) -> JsonObject: """Serialize this S3 storage config to the canonical JSON object shape.""" - return _normalize_object( + return cast( + JsonObject, + normalize_object( { "type": "s3", "bucket": self.bucket, @@ -130,6 +115,7 @@ def to_dict(self) -> JsonObject: "endpoint_url": self.endpoint_url, "allow_http": self.allow_http, } + ), ) @@ -144,7 +130,9 @@ class HttpStorageConfig: def to_dict(self) -> JsonObject: """Serialize this HTTP storage config to the canonical JSON object shape.""" - return _normalize_object( + return cast( + JsonObject, + normalize_object( { "type": "http", "endpoint": self.endpoint, @@ -152,6 +140,7 @@ def to_dict(self) -> JsonObject: "header_env": self.header_env, "timeout_millis": self.timeout_millis, } + ), ) @@ -184,7 +173,7 @@ def to_dict(self) -> JsonObject: } if value["agent_version"] is None: value.pop("agent_version") - return _normalize_object(value) + return cast(JsonObject, normalize_object(value)) @dataclass(slots=True) @@ -204,7 +193,9 @@ class OtlpConfig: def to_dict(self) -> JsonObject: """Serialize this OTLP config to the canonical JSON object shape.""" - return _normalize_object( + return cast( + JsonObject, + normalize_object( { "enabled": self.enabled, "transport": self.transport, @@ -217,6 +208,7 @@ def to_dict(self) -> JsonObject: "instrumentation_scope": self.instrumentation_scope, "timeout_millis": self.timeout_millis, } + ), ) @@ -233,7 +225,9 @@ class ObservabilityConfig: def to_dict(self) -> JsonObject: """Serialize this observability config to the canonical JSON object shape.""" - return _normalize_object( + return cast( + JsonObject, + normalize_object( { "version": self.version, "atof": self.atof, @@ -242,6 +236,7 @@ def to_dict(self) -> JsonObject: "openinference": self.openinference, "policy": self.policy, } + ), ) @@ -260,7 +255,7 @@ def to_dict(self) -> JsonObject: return { "kind": OBSERVABILITY_PLUGIN_KIND, "enabled": self.enabled, - "config": _normalize_object(self.config), + "config": cast(JsonObject, normalize_object(self.config)), } diff --git a/python/nemo_relay/pii_redaction.py b/python/nemo_relay/pii_redaction.py index 5339a1d0..4cad3332 100644 --- a/python/nemo_relay/pii_redaction.py +++ b/python/nemo_relay/pii_redaction.py @@ -5,10 +5,11 @@ from __future__ import annotations -from dataclasses import dataclass, field, fields, is_dataclass -from typing import Literal, Protocol, TypedDict, cast +from dataclasses import dataclass, field +from typing import Literal, TypedDict, cast -from nemo_relay import Json, JsonObject, UnsupportedBehavior +from nemo_relay import JsonObject, UnsupportedBehavior +from nemo_relay._config_normalize import normalize_object from nemo_relay import plugin as plugin_module @@ -31,30 +32,6 @@ class ConfigReport(TypedDict): diagnostics: list[ConfigDiagnostic] -class _SupportsToDict(Protocol): - def to_dict(self) -> JsonObject: ... - - -def _normalize(value: object) -> Json: - if hasattr(value, "to_dict"): - return cast(_SupportsToDict, value).to_dict() - if is_dataclass(value) and not isinstance(value, type): - return { - field_info.name: _normalize(field_value) - for field_info in fields(value) - if (field_value := getattr(value, field_info.name)) is not None - } - if isinstance(value, list): - return [_normalize(item) for item in value] - if isinstance(value, dict): - return {cast(str, key): _normalize(val) for key, val in value.items() if val is not None} - return cast(Json, value) - - -def _normalize_object(value: object) -> JsonObject: - return cast(JsonObject, _normalize(value)) - - @dataclass(slots=True) class ConfigPolicy: """Policy for unsupported PII redaction configuration.""" @@ -85,7 +62,9 @@ class BuiltinConfig: def to_dict(self) -> JsonObject: """Serialize this built-in backend config to the canonical JSON object shape.""" - return _normalize_object( + return cast( + JsonObject, + normalize_object( { "action": self.action, "target_paths": self.target_paths, @@ -96,6 +75,7 @@ def to_dict(self) -> JsonObject: "unmasked_prefix": self.unmasked_prefix, "unmasked_suffix": self.unmasked_suffix, } + ), ) @@ -111,7 +91,9 @@ class LocalModelConfig: def to_dict(self) -> JsonObject: """Serialize this local-model config to the canonical JSON object shape.""" - return _normalize_object( + return cast( + JsonObject, + normalize_object( { "backend": self.backend, "model_id": self.model_id, @@ -119,6 +101,7 @@ def to_dict(self) -> JsonObject: "allow_network": self.allow_network, "max_latency_ms": self.max_latency_ms, } + ), ) @@ -140,7 +123,9 @@ class PiiRedactionConfig: def to_dict(self) -> JsonObject: """Serialize this PII redaction config to the canonical JSON object shape.""" - return _normalize_object( + return cast( + JsonObject, + normalize_object( { "version": self.version, "mode": self.mode, @@ -154,6 +139,7 @@ def to_dict(self) -> JsonObject: "local": self.local, "policy": self.policy, } + ), ) @@ -172,7 +158,7 @@ def to_dict(self) -> JsonObject: return { "kind": PII_REDACTION_PLUGIN_KIND, "enabled": self.enabled, - "config": _normalize_object(self.config), + "config": cast(JsonObject, normalize_object(self.config)), } diff --git a/python/nemo_relay/plugin.py b/python/nemo_relay/plugin.py index 7e7b2f0f..e88afa19 100644 --- a/python/nemo_relay/plugin.py +++ b/python/nemo_relay/plugin.py @@ -11,11 +11,10 @@ from __future__ import annotations from contextlib import asynccontextmanager -from dataclasses import dataclass, field, fields, is_dataclass +from dataclasses import dataclass, field from typing import TYPE_CHECKING, AsyncIterator, Callable, Literal, Protocol, TypedDict, cast from nemo_relay import ( - Json, JsonObject, LlmConditionalExecutionGuardrail, LlmExecutionIntercept, @@ -30,6 +29,7 @@ UnsupportedBehavior, subscribers, ) +from nemo_relay._config_normalize import normalize, normalize_object from nemo_relay._native import ( active_plugin_report as _active_plugin_report, ) @@ -180,30 +180,6 @@ def register(self, plugin_config: JsonObject, context: PluginContext) -> None: ... -class _SupportsToDict(Protocol): - def to_dict(self) -> JsonObject: ... - - -def _normalize(value: object) -> Json: - if hasattr(value, "to_dict"): - return cast(_SupportsToDict, value).to_dict() - if is_dataclass(value) and not isinstance(value, type): - return { - field_info.name: _normalize(field_value) - for field_info in fields(value) - if (field_value := getattr(value, field_info.name)) is not None - } - if isinstance(value, list): - return [_normalize(item) for item in value] - if isinstance(value, dict): - return {cast(str, key): _normalize(val) for key, val in value.items() if val is not None} - return cast(Json, value) - - -def _normalize_object(value: object) -> JsonObject: - return cast(JsonObject, _normalize(value)) - - @dataclass(slots=True) class ConfigPolicy: """Policy for unsupported plugin configuration. @@ -255,7 +231,7 @@ def to_dict(self) -> JsonObject: return { "kind": self.kind, "enabled": self.enabled, - "config": _normalize_object(self.config), + "config": cast(JsonObject, normalize_object(self.config)), } @@ -281,7 +257,7 @@ def to_dict(self) -> JsonObject: """Serialize this config to the canonical JSON document shape.""" return { "version": self.version, - "components": [_normalize(component) for component in self.components], + "components": [normalize(component) for component in self.components], "policy": self.policy.to_dict(), } @@ -299,7 +275,7 @@ def validate(config: PluginConfig | JsonObject) -> ConfigReport: Validation checks plugin-level compatibility, unknown component kinds, multiplicity rules, and per-plugin validation logic. """ - return cast(ConfigReport, _validate_plugin_config(_normalize_object(config))) + return cast(ConfigReport, _validate_plugin_config(cast(JsonObject, normalize_object(config)))) async def initialize(config: PluginConfig | JsonObject) -> ConfigReport: @@ -316,7 +292,7 @@ async def initialize(config: PluginConfig | JsonObject) -> ConfigReport: registration is rolled back on failure, and the previous configuration is restored when possible. """ - return cast(ConfigReport, await _initialize_plugins(_normalize_object(config))) + return cast(ConfigReport, await _initialize_plugins(cast(JsonObject, normalize_object(config)))) def clear() -> None: From a98e62cef8d3ea2c494aa3b9d013d61de38cb33f Mon Sep 17 00:00:00 2001 From: Alex Fournier Date: Tue, 9 Jun 2026 09:42:47 -0700 Subject: [PATCH 14/35] test: use pii redaction plugin kind constant Signed-off-by: Alex Fournier --- crates/cli/tests/coverage/plugins_tests.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/cli/tests/coverage/plugins_tests.rs b/crates/cli/tests/coverage/plugins_tests.rs index 657a85eb..bebe3124 100644 --- a/crates/cli/tests/coverage/plugins_tests.rs +++ b/crates/cli/tests/coverage/plugins_tests.rs @@ -11,7 +11,7 @@ use nemo_relay::plugins::nemo_guardrails::component::{ }; use nemo_relay_adaptive::AdaptiveConfig; use nemo_relay_adaptive::plugin_component::ADAPTIVE_PLUGIN_KIND; -use nemo_relay_pii_redaction::component::PiiRedactionConfig; +use nemo_relay_pii_redaction::component::{PII_REDACTION_PLUGIN_KIND, PiiRedactionConfig}; fn adaptive_component_config(agent_id: &str) -> serde_json::Map { json!({ @@ -818,7 +818,7 @@ fn editor_save_preserves_unknown_nemo_guardrails_fields_and_sections() { fn editor_save_preserves_unknown_pii_redaction_fields_and_prunes_version() { let mut config = PluginConfig { components: vec![PluginComponentSpec { - kind: "pii_redaction".to_string(), + kind: PII_REDACTION_PLUGIN_KIND.to_string(), enabled: true, config: json!({ "version": 1, @@ -876,7 +876,7 @@ fn editor_save_preserves_unknown_pii_redaction_fields_and_prunes_version() { let component = config .components .iter() - .find(|component| component.kind == "pii_redaction") + .find(|component| component.kind == PII_REDACTION_PLUGIN_KIND) .unwrap(); assert!(!component.enabled); assert!(!component.config.contains_key("version")); From 94732327299877a0aa809cb538cd7737109ea0c1 Mon Sep 17 00:00:00 2001 From: Alex Fournier Date: Tue, 9 Jun 2026 09:46:12 -0700 Subject: [PATCH 15/35] fix: support compressed ipv6 detector matches Signed-off-by: Alex Fournier --- crates/pii-redaction/src/detectors.rs | 2 +- .../tests/unit/component_tests.rs | 47 +++++++++++++++++++ 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/crates/pii-redaction/src/detectors.rs b/crates/pii-redaction/src/detectors.rs index 2a50ff9d..23396abd 100644 --- a/crates/pii-redaction/src/detectors.rs +++ b/crates/pii-redaction/src/detectors.rs @@ -67,7 +67,7 @@ const BUILTIN_DETECTOR_SPECS: &[BuiltinDetectorSpec] = &[ detector: BuiltinDetector::Ipv6, name: "ipv6", category: BuiltinDetectorCategory::CommonPii, - regex_pattern: r"\b(?:[A-Fa-f0-9]{1,4}:){2,7}[A-Fa-f0-9]{1,4}\b", + regex_pattern: r"(?:([A-Fa-f0-9]{1,4}:){7}[A-Fa-f0-9]{1,4}|([A-Fa-f0-9]{1,4}:){1,7}:|([A-Fa-f0-9]{1,4}:){1,6}:[A-Fa-f0-9]{1,4}|([A-Fa-f0-9]{1,4}:){1,5}(?::[A-Fa-f0-9]{1,4}){1,2}|([A-Fa-f0-9]{1,4}:){1,4}(?::[A-Fa-f0-9]{1,4}){1,3}|([A-Fa-f0-9]{1,4}:){1,3}(?::[A-Fa-f0-9]{1,4}){1,4}|([A-Fa-f0-9]{1,4}:){1,2}(?::[A-Fa-f0-9]{1,4}){1,5}|[A-Fa-f0-9]{1,4}:(?:(?::[A-Fa-f0-9]{1,4}){1,6})|:(?:(?::[A-Fa-f0-9]{1,4}){1,7}|:))", }, BuiltinDetectorSpec { detector: BuiltinDetector::Url, diff --git a/crates/pii-redaction/tests/unit/component_tests.rs b/crates/pii-redaction/tests/unit/component_tests.rs index 47770048..e13a7fb8 100644 --- a/crates/pii-redaction/tests/unit/component_tests.rs +++ b/crates/pii-redaction/tests/unit/component_tests.rs @@ -1000,6 +1000,53 @@ fn builtin_mask_with_ipv6_detector_preserves_last_segment_by_default() { clear_plugin_configuration().unwrap(); } +#[test] +fn builtin_mask_with_ipv6_detector_supports_compressed_addresses() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + setup_isolated_thread(); + + futures::executor::block_on(initialize_plugins(plugin_config(json!({ + "mode": "builtin", + "codec": "openai_chat", + "input": false, + "output": false, + "tool_input": true, + "tool_output": false, + "builtin": { + "action": "mask", + "detector": "ipv6", + "target_paths": ["/ip"] + } + })))) + .unwrap(); + + let events = capture_events("pii-redaction-ipv6-compressed-mask-events"); + let _handle = tool_call( + ToolCallParams::builder() + .name("notify") + .args(json!({ + "ip": "2001:db8::1", + "keep": "unchanged" + })) + .build(), + ) + .unwrap(); + + let captured_events = captured_events_snapshot(&events); + assert_eq!(captured_events.len(), 1); + assert_eq!( + captured_events[0].input(), + Some(&json!({ + "ip": "****:****::1", + "keep": "unchanged" + })) + ); + + deregister_subscriber("pii-redaction-ipv6-compressed-mask-events").unwrap(); + clear_plugin_configuration().unwrap(); +} + #[test] fn builtin_mask_with_bearer_token_detector_preserves_scheme_and_last_four() { let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); From 9d6e7ec1fee3a99f3e517cf2bc31fed70b2dd1a0 Mon Sep 17 00:00:00 2001 From: Alex Fournier Date: Tue, 9 Jun 2026 09:47:24 -0700 Subject: [PATCH 16/35] fix: make mask bounds overflow safe Signed-off-by: Alex Fournier --- crates/pii-redaction/src/builtin.rs | 2 +- crates/pii-redaction/tests/unit/component_tests.rs | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/crates/pii-redaction/src/builtin.rs b/crates/pii-redaction/src/builtin.rs index 4d5068ae..6317e5e4 100644 --- a/crates/pii-redaction/src/builtin.rs +++ b/crates/pii-redaction/src/builtin.rs @@ -316,7 +316,7 @@ pub(crate) fn mask_text( ) -> String { let chars: Vec = text.chars().collect(); let len = chars.len(); - if len <= unmasked_prefix + unmasked_suffix { + if len <= unmasked_prefix.saturating_add(unmasked_suffix) { return text.to_string(); } diff --git a/crates/pii-redaction/tests/unit/component_tests.rs b/crates/pii-redaction/tests/unit/component_tests.rs index e13a7fb8..ab6d422d 100644 --- a/crates/pii-redaction/tests/unit/component_tests.rs +++ b/crates/pii-redaction/tests/unit/component_tests.rs @@ -1047,6 +1047,12 @@ fn builtin_mask_with_ipv6_detector_supports_compressed_addresses() { clear_plugin_configuration().unwrap(); } +#[test] +fn mask_text_handles_extreme_unmasked_bounds_without_overflow() { + let masked = mask_text("secret", "*", usize::MAX, 4); + assert_eq!(masked, "secret"); +} + #[test] fn builtin_mask_with_bearer_token_detector_preserves_scheme_and_last_four() { let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); From 354f2e39de5fb2a915fa2777de58bf0446ef8b9c Mon Sep 17 00:00:00 2001 From: Alex Fournier Date: Tue, 9 Jun 2026 09:49:50 -0700 Subject: [PATCH 17/35] build: gate relay schema feature through pii schema Signed-off-by: Alex Fournier --- crates/pii-redaction/Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/pii-redaction/Cargo.toml b/crates/pii-redaction/Cargo.toml index 235f7266..8f950bfa 100644 --- a/crates/pii-redaction/Cargo.toml +++ b/crates/pii-redaction/Cargo.toml @@ -15,10 +15,10 @@ workspace = true [features] default = [] -schema = ["dep:schemars"] +schema = ["dep:schemars", "nemo-relay/schema"] [dependencies] -nemo-relay = { workspace = true, features = ["schema"] } +nemo-relay.workspace = true serde = { version = "1", features = ["derive"] } serde_json = "1" regex = "1" From 9b6f4e49cbe9e998a6dc906f13e545b35bd6665f Mon Sep 17 00:00:00 2001 From: Alex Fournier Date: Tue, 9 Jun 2026 10:00:26 -0700 Subject: [PATCH 18/35] fix: make empty remove targets leaf-only Signed-off-by: Alex Fournier --- crates/pii-redaction/src/builtin.rs | 3 +- .../tests/unit/component_tests.rs | 52 +++++++++++++++++++ 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/crates/pii-redaction/src/builtin.rs b/crates/pii-redaction/src/builtin.rs index 6317e5e4..baa12c1f 100644 --- a/crates/pii-redaction/src/builtin.rs +++ b/crates/pii-redaction/src/builtin.rs @@ -135,7 +135,8 @@ impl CompiledBuiltinBackend { value: Json, path_segments: &mut Vec, ) -> Option { - if self.matches_current_preorder_path(path_segments) + if !self.target_paths.is_empty() + && self.matches_current_preorder_path(path_segments) && matches!(self.action, BuiltinAction::Remove) { return None; diff --git a/crates/pii-redaction/tests/unit/component_tests.rs b/crates/pii-redaction/tests/unit/component_tests.rs index ab6d422d..4cad9407 100644 --- a/crates/pii-redaction/tests/unit/component_tests.rs +++ b/crates/pii-redaction/tests/unit/component_tests.rs @@ -438,6 +438,58 @@ fn builtin_remove_deletes_object_fields_and_nulls_array_or_root_targets() { clear_plugin_configuration().unwrap(); } +#[test] +fn builtin_remove_with_empty_target_paths_only_removes_string_leaves() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + setup_isolated_thread(); + + futures::executor::block_on(initialize_plugins(plugin_config(json!({ + "mode": "builtin", + "input": false, + "output": false, + "tool_input": true, + "tool_output": false, + "builtin": { + "action": "remove" + } + })))) + .unwrap(); + + let events = capture_events("pii-redaction-remove-empty-targets-events"); + let _handle = tool_call( + ToolCallParams::builder() + .name("search") + .args(json!({ + "secret": "abc", + "nested": { + "keep": "yes", + "count": 3 + }, + "items": ["a", "b", 9], + "public": true + })) + .build(), + ) + .unwrap(); + + let captured_events = captured_events_snapshot(&events); + assert_eq!(captured_events.len(), 1); + assert_eq!( + captured_events[0].input(), + Some(&json!({ + "nested": { + "count": 3 + }, + "items": [null, null, 9], + "public": true + })) + ); + + deregister_subscriber("pii-redaction-remove-empty-targets-events").unwrap(); + clear_plugin_configuration().unwrap(); +} + #[test] fn builtin_remove_deletes_targeted_object_and_array_container_fields() { let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); From 5cb207ac48e17329932f6ac93a2db3833e90d1f2 Mon Sep 17 00:00:00 2001 From: Alex Fournier Date: Tue, 9 Jun 2026 10:04:51 -0700 Subject: [PATCH 19/35] fix: fail closed on malformed tool call overlays Signed-off-by: Alex Fournier --- crates/pii-redaction/src/overlay.rs | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/crates/pii-redaction/src/overlay.rs b/crates/pii-redaction/src/overlay.rs index 0de0911c..f556b6c1 100644 --- a/crates/pii-redaction/src/overlay.rs +++ b/crates/pii-redaction/src/overlay.rs @@ -130,11 +130,13 @@ fn overlay_openai_chat_tool_calls( for (raw_call, sanitized_call) in raw_calls.iter_mut().zip(tool_calls.iter()) { let Some(raw_call) = raw_call.as_object_mut() else { - continue; + message.remove("tool_calls"); + return; }; set_optional_string_field(raw_call, "id", Some(sanitized_call.id.as_str())); let Some(function) = raw_call.get_mut("function").and_then(Json::as_object_mut) else { - continue; + message.remove("tool_calls"); + return; }; set_optional_string_field(function, "name", Some(sanitized_call.name.as_str())); set_optional_string_field( @@ -163,7 +165,7 @@ fn overlay_openai_responses_tool_calls( return true; } let Some(raw_call) = item.as_object_mut() else { - return true; + return false; }; let Some(sanitized_call) = sanitized_calls.next() else { return false; @@ -194,7 +196,7 @@ fn overlay_anthropic_tool_calls(blocks: &mut Vec, tool_calls: Option<&[Res return true; } let Some(raw_call) = block.as_object_mut() else { - return true; + return false; }; let Some(sanitized_call) = sanitized_calls.next() else { return false; @@ -367,6 +369,25 @@ mod tests { ); } + #[test] + fn openai_chat_overlay_removes_tool_calls_when_typed_entry_has_wrong_shape() { + let mut message = json!({ + "tool_calls": [ + {"id": "call_1", "arguments": "{\"secret\":\"raw-1\"}"} + ] + }) + .as_object() + .unwrap() + .clone(); + + overlay_openai_chat_tool_calls( + &mut message, + Some(&[tool_call("call_1", "one", json!({"secret": "[REDACTED]"}))]), + ); + + assert!(!message.contains_key("tool_calls")); + } + #[test] fn openai_responses_overlay_removes_extra_function_calls() { let mut items = vec![ From 998c655611183ea3bb0dcb5724d344744a65e284 Mon Sep 17 00:00:00 2001 From: Alex Fournier Date: Tue, 9 Jun 2026 10:21:53 -0700 Subject: [PATCH 20/35] refactor: keep python helpers scoped to pii redaction Signed-off-by: Alex Fournier --- python/nemo_relay/_config_normalize.py | 37 --------- python/nemo_relay/adaptive.py | 73 ++++++++++------- python/nemo_relay/observability.py | 65 ++++++++------- python/nemo_relay/pii_redaction.py | 108 +++++++++++++------------ python/nemo_relay/plugin.py | 36 +++++++-- 5 files changed, 165 insertions(+), 154 deletions(-) delete mode 100644 python/nemo_relay/_config_normalize.py diff --git a/python/nemo_relay/_config_normalize.py b/python/nemo_relay/_config_normalize.py deleted file mode 100644 index 50a5c115..00000000 --- a/python/nemo_relay/_config_normalize.py +++ /dev/null @@ -1,37 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -"""Private helpers for normalizing config helper dataclasses to JSON-like values.""" - -from __future__ import annotations - -from dataclasses import fields, is_dataclass -from typing import Any, Protocol, cast - - -class SupportsToDict(Protocol): - """Private protocol for helper objects that provide ``to_dict()``.""" - - def to_dict(self) -> dict[str, Any]: ... - - -def normalize(value: object) -> Any: - """Recursively normalize dataclasses, lists, and dicts into JSON-like values.""" - if hasattr(value, "to_dict"): - return cast(SupportsToDict, value).to_dict() - if is_dataclass(value) and not isinstance(value, type): - return { - field_info.name: normalize(field_value) - for field_info in fields(value) - if (field_value := getattr(value, field_info.name)) is not None - } - if isinstance(value, list): - return [normalize(item) for item in value] - if isinstance(value, dict): - return {cast(str, key): normalize(val) for key, val in value.items() if val is not None} - return value - - -def normalize_object(value: object) -> dict[str, Any]: - """Normalize a helper value and assert the result is mapping-shaped.""" - return cast(dict[str, Any], normalize(value)) diff --git a/python/nemo_relay/adaptive.py b/python/nemo_relay/adaptive.py index 84a3fa78..e4f37545 100644 --- a/python/nemo_relay/adaptive.py +++ b/python/nemo_relay/adaptive.py @@ -9,11 +9,10 @@ from __future__ import annotations -from dataclasses import dataclass, field -from typing import Literal, TypedDict, cast +from dataclasses import dataclass, field, fields, is_dataclass +from typing import Literal, Protocol, TypedDict, cast -from nemo_relay import JsonObject, UnsupportedBehavior -from nemo_relay._config_normalize import normalize, normalize_object +from nemo_relay import Json, JsonObject, UnsupportedBehavior from nemo_relay._native import AdaptiveRuntime as AdaptiveRuntime from nemo_relay._native import build_cache_telemetry_event as _build_cache_telemetry_event from nemo_relay._native import set_latency_sensitivity as _set_latency_sensitivity @@ -39,6 +38,30 @@ class ConfigReport(TypedDict): diagnostics: list[ConfigDiagnostic] +class _SupportsToDict(Protocol): + def to_dict(self) -> JsonObject: ... + + +def _normalize(value: object) -> Json: + if hasattr(value, "to_dict"): + return cast(_SupportsToDict, value).to_dict() + if is_dataclass(value) and not isinstance(value, type): + return { + field_info.name: _normalize(field_value) + for field_info in fields(value) + if (field_value := getattr(value, field_info.name)) is not None + } + if isinstance(value, list): + return [_normalize(item) for item in value] + if isinstance(value, dict): + return {cast(str, key): _normalize(val) for key, val in value.items() if val is not None} + return cast(Json, value) + + +def _normalize_object(value: object) -> JsonObject: + return cast(JsonObject, _normalize(value)) + + @dataclass(slots=True) class ConfigPolicy: """Policy for unsupported adaptive configuration. @@ -86,7 +109,7 @@ def redis(url: str, key_prefix: str = "nemo_relay:") -> "BackendSpec": def to_dict(self) -> JsonObject: """Serialize this backend spec to the canonical JSON object shape.""" - return {"kind": self.kind, "config": cast(JsonObject, normalize_object(self.config))} + return {"kind": self.kind, "config": _normalize_object(self.config)} @dataclass(slots=True) @@ -102,7 +125,7 @@ class StateConfig: def to_dict(self) -> JsonObject: """Serialize this state config to the canonical JSON object shape.""" - return {"backend": cast(JsonObject, normalize_object(self.backend))} + return {"backend": _normalize_object(self.backend)} @dataclass(slots=True) @@ -119,14 +142,11 @@ class TelemetryConfig: def to_dict(self) -> JsonObject: """Serialize this telemetry config to the canonical JSON object shape.""" - return cast( - JsonObject, - normalize_object( + return _normalize_object( { "subscriber_name": self.subscriber_name, "learners": self.learners, } - ), ) @@ -148,16 +168,13 @@ class AdaptiveHintsConfig: def to_dict(self) -> JsonObject: """Serialize this adaptive-hints config to the canonical JSON object shape.""" - return cast( - JsonObject, - normalize_object( + return _normalize_object( { "priority": self.priority, "break_chain": self.break_chain, "inject_header": self.inject_header, "inject_body_path": self.inject_body_path, } - ), ) @@ -177,7 +194,7 @@ class ToolParallelismConfig: def to_dict(self) -> JsonObject: """Serialize this tool-parallelism config to the canonical JSON object shape.""" - return cast(JsonObject, normalize_object({"priority": self.priority, "mode": self.mode})) + return _normalize_object({"priority": self.priority, "mode": self.mode}) @dataclass(slots=True) @@ -197,15 +214,12 @@ class AcgStabilityThresholds: def to_dict(self) -> JsonObject: """Serialize these ACG stability thresholds to the canonical JSON object shape.""" - return cast( - JsonObject, - normalize_object( + return _normalize_object( { "stable_threshold": self.stable_threshold, "semi_stable_threshold": self.semi_stable_threshold, "min_observations_for_full_confidence": self.min_observations_for_full_confidence, } - ), ) @@ -227,16 +241,13 @@ class AcgConfig: def to_dict(self) -> JsonObject: """Serialize this ACG config to the canonical JSON object shape.""" - return cast( - JsonObject, - normalize_object( + return _normalize_object( { "provider": self.provider, "observation_window": self.observation_window, "priority": self.priority, - "stability_thresholds": normalize(self.stability_thresholds), + "stability_thresholds": _normalize(self.stability_thresholds), } - ), ) @@ -273,11 +284,11 @@ def to_dict(self) -> JsonObject: return { "version": self.version, "agent_id": self.agent_id, - "state": normalize(self.state), - "telemetry": normalize(self.telemetry), - "adaptive_hints": normalize(self.adaptive_hints), - "tool_parallelism": normalize(self.tool_parallelism), - "acg": normalize(self.acg), + "state": _normalize(self.state), + "telemetry": _normalize(self.telemetry), + "adaptive_hints": _normalize(self.adaptive_hints), + "tool_parallelism": _normalize(self.tool_parallelism), + "acg": _normalize(self.acg), "policy": self.policy.to_dict(), } @@ -305,13 +316,13 @@ def to_dict(self) -> JsonObject: return { "kind": ADAPTIVE_PLUGIN_KIND, "enabled": self.enabled, - "config": cast(JsonObject, normalize_object(self.config)), + "config": _normalize_object(self.config), } def validate_config(config: AdaptiveConfig | JsonObject) -> ConfigReport: """Validate an adaptive config document without constructing a runtime.""" - return cast(ConfigReport, _validate_adaptive_config(cast(JsonObject, normalize_object(config)))) + return cast(ConfigReport, _validate_adaptive_config(_normalize_object(config))) def build_cache_telemetry_event( diff --git a/python/nemo_relay/observability.py b/python/nemo_relay/observability.py index 154d6057..a067f4cc 100644 --- a/python/nemo_relay/observability.py +++ b/python/nemo_relay/observability.py @@ -5,11 +5,34 @@ from __future__ import annotations -from dataclasses import dataclass, field -from typing import Literal, cast +from dataclasses import dataclass, field, fields, is_dataclass +from typing import Literal, Protocol, cast -from nemo_relay import JsonObject, UnsupportedBehavior -from nemo_relay._config_normalize import normalize_object +from nemo_relay import Json, JsonObject, UnsupportedBehavior + + +class _SupportsToDict(Protocol): + def to_dict(self) -> JsonObject: ... + + +def _normalize(value: object) -> Json: + if hasattr(value, "to_dict"): + return cast(_SupportsToDict, value).to_dict() + if is_dataclass(value) and not isinstance(value, type): + return { + field_info.name: _normalize(field_value) + for field_info in fields(value) + if (field_value := getattr(value, field_info.name)) is not None + } + if isinstance(value, list): + return [_normalize(item) for item in value] + if isinstance(value, dict): + return {cast(str, key): _normalize(val) for key, val in value.items() if val is not None} + return cast(Json, value) + + +def _normalize_object(value: object) -> JsonObject: + return cast(JsonObject, _normalize(value)) @dataclass(slots=True) @@ -40,16 +63,13 @@ class AtofEndpointConfig: def to_dict(self) -> JsonObject: """Serialize this ATOF endpoint config to the canonical JSON object shape.""" - return cast( - JsonObject, - normalize_object( + return _normalize_object( { "url": self.url, "transport": self.transport, "headers": self.headers, "timeout_millis": self.timeout_millis, } - ), ) @@ -65,9 +85,7 @@ class AtofConfig: def to_dict(self) -> JsonObject: """Serialize this ATOF config to the canonical JSON object shape.""" - return cast( - JsonObject, - normalize_object( + return _normalize_object( { "enabled": self.enabled, "output_directory": self.output_directory, @@ -75,7 +93,6 @@ def to_dict(self) -> JsonObject: "mode": self.mode, "endpoints": self.endpoints, } - ), ) @@ -101,9 +118,7 @@ class S3StorageConfig: def to_dict(self) -> JsonObject: """Serialize this S3 storage config to the canonical JSON object shape.""" - return cast( - JsonObject, - normalize_object( + return _normalize_object( { "type": "s3", "bucket": self.bucket, @@ -115,7 +130,6 @@ def to_dict(self) -> JsonObject: "endpoint_url": self.endpoint_url, "allow_http": self.allow_http, } - ), ) @@ -130,9 +144,7 @@ class HttpStorageConfig: def to_dict(self) -> JsonObject: """Serialize this HTTP storage config to the canonical JSON object shape.""" - return cast( - JsonObject, - normalize_object( + return _normalize_object( { "type": "http", "endpoint": self.endpoint, @@ -140,7 +152,6 @@ def to_dict(self) -> JsonObject: "header_env": self.header_env, "timeout_millis": self.timeout_millis, } - ), ) @@ -173,7 +184,7 @@ def to_dict(self) -> JsonObject: } if value["agent_version"] is None: value.pop("agent_version") - return cast(JsonObject, normalize_object(value)) + return _normalize_object(value) @dataclass(slots=True) @@ -193,9 +204,7 @@ class OtlpConfig: def to_dict(self) -> JsonObject: """Serialize this OTLP config to the canonical JSON object shape.""" - return cast( - JsonObject, - normalize_object( + return _normalize_object( { "enabled": self.enabled, "transport": self.transport, @@ -208,7 +217,6 @@ def to_dict(self) -> JsonObject: "instrumentation_scope": self.instrumentation_scope, "timeout_millis": self.timeout_millis, } - ), ) @@ -225,9 +233,7 @@ class ObservabilityConfig: def to_dict(self) -> JsonObject: """Serialize this observability config to the canonical JSON object shape.""" - return cast( - JsonObject, - normalize_object( + return _normalize_object( { "version": self.version, "atof": self.atof, @@ -236,7 +242,6 @@ def to_dict(self) -> JsonObject: "openinference": self.openinference, "policy": self.policy, } - ), ) @@ -255,7 +260,7 @@ def to_dict(self) -> JsonObject: return { "kind": OBSERVABILITY_PLUGIN_KIND, "enabled": self.enabled, - "config": cast(JsonObject, normalize_object(self.config)), + "config": _normalize_object(self.config), } diff --git a/python/nemo_relay/pii_redaction.py b/python/nemo_relay/pii_redaction.py index 4cad3332..784c24e3 100644 --- a/python/nemo_relay/pii_redaction.py +++ b/python/nemo_relay/pii_redaction.py @@ -5,11 +5,10 @@ from __future__ import annotations -from dataclasses import dataclass, field -from typing import Literal, TypedDict, cast +from dataclasses import dataclass, field, fields, is_dataclass +from typing import Literal, Protocol, TypedDict, cast -from nemo_relay import JsonObject, UnsupportedBehavior -from nemo_relay._config_normalize import normalize_object +from nemo_relay import Json, JsonObject, UnsupportedBehavior from nemo_relay import plugin as plugin_module @@ -32,6 +31,30 @@ class ConfigReport(TypedDict): diagnostics: list[ConfigDiagnostic] +class _SupportsToDict(Protocol): + def to_dict(self) -> JsonObject: ... + + +def _normalize(value: object) -> Json: + if hasattr(value, "to_dict"): + return cast(_SupportsToDict, value).to_dict() + if is_dataclass(value) and not isinstance(value, type): + return { + field_info.name: _normalize(field_value) + for field_info in fields(value) + if (field_value := getattr(value, field_info.name)) is not None + } + if isinstance(value, list): + return [_normalize(item) for item in value] + if isinstance(value, dict): + return {cast(str, key): _normalize(val) for key, val in value.items() if val is not None} + return cast(Json, value) + + +def _normalize_object(value: object) -> JsonObject: + return cast(JsonObject, _normalize(value)) + + @dataclass(slots=True) class ConfigPolicy: """Policy for unsupported PII redaction configuration.""" @@ -62,21 +85,16 @@ class BuiltinConfig: def to_dict(self) -> JsonObject: """Serialize this built-in backend config to the canonical JSON object shape.""" - return cast( - JsonObject, - normalize_object( - { - "action": self.action, - "target_paths": self.target_paths, - "pattern": self.pattern, - "detector": self.detector, - "replacement": self.replacement, - "mask_char": self.mask_char, - "unmasked_prefix": self.unmasked_prefix, - "unmasked_suffix": self.unmasked_suffix, - } - ), - ) + return _normalize_object({ + "action": self.action, + "target_paths": self.target_paths, + "pattern": self.pattern, + "detector": self.detector, + "replacement": self.replacement, + "mask_char": self.mask_char, + "unmasked_prefix": self.unmasked_prefix, + "unmasked_suffix": self.unmasked_suffix, + }) @dataclass(slots=True) @@ -91,18 +109,13 @@ class LocalModelConfig: def to_dict(self) -> JsonObject: """Serialize this local-model config to the canonical JSON object shape.""" - return cast( - JsonObject, - normalize_object( - { - "backend": self.backend, - "model_id": self.model_id, - "detector_profile": self.detector_profile, - "allow_network": self.allow_network, - "max_latency_ms": self.max_latency_ms, - } - ), - ) + return _normalize_object({ + "backend": self.backend, + "model_id": self.model_id, + "detector_profile": self.detector_profile, + "allow_network": self.allow_network, + "max_latency_ms": self.max_latency_ms, + }) @dataclass(slots=True) @@ -123,24 +136,19 @@ class PiiRedactionConfig: def to_dict(self) -> JsonObject: """Serialize this PII redaction config to the canonical JSON object shape.""" - return cast( - JsonObject, - normalize_object( - { - "version": self.version, - "mode": self.mode, - "input": self.input, - "output": self.output, - "tool_input": self.tool_input, - "tool_output": self.tool_output, - "priority": self.priority, - "codec": self.codec, - "builtin": self.builtin, - "local": self.local, - "policy": self.policy, - } - ), - ) + return _normalize_object({ + "version": self.version, + "mode": self.mode, + "input": self.input, + "output": self.output, + "tool_input": self.tool_input, + "tool_output": self.tool_output, + "priority": self.priority, + "codec": self.codec, + "builtin": self.builtin, + "local": self.local, + "policy": self.policy, + }) PII_REDACTION_PLUGIN_KIND = "pii_redaction" @@ -158,7 +166,7 @@ def to_dict(self) -> JsonObject: return { "kind": PII_REDACTION_PLUGIN_KIND, "enabled": self.enabled, - "config": cast(JsonObject, normalize_object(self.config)), + "config": _normalize_object(self.config), } diff --git a/python/nemo_relay/plugin.py b/python/nemo_relay/plugin.py index e88afa19..7e7b2f0f 100644 --- a/python/nemo_relay/plugin.py +++ b/python/nemo_relay/plugin.py @@ -11,10 +11,11 @@ from __future__ import annotations from contextlib import asynccontextmanager -from dataclasses import dataclass, field +from dataclasses import dataclass, field, fields, is_dataclass from typing import TYPE_CHECKING, AsyncIterator, Callable, Literal, Protocol, TypedDict, cast from nemo_relay import ( + Json, JsonObject, LlmConditionalExecutionGuardrail, LlmExecutionIntercept, @@ -29,7 +30,6 @@ UnsupportedBehavior, subscribers, ) -from nemo_relay._config_normalize import normalize, normalize_object from nemo_relay._native import ( active_plugin_report as _active_plugin_report, ) @@ -180,6 +180,30 @@ def register(self, plugin_config: JsonObject, context: PluginContext) -> None: ... +class _SupportsToDict(Protocol): + def to_dict(self) -> JsonObject: ... + + +def _normalize(value: object) -> Json: + if hasattr(value, "to_dict"): + return cast(_SupportsToDict, value).to_dict() + if is_dataclass(value) and not isinstance(value, type): + return { + field_info.name: _normalize(field_value) + for field_info in fields(value) + if (field_value := getattr(value, field_info.name)) is not None + } + if isinstance(value, list): + return [_normalize(item) for item in value] + if isinstance(value, dict): + return {cast(str, key): _normalize(val) for key, val in value.items() if val is not None} + return cast(Json, value) + + +def _normalize_object(value: object) -> JsonObject: + return cast(JsonObject, _normalize(value)) + + @dataclass(slots=True) class ConfigPolicy: """Policy for unsupported plugin configuration. @@ -231,7 +255,7 @@ def to_dict(self) -> JsonObject: return { "kind": self.kind, "enabled": self.enabled, - "config": cast(JsonObject, normalize_object(self.config)), + "config": _normalize_object(self.config), } @@ -257,7 +281,7 @@ def to_dict(self) -> JsonObject: """Serialize this config to the canonical JSON document shape.""" return { "version": self.version, - "components": [normalize(component) for component in self.components], + "components": [_normalize(component) for component in self.components], "policy": self.policy.to_dict(), } @@ -275,7 +299,7 @@ def validate(config: PluginConfig | JsonObject) -> ConfigReport: Validation checks plugin-level compatibility, unknown component kinds, multiplicity rules, and per-plugin validation logic. """ - return cast(ConfigReport, _validate_plugin_config(cast(JsonObject, normalize_object(config)))) + return cast(ConfigReport, _validate_plugin_config(_normalize_object(config))) async def initialize(config: PluginConfig | JsonObject) -> ConfigReport: @@ -292,7 +316,7 @@ async def initialize(config: PluginConfig | JsonObject) -> ConfigReport: registration is rolled back on failure, and the previous configuration is restored when possible. """ - return cast(ConfigReport, await _initialize_plugins(cast(JsonObject, normalize_object(config)))) + return cast(ConfigReport, await _initialize_plugins(_normalize_object(config))) def clear() -> None: From 993d89c27b306b720d7af8663ba9d4934c7a1d06 Mon Sep 17 00:00:00 2001 From: Alex Fournier Date: Wed, 10 Jun 2026 08:24:25 -0700 Subject: [PATCH 21/35] style: format pii redaction python helpers Signed-off-by: Alex Fournier --- python/nemo_relay/pii_redaction.py | 66 ++++++++++++----------- python/nemo_relay/pii_redaction.pyi | 9 ---- python/tests/test_pii_redaction_plugin.py | 4 +- 3 files changed, 38 insertions(+), 41 deletions(-) diff --git a/python/nemo_relay/pii_redaction.py b/python/nemo_relay/pii_redaction.py index 784c24e3..5339a1d0 100644 --- a/python/nemo_relay/pii_redaction.py +++ b/python/nemo_relay/pii_redaction.py @@ -85,16 +85,18 @@ class BuiltinConfig: def to_dict(self) -> JsonObject: """Serialize this built-in backend config to the canonical JSON object shape.""" - return _normalize_object({ - "action": self.action, - "target_paths": self.target_paths, - "pattern": self.pattern, - "detector": self.detector, - "replacement": self.replacement, - "mask_char": self.mask_char, - "unmasked_prefix": self.unmasked_prefix, - "unmasked_suffix": self.unmasked_suffix, - }) + return _normalize_object( + { + "action": self.action, + "target_paths": self.target_paths, + "pattern": self.pattern, + "detector": self.detector, + "replacement": self.replacement, + "mask_char": self.mask_char, + "unmasked_prefix": self.unmasked_prefix, + "unmasked_suffix": self.unmasked_suffix, + } + ) @dataclass(slots=True) @@ -109,13 +111,15 @@ class LocalModelConfig: def to_dict(self) -> JsonObject: """Serialize this local-model config to the canonical JSON object shape.""" - return _normalize_object({ - "backend": self.backend, - "model_id": self.model_id, - "detector_profile": self.detector_profile, - "allow_network": self.allow_network, - "max_latency_ms": self.max_latency_ms, - }) + return _normalize_object( + { + "backend": self.backend, + "model_id": self.model_id, + "detector_profile": self.detector_profile, + "allow_network": self.allow_network, + "max_latency_ms": self.max_latency_ms, + } + ) @dataclass(slots=True) @@ -136,19 +140,21 @@ class PiiRedactionConfig: def to_dict(self) -> JsonObject: """Serialize this PII redaction config to the canonical JSON object shape.""" - return _normalize_object({ - "version": self.version, - "mode": self.mode, - "input": self.input, - "output": self.output, - "tool_input": self.tool_input, - "tool_output": self.tool_output, - "priority": self.priority, - "codec": self.codec, - "builtin": self.builtin, - "local": self.local, - "policy": self.policy, - }) + return _normalize_object( + { + "version": self.version, + "mode": self.mode, + "input": self.input, + "output": self.output, + "tool_input": self.tool_input, + "tool_output": self.tool_output, + "priority": self.priority, + "codec": self.codec, + "builtin": self.builtin, + "local": self.local, + "policy": self.policy, + } + ) PII_REDACTION_PLUGIN_KIND = "pii_redaction" diff --git a/python/nemo_relay/pii_redaction.pyi b/python/nemo_relay/pii_redaction.pyi index ef6d993a..1b7a9e9a 100644 --- a/python/nemo_relay/pii_redaction.pyi +++ b/python/nemo_relay/pii_redaction.pyi @@ -10,7 +10,6 @@ from typing import Literal, TypedDict from nemo_relay import JsonObject, UnsupportedBehavior - class ConfigDiagnostic(TypedDict, total=False): level: Literal["warning", "error"] code: str @@ -18,18 +17,15 @@ class ConfigDiagnostic(TypedDict, total=False): component: str field: str - class ConfigReport(TypedDict): diagnostics: list[ConfigDiagnostic] - @dataclass(slots=True) class ConfigPolicy: unknown_field: UnsupportedBehavior = ... unsupported_value: UnsupportedBehavior = ... def to_dict(self) -> JsonObject: ... - @dataclass(slots=True) class BuiltinConfig: action: Literal["remove", "redact", "regex_replace", "hash", "mask"] = ... @@ -42,7 +38,6 @@ class BuiltinConfig: unmasked_suffix: int | None = ... def to_dict(self) -> JsonObject: ... - @dataclass(slots=True) class LocalModelConfig: backend: str | None = ... @@ -52,7 +47,6 @@ class LocalModelConfig: max_latency_ms: int | None = ... def to_dict(self) -> JsonObject: ... - @dataclass(slots=True) class PiiRedactionConfig: version: int = ... @@ -68,15 +62,12 @@ class PiiRedactionConfig: policy: ConfigPolicy = field(default_factory=ConfigPolicy) def to_dict(self) -> JsonObject: ... - PII_REDACTION_PLUGIN_KIND: Literal["pii_redaction"] - @dataclass(slots=True) class ComponentSpec: config: PiiRedactionConfig | JsonObject enabled: bool = ... def to_dict(self) -> JsonObject: ... - def validate_config(config: PiiRedactionConfig | JsonObject) -> ConfigReport: ... diff --git a/python/tests/test_pii_redaction_plugin.py b/python/tests/test_pii_redaction_plugin.py index fabc834e..602b652a 100644 --- a/python/tests/test_pii_redaction_plugin.py +++ b/python/tests/test_pii_redaction_plugin.py @@ -40,7 +40,7 @@ def test_validation_rejects_bad_values(self): builtin=BuiltinConfig( action="mask", detector="not_a_detector", - ) + ), ) ) assert any(diag.get("field") == "builtin.detector" for diag in report["diagnostics"]) @@ -56,7 +56,7 @@ def test_component_configures_plugin_validation(self): builtin=BuiltinConfig( action="mask", detector="email", - ) + ), ) ) ] From 61a90b3ecc94260f3afc99459dddf9b297413d8b Mon Sep 17 00:00:00 2001 From: Alex Fournier Date: Wed, 10 Jun 2026 08:37:14 -0700 Subject: [PATCH 22/35] fix: export pii redaction python facade module Signed-off-by: Alex Fournier --- python/nemo_relay/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/nemo_relay/__init__.py b/python/nemo_relay/__init__.py index 364dfaa8..6dc9379d 100644 --- a/python/nemo_relay/__init__.py +++ b/python/nemo_relay/__init__.py @@ -435,6 +435,7 @@ def worker() -> None: "plugin", "adaptive", "observability", + "pii_redaction", # Scope stack isolation "ScopeStack", "create_scope_stack", From 08dcbc636052b86dd2f7dc07b22b3ed8cf2fcda4 Mon Sep 17 00:00:00 2001 From: Alex Fournier Date: Wed, 10 Jun 2026 11:31:50 -0700 Subject: [PATCH 23/35] fix: align builtin backend config defaults Signed-off-by: Alex Fournier --- crates/pii-redaction/src/component.rs | 17 ++++++++++++++++- .../pii-redaction/tests/unit/component_tests.rs | 10 ++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/crates/pii-redaction/src/component.rs b/crates/pii-redaction/src/component.rs index 2b4da01c..a3cd4b73 100644 --- a/crates/pii-redaction/src/component.rs +++ b/crates/pii-redaction/src/component.rs @@ -123,7 +123,7 @@ impl Default for PiiRedactionConfig { } /// Built-in redaction backend settings. -#[derive(Debug, Clone, Default, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] #[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct BuiltinBackendConfig { /// Action applied to matching string leaves. @@ -153,6 +153,21 @@ pub struct BuiltinBackendConfig { pub unmasked_suffix: Option, } +impl Default for BuiltinBackendConfig { + fn default() -> Self { + Self { + action: default_builtin_action(), + target_paths: Vec::new(), + pattern: None, + detector: None, + replacement: None, + mask_char: None, + unmasked_prefix: None, + unmasked_suffix: None, + } + } +} + /// Local-backend settings for a future in-process local-model runtime. #[derive(Debug, Clone, Default, Serialize, Deserialize)] #[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] diff --git a/crates/pii-redaction/tests/unit/component_tests.rs b/crates/pii-redaction/tests/unit/component_tests.rs index 4cad9407..4b20988f 100644 --- a/crates/pii-redaction/tests/unit/component_tests.rs +++ b/crates/pii-redaction/tests/unit/component_tests.rs @@ -99,6 +99,16 @@ fn builtin_registry_includes_pii_redaction_component() { ); } +#[test] +fn builtin_backend_config_default_matches_documented_action_default() { + let config = BuiltinBackendConfig::default(); + + assert_eq!(config.action, "remove"); + assert!(config.target_paths.is_empty()); + assert!(config.pattern.is_none()); + assert!(config.detector.is_none()); +} + #[test] fn validate_rejects_config_with_no_enabled_surfaces() { let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); From c41d145adb502dc82721b036c4fb48b4b5f1f2b1 Mon Sep 17 00:00:00 2001 From: Alex Fournier Date: Wed, 10 Jun 2026 12:14:29 -0700 Subject: [PATCH 24/35] fix: preserve multiline text in response overlays Signed-off-by: Alex Fournier --- crates/pii-redaction/src/overlay.rs | 45 +++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/crates/pii-redaction/src/overlay.rs b/crates/pii-redaction/src/overlay.rs index f556b6c1..b31693b4 100644 --- a/crates/pii-redaction/src/overlay.rs +++ b/crates/pii-redaction/src/overlay.rs @@ -230,11 +230,23 @@ fn overlay_output_text_blocks(items: &mut [Json], message_text: Option) let parts: Vec<&str> = text.split('\n').collect(); for content in text_items { + let output_text_count = content + .iter() + .filter(|block| block.get("type").and_then(Json::as_str) == Some("output_text")) + .count(); let mut text_blocks = content.iter_mut().filter_map(|block| { (block.get("type").and_then(Json::as_str) == Some("output_text")) .then_some(block.as_object_mut()) .flatten() }); + + if output_text_count <= 1 { + if let Some(block) = text_blocks.next() { + set_optional_string_field(block, "text", Some(text.as_str())); + } + continue; + } + for (index, block) in text_blocks.by_ref().enumerate() { let part = parts .get(index) @@ -246,6 +258,10 @@ fn overlay_output_text_blocks(items: &mut [Json], message_text: Option) } fn overlay_anthropic_text_blocks(blocks: &mut [Json], message_text: Option) { + let text_block_count = blocks + .iter() + .filter(|block| block.get("type").and_then(Json::as_str) == Some("text")) + .count(); let parts = message_text .as_deref() .map(|text| text.split('\n').collect::>()); @@ -258,6 +274,11 @@ fn overlay_anthropic_text_blocks(blocks: &mut [Json], message_text: Option Date: Wed, 10 Jun 2026 12:21:00 -0700 Subject: [PATCH 25/35] fix: validate builtin regex patterns early Signed-off-by: Alex Fournier --- crates/pii-redaction/src/component.rs | 14 ++++++++++++++ .../tests/unit/component_tests.rs | 19 +++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/crates/pii-redaction/src/component.rs b/crates/pii-redaction/src/component.rs index a3cd4b73..6ff0c290 100644 --- a/crates/pii-redaction/src/component.rs +++ b/crates/pii-redaction/src/component.rs @@ -7,6 +7,7 @@ use std::future::Future; use std::pin::Pin; use std::sync::Arc; +use regex::Regex; use nemo_relay::plugin::{ ConfigDiagnostic, ConfigPolicy, DiagnosticLevel, Plugin, PluginComponentSpec, PluginError, PluginRegistrationContext, Result as PluginResult, UnsupportedBehavior, deregister_plugin, @@ -616,6 +617,19 @@ fn validate_builtin_action_requirements( ); } + if let Some(pattern) = builtin.pattern.as_deref() { + if let Err(err) = Regex::new(pattern) { + push_policy_diag( + diagnostics, + policy.unsupported_value, + "pii_redaction.unsupported_value", + Some(PII_REDACTION_PLUGIN_KIND.to_string()), + Some("builtin.pattern".to_string()), + format!("invalid builtin matcher regex '{pattern}': {err}"), + ); + } + } + if builtin .detector .as_deref() diff --git a/crates/pii-redaction/tests/unit/component_tests.rs b/crates/pii-redaction/tests/unit/component_tests.rs index 4b20988f..b4be0867 100644 --- a/crates/pii-redaction/tests/unit/component_tests.rs +++ b/crates/pii-redaction/tests/unit/component_tests.rs @@ -212,6 +212,25 @@ fn validate_rejects_regex_replace_without_pattern() { })); } +#[test] +fn validate_rejects_invalid_builtin_pattern_regex() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + + let report = validate_plugin_config(&plugin_config(json!({ + "mode": "builtin", + "builtin": { + "action": "regex_replace", + "pattern": "[unterminated" + } + }))); + + assert!(report.diagnostics.iter().any(|diag| { + diag.field.as_deref() == Some("builtin.pattern") + && diag.message.contains("invalid builtin matcher regex") + })); +} + #[test] fn validate_rejects_mask_with_empty_mask_char() { let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); From ecdf49a69875ea0bbdb48ace1c8b843a433cb5a7 Mon Sep 17 00:00:00 2001 From: Alex Fournier Date: Wed, 10 Jun 2026 12:33:53 -0700 Subject: [PATCH 26/35] fix: allow documented pii policy field Signed-off-by: Alex Fournier --- crates/pii-redaction/src/component.rs | 2 +- .../tests/unit/component_tests.rs | 27 +++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/crates/pii-redaction/src/component.rs b/crates/pii-redaction/src/component.rs index 6ff0c290..a2f8e610 100644 --- a/crates/pii-redaction/src/component.rs +++ b/crates/pii-redaction/src/component.rs @@ -766,7 +766,7 @@ fn validate_policy_fields( policy, plugin_config, "policy", - &["unknown_field", "unsupported_value"], + &["unknown_component", "unknown_field", "unsupported_value"], ); } diff --git a/crates/pii-redaction/tests/unit/component_tests.rs b/crates/pii-redaction/tests/unit/component_tests.rs index b4be0867..3a8aa8ce 100644 --- a/crates/pii-redaction/tests/unit/component_tests.rs +++ b/crates/pii-redaction/tests/unit/component_tests.rs @@ -134,6 +134,33 @@ fn validate_rejects_config_with_no_enabled_surfaces() { })); } +#[test] +fn validate_allows_documented_policy_unknown_component_field() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + + let report = validate_plugin_config(&plugin_config(json!({ + "mode": "builtin", + "tool_input": true, + "tool_output": false, + "input": false, + "output": false, + "builtin": { + "action": "remove" + }, + "policy": { + "unknown_component": "warn", + "unknown_field": "warn", + "unsupported_value": "error" + } + }))); + + assert!(!report.diagnostics.iter().any(|diag| { + diag.field.as_deref() == Some("policy.unknown_component") + && diag.code == "pii_redaction.unknown_field" + })); +} + #[test] fn validate_rejects_local_section_outside_local_mode() { let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); From 0d0fa351af4b9f766aa1bac0ea0ab521f7586cb6 Mon Sep 17 00:00:00 2001 From: Alex Fournier Date: Wed, 10 Jun 2026 12:37:20 -0700 Subject: [PATCH 27/35] fix: align python pii helper defaults Signed-off-by: Alex Fournier --- python/nemo_relay/pii_redaction.py | 2 +- python/tests/test_pii_redaction_plugin.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/nemo_relay/pii_redaction.py b/python/nemo_relay/pii_redaction.py index 5339a1d0..02a96fcc 100644 --- a/python/nemo_relay/pii_redaction.py +++ b/python/nemo_relay/pii_redaction.py @@ -74,7 +74,7 @@ def to_dict(self) -> JsonObject: class BuiltinConfig: """Deterministic built-in redaction backend settings.""" - action: Literal["remove", "redact", "regex_replace", "hash", "mask"] = "redact" + action: Literal["remove", "redact", "regex_replace", "hash", "mask"] = "remove" target_paths: list[str] = field(default_factory=list) pattern: str | None = None detector: str | None = None diff --git a/python/tests/test_pii_redaction_plugin.py b/python/tests/test_pii_redaction_plugin.py index 602b652a..1d2efd6b 100644 --- a/python/tests/test_pii_redaction_plugin.py +++ b/python/tests/test_pii_redaction_plugin.py @@ -19,7 +19,7 @@ class TestPiiRedactionConfigHelpers: def test_defaults_and_component_wrapper(self): assert BuiltinConfig().to_dict() == { - "action": "redact", + "action": "remove", "target_paths": [], } assert LocalModelConfig().to_dict() == {} From ebec17591d018d6440fa623a1a6265994b2c5a8a Mon Sep 17 00:00:00 2001 From: Alex Fournier Date: Wed, 10 Jun 2026 12:37:26 -0700 Subject: [PATCH 28/35] docs: clarify pii observability-only tool surfaces Signed-off-by: Alex Fournier --- docs/pii-redaction-plugin/configuration.mdx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/pii-redaction-plugin/configuration.mdx b/docs/pii-redaction-plugin/configuration.mdx index 41fdc53a..7aa0c37b 100644 --- a/docs/pii-redaction-plugin/configuration.mdx +++ b/docs/pii-redaction-plugin/configuration.mdx @@ -45,8 +45,8 @@ The top-level PII redaction object contains: | `mode` | Backend mode. Current values are `builtin` and `local_model`. | | `input` | Enables managed LLM request sanitization. | | `output` | Enables managed LLM response sanitization. | -| `tool_input` | Enables managed tool-argument sanitization before execution. | -| `tool_output` | Enables managed tool-result sanitization after execution. | +| `tool_input` | Enables sanitization of emitted tool-request observability payloads. | +| `tool_output` | Enables sanitization of emitted tool-response observability payloads. | | `priority` | Guardrail priority. Lower values run earlier. | | `codec` | Managed LLM provider codec. Required when `input` or `output` is enabled. | | `builtin` | Built-in backend settings used when `mode = "builtin"`. | From 138ad3a7d7a55e027aa1452a6a059a0744226415 Mon Sep 17 00:00:00 2001 From: Alex Fournier Date: Wed, 10 Jun 2026 12:48:52 -0700 Subject: [PATCH 29/35] style: apply pii redaction pre-commit cleanup Signed-off-by: Alex Fournier --- crates/pii-redaction/src/component.rs | 24 ++++++++++++------------ crates/pii-redaction/src/overlay.rs | 5 +---- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/crates/pii-redaction/src/component.rs b/crates/pii-redaction/src/component.rs index a2f8e610..d7265ea6 100644 --- a/crates/pii-redaction/src/component.rs +++ b/crates/pii-redaction/src/component.rs @@ -7,12 +7,12 @@ use std::future::Future; use std::pin::Pin; use std::sync::Arc; -use regex::Regex; use nemo_relay::plugin::{ ConfigDiagnostic, ConfigPolicy, DiagnosticLevel, Plugin, PluginComponentSpec, PluginError, PluginRegistrationContext, Result as PluginResult, UnsupportedBehavior, deregister_plugin, register_plugin, }; +use regex::Regex; use serde::{Deserialize, Serialize}; use serde_json::{Map, Value as Json}; @@ -617,17 +617,17 @@ fn validate_builtin_action_requirements( ); } - if let Some(pattern) = builtin.pattern.as_deref() { - if let Err(err) = Regex::new(pattern) { - push_policy_diag( - diagnostics, - policy.unsupported_value, - "pii_redaction.unsupported_value", - Some(PII_REDACTION_PLUGIN_KIND.to_string()), - Some("builtin.pattern".to_string()), - format!("invalid builtin matcher regex '{pattern}': {err}"), - ); - } + if let Some(pattern) = builtin.pattern.as_deref() + && let Err(err) = Regex::new(pattern) + { + push_policy_diag( + diagnostics, + policy.unsupported_value, + "pii_redaction.unsupported_value", + Some(PII_REDACTION_PLUGIN_KIND.to_string()), + Some("builtin.pattern".to_string()), + format!("invalid builtin matcher regex '{pattern}': {err}"), + ); } if builtin diff --git a/crates/pii-redaction/src/overlay.rs b/crates/pii-redaction/src/overlay.rs index b31693b4..37ace8a7 100644 --- a/crates/pii-redaction/src/overlay.rs +++ b/crates/pii-redaction/src/overlay.rs @@ -436,10 +436,7 @@ mod tests { overlay_output_text_blocks(&mut items, Some("line one\nline two".to_string())); - assert_eq!( - items[0]["content"][0]["text"], - json!("line one\nline two") - ); + assert_eq!(items[0]["content"][0]["text"], json!("line one\nline two")); } #[test] From 6f76edc943a28a829c2f898f2e437f18e2a96131 Mon Sep 17 00:00:00 2001 From: Alex Fournier Date: Wed, 10 Jun 2026 13:35:31 -0700 Subject: [PATCH 30/35] docs: expand pii redaction crate README Signed-off-by: Alex Fournier --- crates/pii-redaction/README.md | 125 ++++++++++++++++++++++++++++++++- 1 file changed, 124 insertions(+), 1 deletion(-) diff --git a/crates/pii-redaction/README.md b/crates/pii-redaction/README.md index 9457f38b..9ed6ae9f 100644 --- a/crates/pii-redaction/README.md +++ b/crates/pii-redaction/README.md @@ -5,4 +5,127 @@ SPDX-License-Identifier: Apache-2.0 # NeMo Relay PII Redaction -First-party PII redaction plugin crate for NeMo Relay. +`nemo-relay-pii-redaction` is the first-party NeMo Relay plugin crate for +deterministic privacy redaction on tool and LLM observability payloads. It +ships the `pii_redaction` plugin contract, a production-ready `builtin` +backend, and the future `local_model` seam for model-backed detection and +redaction. + +The plugin is designed for the common case where teams want a supported, +config-driven privacy policy surface instead of writing custom sanitize +middleware by hand. + +## Why Use It? + +- **Use a supported privacy plugin**: Install deterministic redaction behavior + through the NeMo Relay plugin system instead of custom sanitize callbacks. +- **Cover tool and LLM observability surfaces**: Sanitize emitted tool + request/response payloads and supported codec-backed LLM request/response + payloads through one shared config surface. +- **Choose explicit action semantics**: Use `remove`, `redact`, + `regex_replace`, `hash`, or `mask` depending on the privacy and debugging + tradeoff you need. +- **Start deterministic, leave room for model-backed policy later**: Ship the + lightweight `builtin` backend now and keep `local_model` as the future seam + for local inference-based detection. + +## What You Get + +- **`PiiRedactionConfig`**: Canonical config contract for the top-level + `pii_redaction` plugin component. +- **Deterministic `builtin` backend**: Production-ready action engine for + remove/redact/replace/hash/mask workflows. +- **Built-in detector presets**: First-party detectors for common PII, + structured secrets, and cloud credentials. +- **Codec-aware LLM handling**: Overlay support for `openai_chat`, + `openai_responses`, and `anthropic_messages`. +- **Local backend seam**: `local_model` config contract and provider + registration surface for future model-backed implementations. + +## Plugin Versus Raw Middleware + +Use raw middleware when you need bespoke runtime logic. Use +`nemo-relay-pii-redaction` when you want a reusable privacy policy surface. + +- **Raw middleware** gives you the generic hook mechanism and full code-level + control. +- **`pii_redaction`** packages the common privacy policy contract on top of + those hooks, including typed config, validation, editor support, detector + presets, and cross-runtime behavior. + +This crate does not change real callback arguments or return values. It +sanitizes emitted observability payloads through NeMo Relay sanitize guardrails. + +## Installation + +Install the plugin crate alongside the core runtime: + +```bash +cargo add nemo-relay nemo-relay-pii-redaction +``` + +For local source development: + +```bash +cargo build -p nemo-relay-pii-redaction +cargo test -p nemo-relay-pii-redaction +``` + +## Getting Started + +Register the plugin component before validating or initializing plugin +configuration that includes a `pii_redaction` component: + +```rust +nemo_relay_pii_redaction::component::register_pii_redaction_component()?; +``` + +A minimal config can redact detected emails from emitted tool input payloads: + +```toml +[[plugins.components]] +kind = "pii_redaction" + +[plugins.components.config] +mode = "builtin" +tool_input = true + +[plugins.components.config.builtin] +action = "redact" +detector = "email" +target_paths = [] +``` + +## Built-In Backend + +The shipped `builtin` backend supports these actions: + +- `remove` +- `redact` +- `regex_replace` +- `hash` +- `mask` + +The detector catalog includes: + +- Common PII: `email`, `phone`, `ip_address`, `ipv6`, `url`, `uuid` +- Structured secrets: `api_key`, `bearer_token`, `jwt`, `credit_card` +- Cloud credentials: `aws_access_key_id`, `aws_secret_access_key`, + `gcp_api_key`, `azure_storage_account_key` + +Detector-aware masking defaults are available for the relevant detectors. For +high-risk secrets, prefer `redact` over partial `mask` behavior. + +## Local Model Seam + +`local_model` is included in the plugin contract now, but no runtime +implementation ships in this crate yet. + +The seam exists so a future local detector/redactor backend can be added +without redesigning the public plugin surface. If `mode = "local_model"` is +configured today, the runtime expects a registered local backend provider and +fails fast if one is not installed. + +## Documentation + +NeMo Relay documentation: https://docs.nvidia.com/nemo/relay From ac7b4c5a84ebdc475f38d31bf190c17bd72be23b Mon Sep 17 00:00:00 2001 From: Alex Fournier Date: Wed, 10 Jun 2026 13:44:02 -0700 Subject: [PATCH 31/35] docs: defer pii redaction documentation Signed-off-by: Alex Fournier --- crates/pii-redaction/README.md | 125 +------ docs/index.yml | 4 - docs/pii-redaction-plugin/about.mdx | 121 ------- docs/pii-redaction-plugin/configuration.mdx | 340 -------------------- 4 files changed, 1 insertion(+), 589 deletions(-) delete mode 100644 docs/pii-redaction-plugin/about.mdx delete mode 100644 docs/pii-redaction-plugin/configuration.mdx diff --git a/crates/pii-redaction/README.md b/crates/pii-redaction/README.md index 9ed6ae9f..9457f38b 100644 --- a/crates/pii-redaction/README.md +++ b/crates/pii-redaction/README.md @@ -5,127 +5,4 @@ SPDX-License-Identifier: Apache-2.0 # NeMo Relay PII Redaction -`nemo-relay-pii-redaction` is the first-party NeMo Relay plugin crate for -deterministic privacy redaction on tool and LLM observability payloads. It -ships the `pii_redaction` plugin contract, a production-ready `builtin` -backend, and the future `local_model` seam for model-backed detection and -redaction. - -The plugin is designed for the common case where teams want a supported, -config-driven privacy policy surface instead of writing custom sanitize -middleware by hand. - -## Why Use It? - -- **Use a supported privacy plugin**: Install deterministic redaction behavior - through the NeMo Relay plugin system instead of custom sanitize callbacks. -- **Cover tool and LLM observability surfaces**: Sanitize emitted tool - request/response payloads and supported codec-backed LLM request/response - payloads through one shared config surface. -- **Choose explicit action semantics**: Use `remove`, `redact`, - `regex_replace`, `hash`, or `mask` depending on the privacy and debugging - tradeoff you need. -- **Start deterministic, leave room for model-backed policy later**: Ship the - lightweight `builtin` backend now and keep `local_model` as the future seam - for local inference-based detection. - -## What You Get - -- **`PiiRedactionConfig`**: Canonical config contract for the top-level - `pii_redaction` plugin component. -- **Deterministic `builtin` backend**: Production-ready action engine for - remove/redact/replace/hash/mask workflows. -- **Built-in detector presets**: First-party detectors for common PII, - structured secrets, and cloud credentials. -- **Codec-aware LLM handling**: Overlay support for `openai_chat`, - `openai_responses`, and `anthropic_messages`. -- **Local backend seam**: `local_model` config contract and provider - registration surface for future model-backed implementations. - -## Plugin Versus Raw Middleware - -Use raw middleware when you need bespoke runtime logic. Use -`nemo-relay-pii-redaction` when you want a reusable privacy policy surface. - -- **Raw middleware** gives you the generic hook mechanism and full code-level - control. -- **`pii_redaction`** packages the common privacy policy contract on top of - those hooks, including typed config, validation, editor support, detector - presets, and cross-runtime behavior. - -This crate does not change real callback arguments or return values. It -sanitizes emitted observability payloads through NeMo Relay sanitize guardrails. - -## Installation - -Install the plugin crate alongside the core runtime: - -```bash -cargo add nemo-relay nemo-relay-pii-redaction -``` - -For local source development: - -```bash -cargo build -p nemo-relay-pii-redaction -cargo test -p nemo-relay-pii-redaction -``` - -## Getting Started - -Register the plugin component before validating or initializing plugin -configuration that includes a `pii_redaction` component: - -```rust -nemo_relay_pii_redaction::component::register_pii_redaction_component()?; -``` - -A minimal config can redact detected emails from emitted tool input payloads: - -```toml -[[plugins.components]] -kind = "pii_redaction" - -[plugins.components.config] -mode = "builtin" -tool_input = true - -[plugins.components.config.builtin] -action = "redact" -detector = "email" -target_paths = [] -``` - -## Built-In Backend - -The shipped `builtin` backend supports these actions: - -- `remove` -- `redact` -- `regex_replace` -- `hash` -- `mask` - -The detector catalog includes: - -- Common PII: `email`, `phone`, `ip_address`, `ipv6`, `url`, `uuid` -- Structured secrets: `api_key`, `bearer_token`, `jwt`, `credit_card` -- Cloud credentials: `aws_access_key_id`, `aws_secret_access_key`, - `gcp_api_key`, `azure_storage_account_key` - -Detector-aware masking defaults are available for the relevant detectors. For -high-risk secrets, prefer `redact` over partial `mask` behavior. - -## Local Model Seam - -`local_model` is included in the plugin contract now, but no runtime -implementation ships in this crate yet. - -The seam exists so a future local detector/redactor backend can be added -without redesigning the public plugin surface. If `mode = "local_model"` is -configured today, the runtime expects a registered local backend provider and -fails fast if one is not installed. - -## Documentation - -NeMo Relay documentation: https://docs.nvidia.com/nemo/relay +First-party PII redaction plugin crate for NeMo Relay. diff --git a/docs/index.yml b/docs/index.yml index bcdcaba3..ed921473 100644 --- a/docs/index.yml +++ b/docs/index.yml @@ -34,10 +34,6 @@ navigation: title: "NeMo Guardrails Plugin" slug: nemo-guardrails-plugin title-source: frontmatter - - folder: ./pii-redaction-plugin - title: "PII Redaction Plugin" - slug: pii-redaction-plugin - title-source: frontmatter - folder: ./integrate-into-frameworks title: "Integrate into Frameworks" slug: integrate-into-frameworks diff --git a/docs/pii-redaction-plugin/about.mdx b/docs/pii-redaction-plugin/about.mdx deleted file mode 100644 index f08328c0..00000000 --- a/docs/pii-redaction-plugin/about.mdx +++ /dev/null @@ -1,121 +0,0 @@ ---- -title: "PII Redaction Plugin" -sidebar-title: "About" -description: "" -position: 1 ---- -{/* SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -SPDX-License-Identifier: Apache-2.0 */} - -Use the PII redaction plugin when you want first-party redaction, hashing, or -pattern-based replacement around managed NeMo Relay LLM and tool observability -surfaces through the shared plugin system. - -The built-in plugin component has kind `pii_redaction` and is available as a -first-party NeMo Relay plugin. - -The plugin is designed around backend modes: - -- `builtin` - - Uses a native Rust backend for deterministic payload sanitization. -- `local_model` - - Reserves a future local-model backend lane for more stochastic detection behavior. - -## Use This Plugin When - -Start here when you need to: - -- Remove sensitive fields from emitted tool or LLM payloads. -- Replace sensitive text with a deterministic marker such as `[REDACTED]`. -- Hash matching values before observability exporters or subscribers receive - them. -- Keep privacy behavior inside the same plugin config surface used by other - first-party NeMo Relay components. -- Use built-in detector presets for common values such as emails, phone - numbers, URLs, API keys, and IP addresses without writing custom regexes. - -## Plugin Versus Middleware - -`pii_redaction` is built on top of NeMo Relay's existing sanitize-guardrail -middleware. - -That means this plugin does **not** introduce a separate runtime mechanism. -Instead, it packages a common privacy policy behind a first-party config -surface. - -Use the plugin when you want: - -- a reusable privacy policy that many applications or teams can share -- declarative config through `plugins.toml` or `nemo-relay plugins edit` -- built-in actions, detector presets, and codec-aware LLM handling -- a supported, documented NeMo Relay surface instead of hand-registered callbacks - -Use raw sanitize-guardrail middleware when you want: - -- custom redaction logic authored directly in application code -- dynamic behavior based on runtime state, external lookups, or one-off heuristics -- experiments that are too app-specific to become a first-party plugin contract - -So the distinction is: - -- middleware is the **mechanism** -- `pii_redaction` is the **packaged policy** - -## Current Scope - -The built-in plugin currently exposes four managed sanitize surfaces: - -- `input` -- `output` -- `tool_input` -- `tool_output` - -The current built-in backend supports five actions: - -- `remove` -- `redact` -- `regex_replace` -- `hash` -- `mask` - -The current backend boundary is intentional: - -- Managed tool surfaces are sanitized as JSON payloads with exact JSON-pointer - targeting. -- Managed LLM surfaces use the selected built-in codec so redaction can target - normalized Relay request and response shapes such as `/messages/0/content` - and `/message`. - -## Observability Boundary - -This plugin installs sanitize guardrails, not execution intercepts. - -That means: - -- The plugin changes emitted observability payloads. -- The real provider request and response values remain unchanged. -- Subscribers and exporters receive sanitized payloads after the plugin runs. - -For managed LLM request payloads, codec decode and re-encode can canonicalize -the emitted provider-shaped start event. For example, an OpenAI Responses -request may be recorded in the codec's canonical `input` array form instead of -the original shorthand request shape. - -## Current Non-Goals - -This first-party PR scope does not turn NeMo Relay into a full local-model -runtime. - -In particular: - -- `local_model` is an extension point, not a complete backend implementation - in this PR. -- The plugin does not mutate the real callback arguments or return values. -- The plugin does not add a subtree or prefix selector language beyond exact - JSON-pointer matching. - -## Pages - -- [PII Redaction Configuration](/pii-redaction-plugin/configuration) - documents the built-in component shape, action semantics, supported codecs, - and example configs. diff --git a/docs/pii-redaction-plugin/configuration.mdx b/docs/pii-redaction-plugin/configuration.mdx deleted file mode 100644 index 7aa0c37b..00000000 --- a/docs/pii-redaction-plugin/configuration.mdx +++ /dev/null @@ -1,340 +0,0 @@ ---- -title: "PII Redaction Configuration" -sidebar-title: "Configuration" -description: "" -position: 2 ---- -{/* SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -SPDX-License-Identifier: Apache-2.0 */} - -Use this page when you want to configure the built-in PII redaction plugin -component. The component kind is `pii_redaction`. - -For plugin file discovery, precedence, merge behavior, editor controls, and -gateway conflict rules, see -[Plugin Configuration Files](/build-plugins/plugin-configuration-files). - - -NeMo Relay plugin configuration uses the generic plugin document shape, so -field names stay `snake_case` in every binding and in `plugins.toml`. - - - -## Relation to Raw Middleware - -This plugin uses the same sanitize-guardrail middleware family documented in -[Middleware](/about-nemo-relay/concepts/middleware). - -The difference is the layer of abstraction: - -- raw middleware asks you to register sanitize callbacks directly in code -- `pii_redaction` gives you a first-party, config-driven privacy contract on - top of those same runtime hooks - -Choose `pii_redaction` when you want a reusable built-in policy surface. -Choose raw middleware when you need bespoke callback logic that does not fit -the plugin contract. - -## Component Shape - -The top-level PII redaction object contains: - -| Field | Purpose | -|---|---| -| `version` | PII redaction config schema version. Defaults to `1`. | -| `mode` | Backend mode. Current values are `builtin` and `local_model`. | -| `input` | Enables managed LLM request sanitization. | -| `output` | Enables managed LLM response sanitization. | -| `tool_input` | Enables sanitization of emitted tool-request observability payloads. | -| `tool_output` | Enables sanitization of emitted tool-response observability payloads. | -| `priority` | Guardrail priority. Lower values run earlier. | -| `codec` | Managed LLM provider codec. Required when `input` or `output` is enabled. | -| `builtin` | Built-in backend settings used when `mode = "builtin"`. | -| `local` | Local-backend settings used when `mode = "local_model"`. | -| `policy` | Component-local handling for unknown fields and unsupported values. | - -At least one managed redaction surface must be enabled. - -## Backend Support - -| Area | `builtin` | `local_model` | -|---|---|---| -| Built-in component kind and config validation | Supported | Supported | -| Managed LLM `input` | Supported | Extension point only in this PR | -| Managed LLM `output` | Supported | Extension point only in this PR | -| Managed `tool_input` | Supported | Extension point only in this PR | -| Managed `tool_output` | Supported | Extension point only in this PR | -| Built-in actions | `remove`, `redact`, `regex_replace`, `hash`, `mask` | N/A | -| Codec support | `openai_chat`, `openai_responses`, `anthropic_messages` | Runtime-specific future implementation | -| Runtime availability | Any runtime that includes the `nemo-relay-pii-redaction` plugin crate | Runtimes that install a local backend provider | - -## Built-In Mode - -Use `builtin` mode when NeMo Relay should sanitize emitted observability -payloads with a deterministic first-party backend. - -This is the recommended mode when the privacy behavior is common enough to be -described declaratively with built-in actions, detector presets, exact target -paths, and supported codecs. - -### Requirements - -To use `mode = "builtin"`: - -- `builtin` settings are required. -- `codec` is required when `input` or `output` is enabled. -- `builtin.action` must be `remove`, `redact`, `regex_replace`, `hash`, or `mask`. -- `builtin.pattern` or `builtin.detector` is required when `builtin.action = "regex_replace"` or `builtin.action = "redact"`. - -### `plugins.toml` Example - -You can write this config directly in `plugins.toml`, or create and edit it -through the CLI with `nemo-relay plugins edit`. For plugin file discovery, -precedence, merge behavior, and editor controls, see -[Plugin Configuration Files](/build-plugins/plugin-configuration-files). - -```toml -version = 1 - -[[components]] -kind = "pii_redaction" -enabled = true - -[components.config] -version = 1 -mode = "builtin" -codec = "openai_chat" -input = true -output = true -tool_input = true -tool_output = true - -[components.config.builtin] -action = "regex_replace" -pattern = "sk-[A-Za-z0-9_-]+" -replacement = "[REDACTED]" -target_paths = [ - "/messages/0/content", - "/message", - "/api_key", - "/result/secret", -] - -[components.config.policy] -unknown_component = "warn" -unknown_field = "warn" -unsupported_value = "error" -``` - -This example configures the built-in backend for: - -- LLM request redaction from the normalized request path - `/messages/0/content` -- LLM response redaction from the normalized response path `/message` -- tool argument redaction at `/api_key` -- tool result redaction at `/result/secret` - -### CLI Editor Support - -The NeMo Relay CLI plugin editor now exposes `pii_redaction` directly through -`nemo-relay plugins edit`. - -Use the editor when you want to: - -- toggle the component on or off -- choose `builtin` or `local_model` -- set the LLM `codec` -- edit `builtin` action settings such as `action`, `target_paths`, - `pattern`, `detector`, `replacement`, and masking fields -- edit `local.backend` for a runtime-provided future local-model backend - -The editor preserves unknown fields when it rewrites an existing -`pii_redaction` component, so future or runtime-specific settings are not -discarded by the interactive edit flow. - -If you find yourself needing callback code instead of editor/config fields, it -is a sign that raw middleware may be the better fit for that specific policy. - -## Built-In Settings - -The `builtin` section contains: - -| Field | Purpose | -|---|---| -| `action` | Sanitization action. Current values are `remove`, `redact`, `regex_replace`, `hash`, and `mask`. | -| `target_paths` | Exact JSON-pointer paths to sanitize. Empty means every matching string leaf. | -| `pattern` | Regex pattern used when `action = "regex_replace"` or `action = "redact"`. | -| `detector` | Optional built-in matcher preset. Current values are `email`, `phone`, `api_key`, `ip_address`, `ipv6`, `url`, `uuid`, `bearer_token`, `jwt`, `credit_card`, `aws_access_key_id`, `aws_secret_access_key`, `gcp_api_key`, and `azure_storage_account_key`. | -| `replacement` | Replacement text used when `action = "regex_replace"` or `action = "redact"`. Defaults to `[REDACTED]`. | -| `mask_char` | Masking token used when `action = "mask"`. Defaults to `*`. | -| `unmasked_prefix` | Leading character count to keep when `action = "mask"`. Defaults to `0`, unless a detector-specific masking preset is active. | -| `unmasked_suffix` | Trailing character count to keep when `action = "mask"`. Defaults to `0`, unless a detector-specific masking preset is active. | - -## Action Semantics - -### `remove` - -`remove` is structural. - -When a target matches: - -- object fields are removed -- array elements become `null` -- targeted scalar or root values become `null` - -### `regex_replace` - -`regex_replace` applies the configured regex to matching string leaves and -replaces matches with the configured `replacement`. - -If you set `detector` instead of `pattern`, the built-in backend uses the -detector's stock matcher regex. - -### `redact` - -`redact` is the deterministic whole-match replacement lane. - -It uses the same `pattern` or `detector` matcher flow as `regex_replace`, but -defaults the replacement token to `[REDACTED]` and is intended for cases where -you do not want to preserve any matched secret characters. - -Use `redact` when you want: - -- credential-style secrets fully replaced -- a consistent redaction token across detectors -- clearer policy intent than a custom `regex_replace` - -### `hash` - -`hash` replaces matching string leaves with their SHA-256 hex digest. - -When `pattern` or `detector` is set, `hash` only replaces the matching -substring instead of hashing the entire string leaf. - -### `mask` - -`mask` replaces the middle portion of each matching string leaf with the -configured `mask_char`. - -Use `unmasked_prefix` and `unmasked_suffix` when you want to preserve a small -leading or trailing segment for correlation or debugging, such as the last four -characters of a token. - -When `pattern` or `detector` is set, `mask` only masks matching substrings -inside the string leaf. - -When `detector` is set and you do not specify `unmasked_prefix` or -`unmasked_suffix`, the built-in backend applies detector-aware defaults: - -- `email`: preserves the domain and the first local-part character -- `phone`: preserves the last four digits while keeping separators intact -- `api_key`: preserves the vendor-style prefix such as `sk-` and the last four characters -- `ip_address`: preserves the last octet -- `ipv6`: preserves the last segment -- `url`: preserves the scheme and host, then collapses the path/query tail -- `uuid`: preserves the last four characters -- `bearer_token`: preserves the auth scheme and the last four characters -- `jwt`: preserves the header segment and the tail of the signature -- `credit_card`: preserves the last four digits while keeping separators intact -- `aws_access_key_id`: preserves the provider prefix and the last four characters -- `aws_secret_access_key`: preserves the last four characters -- `gcp_api_key`: preserves the `AIza`-style prefix and the last four characters -- `azure_storage_account_key`: preserves the last four characters - -## Path Semantics - -`target_paths` are exact JSON-pointer matches. - -The plugin uses different payload boundaries for tools and LLMs: - -- Tools use JSON-native payloads. Paths point into the emitted tool args or - tool result shape directly. -- LLMs use the selected built-in codec. Prefer normalized Relay paths such as: - - `/messages/0/content` for request message content - - `/message` for the normalized assistant response text - -The current implementation also preserves provider-shaped response-path -compatibility for the supported codecs, but normalized LLM paths are the -recommended contract for new configuration. - -## Choosing Between This Plugin and Middleware - -Use this plugin when: - -- the privacy behavior should be reusable across applications -- config-driven enablement matters more than hand-written callbacks -- you want built-in detectors and action semantics -- you want a documented first-party NeMo Relay privacy surface - -Use raw middleware when: - -- the policy depends on application-specific runtime state -- the sanitization logic is too custom for the plugin contract -- you need to prototype or experiment before standardizing behavior - -The runtime effect is still sanitize-guardrail middleware in both cases. The -plugin simply gives you a standardized policy layer on top. - -## Detector Presets - -The built-in detector presets are grouped into three deterministic families. - -Common PII: -- `email` -- `phone` -- `ip_address` -- `ipv6` -- `url` - -Structured secrets: -- `api_key` -- `uuid` -- `bearer_token` -- `jwt` -- `credit_card` - - `bearer_token` is heuristic rather than vendor-specific. It can still match - benign bearer-style values, so prefer a narrower detector when you know the - credential family. - -Cloud credentials: -- `aws_access_key_id` -- `aws_secret_access_key` -- `gcp_api_key` -- `azure_storage_account_key` - -They are deterministic regex-backed helpers, not model inference. - -If `target_paths` is empty, the built-in backend sanitizes every matching -string leaf in the selected payload boundary. - -## Observability Semantics - -The built-in plugin uses sanitize guardrails. - -That means: - -- the real provider response value is unchanged -- the emitted NeMo Relay start or end event payload is sanitized -- `annotated_response` is populated from the sanitized end-event payload when a - response codec is provided - -## Local Model Mode - -`local_model` is reserved for a future in-process local-model backend. - -### Current Status - -In this PR: - -- the plugin contract accepts `mode = "local_model"` -- the `local` section currently supports: - - `backend` - - `model_id` - - `detector_profile` - - `allow_network` - - `max_latency_ms` -- actual behavior depends on a runtime-installed local backend provider - -Without a provider, runtimes report the local backend as unavailable during -plugin initialization. From 19459792ba0a2cf6a4c9747a42734d2152b3ce21 Mon Sep 17 00:00:00 2001 From: Alex Fournier Date: Wed, 10 Jun 2026 13:55:04 -0700 Subject: [PATCH 32/35] build: align pii redaction sha2 version Signed-off-by: Alex Fournier --- ATTRIBUTIONS-Rust.md | 209 -------------------------------- Cargo.lock | 15 +-- crates/pii-redaction/Cargo.toml | 2 +- 3 files changed, 3 insertions(+), 223 deletions(-) diff --git a/ATTRIBUTIONS-Rust.md b/ATTRIBUTIONS-Rust.md index 6e064658..9ccd2236 100644 --- a/ATTRIBUTIONS-Rust.md +++ b/ATTRIBUTIONS-Rust.md @@ -32413,215 +32413,6 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ``` -## sha2 - 0.10.9 -**Repository URL**: https://github.com/RustCrypto/hashes -**License Type(s)**: Apache-2.0 -### License: https://spdx.org/licenses/Apache-2.0.html -``` - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - -TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - -1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - -2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - -3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - -4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - -5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - -6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - -7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - -8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - -9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - -END OF TERMS AND CONDITIONS - -APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - -Copyright [yyyy] [name of copyright owner] - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -``` - ## sha2 - 0.11.0 **Repository URL**: https://github.com/RustCrypto/hashes **License Type(s)**: Apache-2.0 diff --git a/Cargo.lock b/Cargo.lock index b1fd0775..caabfb50 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1373,7 +1373,7 @@ dependencies = [ "serde", "serde_json", "serde_json_canonicalizer", - "sha2 0.11.0", + "sha2", "tdigest", "thiserror 2.0.18", "tokio", @@ -1459,7 +1459,7 @@ dependencies = [ "schemars", "serde", "serde_json", - "sha2 0.10.9", + "sha2", "tokio", ] @@ -2467,17 +2467,6 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbfa15b3dddfee50a0fff136974b3e1bde555604ba463834a7eb7deb6417705d" -[[package]] -name = "sha2" -version = "0.10.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" -dependencies = [ - "cfg-if", - "cpufeatures 0.2.17", - "digest 0.10.7", -] - [[package]] name = "sha2" version = "0.11.0" diff --git a/crates/pii-redaction/Cargo.toml b/crates/pii-redaction/Cargo.toml index 8f950bfa..86c94680 100644 --- a/crates/pii-redaction/Cargo.toml +++ b/crates/pii-redaction/Cargo.toml @@ -22,7 +22,7 @@ nemo-relay.workspace = true serde = { version = "1", features = ["derive"] } serde_json = "1" regex = "1" -sha2 = "0.10" +sha2 = "0.11" schemars = { version = "0.8", optional = true } [dev-dependencies] From 32de1b6dbda1042020cfff8fb6ae0c7a2cbadaf8 Mon Sep 17 00:00:00 2001 From: Alex Fournier Date: Wed, 10 Jun 2026 14:05:34 -0700 Subject: [PATCH 33/35] fix: align pii redaction helper parity Signed-off-by: Alex Fournier --- crates/node/pii_redaction.d.ts | 1 + crates/node/pii_redaction.js | 2 +- crates/node/tests/pii_redaction_tests.mjs | 2 +- crates/pii-redaction/src/component.rs | 14 +++++++++++ .../tests/unit/component_tests.rs | 24 +++++++++++++++++++ crates/wasm/tests-js/pii_redaction_tests.mjs | 4 ++-- crates/wasm/wrappers/esm/pii_redaction.d.ts | 1 + crates/wasm/wrappers/esm/pii_redaction.js | 2 +- crates/wasm/wrappers/nodejs/pii_redaction.js | 2 +- python/nemo_relay/pii_redaction.py | 2 ++ python/nemo_relay/pii_redaction.pyi | 1 + python/tests/test_pii_redaction_plugin.py | 6 +++++ 12 files changed, 55 insertions(+), 6 deletions(-) diff --git a/crates/node/pii_redaction.d.ts b/crates/node/pii_redaction.d.ts index 24824d46..e4a9f9f2 100644 --- a/crates/node/pii_redaction.d.ts +++ b/crates/node/pii_redaction.d.ts @@ -6,6 +6,7 @@ import type { ConfigDiagnostic, ConfigReport } from './plugin.js'; export { ConfigDiagnostic, ConfigReport }; export interface ConfigPolicy { + unknown_component?: 'ignore' | 'warn' | 'error' | string; unknown_field?: 'ignore' | 'warn' | 'error' | string; unsupported_value?: 'ignore' | 'warn' | 'error' | string; } diff --git a/crates/node/pii_redaction.js b/crates/node/pii_redaction.js index 192bfebc..941bde9b 100644 --- a/crates/node/pii_redaction.js +++ b/crates/node/pii_redaction.js @@ -32,7 +32,7 @@ function defaultConfig() { */ function builtinConfig(config = {}) { return { - action: 'redact', + action: 'remove', ...config, }; } diff --git a/crates/node/tests/pii_redaction_tests.mjs b/crates/node/tests/pii_redaction_tests.mjs index 10def956..dad3db91 100644 --- a/crates/node/tests/pii_redaction_tests.mjs +++ b/crates/node/tests/pii_redaction_tests.mjs @@ -20,7 +20,7 @@ describe('pii_redaction plugin helpers', () => { tool_output: true, priority: 100, }); - assert.deepEqual(piiRedaction.builtinConfig(), { action: 'redact' }); + assert.deepEqual(piiRedaction.builtinConfig(), { action: 'remove' }); assert.deepEqual(piiRedaction.localModelConfig(), {}); const component = piiRedaction.ComponentSpec({ diff --git a/crates/pii-redaction/src/component.rs b/crates/pii-redaction/src/component.rs index d7265ea6..00429fbf 100644 --- a/crates/pii-redaction/src/component.rs +++ b/crates/pii-redaction/src/component.rs @@ -462,6 +462,7 @@ fn validate_pii_redaction_plugin_config( "max_latency_ms", ], ); + validate_version(&mut diagnostics, &config.policy, config.version); validate_mode(&mut diagnostics, &config.policy, &config); validate_surface_selection(&mut diagnostics, &config.policy, &config); validate_codec_requirements(&mut diagnostics, &config.policy, &config); @@ -665,6 +666,19 @@ fn validate_builtin_action_requirements( } } +fn validate_version(diagnostics: &mut Vec, policy: &ConfigPolicy, version: u32) { + if version != default_pii_redaction_config_version() { + push_policy_diag( + diagnostics, + policy.unsupported_value, + "pii_redaction.unsupported_config_version", + Some(PII_REDACTION_PLUGIN_KIND.to_string()), + Some("version".to_string()), + format!("PII redaction config version {version} is unsupported"), + ); + } +} + fn validate_codec_requirements( diagnostics: &mut Vec, policy: &ConfigPolicy, diff --git a/crates/pii-redaction/tests/unit/component_tests.rs b/crates/pii-redaction/tests/unit/component_tests.rs index 3a8aa8ce..5ecff361 100644 --- a/crates/pii-redaction/tests/unit/component_tests.rs +++ b/crates/pii-redaction/tests/unit/component_tests.rs @@ -161,6 +161,30 @@ fn validate_allows_documented_policy_unknown_component_field() { })); } +#[test] +fn validate_rejects_unsupported_config_version() { + let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); + reset_runtime(); + + let report = validate_plugin_config(&plugin_config(json!({ + "version": 2, + "mode": "builtin", + "tool_input": true, + "input": false, + "output": false, + "tool_output": false, + "builtin": { + "action": "remove" + } + }))); + + assert!(report.diagnostics.iter().any(|diag| { + diag.field.as_deref() == Some("version") + && diag.code == "pii_redaction.unsupported_config_version" + && diag.message.contains("version 2 is unsupported") + })); +} + #[test] fn validate_rejects_local_section_outside_local_mode() { let _guard = crate::plugins::pii_redaction::test_mutex().lock().unwrap(); diff --git a/crates/wasm/tests-js/pii_redaction_tests.mjs b/crates/wasm/tests-js/pii_redaction_tests.mjs index 21b21901..f483c3a0 100644 --- a/crates/wasm/tests-js/pii_redaction_tests.mjs +++ b/crates/wasm/tests-js/pii_redaction_tests.mjs @@ -18,7 +18,7 @@ test('WebAssembly pii_redaction wrappers expose helper defaults', () => { priority: 100, }); assert.deepEqual(piiRedaction.builtinConfig(), { - action: 'redact', + action: 'remove', }); assert.deepEqual(piiRedaction.localModelConfig(), {}); }); @@ -43,7 +43,7 @@ test('WebAssembly pii_redaction wrappers build component specs and validate bad tool_output: true, priority: 100, builtin: { - action: 'redact', + action: 'remove', detector: 'email', }, }, diff --git a/crates/wasm/wrappers/esm/pii_redaction.d.ts b/crates/wasm/wrappers/esm/pii_redaction.d.ts index 24824d46..e4a9f9f2 100644 --- a/crates/wasm/wrappers/esm/pii_redaction.d.ts +++ b/crates/wasm/wrappers/esm/pii_redaction.d.ts @@ -6,6 +6,7 @@ import type { ConfigDiagnostic, ConfigReport } from './plugin.js'; export { ConfigDiagnostic, ConfigReport }; export interface ConfigPolicy { + unknown_component?: 'ignore' | 'warn' | 'error' | string; unknown_field?: 'ignore' | 'warn' | 'error' | string; unsupported_value?: 'ignore' | 'warn' | 'error' | string; } diff --git a/crates/wasm/wrappers/esm/pii_redaction.js b/crates/wasm/wrappers/esm/pii_redaction.js index 4cd684c7..8c73d5fe 100644 --- a/crates/wasm/wrappers/esm/pii_redaction.js +++ b/crates/wasm/wrappers/esm/pii_redaction.js @@ -30,7 +30,7 @@ export function defaultConfig() { */ export function builtinConfig(config = {}) { return { - action: 'redact', + action: 'remove', ...config, }; } diff --git a/crates/wasm/wrappers/nodejs/pii_redaction.js b/crates/wasm/wrappers/nodejs/pii_redaction.js index 192bfebc..941bde9b 100644 --- a/crates/wasm/wrappers/nodejs/pii_redaction.js +++ b/crates/wasm/wrappers/nodejs/pii_redaction.js @@ -32,7 +32,7 @@ function defaultConfig() { */ function builtinConfig(config = {}) { return { - action: 'redact', + action: 'remove', ...config, }; } diff --git a/python/nemo_relay/pii_redaction.py b/python/nemo_relay/pii_redaction.py index 02a96fcc..3f463f51 100644 --- a/python/nemo_relay/pii_redaction.py +++ b/python/nemo_relay/pii_redaction.py @@ -59,12 +59,14 @@ def _normalize_object(value: object) -> JsonObject: class ConfigPolicy: """Policy for unsupported PII redaction configuration.""" + unknown_component: UnsupportedBehavior = "warn" unknown_field: UnsupportedBehavior = "warn" unsupported_value: UnsupportedBehavior = "error" def to_dict(self) -> JsonObject: """Serialize this policy to the canonical JSON object shape.""" return { + "unknown_component": self.unknown_component, "unknown_field": self.unknown_field, "unsupported_value": self.unsupported_value, } diff --git a/python/nemo_relay/pii_redaction.pyi b/python/nemo_relay/pii_redaction.pyi index 1b7a9e9a..de103270 100644 --- a/python/nemo_relay/pii_redaction.pyi +++ b/python/nemo_relay/pii_redaction.pyi @@ -22,6 +22,7 @@ class ConfigReport(TypedDict): @dataclass(slots=True) class ConfigPolicy: + unknown_component: UnsupportedBehavior = ... unknown_field: UnsupportedBehavior = ... unsupported_value: UnsupportedBehavior = ... def to_dict(self) -> JsonObject: ... diff --git a/python/tests/test_pii_redaction_plugin.py b/python/tests/test_pii_redaction_plugin.py index 1d2efd6b..4b3943dd 100644 --- a/python/tests/test_pii_redaction_plugin.py +++ b/python/tests/test_pii_redaction_plugin.py @@ -10,6 +10,7 @@ PII_REDACTION_PLUGIN_KIND, BuiltinConfig, ComponentSpec, + ConfigPolicy, LocalModelConfig, PiiRedactionConfig, validate_config, @@ -22,6 +23,11 @@ def test_defaults_and_component_wrapper(self): "action": "remove", "target_paths": [], } + assert ConfigPolicy().to_dict() == { + "unknown_component": "warn", + "unknown_field": "warn", + "unsupported_value": "error", + } assert LocalModelConfig().to_dict() == {} wrapped = ComponentSpec(PiiRedactionConfig()).to_dict() From 12e3fff1c8d84aeef86bc0827b56c68ddc0c7ee0 Mon Sep 17 00:00:00 2001 From: Alex Fournier Date: Wed, 10 Jun 2026 16:48:18 -0700 Subject: [PATCH 34/35] test: add cli pii redaction coverage Signed-off-by: Alex Fournier --- crates/cli/tests/coverage/doctor_tests.rs | 42 ++++++++++++++++++++++ crates/cli/tests/coverage/plugins_tests.rs | 26 ++++++++++++++ crates/cli/tests/coverage/server_tests.rs | 42 ++++++++++++++++++++++ 3 files changed, 110 insertions(+) diff --git a/crates/cli/tests/coverage/doctor_tests.rs b/crates/cli/tests/coverage/doctor_tests.rs index 8f5d1453..5062955d 100644 --- a/crates/cli/tests/coverage/doctor_tests.rs +++ b/crates/cli/tests/coverage/doctor_tests.rs @@ -693,6 +693,48 @@ async fn collect_observability_registers_adaptive_before_validation() { ); } +#[tokio::test] +async fn collect_observability_registers_pii_redaction_before_validation() { + let gateway = GatewayConfig { + plugin_config: Some(serde_json::json!({ + "version": 1, + "components": [ + { + "kind": "observability", + "enabled": true, + "config": { "version": 1 } + }, + { + "kind": "pii_redaction", + "enabled": false, + "config": { + "version": 1, + "mode": "builtin", + "policy": { + "unknown_component": "warn", + "unknown_field": "warn", + "unsupported_value": "error" + }, + "builtin": { + "action": "remove" + } + } + } + ] + })), + ..GatewayConfig::default() + }; + + let checks = collect_observability(&gateway).await; + + assert!( + !checks.iter().any(|check| check + .details + .contains("plugin component kind 'pii_redaction' is unsupported")), + "doctor should register pii_redaction before plugin validation: {checks:?}" + ); +} + #[tokio::test] async fn collect_observability_probes_atof_streaming_endpoint() { let (url, body, server_thread) = start_doctor_http_capture_server(); diff --git a/crates/cli/tests/coverage/plugins_tests.rs b/crates/cli/tests/coverage/plugins_tests.rs index bebe3124..4d1086a6 100644 --- a/crates/cli/tests/coverage/plugins_tests.rs +++ b/crates/cli/tests/coverage/plugins_tests.rs @@ -1376,6 +1376,32 @@ fn validate_config_accepts_local_tool_only_nemo_guardrails_component() { validate_config(&config).unwrap(); } +#[test] +fn validate_config_accepts_pii_redaction_component() { + let config = PluginConfig { + components: vec![PluginComponentSpec { + kind: PII_REDACTION_PLUGIN_KIND.to_string(), + enabled: true, + config: json!({ + "mode": "builtin", + "codec": "openai_chat", + "input": true, + "output": true, + "builtin": { + "action": "redact", + "detector": "email" + } + }) + .as_object() + .unwrap() + .clone(), + }], + ..PluginConfig::default() + }; + + validate_config(&config).unwrap(); +} + #[test] fn validate_config_rejects_local_nemo_guardrails_request_defaults() { let config = PluginConfig { diff --git a/crates/cli/tests/coverage/server_tests.rs b/crates/cli/tests/coverage/server_tests.rs index d706b68a..4a9c524f 100644 --- a/crates/cli/tests/coverage/server_tests.rs +++ b/crates/cli/tests/coverage/server_tests.rs @@ -1488,6 +1488,48 @@ async fn serve_listener_activates_adaptive_plugin_config() { handle.await.unwrap().unwrap(); } +#[tokio::test] +async fn serve_listener_activates_pii_redaction_plugin_config() { + let _guard = PLUGIN_CONFIG_TEST_LOCK.lock().await; + let _ = nemo_relay::plugin::clear_plugin_configuration(); + + let mut config = test_config(); + config.plugin_config = Some(json!({ + "version": 1, + "components": [ + { + "kind": "pii_redaction", + "enabled": true, + "config": { + "version": 1, + "mode": "builtin", + "codec": "openai_chat", + "input": true, + "output": true, + "builtin": { + "action": "redact", + "detector": "email" + } + } + } + ] + })); + + let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); + let address = listener.local_addr().unwrap(); + let url = format!("http://{address}"); + let (shutdown_tx, shutdown_rx) = oneshot::channel(); + let handle = + tokio::spawn(async move { serve_listener(listener, config, Some(shutdown_rx)).await }); + + wait_for_gateway(&url).await; + assert!(nemo_relay::plugin::active_plugin_report().is_some()); + + shutdown_tx.send(()).unwrap(); + handle.await.unwrap().unwrap(); + assert!(nemo_relay::plugin::active_plugin_report().is_none()); +} + #[tokio::test] async fn serve_listener_rejects_invalid_plugin_config() { let _guard = PLUGIN_CONFIG_TEST_LOCK.lock().await; From c0b00d8f9e0296829a670ee3b573de1326383164 Mon Sep 17 00:00:00 2001 From: Alex Fournier Date: Wed, 10 Jun 2026 17:40:27 -0700 Subject: [PATCH 35/35] test: cover invalid pii cli paths Signed-off-by: Alex Fournier --- crates/cli/tests/coverage/doctor_tests.rs | 32 ++++++++++++++++++++++ crates/cli/tests/coverage/server_tests.rs | 33 +++++++++++++++++++++++ 2 files changed, 65 insertions(+) diff --git a/crates/cli/tests/coverage/doctor_tests.rs b/crates/cli/tests/coverage/doctor_tests.rs index 5062955d..ca04d463 100644 --- a/crates/cli/tests/coverage/doctor_tests.rs +++ b/crates/cli/tests/coverage/doctor_tests.rs @@ -735,6 +735,38 @@ async fn collect_observability_registers_pii_redaction_before_validation() { ); } +#[tokio::test] +async fn collect_observability_reports_invalid_pii_redaction_config() { + let gateway = GatewayConfig { + plugin_config: Some(serde_json::json!({ + "version": 1, + "components": [ + { + "kind": "pii_redaction", + "enabled": true, + "config": { + "version": 2, + "mode": "builtin", + "builtin": { + "action": "remove" + } + } + } + ] + })), + ..GatewayConfig::default() + }; + + let checks = collect_observability(&gateway).await; + + let diagnostic = checks + .iter() + .find(|check| check.name == "Plugin diagnostic") + .expect("plugin diagnostic check"); + assert_eq!(diagnostic.status, Status::Fail); + assert!(diagnostic.details.contains("unsupported_config_version")); +} + #[tokio::test] async fn collect_observability_probes_atof_streaming_endpoint() { let (url, body, server_thread) = start_doctor_http_capture_server(); diff --git a/crates/cli/tests/coverage/server_tests.rs b/crates/cli/tests/coverage/server_tests.rs index 4a9c524f..15e531b9 100644 --- a/crates/cli/tests/coverage/server_tests.rs +++ b/crates/cli/tests/coverage/server_tests.rs @@ -1562,6 +1562,39 @@ async fn serve_listener_rejects_invalid_plugin_config() { assert!(nemo_relay::plugin::active_plugin_report().is_none()); } +#[tokio::test] +async fn serve_listener_rejects_invalid_pii_redaction_plugin_config() { + let _guard = PLUGIN_CONFIG_TEST_LOCK.lock().await; + let _ = nemo_relay::plugin::clear_plugin_configuration(); + + let mut config = test_config(); + config.plugin_config = Some(json!({ + "version": 1, + "components": [ + { + "kind": "pii_redaction", + "enabled": true, + "config": { + "version": 2, + "mode": "builtin", + "builtin": { + "action": "remove" + } + } + } + ] + })); + let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); + let (_shutdown_tx, shutdown_rx) = oneshot::channel(); + let error = serve_listener(listener, config, Some(shutdown_rx)) + .await + .unwrap_err(); + + assert!(error.to_string().contains("unsupported")); + assert!(error.to_string().contains("version")); + assert!(nemo_relay::plugin::active_plugin_report().is_none()); +} + #[tokio::test] async fn gateway_errors_render_structured_json_responses() { let response = CliError::InvalidPayload("bad input".into()).into_response();