From 4937dcfee7cddd77c34f2dcee821a748f83b50e8 Mon Sep 17 00:00:00 2001 From: Droid Date: Wed, 21 Jan 2026 15:11:56 +0400 Subject: [PATCH] fix: sanitize input text to remove null bytes and control characters --- src/core/embeddings.rs | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/src/core/embeddings.rs b/src/core/embeddings.rs index be77c5b..087de79 100644 --- a/src/core/embeddings.rs +++ b/src/core/embeddings.rs @@ -90,9 +90,10 @@ impl EmbeddingEngine { let mut results = Vec::with_capacity(texts.len()); for text in texts { + let clean_text = sanitize_text(text); let tokens = self .model - .str_to_token(text, AddBos::Always) + .str_to_token(&clean_text, AddBos::Always) .context("Failed to tokenize")?; let tokens: Vec<_> = if tokens.len() > self.n_ctx { @@ -139,10 +140,37 @@ fn normalize(input: &[f32]) -> Vec { input.iter().map(|x| x / magnitude).collect() } +fn sanitize_text(text: &str) -> String { + text.chars() + .filter(|&c| !c.is_control() || matches!(c, '\n' | '\t' | '\r')) + .collect() +} + #[cfg(test)] mod tests { use super::*; + #[test] + fn test_sanitize_text() { + // Normal text should pass through + assert_eq!(sanitize_text("hello world"), "hello world"); + + // Null bytes should be removed + assert_eq!(sanitize_text("hello\0world"), "helloworld"); + + // Control characters should be removed + assert_eq!(sanitize_text("hello\x01world"), "helloworld"); + assert_eq!(sanitize_text("hello\x1Fworld"), "helloworld"); + + // Allowed whitespace should be preserved + assert_eq!(sanitize_text("hello\nworld"), "hello\nworld"); + assert_eq!(sanitize_text("hello\tworld"), "hello\tworld"); + assert_eq!(sanitize_text("hello\r\nworld"), "hello\r\nworld"); + + // Mixed + assert_eq!(sanitize_text("h\0e\nl\x01l\to"), "he\nll\to"); + } + #[test] fn test_normalize() { let input = vec![3.0, 4.0];