From a3485d885af984dd00eeae70883c9891404784ea Mon Sep 17 00:00:00 2001 From: Leo Villalobos Date: Sat, 27 Dec 2025 19:41:20 -0800 Subject: [PATCH 1/3] implement support for quoting with single quotes --- src/command/handlers/echo.rs | 4 +-- src/command/mod.rs | 10 ++++-- src/util.rs | 64 +++++++++++++++++++----------------- 3 files changed, 42 insertions(+), 36 deletions(-) diff --git a/src/command/handlers/echo.rs b/src/command/handlers/echo.rs index a8fbec9..06e02cf 100644 --- a/src/command/handlers/echo.rs +++ b/src/command/handlers/echo.rs @@ -43,14 +43,14 @@ mod tests { #[test] fn quoted_args() { - let cmd = parse_cmd("echo \"hello world\" test").unwrap(); + let cmd = parse_cmd("echo \'hello world\' test").unwrap(); assert!(cmd.run().is_ok()); assert_eq!(cmd.args, vec!["echo", "hello world", "test"]); } #[test] fn empty_quoted_string() { - let cmd = parse_cmd("echo \"\"").unwrap(); + let cmd = parse_cmd("echo \'\'").unwrap(); assert!(cmd.run().is_ok()); assert_eq!(cmd.args, vec!["echo", ""]); } diff --git a/src/command/mod.rs b/src/command/mod.rs index 00d6ecd..6676d7b 100644 --- a/src/command/mod.rs +++ b/src/command/mod.rs @@ -3,7 +3,7 @@ pub(crate) mod path; use std::io; -use crate::util::{tokenize, RushError}; +use crate::util::{RushError, tokenize}; use self::{ handlers::{handle_cd, handle_echo, handle_executable, handle_pwd, handle_type}, @@ -94,7 +94,11 @@ impl Command { } #[cfg(test)] - pub(crate) fn handle_executable(&self, path: &str, name: &str) -> Result, RushError> { + pub(crate) fn handle_executable( + &self, + path: &str, + name: &str, + ) -> Result, RushError> { handle_executable(path, name, &self.args) } } @@ -249,7 +253,7 @@ mod tests { #[test] fn quoted_arguments_preserved() { - let cmd = parse_cmd("echo \"hello world\"").unwrap(); + let cmd = parse_cmd("echo \'hello world\'").unwrap(); assert_eq!(cmd.args, vec!["echo", "hello world"]); } diff --git a/src/util.rs b/src/util.rs index b0d1961..9dfa46f 100644 --- a/src/util.rs +++ b/src/util.rs @@ -35,7 +35,7 @@ pub fn tokenize(mut reader: R) -> Result, RushError> for (i, char) in input_tokens.chars().enumerate() { match char { - '"' => { + '\'' => { quote_count += 1; // Push buf to tokens when more than 1 quote is found @@ -45,15 +45,10 @@ pub fn tokenize(mut reader: R) -> Result, RushError> quote_count = 0; } } - char => { - // At the end, an odd num of quotes means a quote wasn't terminated - if i == input_tokens.len() - 1 && quote_count % 2 == 1 { - return Err(RushError::UnterminatedQuote); - } - + ' ' => { // If we haven't seen a quote yet and we encounter a space, push buf // into tokens and clear buf - if quote_count == 0 && char == ' ' { + if quote_count == 0 { // Skip over empty tokens if buf.trim().is_empty() { continue; @@ -64,7 +59,14 @@ pub fn tokenize(mut reader: R) -> Result, RushError> continue; } - // Push the current char into buf + buf.push(' '); + } + char => { + // At the end, an odd num of quotes means a quote wasn't terminated + if i == input_tokens.len() - 1 && quote_count % 2 == 1 { + return Err(RushError::UnterminatedQuote); + } + buf.push(char); // At the end, push any remaining chars into tokens @@ -138,7 +140,7 @@ mod tests { #[test] fn simple_quoted_string() { assert_eq!( - parse("echo \"hello world\"\n").unwrap(), + parse("echo \'hello world\'\n").unwrap(), vec!["echo", "hello world"] ); } @@ -146,7 +148,7 @@ mod tests { #[test] fn multiple_quoted_strings() { assert_eq!( - parse("\"first\" \"second\" \"third\"\n").unwrap(), + parse("\'first\' \'second\' \'third\'\n").unwrap(), vec!["first", "second", "third"] ); } @@ -154,7 +156,7 @@ mod tests { #[test] fn preserves_spaces_in_quotes() { assert_eq!( - parse("echo \"two spaces\" and some \"mo re\"\n").unwrap(), + parse("echo \'two spaces\' and some \'mo re\'\n").unwrap(), vec!["echo", "two spaces", "and", "some", "mo re"] ); } @@ -162,26 +164,26 @@ mod tests { #[test] fn single_quoted_token() { assert_eq!( - parse("\"single quoted token\"\n").unwrap(), + parse("\'single quoted token\'\n").unwrap(), vec!["single quoted token"] ); } #[test] fn empty_quoted_strings() { - assert_eq!(parse("\"\"\n").unwrap(), vec![""]); - assert_eq!(parse("echo \"\"\n").unwrap(), vec!["echo", ""]); - assert_eq!(parse("\"\" \"\" \"\"\n").unwrap(), vec!["", "", ""]); + assert_eq!(parse("\'\'\n").unwrap(), vec![""]); + assert_eq!(parse("echo \'\'\n").unwrap(), vec!["echo", ""]); + assert_eq!(parse("\'\' \'\' \'\'\n").unwrap(), vec!["", "", ""]); } #[test] fn quotes_with_special_characters() { assert_eq!( - parse("echo \"hello!@#$%^&*()world\"\n").unwrap(), + parse("echo \'hello!@#$%^&*()world\'\n").unwrap(), vec!["echo", "hello!@#$%^&*()world"] ); assert_eq!( - parse("echo \"path/to/file\"\n").unwrap(), + parse("echo \'path/to/file\'\n").unwrap(), vec!["echo", "path/to/file"] ); } @@ -189,25 +191,25 @@ mod tests { #[test] fn mixed_quoted_and_unquoted() { assert_eq!( - parse("cp \"source file\" dest\n").unwrap(), + parse("cp \'source file\' dest\n").unwrap(), vec!["cp", "source file", "dest"] ); assert_eq!( - parse("command arg1 \"quoted arg\" arg2 \"another quoted\"\n").unwrap(), + parse("command arg1 \'quoted arg\' arg2 \'another quoted\'\n").unwrap(), vec!["command", "arg1", "quoted arg", "arg2", "another quoted"] ); } #[test] fn consecutive_quotes() { - assert_eq!(parse("\"\"\"\" \n").unwrap(), vec!["", ""]); - assert_eq!(parse("\"a\"\"b\"\n").unwrap(), vec!["a", "b"]); + assert_eq!(parse("\'\'\'\' \n").unwrap(), vec!["", ""]); + assert_eq!(parse("\'a\'\'b\'\n").unwrap(), vec!["a", "b"]); } #[test] fn quotes_at_start() { assert_eq!( - parse("\"start\" middle end\n").unwrap(), + parse("\'start\' middle end\n").unwrap(), vec!["start", "middle", "end"] ); } @@ -215,25 +217,25 @@ mod tests { #[test] fn quotes_at_end() { assert_eq!( - parse("start middle \"end\"\n").unwrap(), + parse("start middle \'end\'\n").unwrap(), vec!["start", "middle", "end"] ); } #[test] fn only_quoted_token() { - assert_eq!(parse("\"only\"\n").unwrap(), vec!["only"]); + assert_eq!(parse("\'only\'\n").unwrap(), vec!["only"]); } #[test] fn single_char_quoted() { - assert_eq!(parse("\"a\" \"b\" \"c\"\n").unwrap(), vec!["a", "b", "c"]); + assert_eq!(parse("\'a\' \'b\' \'c\'\n").unwrap(), vec!["a", "b", "c"]); } #[test] fn quoted_pattern_for_grep() { assert_eq!( - parse("grep \"pattern\" file.txt\n").unwrap(), + parse("grep \'pattern\' file.txt\n").unwrap(), vec!["grep", "pattern", "file.txt"] ); } @@ -293,7 +295,7 @@ mod tests { #[test] fn very_long_quoted_token() { let long_token = "a".repeat(1000); - let long_quoted = format!("echo \"{}\"\n", long_token); + let long_quoted = format!("echo \'{}\'\n", long_token); assert_eq!(parse(&long_quoted).unwrap(), vec!["echo", &long_token]); } } @@ -304,7 +306,7 @@ mod tests { #[test] fn unterminated_quote_at_end() { assert!(matches!( - parse("echo \"hello world\n").unwrap_err(), + parse("echo \'hello world\n").unwrap_err(), RushError::UnterminatedQuote )); } @@ -312,7 +314,7 @@ mod tests { #[test] fn unterminated_quote_at_start() { assert!(matches!( - parse("\"unterminated\n").unwrap_err(), + parse("\'unterminated\n").unwrap_err(), RushError::UnterminatedQuote )); } @@ -320,7 +322,7 @@ mod tests { #[test] fn unterminated_quote_after_valid_quotes() { assert!(matches!( - parse("cmd \"arg1\" \"unterminated\n").unwrap_err(), + parse("cmd \'arg1\' \'unterminated\n").unwrap_err(), RushError::UnterminatedQuote )); } From f04abb709bfcef998e168d2e8213e8064b437018 Mon Sep 17 00:00:00 2001 From: Leonardo Villalobos Date: Tue, 30 Dec 2025 14:13:32 -0800 Subject: [PATCH 2/3] implement support for quoting with single quotes --- src/command/handlers/echo.rs | 2 +- src/util.rs | 50 ++++++++++++++++++++++++++---------- 2 files changed, 38 insertions(+), 14 deletions(-) diff --git a/src/command/handlers/echo.rs b/src/command/handlers/echo.rs index 06e02cf..4076e10 100644 --- a/src/command/handlers/echo.rs +++ b/src/command/handlers/echo.rs @@ -52,7 +52,7 @@ mod tests { fn empty_quoted_string() { let cmd = parse_cmd("echo \'\'").unwrap(); assert!(cmd.run().is_ok()); - assert_eq!(cmd.args, vec!["echo", ""]); + assert_eq!(cmd.args, vec!["echo"]); } #[test] diff --git a/src/util.rs b/src/util.rs index 9dfa46f..5addeb4 100644 --- a/src/util.rs +++ b/src/util.rs @@ -30,7 +30,12 @@ pub fn tokenize(mut reader: R) -> Result, RushError> let input_tokens = input.trim(); let buf = &mut String::new(); - let mut tokens = Vec::new(); + enum TokenKind { + Literal(String), + Quoted(String), + } + + let mut tokens = Vec::::new(); let mut quote_count = 0; for (i, char) in input_tokens.chars().enumerate() { @@ -40,7 +45,19 @@ pub fn tokenize(mut reader: R) -> Result, RushError> // Push buf to tokens when more than 1 quote is found if quote_count > 1 { - tokens.push(buf.clone()); + // Ignore empty quoted tokens + if buf.len() == 0 { + quote_count = 0; + continue; + } + + // Concatenate consecutive quoted tokens + if let Some(TokenKind::Quoted(last_token)) = tokens.last_mut() { + last_token.push_str(&buf.clone()); + } else { + tokens.push(TokenKind::Quoted(buf.clone())); + } + buf.clear(); quote_count = 0; } @@ -54,7 +71,7 @@ pub fn tokenize(mut reader: R) -> Result, RushError> continue; } - tokens.push(buf.trim().to_string()); + tokens.push(TokenKind::Literal(buf.trim().into())); buf.clear(); continue; } @@ -67,17 +84,24 @@ pub fn tokenize(mut reader: R) -> Result, RushError> return Err(RushError::UnterminatedQuote); } + // Push the current char into buf buf.push(char); // At the end, push any remaining chars into tokens if i == input_tokens.len() - 1 && buf.len() > 0 { - tokens.push(buf.clone()); + tokens.push(TokenKind::Literal(buf.trim().into())); } } } } - Ok(tokens) + Ok(tokens + .iter() + .map(|token| match token { + TokenKind::Literal(literal) => literal.to_owned(), + TokenKind::Quoted(quoted) => quoted.to_owned(), + }) + .collect::>()) } #[cfg(test)] @@ -146,10 +170,10 @@ mod tests { } #[test] - fn multiple_quoted_strings() { + fn consecutive_quoted_strings_are_concatenated() { assert_eq!( parse("\'first\' \'second\' \'third\'\n").unwrap(), - vec!["first", "second", "third"] + vec!["firstsecondthird"] ); } @@ -171,9 +195,9 @@ mod tests { #[test] fn empty_quoted_strings() { - assert_eq!(parse("\'\'\n").unwrap(), vec![""]); - assert_eq!(parse("echo \'\'\n").unwrap(), vec!["echo", ""]); - assert_eq!(parse("\'\' \'\' \'\'\n").unwrap(), vec!["", "", ""]); + assert_eq!(parse("\'\'\n").unwrap(), Vec::<&str>::new()); + assert_eq!(parse("echo \'\'\n").unwrap(), vec!["echo"]); + assert_eq!(parse("\'\' \'\' \'\'\n").unwrap(), Vec::<&str>::new()); } #[test] @@ -202,8 +226,8 @@ mod tests { #[test] fn consecutive_quotes() { - assert_eq!(parse("\'\'\'\' \n").unwrap(), vec!["", ""]); - assert_eq!(parse("\'a\'\'b\'\n").unwrap(), vec!["a", "b"]); + assert_eq!(parse("\'\'\'\' \n").unwrap(), Vec::<&str>::new()); + assert_eq!(parse("\'a\'\'b\'\n").unwrap(), vec!["ab"]); } #[test] @@ -229,7 +253,7 @@ mod tests { #[test] fn single_char_quoted() { - assert_eq!(parse("\'a\' \'b\' \'c\'\n").unwrap(), vec!["a", "b", "c"]); + assert_eq!(parse("\'a\' \'b\' \'c\'\n").unwrap(), vec!["abc"]); } #[test] From 0de7d65e06d28b10ba3b5c33269d232886c830a0 Mon Sep 17 00:00:00 2001 From: Leonardo Villalobos Date: Wed, 31 Dec 2025 14:03:36 -0800 Subject: [PATCH 3/3] add a tokenizer --- src/command/mod.rs | 5 +- src/util.rs | 212 +++++++++++++++++++++++++++++++-------------- 2 files changed, 152 insertions(+), 65 deletions(-) diff --git a/src/command/mod.rs b/src/command/mod.rs index 6676d7b..97821af 100644 --- a/src/command/mod.rs +++ b/src/command/mod.rs @@ -3,7 +3,7 @@ pub(crate) mod path; use std::io; -use crate::util::{RushError, tokenize}; +use crate::util::{RushError, Tokenizer}; use self::{ handlers::{handle_cd, handle_echo, handle_executable, handle_pwd, handle_type}, @@ -56,7 +56,8 @@ pub(crate) struct Command { impl Command { pub(crate) fn new(reader: R) -> Result { - let args = tokenize(reader)?; + let mut tokenizer = Tokenizer::from(reader)?; + let args = tokenizer.tokenize()?; // Read the name of the command from the tokenized args let Some(name) = args.first() else { diff --git a/src/util.rs b/src/util.rs index 5addeb4..e67aeb4 100644 --- a/src/util.rs +++ b/src/util.rs @@ -21,87 +21,168 @@ pub enum RushError { UnterminatedQuote, } -pub fn tokenize(mut reader: R) -> Result, RushError> { - let mut input = String::new(); - reader - .read_line(&mut input) - .map_err(|_| RushError::UnexpectedEOF)?; - - let input_tokens = input.trim(); - let buf = &mut String::new(); - - enum TokenKind { - Literal(String), - Quoted(String), - } +#[derive(Debug)] +enum TokenKind { + Literal(String), + Quoted(String), + Space, +} - let mut tokens = Vec::::new(); - let mut quote_count = 0; +#[derive(Debug)] +pub struct Tokenizer { + input: String, + tokens: Vec, +} - for (i, char) in input_tokens.chars().enumerate() { - match char { - '\'' => { - quote_count += 1; +impl Tokenizer { + pub fn from(mut reader: R) -> Result + where + R: io::BufRead, + { + let mut input = String::new(); + reader + .read_line(&mut input) + .map_err(|_| RushError::UnexpectedEOF)?; + + Ok(Self { + input: input.trim().to_owned(), + tokens: Vec::new(), + }) + } - // Push buf to tokens when more than 1 quote is found - if quote_count > 1 { - // Ignore empty quoted tokens - if buf.len() == 0 { - quote_count = 0; + pub fn tokenize(&mut self) -> Result, RushError> { + let buf = &mut String::new(); + let mut quote_count = 0; + let mut has_seen_literal = false; + + for (i, char) in self.input.chars().enumerate() { + match char { + '\'' => { + quote_count += 1; + + if quote_count == 1 { + // If there's content in buf, push it as a Literal before + // starting the quoted string + if !buf.trim().is_empty() { + has_seen_literal = true; + self.tokens.push(TokenKind::Literal(buf.trim().into())); + } + buf.clear(); continue; } - // Concatenate consecutive quoted tokens - if let Some(TokenKind::Quoted(last_token)) = tokens.last_mut() { - last_token.push_str(&buf.clone()); - } else { - tokens.push(TokenKind::Quoted(buf.clone())); + if quote_count == 2 { + // Ignore empty quoted tokens + if buf.trim().len() == 0 { + buf.clear(); + quote_count = 0; + continue; + } + + // Concatenate consecutive tokens (only if last token is NOT Space) + if !matches!(self.tokens.last(), Some(TokenKind::Space)) { + match self.tokens.last_mut() { + Some(TokenKind::Quoted(last_token)) => { + last_token.push_str(&buf.clone()); + buf.clear(); + quote_count = 0; + continue; + } + Some(TokenKind::Literal(last_token)) => { + last_token.push_str(&buf.clone()); + // Convert the Literal to a Quoted since it now contains quoted content + let combined = last_token.clone(); + self.tokens.pop(); + self.tokens.push(TokenKind::Quoted(combined)); + buf.clear(); + quote_count = 0; + continue; + } + _ => {} + } + } else { + // There's a Space before this quoted string, so pop it before adding the new token + self.tokens.pop(); + } + + self.tokens.push(TokenKind::Quoted(buf.clone())); + + buf.clear(); + quote_count = 0; } - - buf.clear(); - quote_count = 0; } - } - ' ' => { - // If we haven't seen a quote yet and we encounter a space, push buf - // into tokens and clear buf - if quote_count == 0 { - // Skip over empty tokens - if buf.trim().is_empty() { + ' ' => { + if quote_count == 0 { + // Skip over empty tokens + if buf.trim().is_empty() { + buf.clear(); + // Push Space token after Literals, OR after Quoted if we've seen a literal before + // This allows pure quoted strings to concatenate, but separates tokens when literals are involved + if matches!(self.tokens.last(), Some(TokenKind::Literal(_))) { + self.tokens.push(TokenKind::Space); + } else if has_seen_literal + && matches!(self.tokens.last(), Some(TokenKind::Quoted(_))) + { + self.tokens.push(TokenKind::Space); + } + continue; + } + + // Since we aren't processing a quoted string, push the buf into + // self.tokens as a Literal token + has_seen_literal = true; + self.tokens.push(TokenKind::Literal(buf.trim().into())); + // Push a Space token after the Literal token to help the state machine + // determine whether to concatenate or not + self.tokens.push(TokenKind::Space); + + buf.clear(); continue; } - tokens.push(TokenKind::Literal(buf.trim().into())); - buf.clear(); - continue; + // We push a space into buf if we're processing a quoted string + buf.push(' '); } + char => { + // At the end, an odd num of quotes means a quote wasn't terminated + if i == self.input.len() - 1 && quote_count % 2 == 1 { + return Err(RushError::UnterminatedQuote); + } - buf.push(' '); + // Push the current char into buf + buf.push(char); + } } - char => { - // At the end, an odd num of quotes means a quote wasn't terminated - if i == input_tokens.len() - 1 && quote_count % 2 == 1 { - return Err(RushError::UnterminatedQuote); + } + + // Push remaining chars into self.tokens + if buf.len() > 0 { + // Concatenate with the last token if it's a Literal or Quoted (no Space between) + match self.tokens.last_mut() { + Some(TokenKind::Literal(last_token)) => { + last_token.push_str(buf.trim()); + } + Some(TokenKind::Quoted(last_token)) => { + last_token.push_str(buf.trim()); + } + _ => { + self.tokens.push(TokenKind::Literal(buf.trim().into())); } + } + } - // Push the current char into buf - buf.push(char); + let mut tokens = Vec::::new(); - // At the end, push any remaining chars into tokens - if i == input_tokens.len() - 1 && buf.len() > 0 { - tokens.push(TokenKind::Literal(buf.trim().into())); - } + for token in &self.tokens { + match token { + TokenKind::Literal(literal) => tokens.push(literal.to_owned()), + TokenKind::Quoted(quoted) => tokens.push(quoted.to_owned()), + TokenKind::Space => { /* state machine hint */ } } } - } - Ok(tokens - .iter() - .map(|token| match token { - TokenKind::Literal(literal) => literal.to_owned(), - TokenKind::Quoted(quoted) => quoted.to_owned(), - }) - .collect::>()) + Ok(tokens) + } } #[cfg(test)] @@ -111,7 +192,8 @@ mod tests { // Shared test helper fn parse(input: &str) -> Result, RushError> { - tokenize(io::Cursor::new(input)) + let mut state_machine = Tokenizer::from(io::Cursor::new(input))?; + state_machine.tokenize() } mod basic_tokenization { @@ -222,6 +304,10 @@ mod tests { parse("command arg1 \'quoted arg\' arg2 \'another quoted\'\n").unwrap(), vec!["command", "arg1", "quoted arg", "arg2", "another quoted"] ); + assert_eq!( + parse("echo \'world shell\' \'script\'\'test\' example\'\'hello").unwrap(), + vec!["echo", "world shell", "scripttest", "examplehello"] + ); } #[test] @@ -372,7 +458,7 @@ mod tests { #[test] fn io_read_error_returns_unexpected_eof() { let reader = ErrReader; - let err = tokenize(reader).unwrap_err(); + let err = Tokenizer::from(reader).unwrap_err(); assert!(matches!(err, RushError::UnexpectedEOF)); } }