Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions src/command/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ pub(crate) mod path;

use std::io;

use crate::util::{RushError, tokenize};
use crate::util::{RushError, Tokenizer};

use self::{
handlers::{handle_cd, handle_echo, handle_executable, handle_pwd, handle_type},
Expand Down Expand Up @@ -56,7 +56,8 @@ pub(crate) struct Command {

impl Command {
pub(crate) fn new<R: io::BufRead>(reader: R) -> Result<Command, RushError> {
let args = tokenize(reader)?;
let mut tokenizer = Tokenizer::from(reader)?;
let args = tokenizer.tokenize()?;

// Read the name of the command from the tokenized args
let Some(name) = args.first() else {
Expand Down
212 changes: 149 additions & 63 deletions src/util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,87 +21,168 @@ pub enum RushError {
UnterminatedQuote,
}

pub fn tokenize<R: io::BufRead>(mut reader: R) -> Result<Vec<String>, RushError> {
let mut input = String::new();
reader
.read_line(&mut input)
.map_err(|_| RushError::UnexpectedEOF)?;

let input_tokens = input.trim();
let buf = &mut String::new();

enum TokenKind {
Literal(String),
Quoted(String),
}
#[derive(Debug)]
enum TokenKind {
Literal(String),
Quoted(String),
Space,
}

let mut tokens = Vec::<TokenKind>::new();
let mut quote_count = 0;
#[derive(Debug)]
pub struct Tokenizer {
input: String,
tokens: Vec<TokenKind>,
}

for (i, char) in input_tokens.chars().enumerate() {
match char {
'\'' => {
quote_count += 1;
impl Tokenizer {
pub fn from<R>(mut reader: R) -> Result<Self, RushError>
where
R: io::BufRead,
{
let mut input = String::new();
reader
.read_line(&mut input)
.map_err(|_| RushError::UnexpectedEOF)?;

Ok(Self {
input: input.trim().to_owned(),
tokens: Vec::new(),
})
}

// Push buf to tokens when more than 1 quote is found
if quote_count > 1 {
// Ignore empty quoted tokens
if buf.len() == 0 {
quote_count = 0;
pub fn tokenize(&mut self) -> Result<Vec<String>, RushError> {
let buf = &mut String::new();
let mut quote_count = 0;
let mut has_seen_literal = false;

for (i, char) in self.input.chars().enumerate() {
match char {
'\'' => {
quote_count += 1;

if quote_count == 1 {
// If there's content in buf, push it as a Literal before
// starting the quoted string
if !buf.trim().is_empty() {
has_seen_literal = true;
self.tokens.push(TokenKind::Literal(buf.trim().into()));
}
buf.clear();
continue;
}

// Concatenate consecutive quoted tokens
if let Some(TokenKind::Quoted(last_token)) = tokens.last_mut() {
last_token.push_str(&buf.clone());
} else {
tokens.push(TokenKind::Quoted(buf.clone()));
if quote_count == 2 {
// Ignore empty quoted tokens
if buf.trim().len() == 0 {
buf.clear();
quote_count = 0;
continue;
}

// Concatenate consecutive tokens (only if last token is NOT Space)
if !matches!(self.tokens.last(), Some(TokenKind::Space)) {
match self.tokens.last_mut() {
Some(TokenKind::Quoted(last_token)) => {
last_token.push_str(&buf.clone());
buf.clear();
quote_count = 0;
continue;
}
Some(TokenKind::Literal(last_token)) => {
last_token.push_str(&buf.clone());
// Convert the Literal to a Quoted since it now contains quoted content
let combined = last_token.clone();
self.tokens.pop();
self.tokens.push(TokenKind::Quoted(combined));
buf.clear();
quote_count = 0;
continue;
}
_ => {}
}
} else {
// There's a Space before this quoted string, so pop it before adding the new token
self.tokens.pop();
}

self.tokens.push(TokenKind::Quoted(buf.clone()));

buf.clear();
quote_count = 0;
}

buf.clear();
quote_count = 0;
}
}
' ' => {
// If we haven't seen a quote yet and we encounter a space, push buf
// into tokens and clear buf
if quote_count == 0 {
// Skip over empty tokens
if buf.trim().is_empty() {
' ' => {
if quote_count == 0 {
// Skip over empty tokens
if buf.trim().is_empty() {
buf.clear();
// Push Space token after Literals, OR after Quoted if we've seen a literal before
// This allows pure quoted strings to concatenate, but separates tokens when literals are involved
if matches!(self.tokens.last(), Some(TokenKind::Literal(_))) {
self.tokens.push(TokenKind::Space);
} else if has_seen_literal
&& matches!(self.tokens.last(), Some(TokenKind::Quoted(_)))
{
self.tokens.push(TokenKind::Space);
}
continue;
}

// Since we aren't processing a quoted string, push the buf into
// self.tokens as a Literal token
has_seen_literal = true;
self.tokens.push(TokenKind::Literal(buf.trim().into()));
// Push a Space token after the Literal token to help the state machine
// determine whether to concatenate or not
self.tokens.push(TokenKind::Space);

buf.clear();
continue;
}

tokens.push(TokenKind::Literal(buf.trim().into()));
buf.clear();
continue;
// We push a space into buf if we're processing a quoted string
buf.push(' ');
}
char => {
// At the end, an odd num of quotes means a quote wasn't terminated
if i == self.input.len() - 1 && quote_count % 2 == 1 {
return Err(RushError::UnterminatedQuote);
}

buf.push(' ');
// Push the current char into buf
buf.push(char);
}
}
char => {
// At the end, an odd num of quotes means a quote wasn't terminated
if i == input_tokens.len() - 1 && quote_count % 2 == 1 {
return Err(RushError::UnterminatedQuote);
}

// Push remaining chars into self.tokens
if buf.len() > 0 {
// Concatenate with the last token if it's a Literal or Quoted (no Space between)
match self.tokens.last_mut() {
Some(TokenKind::Literal(last_token)) => {
last_token.push_str(buf.trim());
}
Some(TokenKind::Quoted(last_token)) => {
last_token.push_str(buf.trim());
}
_ => {
self.tokens.push(TokenKind::Literal(buf.trim().into()));
}
}
}

// Push the current char into buf
buf.push(char);
let mut tokens = Vec::<String>::new();

// At the end, push any remaining chars into tokens
if i == input_tokens.len() - 1 && buf.len() > 0 {
tokens.push(TokenKind::Literal(buf.trim().into()));
}
for token in &self.tokens {
match token {
TokenKind::Literal(literal) => tokens.push(literal.to_owned()),
TokenKind::Quoted(quoted) => tokens.push(quoted.to_owned()),
TokenKind::Space => { /* state machine hint */ }
}
}
}

Ok(tokens
.iter()
.map(|token| match token {
TokenKind::Literal(literal) => literal.to_owned(),
TokenKind::Quoted(quoted) => quoted.to_owned(),
})
.collect::<Vec<_>>())
Ok(tokens)
}
}

#[cfg(test)]
Expand All @@ -111,7 +192,8 @@ mod tests {

// Shared test helper
fn parse(input: &str) -> Result<Vec<String>, RushError> {
tokenize(io::Cursor::new(input))
let mut state_machine = Tokenizer::from(io::Cursor::new(input))?;
state_machine.tokenize()
}

mod basic_tokenization {
Expand Down Expand Up @@ -222,6 +304,10 @@ mod tests {
parse("command arg1 \'quoted arg\' arg2 \'another quoted\'\n").unwrap(),
vec!["command", "arg1", "quoted arg", "arg2", "another quoted"]
);
assert_eq!(
parse("echo \'world shell\' \'script\'\'test\' example\'\'hello").unwrap(),
vec!["echo", "world shell", "scripttest", "examplehello"]
);
}

#[test]
Expand Down Expand Up @@ -372,7 +458,7 @@ mod tests {
#[test]
fn io_read_error_returns_unexpected_eof() {
let reader = ErrReader;
let err = tokenize(reader).unwrap_err();
let err = Tokenizer::from(reader).unwrap_err();
assert!(matches!(err, RushError::UnexpectedEOF));
}
}
Expand Down