From 9f1095e939f0929233c5e3d305b93c8fbbce31b9 Mon Sep 17 00:00:00 2001 From: Mingun Date: Sat, 27 Dec 2025 22:02:53 +0500 Subject: [PATCH 1/4] Update issue590 test to contain valid DOCTYPE. Previously we do not parse DTD in any way, so we might used invalid declarations. But because DTD cannot be parsed without knowing its structure, we decided to parse it at least partially to correctly handle some DTDs (as in #923) --- tests/issues.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/issues.rs b/tests/issues.rs index 916ba7ff..9a8b3ebf 100644 --- a/tests/issues.rs +++ b/tests/issues.rs @@ -175,11 +175,13 @@ mod issue514 { #[test] fn issue590() { let mut reader = Reader::from_reader(BufReader::with_capacity( - 16, - &b"]>"[..], - // 0 7 ^15 ^23 - //[ ] = capacity + 24, + &b"]>"[..], + // 0 7 ^23 ^44 + //[ ] = capacity )); + // Ensure, that capacity was not increased unexpectedly + assert_eq!(reader.get_ref().capacity(), 24); let mut buf = Vec::new(); loop { if reader.read_event_into(&mut buf).unwrap() == Event::Eof { From 2e861748716f8faa777345428397f5bd979e580c Mon Sep 17 00:00:00 2001 From: Mingun Date: Fri, 26 Dec 2025 18:31:47 +0500 Subject: [PATCH 2/4] Add reusable parser for XML comment for use in the future --- src/parser/comment.rs | 190 ++++++++++++++++++++++++++++++++++++++++++ src/parser/mod.rs | 2 + 2 files changed, 192 insertions(+) create mode 100644 src/parser/comment.rs diff --git a/src/parser/comment.rs b/src/parser/comment.rs new file mode 100644 index 00000000..04517290 --- /dev/null +++ b/src/parser/comment.rs @@ -0,0 +1,190 @@ +//! Contains a parser for an XML comment. + +use crate::errors::SyntaxError; +use crate::parser::Parser; + +/// A parser that search a `-->` sequence in the slice. +/// +/// To use a parser create an instance of parser and [`feed`] data into it. +/// After successful search the parser will return [`Some`] with position where +/// comment is ended (the position after `-->`). If search was unsuccessful, +/// a [`None`] will be returned. You typically would expect positive result of +/// search, so that you should feed new data until yo'll get it. +/// +/// NOTE: after successful match the parser does not returned to the initial +/// state and should not be used anymore. Create a new parser if you want to perform +/// new search. +/// +/// # Example +/// +/// ``` +/// # use pretty_assertions::assert_eq; +/// use quick_xml::parser::{CommentParser, Parser}; +/// +/// let mut parser = CommentParser::default(); +/// +/// // Parse `and the text follow...` +/// // splitted into three chunks +/// assert_eq!(parser.feed(b"and the text follow..."), Some(12)); +/// // ^ ^ +/// // 0 11 +/// ``` +/// +/// [`feed`]: Self::feed() +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum CommentParser { + /// The parser does not yet seen any dashes at the end of previous slice. + Seen0, + /// The parser already seen one dash on the end of previous slice. + Seen1, + /// The parser already seen two dashes on the end of previous slice. + Seen2, +} + +impl Default for CommentParser { + #[inline] + fn default() -> Self { + Self::Seen0 + } +} + +impl Parser for CommentParser { + /// Determines the end position of an XML comment in the provided slice. + /// Comments is a pieces of text enclosed in `` braces. + /// Comment ends on the first occurrence of `-->` which cannot be escaped. + /// + /// Returns position after the `-->` or `None` if such sequence was not found. + /// + /// # Parameters + /// - `bytes`: a slice to find the end of a comment. + /// Should contain text in ASCII-compatible encoding + #[inline] + fn feed(&mut self, bytes: &[u8]) -> Option { + let result = match self { + Self::Seen0 => seen0(bytes), + Self::Seen1 => seen1(bytes), + Self::Seen2 => seen2(bytes), + }; + if let Some(r) = result { + return Some(r); + } + if bytes.ends_with(b"--") { + *self = Self::Seen2; + } else { + self.next_state(bytes.last().copied()); + } + None + } + + #[inline] + fn eof_error(self, _content: &[u8]) -> SyntaxError { + SyntaxError::UnclosedComment + } +} + +impl CommentParser { + #[inline] + fn next_state(&mut self, last: Option) { + match (*self, last) { + (Self::Seen0, Some(b'-')) => *self = Self::Seen1, + + (Self::Seen1, Some(b'-')) => *self = Self::Seen2, + (Self::Seen1, Some(_)) => *self = Self::Seen0, + + (Self::Seen2, Some(b'-')) => {} + (Self::Seen2, Some(_)) => *self = Self::Seen0, + + _ => {} + } + } +} + +#[inline] +fn seen0(bytes: &[u8]) -> Option { + for i in memchr::memchr_iter(b'>', bytes) { + if bytes[..i].ends_with(b"--") { + // +1 for `>` which should be included in event + return Some(i + 1); + } + } + None +} + +#[inline] +fn seen1(bytes: &[u8]) -> Option { + // -|-> + if bytes.starts_with(b"->") { + return Some(2); + } + // Even if the first character is `-` it cannot be part of close sequence, + // because we checked that condition above. That means that we can forgot that + // we seen one `-` at the end of the previous chunk. + // -|x... + seen0(bytes) +} + +#[inline] +fn seen2(bytes: &[u8]) -> Option { + match bytes.get(0) { + // --| + None => None, + // --|> + Some(b'>') => Some(1), + // The end sequence here can be matched only if bytes starts with `->` + // which is handled in seen1(). + // --|x... + Some(_) => seen1(bytes), + } +} + +#[test] +fn parse() { + use pretty_assertions::assert_eq; + use CommentParser::*; + + /// Returns `Ok(pos)` with the position in the buffer where element is ended. + /// + /// Returns `Err(internal_state)` if parsing was not done yet. + fn parse_comment(bytes: &[u8], mut parser: CommentParser) -> Result { + match parser.feed(bytes) { + Some(i) => Ok(i), + None => Err(parser), + } + } + + assert_eq!(parse_comment(b"", Seen0), Err(Seen0)); // xx| + assert_eq!(parse_comment(b"", Seen1), Err(Seen1)); // x-| + assert_eq!(parse_comment(b"", Seen2), Err(Seen2)); // --| + + assert_eq!(parse_comment(b"-", Seen0), Err(Seen1)); // xx|- + assert_eq!(parse_comment(b"-", Seen1), Err(Seen2)); // x-|- + assert_eq!(parse_comment(b"-", Seen2), Err(Seen2)); // --|- + + assert_eq!(parse_comment(b">", Seen0), Err(Seen0)); // xx|> + assert_eq!(parse_comment(b">", Seen1), Err(Seen0)); // x-|> + assert_eq!(parse_comment(b">", Seen2), Ok(1)); // --|> + + assert_eq!(parse_comment(b"--", Seen0), Err(Seen2)); // xx|-- + assert_eq!(parse_comment(b"--", Seen1), Err(Seen2)); // x-|-- + assert_eq!(parse_comment(b"--", Seen2), Err(Seen2)); // --|-- + + assert_eq!(parse_comment(b"->", Seen0), Err(Seen0)); // xx|-> + assert_eq!(parse_comment(b"->", Seen1), Ok(2)); // x-|-> + assert_eq!(parse_comment(b"->", Seen2), Ok(2)); // --|-> + + assert_eq!(parse_comment(b"-->", Seen0), Ok(3)); // xx|--> + assert_eq!(parse_comment(b"-->", Seen1), Ok(3)); // x-|--> + assert_eq!(parse_comment(b"-->", Seen2), Ok(3)); // --|--> + + assert_eq!(parse_comment(b">-->", Seen0), Ok(4)); // xx|>--> + assert_eq!(parse_comment(b">-->", Seen1), Ok(4)); // x-|>--> + assert_eq!(parse_comment(b">-->", Seen2), Ok(1)); // --|>--> + + assert_eq!(parse_comment(b"->-->", Seen0), Ok(5)); // xx|->--> + assert_eq!(parse_comment(b"->-->", Seen1), Ok(2)); // x-|->--> + assert_eq!(parse_comment(b"->-->", Seen2), Ok(2)); // --|->--> +} diff --git a/src/parser/mod.rs b/src/parser/mod.rs index bf08dcdf..cdc19b32 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -2,9 +2,11 @@ use crate::errors::SyntaxError; +mod comment; mod element; mod pi; +pub use comment::CommentParser; pub use element::ElementParser; pub use pi::PiParser; From e794fdd511a6de7a081b52a82e430c59658a9113 Mon Sep 17 00:00:00 2001 From: Mingun Date: Tue, 6 Jan 2026 16:01:19 +0500 Subject: [PATCH 3/4] Move macros for testing syntax errors to the helpers module to reuse in dtd tests --- tests/helpers/mod.rs | 288 +++++++++++++++++++++++ tests/reader-errors.rs | 504 ++++++++++------------------------------- 2 files changed, 410 insertions(+), 382 deletions(-) diff --git a/tests/helpers/mod.rs b/tests/helpers/mod.rs index bbc7c0c0..bcf51350 100644 --- a/tests/helpers/mod.rs +++ b/tests/helpers/mod.rs @@ -149,3 +149,291 @@ macro_rules! small_buffers_tests { } }; } + +/// Tests for reader-errors and reader-dtd. +#[macro_export] +macro_rules! event_ok { + ($test:ident($xml:literal) => $pos:literal : $event:expr) => { + mod $test { + use super::*; + + mod reader { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn borrowed() { + let mut reader = Reader::from_str($xml); + reader.config_mut().enable_all_checks(true); + assert_eq!( + (reader.read_event().unwrap(), reader.buffer_position()), + ($event, $pos) + ); + } + + #[test] + fn buffered() { + let mut buf = Vec::new(); + let mut reader = Reader::from_str($xml); + reader.config_mut().enable_all_checks(true); + assert_eq!( + ( + reader.read_event_into(&mut buf).unwrap(), + reader.buffer_position() + ), + ($event, $pos) + ); + } + + #[cfg(feature = "async-tokio")] + #[tokio::test] + async fn async_tokio() { + let mut buf = Vec::new(); + let mut reader = Reader::from_str($xml); + reader.config_mut().enable_all_checks(true); + assert_eq!( + ( + reader.read_event_into_async(&mut buf).await.unwrap(), + reader.buffer_position() + ), + ($event, $pos) + ); + } + } + + mod ns_reader { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn borrowed() { + let mut reader = NsReader::from_str($xml); + reader.config_mut().enable_all_checks(true); + assert_eq!( + ( + reader.read_resolved_event().unwrap().1, + reader.buffer_position() + ), + ($event, $pos) + ); + } + + #[test] + fn buffered() { + let mut buf = Vec::new(); + let mut reader = NsReader::from_str($xml); + reader.config_mut().enable_all_checks(true); + assert_eq!( + ( + reader.read_resolved_event_into(&mut buf).unwrap().1, + reader.buffer_position() + ), + ($event, $pos) + ); + } + + #[cfg(feature = "async-tokio")] + #[tokio::test] + async fn async_tokio() { + let mut buf = Vec::new(); + let mut reader = NsReader::from_str($xml); + reader.config_mut().enable_all_checks(true); + assert_eq!( + ( + reader + .read_resolved_event_into_async(&mut buf) + .await + .unwrap() + .1, + reader.buffer_position() + ), + ($event, $pos) + ); + } + } + } + }; +} + +/// Tests for reader-errors and reader-dtd. +#[macro_export] +macro_rules! syntax_err { + ($test:ident($xml:literal) => $pos:expr, $cause:expr) => { + mod $test { + use super::*; + + mod reader { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn borrowed() { + let mut reader = Reader::from_str($xml); + assert_eq!( + reader + .read_event() + .expect("parser should return `Event::Text`"), + Event::Text(BytesText::new(".")) + ); + match reader.read_event() { + Err(Error::Syntax(cause)) => assert_eq!( + (cause, reader.error_position(), reader.buffer_position()), + ($cause, 1, $pos), + ), + x => panic!("Expected `Err(Syntax(_))`, but got {:?}", x), + } + assert_eq!( + reader + .read_event() + .expect("parser should return `Event::Eof` after error"), + Event::Eof + ); + } + + #[test] + fn buffered() { + let mut buf = Vec::new(); + let mut reader = Reader::from_str($xml); + assert_eq!( + reader + .read_event_into(&mut buf) + .expect("parser should return `Event::Text`"), + Event::Text(BytesText::new(".")) + ); + match reader.read_event_into(&mut buf) { + Err(Error::Syntax(cause)) => assert_eq!( + (cause, reader.error_position(), reader.buffer_position()), + ($cause, 1, $pos), + ), + x => panic!("Expected `Err(Syntax(_))`, but got {:?}", x), + } + assert_eq!( + reader + .read_event_into(&mut buf) + .expect("parser should return `Event::Eof` after error"), + Event::Eof + ); + } + + #[cfg(feature = "async-tokio")] + #[tokio::test] + async fn async_tokio() { + let mut buf = Vec::new(); + let mut reader = Reader::from_str($xml); + assert_eq!( + reader + .read_event_into_async(&mut buf) + .await + .expect("parser should return `Event::Text`"), + Event::Text(BytesText::new(".")) + ); + match reader.read_event_into_async(&mut buf).await { + Err(Error::Syntax(cause)) => assert_eq!( + (cause, reader.error_position(), reader.buffer_position()), + ($cause, 1, $pos), + ), + x => panic!("Expected `Err(Syntax(_))`, but got {:?}", x), + } + assert_eq!( + reader + .read_event_into_async(&mut buf) + .await + .expect("parser should return `Event::Eof` after error"), + Event::Eof + ); + } + } + + mod ns_reader { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn borrowed() { + let mut reader = NsReader::from_str($xml); + assert_eq!( + reader + .read_resolved_event() + .expect("parser should return `Event::Text`") + .1, + Event::Text(BytesText::new(".")) + ); + match reader.read_resolved_event() { + Err(Error::Syntax(cause)) => assert_eq!( + (cause, reader.error_position(), reader.buffer_position()), + ($cause, 1, $pos), + ), + x => panic!("Expected `Err(Syntax(_))`, but got {:?}", x), + } + assert_eq!( + reader + .read_resolved_event() + .expect("parser should return `Event::Eof` after error") + .1, + Event::Eof + ); + } + + #[test] + fn buffered() { + let mut buf = Vec::new(); + let mut reader = NsReader::from_str($xml); + assert_eq!( + reader + .read_resolved_event_into(&mut buf) + .expect("parser should return `Event::Text`") + .1, + Event::Text(BytesText::new(".")) + ); + match reader.read_resolved_event_into(&mut buf) { + Err(Error::Syntax(cause)) => assert_eq!( + (cause, reader.error_position(), reader.buffer_position()), + ($cause, 1, $pos), + ), + x => panic!("Expected `Err(Syntax(_))`, but got {:?}", x), + } + assert_eq!( + reader + .read_resolved_event_into(&mut buf) + .expect("parser should return `Event::Eof` after error") + .1, + Event::Eof + ); + } + + #[cfg(feature = "async-tokio")] + #[tokio::test] + async fn async_tokio() { + let mut buf = Vec::new(); + let mut reader = NsReader::from_str($xml); + assert_eq!( + reader + .read_resolved_event_into_async(&mut buf) + .await + .expect("parser should return `Event::Text`") + .1, + Event::Text(BytesText::new(".")) + ); + match reader.read_resolved_event_into_async(&mut buf).await { + Err(Error::Syntax(cause)) => assert_eq!( + (cause, reader.error_position(), reader.buffer_position()), + ($cause, 1, $pos), + ), + x => panic!("Expected `Err(Syntax(_))`, but got {:?}", x), + } + assert_eq!( + reader + .read_resolved_event_into_async(&mut buf) + .await + .expect("parser should return `Event::Eof` after error") + .1, + Event::Eof + ); + } + } + } + }; + ($test:ident($xml:literal) => $cause:expr) => { + syntax_err!($test($xml) => $xml.len() as u64, $cause); + }; +} diff --git a/tests/reader-errors.rs b/tests/reader-errors.rs index e6fce8be..b65e70cb 100644 --- a/tests/reader-errors.rs +++ b/tests/reader-errors.rs @@ -4,283 +4,23 @@ use quick_xml::errors::{Error, SyntaxError}; use quick_xml::events::{BytesCData, BytesDecl, BytesEnd, BytesPI, BytesStart, BytesText, Event}; use quick_xml::reader::{NsReader, Reader}; -macro_rules! ok { - ($test:ident($xml:literal) => $pos:literal : $event:expr) => { - mod $test { - use super::*; - - mod reader { - use super::*; - use pretty_assertions::assert_eq; - - #[test] - fn borrowed() { - let mut reader = Reader::from_str($xml); - reader.config_mut().enable_all_checks(true); - assert_eq!(reader.read_event().unwrap(), $event); - assert_eq!(reader.buffer_position(), $pos); - } - - #[test] - fn buffered() { - let mut buf = Vec::new(); - let mut reader = Reader::from_str($xml); - reader.config_mut().enable_all_checks(true); - assert_eq!(reader.read_event_into(&mut buf).unwrap(), $event); - assert_eq!(reader.buffer_position(), $pos); - } - - #[cfg(feature = "async-tokio")] - #[tokio::test] - async fn async_tokio() { - let mut buf = Vec::new(); - let mut reader = Reader::from_str($xml); - reader.config_mut().enable_all_checks(true); - assert_eq!( - reader.read_event_into_async(&mut buf).await.unwrap(), - $event - ); - assert_eq!(reader.buffer_position(), $pos); - } - } - - mod ns_reader { - use super::*; - use pretty_assertions::assert_eq; - - #[test] - fn borrowed() { - let mut reader = NsReader::from_str($xml); - reader.config_mut().enable_all_checks(true); - assert_eq!(reader.read_resolved_event().unwrap().1, $event); - assert_eq!(reader.buffer_position(), $pos); - } - - #[test] - fn buffered() { - let mut buf = Vec::new(); - let mut reader = NsReader::from_str($xml); - reader.config_mut().enable_all_checks(true); - assert_eq!(reader.read_resolved_event_into(&mut buf).unwrap().1, $event); - assert_eq!(reader.buffer_position(), $pos); - } - - #[cfg(feature = "async-tokio")] - #[tokio::test] - async fn async_tokio() { - let mut buf = Vec::new(); - let mut reader = NsReader::from_str($xml); - reader.config_mut().enable_all_checks(true); - assert_eq!( - reader - .read_resolved_event_into_async(&mut buf) - .await - .unwrap() - .1, - $event - ); - assert_eq!(reader.buffer_position(), $pos); - } - } - } - }; -} +// For event_ok and syntax_err macros +mod helpers; mod syntax { use super::*; - macro_rules! err { - ($test:ident($xml:literal) => $pos:expr, $cause:expr) => { - mod $test { - use super::*; - - mod reader { - use super::*; - use pretty_assertions::assert_eq; - - #[test] - fn borrowed() { - let mut reader = Reader::from_str($xml); - assert_eq!( - reader - .read_event() - .expect("parser should return `Event::Text`"), - Event::Text(BytesText::new(".")) - ); - match reader.read_event() { - Err(Error::Syntax(cause)) => assert_eq!( - (cause, reader.error_position(), reader.buffer_position()), - ($cause, 1, $pos), - ), - x => panic!("Expected `Err(Syntax(_))`, but got {:?}", x), - } - assert_eq!( - reader - .read_event() - .expect("parser should return `Event::Eof` after error"), - Event::Eof - ); - } - - #[test] - fn buffered() { - let mut buf = Vec::new(); - let mut reader = Reader::from_str($xml); - assert_eq!( - reader - .read_event_into(&mut buf) - .expect("parser should return `Event::Text`"), - Event::Text(BytesText::new(".")) - ); - match reader.read_event_into(&mut buf) { - Err(Error::Syntax(cause)) => assert_eq!( - (cause, reader.error_position(), reader.buffer_position()), - ($cause, 1, $pos), - ), - x => panic!("Expected `Err(Syntax(_))`, but got {:?}", x), - } - assert_eq!( - reader - .read_event_into(&mut buf) - .expect("parser should return `Event::Eof` after error"), - Event::Eof - ); - } - - #[cfg(feature = "async-tokio")] - #[tokio::test] - async fn async_tokio() { - let mut buf = Vec::new(); - let mut reader = Reader::from_str($xml); - assert_eq!( - reader - .read_event_into_async(&mut buf) - .await - .expect("parser should return `Event::Text`"), - Event::Text(BytesText::new(".")) - ); - match reader.read_event_into_async(&mut buf).await { - Err(Error::Syntax(cause)) => assert_eq!( - (cause, reader.error_position(), reader.buffer_position()), - ($cause, 1, $pos), - ), - x => panic!("Expected `Err(Syntax(_))`, but got {:?}", x), - } - assert_eq!( - reader - .read_event_into_async(&mut buf) - .await - .expect("parser should return `Event::Eof` after error"), - Event::Eof - ); - } - } - - mod ns_reader { - use super::*; - use pretty_assertions::assert_eq; - - #[test] - fn borrowed() { - let mut reader = NsReader::from_str($xml); - assert_eq!( - reader - .read_resolved_event() - .expect("parser should return `Event::Text`") - .1, - Event::Text(BytesText::new(".")) - ); - match reader.read_resolved_event() { - Err(Error::Syntax(cause)) => assert_eq!( - (cause, reader.error_position(), reader.buffer_position()), - ($cause, 1, $pos), - ), - x => panic!("Expected `Err(Syntax(_))`, but got {:?}", x), - } - assert_eq!( - reader - .read_resolved_event() - .expect("parser should return `Event::Eof` after error") - .1, - Event::Eof - ); - } - - #[test] - fn buffered() { - let mut buf = Vec::new(); - let mut reader = NsReader::from_str($xml); - assert_eq!( - reader - .read_resolved_event_into(&mut buf) - .expect("parser should return `Event::Text`") - .1, - Event::Text(BytesText::new(".")) - ); - match reader.read_resolved_event_into(&mut buf) { - Err(Error::Syntax(cause)) => assert_eq!( - (cause, reader.error_position(), reader.buffer_position()), - ($cause, 1, $pos), - ), - x => panic!("Expected `Err(Syntax(_))`, but got {:?}", x), - } - assert_eq!( - reader - .read_resolved_event_into(&mut buf) - .expect("parser should return `Event::Eof` after error") - .1, - Event::Eof - ); - } - - #[cfg(feature = "async-tokio")] - #[tokio::test] - async fn async_tokio() { - let mut buf = Vec::new(); - let mut reader = NsReader::from_str($xml); - assert_eq!( - reader - .read_resolved_event_into_async(&mut buf) - .await - .expect("parser should return `Event::Text`") - .1, - Event::Text(BytesText::new(".")) - ); - match reader.read_resolved_event_into_async(&mut buf).await { - Err(Error::Syntax(cause)) => assert_eq!( - (cause, reader.error_position(), reader.buffer_position()), - ($cause, 1, $pos), - ), - x => panic!("Expected `Err(Syntax(_))`, but got {:?}", x), - } - assert_eq!( - reader - .read_resolved_event_into_async(&mut buf) - .await - .expect("parser should return `Event::Eof` after error") - .1, - Event::Eof - ); - } - } - } - }; - ($test:ident($xml:literal) => $cause:expr) => { - err!($test($xml) => $xml.len() as u64, $cause); - }; - } - mod tag { use super::*; - err!(unclosed1(".<") => SyntaxError::UnclosedTag); - err!(unclosed2(". SyntaxError::UnclosedTag); - err!(unclosed3(". SyntaxError::UnclosedTag); - err!(unclosed4(".< ") => SyntaxError::UnclosedTag); - err!(unclosed5(".<\t") => SyntaxError::UnclosedTag); - err!(unclosed6(".<\r") => SyntaxError::UnclosedTag); - err!(unclosed7(".<\n") => SyntaxError::UnclosedTag); - err!(unclosed8(".< \t\r\nx") => SyntaxError::UnclosedTag); + syntax_err!(unclosed1(".<") => SyntaxError::UnclosedTag); + syntax_err!(unclosed2(". SyntaxError::UnclosedTag); + syntax_err!(unclosed3(". SyntaxError::UnclosedTag); + syntax_err!(unclosed4(".< ") => SyntaxError::UnclosedTag); + syntax_err!(unclosed5(".<\t") => SyntaxError::UnclosedTag); + syntax_err!(unclosed6(".<\r") => SyntaxError::UnclosedTag); + syntax_err!(unclosed7(".<\n") => SyntaxError::UnclosedTag); + syntax_err!(unclosed8(".< \t\r\nx") => SyntaxError::UnclosedTag); /// Closed tags can be tested only in pair with open tags, because otherwise /// `IllFormedError::UnmatchedEndTag` will be raised @@ -330,64 +70,64 @@ mod syntax { } // Incorrect after-bang symbol is detected early, so buffer_position() stay at `!` - err!(unclosed_bang1(". 2, SyntaxError::InvalidBangMarkup); - err!(unclosed_bang2(".") => 2, SyntaxError::InvalidBangMarkup); - err!(unclosed_bang3(". 2, SyntaxError::InvalidBangMarkup); - err!(unclosed_bang4(".") => 2, SyntaxError::InvalidBangMarkup); + syntax_err!(unclosed_bang1(". 2, SyntaxError::InvalidBangMarkup); + syntax_err!(unclosed_bang2(".") => 2, SyntaxError::InvalidBangMarkup); + syntax_err!(unclosed_bang3(". 2, SyntaxError::InvalidBangMarkup); + syntax_err!(unclosed_bang4(".") => 2, SyntaxError::InvalidBangMarkup); /// https://www.w3.org/TR/xml11/#NT-Comment mod comment { use super::*; - err!(unclosed01(". SyntaxError::UnclosedComment); - err!(unclosed02(".") => SyntaxError::UnclosedComment); - err!(unclosed07(".") => SyntaxError::UnclosedComment); - err!(unclosed10(".") => 7: Event::Comment(BytesText::new(""))); - ok!(normal2("rest") => 7: Event::Comment(BytesText::new(""))); + syntax_err!(unclosed01(". SyntaxError::UnclosedComment); + syntax_err!(unclosed02(".") => SyntaxError::UnclosedComment); + syntax_err!(unclosed07(".") => SyntaxError::UnclosedComment); + syntax_err!(unclosed10(".") => 7: Event::Comment(BytesText::new(""))); + event_ok!(normal2("rest") => 7: Event::Comment(BytesText::new(""))); } /// https://www.w3.org/TR/xml11/#NT-CDSect mod cdata { use super::*; - err!(unclosed01(". SyntaxError::UnclosedCData); - err!(unclosed02(". SyntaxError::UnclosedCData); - err!(unclosed03(". SyntaxError::UnclosedCData); - err!(unclosed04(".") => SyntaxError::UnclosedCData); - err!(unclosed05(". SyntaxError::UnclosedCData); - err!(unclosed06(". SyntaxError::UnclosedCData); - err!(unclosed07(".") => SyntaxError::UnclosedCData); - err!(unclosed08(". SyntaxError::UnclosedCData); - err!(unclosed09(". SyntaxError::UnclosedCData); - err!(unclosed10(".") => SyntaxError::UnclosedCData); - err!(unclosed11(". SyntaxError::UnclosedCData); - err!(unclosed12(". SyntaxError::UnclosedCData); - err!(unclosed13(".") => SyntaxError::UnclosedCData); - err!(unclosed14(". SyntaxError::UnclosedCData); - err!(unclosed15(". SyntaxError::UnclosedCData); - err!(unclosed16(".") => SyntaxError::UnclosedCData); - err!(unclosed17(". SyntaxError::UnclosedCData); - err!(unclosed18(". SyntaxError::UnclosedCData); - err!(unclosed19(".") => SyntaxError::UnclosedCData); - err!(unclosed20(". SyntaxError::UnclosedCData); - err!(unclosed21(". SyntaxError::UnclosedCData); - err!(unclosed22(".") => SyntaxError::UnclosedCData); - err!(unclosed23(". SyntaxError::UnclosedCData); - err!(unclosed24(". SyntaxError::UnclosedCData); - err!(unclosed25(".") => SyntaxError::UnclosedCData); - - err!(lowercase(".") => SyntaxError::UnclosedCData); - - ok!(normal1("") => 12: Event::CData(BytesCData::new(""))); - ok!(normal2("rest") => 12: Event::CData(BytesCData::new(""))); + syntax_err!(unclosed01(". SyntaxError::UnclosedCData); + syntax_err!(unclosed02(". SyntaxError::UnclosedCData); + syntax_err!(unclosed03(". SyntaxError::UnclosedCData); + syntax_err!(unclosed04(".") => SyntaxError::UnclosedCData); + syntax_err!(unclosed05(". SyntaxError::UnclosedCData); + syntax_err!(unclosed06(". SyntaxError::UnclosedCData); + syntax_err!(unclosed07(".") => SyntaxError::UnclosedCData); + syntax_err!(unclosed08(". SyntaxError::UnclosedCData); + syntax_err!(unclosed09(". SyntaxError::UnclosedCData); + syntax_err!(unclosed10(".") => SyntaxError::UnclosedCData); + syntax_err!(unclosed11(". SyntaxError::UnclosedCData); + syntax_err!(unclosed12(". SyntaxError::UnclosedCData); + syntax_err!(unclosed13(".") => SyntaxError::UnclosedCData); + syntax_err!(unclosed14(". SyntaxError::UnclosedCData); + syntax_err!(unclosed15(". SyntaxError::UnclosedCData); + syntax_err!(unclosed16(".") => SyntaxError::UnclosedCData); + syntax_err!(unclosed17(". SyntaxError::UnclosedCData); + syntax_err!(unclosed18(". SyntaxError::UnclosedCData); + syntax_err!(unclosed19(".") => SyntaxError::UnclosedCData); + syntax_err!(unclosed20(". SyntaxError::UnclosedCData); + syntax_err!(unclosed21(". SyntaxError::UnclosedCData); + syntax_err!(unclosed22(".") => SyntaxError::UnclosedCData); + syntax_err!(unclosed23(". SyntaxError::UnclosedCData); + syntax_err!(unclosed24(". SyntaxError::UnclosedCData); + syntax_err!(unclosed25(".") => SyntaxError::UnclosedCData); + + syntax_err!(lowercase(".") => SyntaxError::UnclosedCData); + + event_ok!(normal1("") => 12: Event::CData(BytesCData::new(""))); + event_ok!(normal2("rest") => 12: Event::CData(BytesCData::new(""))); } /// According to the grammar, only upper-case letters allowed for DOCTYPE writing. @@ -396,82 +136,82 @@ mod syntax { mod doctype { use super::*; - err!(unclosed01(". SyntaxError::UnclosedDoctype); - err!(unclosed02(". SyntaxError::UnclosedDoctype); - err!(unclosed03(". SyntaxError::UnclosedDoctype); - err!(unclosed04(".") => SyntaxError::UnclosedDoctype); - err!(unclosed05(". SyntaxError::UnclosedDoctype); - err!(unclosed06(". SyntaxError::UnclosedDoctype); - err!(unclosed07(".") => SyntaxError::UnclosedDoctype); - err!(unclosed08(". SyntaxError::UnclosedDoctype); - err!(unclosed09(". SyntaxError::UnclosedDoctype); - err!(unclosed10(".") => SyntaxError::UnclosedDoctype); - err!(unclosed11(". SyntaxError::UnclosedDoctype); - err!(unclosed12(". SyntaxError::UnclosedDoctype); - err!(unclosed13(".") => SyntaxError::UnclosedDoctype); - err!(unclosed14(". SyntaxError::UnclosedDoctype); - err!(unclosed15(". SyntaxError::UnclosedDoctype); - err!(unclosed16(".") => SyntaxError::UnclosedDoctype); - err!(unclosed17(". SyntaxError::UnclosedDoctype); - err!(unclosed18(". SyntaxError::UnclosedDoctype); - err!(unclosed19(".") => SyntaxError::UnclosedDoctype); - err!(unclosed20(". SyntaxError::UnclosedDoctype); - err!(unclosed21(". SyntaxError::UnclosedDoctype); + syntax_err!(unclosed01(". SyntaxError::UnclosedDoctype); + syntax_err!(unclosed02(". SyntaxError::UnclosedDoctype); + syntax_err!(unclosed03(". SyntaxError::UnclosedDoctype); + syntax_err!(unclosed04(".") => SyntaxError::UnclosedDoctype); + syntax_err!(unclosed05(". SyntaxError::UnclosedDoctype); + syntax_err!(unclosed06(". SyntaxError::UnclosedDoctype); + syntax_err!(unclosed07(".") => SyntaxError::UnclosedDoctype); + syntax_err!(unclosed08(". SyntaxError::UnclosedDoctype); + syntax_err!(unclosed09(". SyntaxError::UnclosedDoctype); + syntax_err!(unclosed10(".") => SyntaxError::UnclosedDoctype); + syntax_err!(unclosed11(". SyntaxError::UnclosedDoctype); + syntax_err!(unclosed12(". SyntaxError::UnclosedDoctype); + syntax_err!(unclosed13(".") => SyntaxError::UnclosedDoctype); + syntax_err!(unclosed14(". SyntaxError::UnclosedDoctype); + syntax_err!(unclosed15(". SyntaxError::UnclosedDoctype); + syntax_err!(unclosed16(".") => SyntaxError::UnclosedDoctype); + syntax_err!(unclosed17(". SyntaxError::UnclosedDoctype); + syntax_err!(unclosed18(". SyntaxError::UnclosedDoctype); + syntax_err!(unclosed19(".") => SyntaxError::UnclosedDoctype); + syntax_err!(unclosed20(". SyntaxError::UnclosedDoctype); + syntax_err!(unclosed21(". SyntaxError::UnclosedDoctype); // results in IllFormed(MissingDoctypeName), checked below - err!(unclosed22(". SyntaxError::UnclosedDoctype); + syntax_err!(unclosed22(". SyntaxError::UnclosedDoctype); // According to the grammar, XML declaration MUST contain at least one space // and an element name, but we do not consider this as a _syntax_ error. - ok!(normal1("") => 12: Event::DocType(BytesText::new("e"))); - ok!(normal2("rest") => 12: Event::DocType(BytesText::new("e"))); + event_ok!(normal1("") => 12: Event::DocType(BytesText::new("e"))); + event_ok!(normal2("rest") => 12: Event::DocType(BytesText::new("e"))); } /// https://www.w3.org/TR/xml11/#NT-PI mod pi { use super::*; - err!(unclosed01(". SyntaxError::UnclosedPI); - err!(unclosed02(". SyntaxError::UnclosedPI); - err!(unclosed03(".") => SyntaxError::UnclosedPI); - err!(unclosed04(". SyntaxError::UnclosedPI); - err!(unclosed05(". SyntaxError::UnclosedPI); - err!(unclosed06(". SyntaxError::UnclosedPI); - err!(unclosed07(". SyntaxError::UnclosedPI); - err!(unclosed08(". SyntaxError::UnclosedPI); - err!(unclosed09(". SyntaxError::UnclosedPI); - err!(unclosed10(". SyntaxError::UnclosedPI); + syntax_err!(unclosed01(". SyntaxError::UnclosedPI); + syntax_err!(unclosed02(". SyntaxError::UnclosedPI); + syntax_err!(unclosed03(".") => SyntaxError::UnclosedPI); + syntax_err!(unclosed04(". SyntaxError::UnclosedPI); + syntax_err!(unclosed05(". SyntaxError::UnclosedPI); + syntax_err!(unclosed06(". SyntaxError::UnclosedPI); + syntax_err!(unclosed07(". SyntaxError::UnclosedPI); + syntax_err!(unclosed08(". SyntaxError::UnclosedPI); + syntax_err!(unclosed09(". SyntaxError::UnclosedPI); + syntax_err!(unclosed10(". SyntaxError::UnclosedPI); // According to the grammar, processing instruction MUST contain a non-empty // target name, but we do not consider this as a _syntax_ error. - ok!(normal_empty1("") => 4: Event::PI(BytesPI::new(""))); - ok!(normal_empty2("rest") => 4: Event::PI(BytesPI::new(""))); - ok!(normal_xmlx1("") => 8: Event::PI(BytesPI::new("xmlx"))); - ok!(normal_xmlx2("rest") => 8: Event::PI(BytesPI::new("xmlx"))); + event_ok!(normal_empty1("") => 4: Event::PI(BytesPI::new(""))); + event_ok!(normal_empty2("rest") => 4: Event::PI(BytesPI::new(""))); + event_ok!(normal_xmlx1("") => 8: Event::PI(BytesPI::new("xmlx"))); + event_ok!(normal_xmlx2("rest") => 8: Event::PI(BytesPI::new("xmlx"))); } /// https://www.w3.org/TR/xml11/#NT-prolog mod decl { use super::*; - err!(unclosed1(". SyntaxError::UnclosedPI); - err!(unclosed2(". SyntaxError::UnclosedPI); - err!(unclosed3(". SyntaxError::UnclosedXmlDecl); - err!(unclosed4(". SyntaxError::UnclosedXmlDecl); - err!(unclosed5(". SyntaxError::UnclosedXmlDecl); - err!(unclosed6(". SyntaxError::UnclosedXmlDecl); - err!(unclosed7(". SyntaxError::UnclosedXmlDecl); - err!(unclosed8(". SyntaxError::UnclosedXmlDecl); + syntax_err!(unclosed1(". SyntaxError::UnclosedPI); + syntax_err!(unclosed2(". SyntaxError::UnclosedPI); + syntax_err!(unclosed3(". SyntaxError::UnclosedXmlDecl); + syntax_err!(unclosed4(". SyntaxError::UnclosedXmlDecl); + syntax_err!(unclosed5(". SyntaxError::UnclosedXmlDecl); + syntax_err!(unclosed6(". SyntaxError::UnclosedXmlDecl); + syntax_err!(unclosed7(". SyntaxError::UnclosedXmlDecl); + syntax_err!(unclosed8(". SyntaxError::UnclosedXmlDecl); // "xmls" is a PI target, not an XML declaration - err!(unclosed9(". SyntaxError::UnclosedPI); + syntax_err!(unclosed9(". SyntaxError::UnclosedPI); // According to the grammar, XML declaration MUST contain at least one space // and `version` attribute, but we do not consider this as a _syntax_ error. - ok!(normal1("") => 7: Event::Decl(BytesDecl::from_start(BytesStart::new("xml")))); - ok!(normal2("") => 8: Event::Decl(BytesDecl::from_start(BytesStart::from_content("xml ", 3)))); - ok!(normal3("") => 8: Event::Decl(BytesDecl::from_start(BytesStart::from_content("xml\t", 3)))); - ok!(normal4("") => 8: Event::Decl(BytesDecl::from_start(BytesStart::from_content("xml\r", 3)))); - ok!(normal5("") => 8: Event::Decl(BytesDecl::from_start(BytesStart::from_content("xml\n", 3)))); - ok!(normal6("rest") => 8: Event::Decl(BytesDecl::from_start(BytesStart::from_content("xml\n", 3)))); + event_ok!(normal1("") => 7: Event::Decl(BytesDecl::from_start(BytesStart::new("xml")))); + event_ok!(normal2("") => 8: Event::Decl(BytesDecl::from_start(BytesStart::from_content("xml ", 3)))); + event_ok!(normal3("") => 8: Event::Decl(BytesDecl::from_start(BytesStart::from_content("xml\t", 3)))); + event_ok!(normal4("") => 8: Event::Decl(BytesDecl::from_start(BytesStart::from_content("xml\r", 3)))); + event_ok!(normal5("") => 8: Event::Decl(BytesDecl::from_start(BytesStart::from_content("xml\n", 3)))); + event_ok!(normal6("rest") => 8: Event::Decl(BytesDecl::from_start(BytesStart::from_content("xml\n", 3)))); } /// Tests for UTF-16 encoded XML declarations. @@ -927,13 +667,13 @@ mod ill_formed { // ^= 9 err!(missing_doctype_name2("") => 13: IllFormedError::MissingDoctypeName); // ^= 13 - ok!(missing_doctype_name3("") => 15: Event::DocType(BytesText::new("x"))); + event_ok!(missing_doctype_name3("") => 15: Event::DocType(BytesText::new("x"))); err2!(unmatched_end_tag1(".") => 1: IllFormedError::UnmatchedEndTag("".to_string())); err2!(unmatched_end_tag2(".") => 1: IllFormedError::UnmatchedEndTag("end".to_string())); err2!(unmatched_end_tag3(".") => 1: IllFormedError::UnmatchedEndTag("end".to_string())); - ok!(mismatched_end_tag1("") => 7: Event::Start(BytesStart::new("start"))); + event_ok!(mismatched_end_tag1("") => 7: Event::Start(BytesStart::new("start"))); err2!(mismatched_end_tag2("") => 7: IllFormedError::MismatchedEndTag { // ^= 7 expected: "start".to_string(), @@ -950,7 +690,7 @@ mod ill_formed { found: "end".to_string(), }); - ok!(double_hyphen_in_comment1("") => 7: Event::Comment(BytesText::new(""))); + event_ok!(double_hyphen_in_comment1("") => 7: Event::Comment(BytesText::new(""))); err!(double_hyphen_in_comment2("") => 4: IllFormedError::DoubleHyphenInComment); // ^= 4 err!(double_hyphen_in_comment3("") => 5: IllFormedError::DoubleHyphenInComment); @@ -970,16 +710,16 @@ mod ill_formed { err2!(unclosed_hex2(".") => 1: IllFormedError::UnclosedReference); // We do not check correctness of references during parsing - ok!(empty("&;") => 2: Event::GeneralRef(BytesRef::new(""))); - ok!(normal1("&x;") => 3: Event::GeneralRef(BytesRef::new("x"))); - ok!(normal2("&x;rest") => 3: Event::GeneralRef(BytesRef::new("x"))); - ok!(num("&#;") => 3: Event::GeneralRef(BytesRef::new("#"))); - ok!(dec("") => 4: Event::GeneralRef(BytesRef::new("#2"))); - ok!(hex1("&#x;") => 4: Event::GeneralRef(BytesRef::new("#x"))); - ok!(hex2("") => 5: Event::GeneralRef(BytesRef::new("#xF"))); + event_ok!(empty("&;") => 2: Event::GeneralRef(BytesRef::new(""))); + event_ok!(normal1("&x;") => 3: Event::GeneralRef(BytesRef::new("x"))); + event_ok!(normal2("&x;rest") => 3: Event::GeneralRef(BytesRef::new("x"))); + event_ok!(num("&#;") => 3: Event::GeneralRef(BytesRef::new("#"))); + event_ok!(dec("") => 4: Event::GeneralRef(BytesRef::new("#2"))); + event_ok!(hex1("&#x;") => 4: Event::GeneralRef(BytesRef::new("#x"))); + event_ok!(hex2("") => 5: Event::GeneralRef(BytesRef::new("#xF"))); // XML specification explicitly allowed any number of leading zeroes - ok!(long_dec(" ") => 44: Event::GeneralRef(BytesRef::new("#00000000000000000000000000000000000000032"))); - ok!(long_hex(" ") => 45: Event::GeneralRef(BytesRef::new("#x00000000000000000000000000000000000000020"))); + event_ok!(long_dec(" ") => 44: Event::GeneralRef(BytesRef::new("#00000000000000000000000000000000000000032"))); + event_ok!(long_hex(" ") => 45: Event::GeneralRef(BytesRef::new("#x00000000000000000000000000000000000000020"))); } } From f78d112eab161e98cb1cf392048badc150bf65a1 Mon Sep 17 00:00:00 2001 From: tayu0110 Date: Sun, 21 Dec 2025 17:19:43 +0500 Subject: [PATCH 4/4] Correct DTD handling for skipping DTD correctly --- Changelog.md | 3 + src/parser/dtd.rs | 273 ++++++++++++++++++++++++++++++++++++++++++++ src/parser/mod.rs | 2 + src/reader/mod.rs | 25 ++-- src/reader/state.rs | 4 +- tests/issues.rs | 53 +++++++++ tests/reader-dtd.rs | 137 ++++++++++++++++++++++ 7 files changed, 477 insertions(+), 20 deletions(-) create mode 100644 src/parser/dtd.rs create mode 100644 tests/reader-dtd.rs diff --git a/Changelog.md b/Changelog.md index 09353119..688f3a2f 100644 --- a/Changelog.md +++ b/Changelog.md @@ -27,6 +27,8 @@ struct and can be applied at once. When `serde-types` feature is enabled, config ### Bug Fixes +- [#923]: Implement correct skipping of well-formed DTD. + ### Misc Changes - [#908]: Increase minimal supported `serde` version from 1.0.139 to 1.0.180. @@ -41,6 +43,7 @@ struct and can be applied at once. When `serde-types` feature is enabled, config [#908]: https://github.com/tafia/quick-xml/pull/908 [#913]: https://github.com/tafia/quick-xml/pull/913 [#924]: https://github.com/tafia/quick-xml/pull/924 +[#923]: https://github.com/tafia/quick-xml/issues/923 ## 0.38.4 -- 2025-11-11 diff --git a/src/parser/dtd.rs b/src/parser/dtd.rs new file mode 100644 index 00000000..3b10758d --- /dev/null +++ b/src/parser/dtd.rs @@ -0,0 +1,273 @@ +use crate::parser::{CommentParser, ElementParser, Parser, PiParser}; + +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum DtdParser { + /// If inside a PubidLiteral or SystemLiteral, it holds the quote type (either `'` or `"`). + /// Otherwise, it holds `0` (this is an initial state). + /// + /// ```text + /// [28] doctypedecl ::= '' + /// ``` + BeforeInternalSubset(u8), + /// Inside of the `intSubset` rule. + /// + /// ```text + /// [28a] DeclSep ::= PEReference | S + /// [28b] intSubset ::= (markupdecl | DeclSep)* + /// [29] markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment + /// ``` + InsideOfInternalSubset, + /// After `]` but before `>`. + AfterInternalSubset, + InComment(CommentParser), + InPi(PiParser), + /// ```text + /// [45] elementdecl ::= '' + /// ``` + InElementDecl, + /// This state handles ATTLIST, ENTITY and NOTATION elements, i.e. all elements that can have + /// quotes strings (`'...'` or `"..."`) inside their markup, in which `>` should not be threated + /// as the end of the markup. + /// + /// This state handles the following productions from XML grammar: + /// + /// ### ATTLIST + /// + /// ```text + /// [52] AttlistDecl ::= '' + /// [53] AttDef ::= S Name S AttType S DefaultDecl + /// [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED' | (('#FIXED' S)? AttValue) + /// ``` + /// + /// ### ENTITY + /// + /// ```text + /// [70] EntityDecl ::= GEDecl | PEDecl + /// [71] GEDecl ::= '' + /// [72] PEDecl ::= '' + /// [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?) + /// [74] PEDef ::= EntityValue | ExternalID + /// [75] ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral + /// [76] NDataDecl ::= S 'NDATA' S Name + /// ``` + /// + /// ### NOTATION + /// + /// ```text + /// [82] NotationDecl ::= '' + /// ``` + InQuoteSensitive(ElementParser), + /// The state where it was not possible to determine which markup it was during the previous iteration. \ + /// It holds the number of bytes read since the start of the markup. + UndecidedMarkup(usize), + Finished, +} + +impl DtdParser { + /// Skip DTD contents. + /// + /// # Parameters (as same as `reader::BangType::parse`) + /// - `buf`: buffer with data consumed on previous iterations + /// - `chunk`: data read on current iteration and not yet consumed from reader + pub fn feed<'b>(&mut self, buf: &[u8], chunk: &'b [u8]) -> Option<(&'b [u8], usize)> { + // This method assumes the DTD is well-formed. + // Since this crate does not support parsing DTDs, the inability to read non-well-formed DTDs + // is not particularly problematic; the only point of interest is reporting well-formed DTDs + // to the user without errors. + + let mut cur = chunk; + while !cur.is_empty() { + match *self { + Self::BeforeInternalSubset(0) => { + // Find the + // - start of quoted string ('...' or "...") + // - start of internal subset ([...]) + // - end of DOCTYPE declaration (>) + if let Some(i) = cur + .iter() + .position(|&b| matches!(b, b'\'' | b'"' | b'[' | b'>')) + { + let b = cur[i]; + match b { + b'\'' | b'"' => { + // SystemLiteral or PubidLiteral + *self = Self::BeforeInternalSubset(b); + cur = &cur[i + 1..]; + continue; + } + b'[' => { + *self = Self::InsideOfInternalSubset; + cur = &cur[i + 1..]; + continue; + } + b'>' => { + *self = Self::Finished; + let len = chunk.len() - cur.len() + i; + // +1 for `>` + return Some((&chunk[..len], len + 1)); + } + _ => {} + } + continue; + } + break; + } + // Inside the quoted string (this is PubidLiteral or SystemLiteral) we do not want to + // recognize other special characters (namely [ and >). Find only the closing quote + Self::BeforeInternalSubset(quote) => { + // ExternalID handling + if let Some(i) = memchr::memchr(quote, cur) { + *self = Self::BeforeInternalSubset(0); + cur = &cur[i + 1..]; + continue; + } + break; + } + Self::InsideOfInternalSubset => { + // Find the end of internal subset ([) or the start of the markup inside (<) + if let Some(i) = memchr::memchr2(b']', b'<', cur) { + if cur[i] == b']' { + *self = Self::AfterInternalSubset; + cur = &cur[i + 1..]; // +1 to skip `]` + continue; + } + // +1 to start after `<` + if let Some(skip) = self.switch(&cur[i + 1..]) { + cur = &cur[i + 1 + skip..]; // +1 to skip `<` + continue; + } + // Keep the number of already looked bytes (started from byte after `<`, so -1), + // try to decide after feeding the new chunk + *self = Self::UndecidedMarkup(cur.len() - i - 1); + } + break; + } + Self::AfterInternalSubset => { + if let Some(i) = memchr::memchr(b'>', cur) { + *self = Self::Finished; + let len = chunk.len() - cur.len() + i; + // +1 for `>` + return Some((&chunk[..len], len + 1)); + } + break; + } + Self::InComment(ref mut parser) => { + // If comment is ended, return to the main state, otherwise keep in the current state + if let Some(i) = parser.feed(cur) { + *self = Self::InsideOfInternalSubset; + cur = &cur[i..]; + continue; + } + break; + } + Self::InPi(ref mut parser) => { + // If processing instruction is ended, return to the main state, + // otherwise keep in the current state + if let Some(i) = parser.feed(cur) { + *self = Self::InsideOfInternalSubset; + cur = &cur[i..]; + continue; + } + break; + } + Self::InElementDecl => { + // `` does not have places where `>` could be escaped + // so the first occurrence ends that state + if let Some(i) = memchr::memchr(b'>', cur) { + *self = Self::InsideOfInternalSubset; + cur = &cur[i + 1..]; // +1 for `>` + continue; + } + break; + } + Self::InQuoteSensitive(ref mut parser) => { + // If ATTLIST, ENTITY or NOTATION is ended, return to the main state, + // otherwise keep in the current state + if let Some(i) = parser.feed(cur) { + *self = Self::InsideOfInternalSubset; + cur = &cur[i..]; + continue; + } + break; + } + Self::UndecidedMarkup(skipped) => { + // Buffer is long enough to store the longest possible keyword `!NOTATION` + let mut bytes = [0u8; 9]; + + // Copy the last `skipped` bytes from the previous iteration into buffer, + // for example, "!NOT" (skipped = 4 in that case)... + bytes[..skipped].copy_from_slice(&buf[buf.len() - skipped..]); + + // ...add new bytes to the buffer from current iteration, + // for example, "ATION"... + let end = bytes.len().min(skipped + cur.len()); + bytes[skipped..end].copy_from_slice(&cur[..end - skipped]); + + // ...and try to match over it. + // For example, "!NOTATION" will return 9, and we skip 9-4=5 bytes of "ATION" + if let Some(skip) = self.switch(&bytes[..end]) { + cur = &cur[skip - skipped..]; + continue; + } + *self = Self::UndecidedMarkup(skipped + cur.len()); + break; + } + Self::Finished => break, + } + } + + None + } + + #[inline] + fn switch(&mut self, markup: &[u8]) -> Option { + match markup { + [b'?', ..] => { + // { + // Comment, /// . Contains balance of '<' (+1) and '>' (-1) - DocType(i32), + DocType(DtdParser), } impl BangType { #[inline(always)] @@ -1165,7 +1165,7 @@ impl BangType { Ok(match byte { Some(b'[') => Self::CData, Some(b'-') => Self::Comment, - Some(b'D') | Some(b'd') => Self::DocType(0), + Some(b'D') | Some(b'd') => Self::DocType(DtdParser::BeforeInternalSubset(0)), _ => return Err(SyntaxError::InvalidBangMarkup), }) } @@ -1222,18 +1222,7 @@ impl BangType { } } } - Self::DocType(ref mut balance) => { - for i in memchr::memchr2_iter(b'<', b'>', chunk) { - if chunk[i] == b'<' { - *balance += 1; - } else { - if *balance == 0 { - return Some((&chunk[..i], i + 1)); // +1 for `>` - } - *balance -= 1; - } - } - } + Self::DocType(ref mut parser) => return parser.feed(buf, chunk), } None } @@ -1266,7 +1255,7 @@ mod test { mod read_bang_element { use super::*; use crate::errors::{Error, SyntaxError}; - use crate::reader::BangType; + use crate::reader::{BangType, DtdParser}; use crate::utils::Bytes; /// Checks that reading CDATA content works correctly @@ -1552,7 +1541,7 @@ mod test { .unwrap(); assert_eq!( (ty, Bytes(bytes)), - (BangType::DocType(0), Bytes(b"!DOCTYPE")) + (BangType::DocType(DtdParser::Finished), Bytes(b"!DOCTYPE")) ); assert_eq!(position, 10); } @@ -1626,7 +1615,7 @@ mod test { .unwrap(); assert_eq!( (ty, Bytes(bytes)), - (BangType::DocType(0), Bytes(b"!doctype")) + (BangType::DocType(DtdParser::Finished), Bytes(b"!doctype")) ); assert_eq!(position, 10); } diff --git a/src/reader/state.rs b/src/reader/state.rs index 005d698b..ff08272d 100644 --- a/src/reader/state.rs +++ b/src/reader/state.rs @@ -7,7 +7,7 @@ use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesPI, BytesStart, BytesT use crate::parser::{Parser, PiParser}; #[cfg(feature = "encoding")] use crate::reader::EncodingRef; -use crate::reader::{BangType, Config, ParseState}; +use crate::reader::{BangType, Config, DtdParser, ParseState}; use crate::utils::{is_whitespace, name_len}; /// A struct that holds a current reader state and a parser configuration. @@ -145,7 +145,7 @@ impl ReaderState { // https://www.w3.org/TR/xml11/#sec-prolog-dtd // HTML5 allows mixed case for doctype declarations: // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state - BangType::DocType(0) if uncased_starts_with(buf, b"!DOCTYPE") => { + BangType::DocType(DtdParser::Finished) if uncased_starts_with(buf, b"!DOCTYPE") => { match buf[8..].iter().position(|&b| !is_whitespace(b)) { Some(start) => Ok(Event::DocType(BytesText::wrap( // Cut of `!DOCTYPE` and any number of spaces from start diff --git a/tests/issues.rs b/tests/issues.rs index 9a8b3ebf..966aa9ad 100644 --- a/tests/issues.rs +++ b/tests/issues.rs @@ -514,3 +514,56 @@ fn issue801() { } } } + +/// Regression tests for https://github.com/tafia/quick-xml/issues/923 +mod issue923 { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn case1() { + let mut reader = Reader::from_str( + r#" + +"> + +]> +&ent;"#, + ); + let mut buf = Vec::new(); + reader.read_event_into(&mut buf).unwrap(); // + reader.read_event_into(&mut buf).unwrap(); + reader.read_event_into(&mut buf).unwrap(); // + reader.read_event_into(&mut buf).unwrap(); + reader.read_event_into(&mut buf).unwrap(); // DTD + reader.read_event_into(&mut buf).unwrap(); + reader.read_event_into(&mut buf).unwrap(); // + reader.read_event_into(&mut buf).unwrap(); // &ent; + reader.read_event_into(&mut buf).unwrap(); // + + assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof); + } + + #[test] + fn case2() { + let mut reader = Reader::from_str( + r#" + + +]> +"#, + ); + let mut buf = Vec::new(); + reader.read_event_into(&mut buf).unwrap(); // + reader.read_event_into(&mut buf).unwrap(); + reader.read_event_into(&mut buf).unwrap(); // + reader.read_event_into(&mut buf).unwrap(); + reader.read_event_into(&mut buf).unwrap(); // DTD + reader.read_event_into(&mut buf).unwrap(); + reader.read_event_into(&mut buf).unwrap(); // + + assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof); + } +} diff --git a/tests/reader-dtd.rs b/tests/reader-dtd.rs new file mode 100644 index 00000000..6ff4fef2 --- /dev/null +++ b/tests/reader-dtd.rs @@ -0,0 +1,137 @@ +use quick_xml::errors::{Error, SyntaxError}; +use quick_xml::events::{BytesText, Event}; +use quick_xml::reader::{NsReader, Reader}; + +// For event_ok and syntax_err macros +mod helpers; + +macro_rules! ok { + ($test:ident : $pos:literal $xml:literal $event:literal) => { + event_ok!($test ($xml) => $pos : Event::DocType(BytesText::from_escaped($event))); + }; +} + +mod without_internal_subset { + use super::*; + + ok!(simple: 15 + "" + "root" + ); + ok!(with_external_id_1: 21 + r#"['">"# + r#"root ">['""# + ); + ok!(with_external_id_2: 21 + r#"["'>"# + r#"root '>["'"# + ); + ok!(with_external_id_3: 22 + r#"['" >"# + r#"root ">['" "# + ); + ok!(with_external_id_4: 22 + r#"["' >"# + r#"root '>["' "# + ); +} + +ok!(with_external_id_1: 23 + r#"['"[]>"# + r#"root ">['"[]"# +); +ok!(with_external_id_2: 23 + r#"["'[]>"# + r#"root '>["'[]"# +); +ok!(with_external_id_3: 25 + r#"['" [] >"# + r#"root ">['" [] "# +); +ok!(with_external_id_4: 25 + r#"["' [] >"# + r#"root '>["' [] "# +); + +ok!(entity_1: 35 + r#"">]>"# + r#"root [">]"# +); +ok!(entity_2: 35 + r#"]>"# + r#"root []"# +); + +ok!(attlist: 86 + r#" 2 is true" att2 #FIXED '>>> in other quote'>]>"# + r#"root [ 2 is true" att2 #FIXED '>>> in other quote'>]"# +); +ok!(notation: 80 + r#">>some_system_id"'>]>"# + r#"root [>>some_system_id"'>]"# +); +ok!(comment: 51 + r#"]>"# + r#"root []"# +); +ok!(pi: 37 + r#"><<>><><>?>]>"# + r#"root [><<>><><>?>]"# +); +ok!(all_together: 164 + " + + '> + '> + --> + ?> + ] + >" + "e [ + + + '> + '> + --> + ?> + ] + " +); + +ok!(unknown_dtd_markup: 34 + " ] >" + "e [ ] " +); + +mod unclosed { + use super::*; + + syntax_err!(doctype_1(". SyntaxError::UnclosedDoctype); + syntax_err!(doctype_2(".['\" [ ] ") => SyntaxError::UnclosedDoctype); + syntax_err!(doctype_3(".[\"' [ ] ") => SyntaxError::UnclosedDoctype); + + syntax_err!(external_id_1(".[\" ") => SyntaxError::UnclosedDoctype); + syntax_err!(external_id_2(".[' ") => SyntaxError::UnclosedDoctype); + + syntax_err!(internal_subset_1(". SyntaxError::UnclosedDoctype); + syntax_err!(internal_subset_2(".['\" [ ") => SyntaxError::UnclosedDoctype); + syntax_err!(internal_subset_3(".[\"' [ ") => SyntaxError::UnclosedDoctype); + + syntax_err!(element(". SyntaxError::UnclosedDoctype); + syntax_err!(attlist(". SyntaxError::UnclosedDoctype); + + syntax_err!(entity_1(". SyntaxError::UnclosedDoctype); + syntax_err!(entity_2(".' ") => SyntaxError::UnclosedDoctype); + syntax_err!(entity_3(".\" ") => SyntaxError::UnclosedDoctype); + + syntax_err!(notation_1(". SyntaxError::UnclosedDoctype); + syntax_err!(notation_2(".' ") => SyntaxError::UnclosedDoctype); + syntax_err!(notation_3(".\" ") => SyntaxError::UnclosedDoctype); + + syntax_err!(comment_1(". SyntaxError::UnclosedDoctype); + syntax_err!(comment_2(". ") => SyntaxError::UnclosedDoctype); + + syntax_err!(pi_1(". SyntaxError::UnclosedDoctype); + syntax_err!(pi_2(". ") => SyntaxError::UnclosedDoctype); +}