diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 382ac855..70b3d233 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -6,11 +6,13 @@ mod comment; mod dtd; mod element; mod pi; +mod start_element; pub use comment::CommentParser; pub(crate) use dtd::DtdParser; pub use element::ElementParser; pub use pi::PiParser; +pub use start_element::StartElementParser; /// Used to decouple reading of data from data source and parsing XML structure from it. /// This is a state preserved between getting chunks of bytes from the reader. diff --git a/src/parser/start_element.rs b/src/parser/start_element.rs new file mode 100644 index 00000000..4164c61e --- /dev/null +++ b/src/parser/start_element.rs @@ -0,0 +1,242 @@ +//! Contains a parser for an XML element. + +use crate::errors::SyntaxError; + +/// A parser that search a `>` symbol in the slice outside of quoted regions. +/// +/// The parser considers two quoted regions: a double-quoted (`"..."`) and +/// a single-quoted (`'...'`) region. Matches found inside those regions are not +/// considered as results. Each region starts and ends by its quote symbol, +/// which cannot be escaped (but can be encoded as XML character entity or named +/// entity. Anyway, that encoding does not contain literal quotes). +/// +/// To use a parser create an instance of parser and [`feed`] data into it. +/// After successful search the parser will return [`Some`] with the length +/// of the element name and the position of +/// found symbol. If search is unsuccessful, a [`None`] will be returned. You +/// typically would expect positive result of search, so that you should feed +/// new data until you get it. +/// +/// NOTE: after successful match the parser does not returned to the initial +/// state and should not be used anymore. Create a new parser if you want to perform +/// new search. +/// +/// # Example +/// +/// ``` +/// # use pretty_assertions::assert_eq; +/// use quick_xml::parser::{ElementParser, Parser}; +/// +/// let mut parser = ElementParser::default(); +/// +/// // Parse `and the text follow...` +/// // splitted into three chunks +/// assert_eq!(parser.feed(b"and the text follow..."), Some(8)); +/// // ^ ^ +/// // 0 8 +/// ``` +/// +/// [`feed`]: Self::feed() +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum StartElementParser { + /// The initial state, inside the Tag name. + /// Contains the current length of the tag name. + Tag(usize), + /// The name fast completely parsed. Now look for the '>'. + Attributes(usize, AttributeParser), +} + +/// The internal state of the attribute parser. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum AttributeParser { + /// The initial state, not within ' or ". + Outside, + /// Inside a single-quoted region (`'...'`). + SingleQ, + /// Inside a double-quoted region (`"..."`). + DoubleQ, +} + +impl StartElementParser { + /// Returns the length of the name and the number of consumed bytes of the current call or `None` if `>` was not found in `bytes`. + /// A return-value of None implies, that the full bytes array was consumed. + /// Assumes, that the initial '<' is already consumed. + #[inline] + pub fn feed(&mut self, bytes: &[u8]) -> Option<(usize, usize)> { + // The number of bytes consumed in the current feed iteration. + let mut consumed: usize = 0; + + let (name_len, mut attr_parser) = 'name_len: { + match *self { + Self::Tag(name_len) => { + for i in 0..bytes.len() { + let byte = bytes[i]; + + if matches!(byte, b' ' | b'\r' | b'\n' | b'\t' | b'/') { + // TODO(flxbe): Somehow make sure, that the only expect a '>' after the '/'. + let name_len = name_len + i; + let attr_parser = AttributeParser::Outside; + *self = Self::Attributes(name_len, attr_parser); + + consumed += i; + break 'name_len (name_len, attr_parser); + } else if byte == b'>' { + return Some((name_len + i, consumed + i)); + } + } + + *self = Self::Tag(name_len + bytes.len()); + return None; + } + Self::Attributes(name_len, attr_parser) => (name_len, attr_parser), + } + }; + + let new_data = &bytes[consumed..]; + for i in memchr::memchr3_iter(b'>', b'\'', b'"', new_data) { + attr_parser = match (attr_parser, new_data[i]) { + // only allowed to match `>` while we are in state `Outside` + (AttributeParser::Outside, b'>') => return Some((name_len, consumed + i)), + (AttributeParser::Outside, b'\'') => AttributeParser::SingleQ, + (AttributeParser::Outside, b'"') => AttributeParser::DoubleQ, + + // the only end_byte that gets us out if the same character + (AttributeParser::SingleQ, b'\'') | (AttributeParser::DoubleQ, b'"') => { + AttributeParser::Outside + } + + // all other bytes: no state change + _ => continue, + }; + } + + *self = Self::Attributes(name_len, attr_parser); + None + } + + /// Return the correct EOF SyntaxError based on the current internal state. + #[inline] + pub fn eof_error(self, _content: &[u8]) -> SyntaxError { + match self { + Self::Tag(_) => SyntaxError::UnclosedTag, + Self::Attributes(_, attr) => match attr { + AttributeParser::Outside => SyntaxError::UnclosedTag, + AttributeParser::SingleQ => SyntaxError::UnclosedSingleQuotedAttributeValue, + AttributeParser::DoubleQ => SyntaxError::UnclosedDoubleQuotedAttributeValue, + }, + } + } +} + +impl Default for StartElementParser { + #[inline] + fn default() -> Self { + Self::Tag(0) + } +} + +#[test] +fn parse_all() { + use pretty_assertions::assert_eq; + + fn parse_input(input: &[u8], name_len: usize) { + let mut parser = StartElementParser::default(); + + assert_eq!(parser.feed(input), Some((name_len, input.len() - 1))); + } + + parse_input(b"tag key='value' key=\"value\">", 3); + parse_input(b"tag>", 3); + parse_input(b"tag />", 3); + parse_input(b"tag/>", 3); +} + +#[test] +fn parse_internal_state() { + use pretty_assertions::assert_eq; + + let mut parser = StartElementParser::default(); + assert_eq!(parser.feed(b""), None); + assert_eq!(parser, StartElementParser::Tag(0)); + + // start feeding the tag + assert_eq!(parser.feed(b"tag"), None); + assert_eq!(parser, StartElementParser::Tag(3)); + + // Finish the tag parsing after seeing some whitespace + assert_eq!(parser.feed(b" "), None); + assert_eq!( + parser, + StartElementParser::Attributes(3, AttributeParser::Outside) + ); + + // Remain in state when no progress is made + assert_eq!(parser.feed(b""), None); + assert_eq!( + parser, + StartElementParser::Attributes(3, AttributeParser::Outside) + ); + assert_eq!(parser.feed(b"some random content"), None); + assert_eq!( + parser, + StartElementParser::Attributes(3, AttributeParser::Outside) + ); + + // Handle single qoute + assert_eq!(parser.feed(b"\'"), None); + assert_eq!( + parser, + StartElementParser::Attributes(3, AttributeParser::SingleQ) + ); + + // Remain in state when no progress is made + assert_eq!(parser.feed(b""), None); + assert_eq!( + parser, + StartElementParser::Attributes(3, AttributeParser::SingleQ) + ); + assert_eq!(parser.feed(b"some random content \">"), None); + assert_eq!( + parser, + StartElementParser::Attributes(3, AttributeParser::SingleQ) + ); + + // Close single quote + assert_eq!(parser.feed(b"'"), None); + assert_eq!( + parser, + StartElementParser::Attributes(3, AttributeParser::Outside) + ); + + // Handle double qoute + assert_eq!(parser.feed(b"\""), None); + assert_eq!( + parser, + StartElementParser::Attributes(3, AttributeParser::DoubleQ) + ); + + // Remain in state when no progress is made + assert_eq!(parser.feed(b""), None); + assert_eq!( + parser, + StartElementParser::Attributes(3, AttributeParser::DoubleQ) + ); + assert_eq!(parser.feed(b"some random content '>"), None); + assert_eq!( + parser, + StartElementParser::Attributes(3, AttributeParser::DoubleQ) + ); + + // Close double quote + assert_eq!(parser.feed(b"\""), None); + assert_eq!( + parser, + StartElementParser::Attributes(3, AttributeParser::Outside) + ); + + assert_eq!(parser.feed(b">"), Some((3, 0))); +} diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index 32aa313e..5c960adc 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -8,7 +8,7 @@ use std::path::Path; use crate::errors::{Error, Result}; use crate::events::Event; use crate::name::QName; -use crate::parser::Parser; +use crate::parser::{Parser, StartElementParser}; use crate::reader::{BangType, ReadRefResult, ReadTextResult, Reader, Span, XmlSource}; use crate::utils::is_whitespace; @@ -316,6 +316,7 @@ macro_rules! impl_buffered_source { // That method is called only when available buffer starts from '<' // We need to consume it self $(.$reader)? .consume(1); + let available = loop { break match self $(.$reader)? .fill_buf() $(.$await)? { Ok(n) => n, @@ -325,6 +326,48 @@ macro_rules! impl_buffered_source { }; Ok(available.first().cloned()) } + + #[inline] + $($async)? fn read_start_element<'i>(&mut self, buf: &'i mut Vec, position: &mut u64) -> Result<(usize, &'i [u8])> { + let mut parser = StartElementParser::default(); + let mut read = 1; + // '<' was consumed in peek_one(), but not placed in buf + buf.push(b'<'); + + let start = buf.len(); + loop { + let available = match self $(.$reader)? .fill_buf() $(.$await)? { + Ok(n) if n.is_empty() => break, + Ok(n) => n, + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => { + *position += read; + return Err(Error::Io(e.into())); + } + }; + + if let Some((name_len, consumed)) = parser.feed(available) { + buf.extend_from_slice(&available[..consumed]); + + // +1 for `>` which we do not include + self $(.$reader)? .consume(consumed + 1); + read += consumed as u64 + 1; + + *position += read; + return Ok((name_len, &buf[start..])); + } + + // The `>` symbol not yet found, continue reading + buf.extend_from_slice(available); + + let used = available.len(); + self $(.$reader)? .consume(used); + read += used as u64; + } + + *position += read; + Err(Error::Syntax(parser.eof_error(&buf[start..]))) + } }; } diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 79e94ebd..0a2e6290 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -456,10 +456,10 @@ macro_rules! read_until_close { }, // `<...` - opening or self-closed tag Ok(Some(_)) => match $reader - .read_with(ElementParser::Outside, $buf, &mut $self.state.offset) + .read_start_element($buf, &mut $self.state.offset) $(.$await)? { - Ok(bytes) => Ok($self.state.emit_start(bytes)), + Ok((name_len, bytes)) => Ok($self.state.emit_start(name_len, bytes)), Err(e) => { // We want to report error at `<` $self.state.last_error_offset = start; @@ -1136,6 +1136,28 @@ trait XmlSource<'r, B> { /// Return one character without consuming it, so that future `read_*` calls /// will still include it. On EOF, return `None`. fn peek_one(&mut self) -> io::Result>; + + /// Read input until start element is finished. + /// + /// This method expect that start sequence of a parser already was read. + /// + /// Returns a tuple of the length of the tag name and a slice of data read up to the end of the thing being parsed. + /// The end of thing and the returned content is determined by the used parser. + /// + /// If input (`Self`) is exhausted and no bytes was read, or if the specified + /// parser could not find the ending sequence of the thing, returns `SyntaxError`. + /// + /// # Parameters + /// - `buf`: Buffer that could be filled from an input (`Self`) and + /// from which [events] could borrow their data + /// - `position`: Will be increased by amount of bytes consumed + /// + /// [events]: crate::events::Event + fn read_start_element( + &mut self, + buf: B, + position: &mut u64, + ) -> Result<(usize, &'r [u8]), Error>; } /// Possible elements started with ` XmlSource<'a, ()> for &'a [u8] { "markup must start from '<':\n{:?}", crate::utils::Bytes(self) ); + Ok(self.get(1).copied()) } + + #[inline] + fn read_start_element(&mut self, _buf: (), position: &mut u64) -> Result<(usize, &'a [u8])> { + *position += 1; + *self = &self[1..]; + + let mut parser = StartElementParser::default(); + + if let Some((name_len, consumed)) = parser.feed(self) { + // +1 for `>` which we do not include + *position += consumed as u64 + 1; + let bytes = &self[..consumed]; + *self = &self[consumed + 1..]; + return Ok((name_len, bytes)); + } + + *position += self.len() as u64; + Err(Error::Syntax(parser.eof_error(self))) + } } #[cfg(test)] diff --git a/src/reader/state.rs b/src/reader/state.rs index ef96ba7a..046c2390 100644 --- a/src/reader/state.rs +++ b/src/reader/state.rs @@ -301,18 +301,10 @@ impl ReaderState { /// /// # Parameters /// - `content`: Content of a tag between `<` and `>` - pub fn emit_start<'b>(&mut self, content: &'b [u8]) -> Event<'b> { - debug_assert!( - content.starts_with(b"<"), - "start or empty tag must start from '<':\n{:?}", - crate::utils::Bytes(content) - ); - - // strip `<` - let content = &content[1..]; + pub fn emit_start<'b>(&mut self, name_len: usize, content: &'b [u8]) -> Event<'b> { if let Some(content) = content.strip_suffix(b"/") { // This is self-closed tag `` - let event = BytesStart::wrap(content, name_len(content), self.decoder()); + let event = BytesStart::wrap(content, name_len, self.decoder()); if self.config.expand_empty_elements { self.state = ParseState::InsideEmpty; @@ -323,7 +315,7 @@ impl ReaderState { Event::Empty(event) } } else { - let event = BytesStart::wrap(content, name_len(content), self.decoder()); + let event = BytesStart::wrap(content, name_len, self.decoder()); // #514: Always store names event when .check_end_names == false, // because checks can be temporary disabled and when they would be