Skip to content

Commit

Permalink
Merge pull request #802 from BlueGreenMagick/fix-bufread-doctype
Browse files Browse the repository at this point in the history
Fix DocType closing bracket recognition in buffered reader
  • Loading branch information
Mingun committed Sep 20, 2024
2 parents 2f3824a + 3ebe221 commit 51d9e23
Show file tree
Hide file tree
Showing 7 changed files with 72 additions and 19 deletions.
4 changes: 4 additions & 0 deletions Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,12 @@

### Bug Fixes

- [#533]: Fix incorrect DocType closing bracket detection when parsing with buffered reader

### Misc Changes

[#623]: https://github.com/tafia/quick-xml/issues/533


## 0.36.1 -- 2024-07-23

Expand Down
2 changes: 1 addition & 1 deletion src/reader/buffered_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ macro_rules! impl_buffered_source {
buf.push(b'!');
self $(.$reader)? .consume(1);

let bang_type = BangType::new(self.peek_one() $(.$await)? ?)?;
let mut bang_type = BangType::new(self.peek_one() $(.$await)? ?)?;

loop {
match self $(.$reader)? .fill_buf() $(.$await)? {
Expand Down
31 changes: 16 additions & 15 deletions src/reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1019,16 +1019,16 @@ enum BangType {
CData,
/// <!--...-->
Comment,
/// <!DOCTYPE...>
DocType,
/// <!DOCTYPE...>. Contains balance of '<' (+1) and '>' (-1)
DocType(i32),
}
impl BangType {
#[inline(always)]
const fn new(byte: Option<u8>) -> Result<Self> {
Ok(match byte {
Some(b'[') => Self::CData,
Some(b'-') => Self::Comment,
Some(b'D') | Some(b'd') => Self::DocType,
Some(b'D') | Some(b'd') => Self::DocType(0),
_ => return Err(Error::Syntax(SyntaxError::InvalidBangMarkup)),
})
}
Expand All @@ -1040,7 +1040,7 @@ impl BangType {
/// - `buf`: buffer with data consumed on previous iterations
/// - `chunk`: data read on current iteration and not yet consumed from reader
#[inline(always)]
fn parse<'b>(&self, buf: &[u8], chunk: &'b [u8]) -> Option<(&'b [u8], usize)> {
fn parse<'b>(&mut self, buf: &[u8], chunk: &'b [u8]) -> Option<(&'b [u8], usize)> {
match self {
Self::Comment => {
for i in memchr::memchr_iter(b'>', chunk) {
Expand Down Expand Up @@ -1085,14 +1085,15 @@ impl BangType {
}
}
}
Self::DocType => {
for i in memchr::memchr_iter(b'>', chunk) {
let content = &chunk[..i];
let balance = memchr::memchr2_iter(b'<', b'>', content)
.map(|p| if content[p] == b'<' { 1i32 } else { -1 })
.sum::<i32>();
if balance == 0 {
return Some((content, i + 1)); // +1 for `>`
Self::DocType(ref mut balance) => {
for i in memchr::memchr2_iter(b'<', b'>', chunk) {
if chunk[i] == b'<' {
*balance += 1;
} else {
if *balance == 0 {
return Some((&chunk[..i], i + 1)); // +1 for `>`
}
*balance -= 1;
}
}
}
Expand All @@ -1104,7 +1105,7 @@ impl BangType {
match self {
Self::CData => Error::Syntax(SyntaxError::UnclosedCData),
Self::Comment => Error::Syntax(SyntaxError::UnclosedComment),
Self::DocType => Error::Syntax(SyntaxError::UnclosedDoctype),
Self::DocType(_) => Error::Syntax(SyntaxError::UnclosedDoctype),
}
}
}
Expand Down Expand Up @@ -1414,7 +1415,7 @@ mod test {
.unwrap();
assert_eq!(
(ty, Bytes(bytes)),
(BangType::DocType, Bytes(b"!DOCTYPE"))
(BangType::DocType(0), Bytes(b"!DOCTYPE"))
);
assert_eq!(position, 10);
}
Expand Down Expand Up @@ -1488,7 +1489,7 @@ mod test {
.unwrap();
assert_eq!(
(ty, Bytes(bytes)),
(BangType::DocType, Bytes(b"!doctype"))
(BangType::DocType(0), Bytes(b"!doctype"))
);
assert_eq!(position, 10);
}
Expand Down
2 changes: 1 addition & 1 deletion src/reader/slice_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] {
// start with it.
debug_assert_eq!(self[0], b'!');

let bang_type = BangType::new(self[1..].first().copied())?;
let mut bang_type = BangType::new(self[1..].first().copied())?;

if let Some((bytes, i)) = bang_type.parse(&[], self) {
*position += i as u64;
Expand Down
2 changes: 1 addition & 1 deletion src/reader/state.rs
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ impl ReaderState {
// https://www.w3.org/TR/xml11/#sec-prolog-dtd
// HTML5 allows mixed case for doctype declarations:
// https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
BangType::DocType if uncased_starts_with(buf, b"!DOCTYPE") => {
BangType::DocType(0) if uncased_starts_with(buf, b"!DOCTYPE") => {
match buf[8..].iter().position(|&b| !is_whitespace(b)) {
Some(start) => Ok(Event::DocType(BytesText::wrap(
// Cut of `!DOCTYPE` and any number of spaces from start
Expand Down
36 changes: 36 additions & 0 deletions tests/issues.rs
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,24 @@ mod issue514 {
}
}

/// Regression test for https://github.com/tafia/quick-xml/issues/590
#[test]
fn issue590() {
let mut reader = Reader::from_reader(BufReader::with_capacity(
16,
&b"<!DOCTYPE t [<!1><!2>]>"[..],
// 0 7 ^15 ^23
//[ ] = capacity
));
let mut buf = Vec::new();
loop {
match reader.read_event_into(&mut buf).unwrap() {
Event::Eof => break,
_ => (),
}
}
}

/// Regression test for https://github.com/tafia/quick-xml/issues/604
mod issue604 {
use super::*;
Expand Down Expand Up @@ -477,3 +495,21 @@ fn issue776() {
Event::End(BytesEnd::new(r#"tag attr=">""#))
);
}

/// Regression test for https://github.com/tafia/quick-xml/issues/801
/// Angle brackets are read in different buffer
#[test]
fn issue801() {
let xml = b"<!DOCTYPE X [<!-- --> <!ENTITY a \"a\">]>";
let reader = BufReader::with_capacity(2, &xml[..]);
let mut reader = Reader::from_reader(reader);
let mut buf = Vec::new();
loop {
buf.clear();
match reader.read_event_into(&mut buf) {
Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e),
Ok(Event::Eof) => break,
Ok(_) => {}
}
}
}
14 changes: 13 additions & 1 deletion tests/serde-issues.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@
//! Name each module / test as `issue<GH number>` and keep sorted by issue number

use pretty_assertions::assert_eq;
use quick_xml::de::from_str;
use quick_xml::de::{from_reader, from_str};
use quick_xml::se::{to_string, to_string_with_root};
use serde::de::{Deserializer, IgnoredAny};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::io::BufReader;

/// Regression tests for https://github.com/tafia/quick-xml/issues/252.
mod issue252 {
Expand Down Expand Up @@ -321,6 +322,17 @@ fn issue510() {
);
}

/// Regression test for https://github.com/tafia/quick-xml/issues/533.
#[test]
fn issue533() {
#[derive(Deserialize)]
struct X {}

let xml = "<!DOCTYPE X [<!-- -->]><X></X>";
let reader = BufReader::with_capacity(20, xml.as_bytes());
let _: X = from_reader(reader).unwrap();
}

/// Regression test for https://github.com/tafia/quick-xml/issues/537.
///
/// This test checks that special `xmlns:xxx` attributes uses full name of
Expand Down

0 comments on commit 51d9e23

Please sign in to comment.