From eb90e9f011784ae39acd67093154611d101dd86d Mon Sep 17 00:00:00 2001 From: Mingun Date: Sun, 7 Jul 2024 18:35:31 +0500 Subject: [PATCH] Add `allow_dangling_amp` configuration option and allow dangling `&` --- Changelog.md | 3 ++ src/reader/buffered_reader.rs | 6 ++-- src/reader/mod.rs | 62 +++++++++++++++++++++++++------- src/reader/slice_reader.rs | 8 ++--- tests/reader-config.rs | 68 ++++++++++++++++++++++++++++++++++- 5 files changed, 127 insertions(+), 20 deletions(-) diff --git a/Changelog.md b/Changelog.md index 64b5eed2..5b17a160 100644 --- a/Changelog.md +++ b/Changelog.md @@ -24,6 +24,9 @@ XML specification. See the updated `custom_entities` example! - [#766]: Allow to parse resolved entities as XML fragments and stream events from them. - [#766]: Added new event `Event::GeneralRef` with content of [general entity]. +- [#766]: Added new configuration option `allow_dangling_amp` which allows to have + a `&` not followed by `;` in the textual data which is required for some applications + for compatibility reasons. ### Bug Fixes diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index a3d90da6..347f3e8d 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -161,7 +161,7 @@ macro_rules! impl_buffered_source { *position += read; - return ReadRefResult::UpToRef; + return ReadRefResult::UpToRef(&buf[start..]); } Some(i) => { let is_end = available[i] == b';'; @@ -177,7 +177,7 @@ macro_rules! impl_buffered_source { return if is_end { ReadRefResult::Ref(&buf[start..]) } else { - ReadRefResult::UpToMarkup + ReadRefResult::UpToMarkup(&buf[start..]) }; } None => { @@ -191,7 +191,7 @@ macro_rules! impl_buffered_source { } *position += read; - ReadRefResult::UpToEof + ReadRefResult::UpToEof(&buf[start..]) } #[inline] diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 82bcda47..f4644a9d 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -24,6 +24,32 @@ use crate::reader::state::ReaderState; #[cfg_attr(feature = "serde-types", derive(serde::Deserialize, serde::Serialize))] #[non_exhaustive] pub struct Config { + /// Whether lone ampersand character (without a paired semicolon) should be + /// allowed in textual content. Unless enabled, in case of a dangling ampersand, + /// the [`Error::IllFormed(UnclosedReference)`] is returned from read methods. + /// + /// Default: `false` + /// + /// # Example + /// + /// ``` + /// # use quick_xml::events::{BytesRef, BytesText, Event}; + /// # use quick_xml::reader::Reader; + /// # use pretty_assertions::assert_eq; + /// let mut reader = Reader::from_str("text with & & & alone"); + /// reader.config_mut().allow_dangling_amp = true; + /// + /// assert_eq!(reader.read_event().unwrap(), Event::Text(BytesText::new("text with "))); + /// assert_eq!(reader.read_event().unwrap(), Event::Text(BytesText::from_escaped("& "))); + /// assert_eq!(reader.read_event().unwrap(), Event::GeneralRef(BytesRef::new("amp"))); + /// assert_eq!(reader.read_event().unwrap(), Event::Text(BytesText::new(" "))); + /// assert_eq!(reader.read_event().unwrap(), Event::Text(BytesText::from_escaped("& alone"))); + /// assert_eq!(reader.read_event().unwrap(), Event::Eof); + /// ``` + /// + /// [`Error::IllFormed(UnclosedReference)`]: crate::errors::IllFormedError::UnclosedReference + pub allow_dangling_amp: bool, + /// Whether unmatched closing tag names should be allowed. Unless enabled, /// in case of a dangling end tag, the [`Error::IllFormed(UnmatchedEndTag)`] /// is returned from read methods. @@ -210,6 +236,7 @@ impl Config { impl Default for Config { fn default() -> Self { Self { + allow_dangling_amp: false, allow_unmatched_ends: false, check_comments: false, check_end_names: true, @@ -261,18 +288,29 @@ macro_rules! read_event_impl { Ok(Event::GeneralRef(BytesRef::wrap(&bytes[1..], $self.decoder()))) } // Go to Done state - ReadRefResult::UpToEof => { + ReadRefResult::UpToEof(bytes) if $self.state.config.allow_dangling_amp => { + $self.state.state = ParseState::Done; + Ok(Event::Text($self.state.emit_text(bytes))) + } + ReadRefResult::UpToEof(_) => { $self.state.state = ParseState::Done; $self.state.last_error_offset = start; Err(Error::IllFormed(IllFormedError::UnclosedReference)) } // Do not change state, stay in InsideRef - ReadRefResult::UpToRef => { + ReadRefResult::UpToRef(bytes) if $self.state.config.allow_dangling_amp => { + Ok(Event::Text($self.state.emit_text(bytes))) + } + ReadRefResult::UpToRef(_) => { $self.state.last_error_offset = start; Err(Error::IllFormed(IllFormedError::UnclosedReference)) } // Go to InsideMarkup state - ReadRefResult::UpToMarkup => { + ReadRefResult::UpToMarkup(bytes) if $self.state.config.allow_dangling_amp => { + $self.state.state = ParseState::InsideMarkup; + Ok(Event::Text($self.state.emit_text(bytes))) + } + ReadRefResult::UpToMarkup(_) => { $self.state.state = ParseState::InsideMarkup; $self.state.last_error_offset = start; Err(Error::IllFormed(IllFormedError::UnclosedReference)) @@ -997,13 +1035,13 @@ enum ReadRefResult<'r> { /// Contains text block up to EOF. Neither end of reference (`;`), start of /// another reference (`&`) or start of markup (`<`) characters was found. /// Result includes start `&`. - UpToEof, + UpToEof(&'r [u8]), /// Contains text block up to next possible reference (`&` character). /// Result includes start `&`. - UpToRef, + UpToRef(&'r [u8]), /// Contains text block up to start of markup (`<` character). /// Result includes start `&`. - UpToMarkup, + UpToMarkup(&'r [u8]), /// IO error occurred. Err(io::Error), } @@ -1717,8 +1755,8 @@ mod test { // ^= 2 match $source(&mut input).read_ref(buf, &mut position) $(.$await)? { - ReadRefResult::UpToEof => (), - x => panic!("Expected `UpToEof`, but got `{:?}`", x), + ReadRefResult::UpToEof(bytes) => assert_eq!(Bytes(bytes), Bytes(b"&")), + x => panic!("Expected `UpToEof(_)`, but got `{:?}`", x), } assert_eq!(position, 2); } @@ -1731,8 +1769,8 @@ mod test { // ^= 2 match $source(&mut input).read_ref(buf, &mut position) $(.$await)? { - ReadRefResult::UpToRef => (), - x => panic!("Expected `UpToRef`, but got `{:?}`", x), + ReadRefResult::UpToRef(bytes) => assert_eq!(Bytes(bytes), Bytes(b"&")), + x => panic!("Expected `UpToRef(_)`, but got `{:?}`", x), } assert_eq!(position, 2); } @@ -1745,8 +1783,8 @@ mod test { // ^= 3 match $source(&mut input).read_ref(buf, &mut position) $(.$await)? { - ReadRefResult::UpToMarkup => (), - x => panic!("Expected `UpToMarkup`, but got `{:?}`", x), + ReadRefResult::UpToMarkup(bytes) => assert_eq!(Bytes(bytes), Bytes(b"&")), + x => panic!("Expected `UpToMarkup(_)`, but got `{:?}`", x), } assert_eq!(position, 3); } diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index 998e734a..5d20d8aa 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -306,11 +306,11 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { // Do not consume `&` because it may be lone and we would be need to // return it as part of Text event Some(i) if self[i + 1] == b'&' => { - let (_, rest) = self.split_at(i + 1); + let (bytes, rest) = self.split_at(i + 1); *self = rest; *position += i as u64 + 1; - ReadRefResult::UpToRef + ReadRefResult::UpToRef(bytes) } Some(i) => { let end = i + 1; @@ -323,7 +323,7 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { if is_end { ReadRefResult::Ref(bytes) } else { - ReadRefResult::UpToMarkup + ReadRefResult::UpToMarkup(bytes) } } None => { @@ -331,7 +331,7 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { *self = &[]; *position += bytes.len() as u64; - ReadRefResult::UpToEof + ReadRefResult::UpToEof(bytes) } } } diff --git a/tests/reader-config.rs b/tests/reader-config.rs index 8796075e..09f820a3 100644 --- a/tests/reader-config.rs +++ b/tests/reader-config.rs @@ -6,9 +6,75 @@ //! Please keep tests sorted (exceptions are allowed if options are tightly related). use quick_xml::errors::{Error, IllFormedError}; -use quick_xml::events::{BytesCData, BytesEnd, BytesPI, BytesStart, BytesText, Event}; +use quick_xml::events::{BytesCData, BytesEnd, BytesPI, BytesRef, BytesStart, BytesText, Event}; use quick_xml::reader::Reader; +mod allow_dangling_amp { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn false_() { + let mut reader = Reader::from_str("&&<&"); + reader.config_mut().allow_dangling_amp = false; + + match reader.read_event() { + Err(Error::IllFormed(cause)) => { + assert_eq!(cause, IllFormedError::UnclosedReference); + } + x => panic!("Expected `Err(Syntax(_))`, but got `{:?}`", x), + } + assert_eq!(reader.error_position()..reader.buffer_position(), 0..1); + + match reader.read_event() { + Err(Error::IllFormed(cause)) => { + assert_eq!(cause, IllFormedError::UnclosedReference); + } + x => panic!("Expected `Err(Syntax(_))`, but got `{:?}`", x), + } + assert_eq!(reader.error_position()..reader.buffer_position(), 1..2); + + assert_eq!( + reader.read_event().unwrap(), + Event::GeneralRef(BytesRef::new("lt")) + ); + match reader.read_event() { + Err(Error::IllFormed(cause)) => { + assert_eq!(cause, IllFormedError::UnclosedReference); + } + x => panic!("Expected `Err(Syntax(_))`, but got `{:?}`", x), + } + assert_eq!(reader.error_position()..reader.buffer_position(), 6..7); + + assert_eq!(reader.read_event().unwrap(), Event::Eof); + assert_eq!(reader.error_position()..reader.buffer_position(), 6..7); + } + + #[test] + fn true_() { + let mut reader = Reader::from_str("&&<&"); + reader.config_mut().allow_dangling_amp = true; + + assert_eq!( + reader.read_event().unwrap(), + Event::Text(BytesText::from_escaped("&")) + ); + assert_eq!( + reader.read_event().unwrap(), + Event::Text(BytesText::from_escaped("&")) + ); + assert_eq!( + reader.read_event().unwrap(), + Event::GeneralRef(BytesRef::new("lt")) + ); + assert_eq!( + reader.read_event().unwrap(), + Event::Text(BytesText::from_escaped("&")) + ); + assert_eq!(reader.read_event().unwrap(), Event::Eof); + } +} + mod allow_unmatched_ends { use super::*; use pretty_assertions::assert_eq;