diff --git a/Changelog.md b/Changelog.md index 142c9e15..a41dae64 100644 --- a/Changelog.md +++ b/Changelog.md @@ -16,6 +16,8 @@ enums from textual content - [#556]: `to_writer` and `to_string` now accept `?Sized` types - [#556]: Add new `to_writer_with_root` and `to_string_with_root` helper functions +- [#520]: Add methods `BytesText::inplace_trim_start` and `BytesText::inplace_trim_end` + to trim leading and trailing spaces from text events ### Bug Fixes @@ -25,12 +27,18 @@ sequence type (for example, `Vec` or tuple) - [#540]: Fix a compilation error (probably a rustc bug) in some circumstances. `Serializer::new` and `Serializer::with_root` now accepts only references to `Write`r. +- [#520]: Merge consequent (delimited only by comments and processing instructions) + texts and CDATA when deserialize using serde deserializer. `DeEvent::Text` and + `DeEvent::CData` events was replaced by `DeEvent::Text` with merged content. + The same behavior for the `Reader` does not implemented (yet?) and should be + implemented manually ### Misc Changes [externally tagged]: https://serde.rs/enum-representations.html#externally-tagged [#490]: https://github.com/tafia/quick-xml/pull/490 [#510]: https://github.com/tafia/quick-xml/issues/510 +[#520]: https://github.com/tafia/quick-xml/pull/520 [#537]: https://github.com/tafia/quick-xml/issues/537 [#540]: https://github.com/tafia/quick-xml/issues/540 [#541]: https://github.com/tafia/quick-xml/pull/541 diff --git a/src/de/map.rs b/src/de/map.rs index e0e3f695..28fbbc79 100644 --- a/src/de/map.rs +++ b/src/de/map.rs @@ -239,14 +239,14 @@ where // We shouldn't have both `$value` and `$text` fields in the same // struct, so if we have `$value` field, the we should deserialize // text content to `$value` - DeEvent::Text(_) | DeEvent::CData(_) if self.has_value_field => { + DeEvent::Text(_) if self.has_value_field => { self.source = ValueSource::Content; // Deserialize `key` from special attribute name which means // that value should be taken from the text content of the // XML node seed.deserialize(VALUE_KEY.into_deserializer()).map(Some) } - DeEvent::Text(_) | DeEvent::CData(_) => { + DeEvent::Text(_) => { self.source = ValueSource::Text; // Deserialize `key` from special attribute name which means // that value should be taken from the text content of the @@ -307,19 +307,11 @@ where // // The whole map represented by an `` element, the map key // is implicit and equals to the `TEXT_KEY` constant, and the value - // is a `Text` or a `CData` event (the value deserializer will see one - // of that events) + // is a `Text` event (the value deserializer will see that event) // This case are checked by "xml_schema_lists::element" tests in tests/serde-de.rs ValueSource::Text => match self.de.next()? { - DeEvent::Text(e) => seed.deserialize(SimpleTypeDeserializer::from_text_content( - // Comment to prevent auto-formatting - e.decode(true)?, - )), - DeEvent::CData(e) => seed.deserialize(SimpleTypeDeserializer::from_text_content( - // Comment to prevent auto-formatting - e.decode()?, - )), - // SAFETY: We set `Text` only when we seen `Text` or `CData` + DeEvent::Text(e) => seed.deserialize(SimpleTypeDeserializer::from_text_content(e)), + // SAFETY: We set `Text` only when we seen `Text` _ => unreachable!(), }, // This arm processes the following XML shape: @@ -431,7 +423,7 @@ where /// /// The whole map represented by an `` element, the map key is /// implicit and equals to the [`VALUE_KEY`] constant, and the value is - /// a [`Text`], a [`CData`], or a [`Start`] event (the value deserializer + /// a [`Text`], or a [`Start`] event (the value deserializer /// will see one of those events). In the first two cases the value of this /// field do not matter (because we already see the textual event and there /// no reasons to look "inside" something), but in the last case the primitives @@ -452,7 +444,6 @@ where /// as accepting "text content" which the currently `$text` means. /// /// [`Text`]: DeEvent::Text - /// [`CData`]: DeEvent::CData /// [`Start`]: DeEvent::Start allow_start: bool, } @@ -464,11 +455,11 @@ where /// Returns a next string as concatenated content of consequent [`Text`] and /// [`CData`] events, used inside [`deserialize_primitives!()`]. /// - /// [`Text`]: DeEvent::Text - /// [`CData`]: DeEvent::CData + /// [`Text`]: crate::events::Event::Text + /// [`CData`]: crate::events::Event::CData #[inline] - fn read_string(&mut self, unescape: bool) -> Result, DeError> { - self.map.de.read_string_impl(unescape, self.allow_start) + fn read_string(&mut self) -> Result, DeError> { + self.map.de.read_string_impl(self.allow_start) } } @@ -631,8 +622,8 @@ impl<'de> TagFilter<'de> { /// Depending on [`Self::filter`], only some of that possible constructs would be /// an element. /// -/// [`Text`]: DeEvent::Text -/// [`CData`]: DeEvent::CData +/// [`Text`]: crate::events::Event::Text +/// [`CData`]: crate::events::Event::CData struct MapValueSeqAccess<'de, 'a, 'm, R> where R: XmlRead<'de>, @@ -697,7 +688,7 @@ where // opened tag `self.map.start` DeEvent::Eof => Err(DeError::UnexpectedEof), - // Start(tag), Text, CData + // Start(tag), Text _ => seed .deserialize(SeqItemDeserializer { map: self.map }) .map(Some), @@ -725,11 +716,11 @@ where /// Returns a next string as concatenated content of consequent [`Text`] and /// [`CData`] events, used inside [`deserialize_primitives!()`]. /// - /// [`Text`]: DeEvent::Text - /// [`CData`]: DeEvent::CData + /// [`Text`]: crate::events::Event::Text + /// [`CData`]: crate::events::Event::CData #[inline] - fn read_string(&mut self, unescape: bool) -> Result, DeError> { - self.map.de.read_string_impl(unescape, true) + fn read_string(&mut self) -> Result, DeError> { + self.map.de.read_string_impl(true) } } @@ -781,31 +772,17 @@ where V: Visitor<'de>, { match self.map.de.next()? { - DeEvent::Text(e) => SimpleTypeDeserializer::from_text_content( - // Comment to prevent auto-formatting - e.decode(true)?, - ) - .deserialize_seq(visitor), - DeEvent::CData(e) => SimpleTypeDeserializer::from_text_content( - // Comment to prevent auto-formatting - e.decode()?, - ) - .deserialize_seq(visitor), + DeEvent::Text(e) => { + SimpleTypeDeserializer::from_text_content(e).deserialize_seq(visitor) + } // This is a sequence element. We cannot treat it as another flatten // sequence if type will require `deserialize_seq` We instead forward // it to `xs:simpleType` implementation DeEvent::Start(e) => { let value = match self.map.de.next()? { - DeEvent::Text(e) => SimpleTypeDeserializer::from_text_content( - // Comment to prevent auto-formatting - e.decode(true)?, - ) - .deserialize_seq(visitor), - DeEvent::CData(e) => SimpleTypeDeserializer::from_text_content( - // Comment to prevent auto-formatting - e.decode()?, - ) - .deserialize_seq(visitor), + DeEvent::Text(e) => { + SimpleTypeDeserializer::from_text_content(e).deserialize_seq(visitor) + } e => Err(DeError::Unsupported( format!("unsupported event {:?}", e).into(), )), @@ -814,8 +791,8 @@ where self.map.de.read_to_end(e.name())?; value } - // SAFETY: we use that deserializer only when Start(element), Text, - // or CData event Start(tag), Text, CData was peeked already + // SAFETY: we use that deserializer only when Start(element) or Text + // event was peeked already _ => unreachable!(), } } diff --git a/src/de/mod.rs b/src/de/mod.rs index 03b6648e..b9ca4ae0 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -77,11 +77,7 @@ //! ```xml //! <...>texttext //! ``` -//!
-//! -//! Merging of the text / CDATA content is tracked in the issue [#474] and -//! will be available in the next release. -//!
+//! Mixed text / CDATA content represents one logical string, `"textcdatatext"` in that case. //! //! //! @@ -90,9 +86,7 @@ //! - [`Cow`] //! - [`u32`], [`f32`] and other numeric types //! - `enum`s, like -//! ```ignore -//! // FIXME: #474, merging mixed text / CDATA -//! // content does not work yet +//! ``` //! # use pretty_assertions::assert_eq; //! # use serde::Deserialize; //! # #[derive(Debug, PartialEq)] @@ -149,11 +143,6 @@ //! ... //! ]]> //! ``` -//!
-//! -//! Merging of the text / CDATA content is tracked in the issue [#474] and -//! will be available in the next release. -//!
//! //! [`xs:list`]: https://www.w3.org/TR/xmlschema11-2/#list-datatypes //! @@ -162,8 +151,6 @@ //! Use any type that deserialized using [`deserialize_seq()`] call, for example: //! //! ``` -//! // FIXME: #474, merging mixed text / CDATA -//! // content does not work yet //! type List = Vec; //! ``` //! @@ -520,8 +507,7 @@ //! } //! # assert_eq!(AnyName::One { field1: () }, quick_xml::de::from_str(r#"..."#).unwrap()); //! # assert_eq!(AnyName::Two { field2: () }, quick_xml::de::from_str(r#"..."#).unwrap()); -//! # assert_eq!(AnyName::Text("text".into()), quick_xml::de::from_str(r#"text"#).unwrap()); -//! # // TODO: After #474 parse mixed content +//! # assert_eq!(AnyName::Text("text cdata ".into()), quick_xml::de::from_str(r#"text "#).unwrap()); //! ``` //! ``` //! # use pretty_assertions::assert_eq; @@ -544,8 +530,7 @@ //! } //! # assert_eq!(AnyName::One, quick_xml::de::from_str(r#"..."#).unwrap()); //! # assert_eq!(AnyName::Two(Two { field2: () }), quick_xml::de::from_str(r#"..."#).unwrap()); -//! # assert_eq!(AnyName::Text, quick_xml::de::from_str(r#"text"#).unwrap()); -//! # // TODO: After #474 parse mixed content +//! # assert_eq!(AnyName::Text, quick_xml::de::from_str(r#"text "#).unwrap()); //! ``` //! ``` //! # use pretty_assertions::assert_eq; @@ -561,8 +546,7 @@ //! } //! # assert_eq!(AnyName::One, quick_xml::de::from_str(r#"..."#).unwrap()); //! # assert_eq!(AnyName::Other, quick_xml::de::from_str(r#"..."#).unwrap()); -//! # assert_eq!(AnyName::Other, quick_xml::de::from_str(r#"text"#).unwrap()); -//! # // TODO: After #474 parse mixed content +//! # assert_eq!(AnyName::Other, quick_xml::de::from_str(r#"text "#).unwrap()); //! ``` //!
//! @@ -643,9 +627,8 @@ //! # quick_xml::de::from_str(r#"..."#).unwrap(), //! # ); //! # assert_eq!( -//! # AnyName { field: (), any_name: Choice::Text("text".into()) }, -//! # // TODO: After #474 parse mixed content -//! # quick_xml::de::from_str(r#"text"#).unwrap(), +//! # AnyName { field: (), any_name: Choice::Text("text cdata ".into()) }, +//! # quick_xml::de::from_str(r#"text "#).unwrap(), //! # ); //! ``` //! @@ -967,8 +950,7 @@ //! from the full element (`...`), so they could use the element name //! to choose the right variant: //! -//! ```ignore -//! // FIXME: #474 +//! ``` //! # use pretty_assertions::assert_eq; //! # use serde::Deserialize; //! # type One = (); @@ -985,9 +967,7 @@ //! # quick_xml::de::from_str(r#"...text ......"#).unwrap(), //! # ); //! ``` -//! ```ignore -//! // FIXME: #474, Custom("unknown variant `two`, -//! // expected `one`") +//! ``` //! # use pretty_assertions::assert_eq; //! # use serde::Deserialize; //! # #[derive(Debug, PartialEq)] @@ -1011,11 +991,6 @@ //! NOTE: consequent text and CDATA nodes are merged into the one text node, //! so you cannot have two adjacent string types in your sequence. //!
-//!
-//! -//! Merging of the text / CDATA content is tracked in the issue [#474] and -//! will be available in the next release. -//!
//! //! //! @@ -1040,8 +1015,7 @@ //! //! A homogeneous sequence of elements with a fixed or dynamic size: //! -//! ```ignore -//! // FIXME: #474 +//! ``` //! # use pretty_assertions::assert_eq; //! # use serde::Deserialize; //! # #[derive(Debug, PartialEq)] @@ -1059,8 +1033,7 @@ //! # quick_xml::de::from_str::(r#"...text ......"#).unwrap(), //! # ); //! ``` -//! ```ignore -//! // FIXME: #474 +//! ``` //! # use pretty_assertions::assert_eq; //! # use serde::Deserialize; //! # #[derive(Debug, PartialEq)] @@ -1088,11 +1061,6 @@ //! NOTE: consequent text and CDATA nodes are merged into the one text node, //! so you cannot have two adjacent string types in your sequence. //! -//!
-//! -//! Merging of the text / CDATA content is tracked in the issue [#474] and -//! will be available in the next release. -//!
//! //! //! @@ -1119,8 +1087,7 @@ //! //! You MUST specify `#[serde(rename = "$value")]` on that field: //! -//! ```ignore -//! // FIXME: #474, Custom("duplicate field `$value`") +//! ``` //! # use pretty_assertions::assert_eq; //! # use serde::Deserialize; //! # type One = (); @@ -1157,8 +1124,7 @@ //! # ).unwrap(), //! # ); //! ``` -//! ```ignore -//! // FIXME: #474, Custom("duplicate field `$value`") +//! ``` //! # use pretty_assertions::assert_eq; //! # use serde::Deserialize; //! # type One = (); @@ -1204,11 +1170,6 @@ //! NOTE: consequent text and CDATA nodes are merged into the one text node, //! so you cannot have two adjacent string types in your sequence. //! -//!
-//! -//! Merging of the text / CDATA content is tracked in the issue [#474] and -//! will be available in the next release. -//!
//! //! //! @@ -1237,8 +1198,7 @@ //! //! You MUST specify `#[serde(rename = "$value")]` on that field: //! -//! ```ignore -//! // FIXME: #474 +//! ``` //! # use pretty_assertions::assert_eq; //! # use serde::Deserialize; //! # #[derive(Debug, PartialEq)] @@ -1282,8 +1242,7 @@ //! # ).unwrap(), //! # ); //! ``` -//! ```ignore -//! // FIXME: #474 +//! ``` //! # use pretty_assertions::assert_eq; //! # use serde::Deserialize; //! # #[derive(Debug, PartialEq)] @@ -1332,11 +1291,6 @@ //! NOTE: consequent text and CDATA nodes are merged into the one text node, //! so you cannot have two adjacent string types in your sequence. //! -//!
-//! -//! Merging of the text / CDATA content is tracked in the issue [#474] and -//! will be available in the next release. -//!
//! //! //! @@ -1720,7 +1674,6 @@ //! //! [specification]: https://www.w3.org/TR/xmlschema11-1/#Simple_Type_Definition //! [`deserialize_with`]: https://serde.rs/field-attrs.html#deserialize_with -//! [#474]: https://github.com/tafia/quick-xml/issues/474 //! [#497]: https://github.com/tafia/quick-xml/issues/497 // Macros should be defined before the modules that using them @@ -1734,7 +1687,7 @@ macro_rules! deserialize_type { V: Visitor<'de>, { // No need to unescape because valid integer representations cannot be escaped - let text = self.read_string(false)?; + let text = self.read_string()?; visitor.$visit(text.parse()?) } }; @@ -1766,8 +1719,7 @@ macro_rules! deserialize_primitives { where V: Visitor<'de>, { - // No need to unescape because valid boolean representations cannot be escaped - let text = self.read_string(false)?; + let text = self.read_string()?; str2bool(&text, visitor) } @@ -1792,7 +1744,7 @@ macro_rules! deserialize_primitives { where V: Visitor<'de>, { - let text = self.read_string(true)?; + let text = self.read_string()?; match text { Cow::Borrowed(string) => visitor.visit_borrowed_str(string), Cow::Owned(string) => visitor.visit_string(string), @@ -1873,7 +1825,6 @@ macro_rules! deserialize_option { ($de:expr, $deserializer:ident, $visitor:ident) => { match $de.peek()? { DeEvent::Text(t) if t.is_empty() => $visitor.visit_none(), - DeEvent::CData(t) if t.is_empty() => $visitor.visit_none(), DeEvent::Eof => $visitor.visit_none(), _ => $visitor.visit_some($deserializer), } @@ -1898,6 +1849,7 @@ use std::borrow::Cow; #[cfg(feature = "overlapped-lists")] use std::collections::VecDeque; use std::io::BufRead; +use std::mem::replace; #[cfg(feature = "overlapped-lists")] use std::num::NonZeroUsize; @@ -1909,6 +1861,38 @@ pub(crate) const VALUE_KEY: &str = "$value"; /// Simplified event which contains only these variants that used by deserializer #[derive(Debug, PartialEq, Eq)] pub enum DeEvent<'a> { + /// Start tag (with attributes) ``. + Start(BytesStart<'a>), + /// End tag ``. + End(BytesEnd<'a>), + /// Decoded and concatenated content of consequent [`Text`] and [`CData`] + /// events. _Consequent_ means that events should follow each other or be + /// delimited only by (any count of) [`Comment`] or [`PI`] events. + /// + /// [`Text`]: Event::Text + /// [`CData`]: Event::CData + /// [`Comment`]: Event::Comment + /// [`PI`]: Event::PI + Text(Cow<'a, str>), + /// End of XML document. + Eof, +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Simplified event which contains only these variants that used by deserializer, +/// but [`Text`] events not yet fully processed. +/// +/// [`Text`] events should be trimmed if they does not surrounded by the other +/// [`Text`] or [`CData`] events. This event contains intermediate state of [`Text`] +/// event, where they are trimmed from the start, but not from the end. To trim +/// end spaces we should lookahead by one deserializer event (i. e. skip all +/// comments and processing instructions). +/// +/// [`Text`]: Event::Text +/// [`CData`]: Event::CData +#[derive(Debug, PartialEq, Eq)] +pub enum PayloadEvent<'a> { /// Start tag (with attributes) ``. Start(BytesStart<'a>), /// End tag ``. @@ -1922,6 +1906,164 @@ pub enum DeEvent<'a> { Eof, } +impl<'a> PayloadEvent<'a> { + /// Ensures that all data is owned to extend the object's lifetime if necessary. + #[inline] + fn into_owned(self) -> PayloadEvent<'static> { + match self { + PayloadEvent::Start(e) => PayloadEvent::Start(e.into_owned()), + PayloadEvent::End(e) => PayloadEvent::End(e.into_owned()), + PayloadEvent::Text(e) => PayloadEvent::Text(e.into_owned()), + PayloadEvent::CData(e) => PayloadEvent::CData(e.into_owned()), + PayloadEvent::Eof => PayloadEvent::Eof, + } + } +} + +/// An intermediate reader that consumes [`PayloadEvent`]s and produces final [`DeEvent`]s. +/// [`PayloadEvent::Text`] events, that followed by any event except +/// [`PayloadEvent::Text`] or [`PayloadEvent::CData`], are trimmed from the end. +struct XmlReader<'i, R: XmlRead<'i>> { + /// A source of low-level XML events + reader: R, + /// Intermediate event, that could be returned by the next call to `next()`. + /// If that is the `Text` event then leading spaces already trimmed, but + /// trailing spaces is not. Before the event will be returned, trimming of + /// the spaces could be necessary + lookahead: Result, DeError>, +} + +impl<'i, R: XmlRead<'i>> XmlReader<'i, R> { + fn new(mut reader: R) -> Self { + // Lookahead by one event immediately, so we do not need to check in the + // loop if we need lookahead or not + let lookahead = reader.next(); + + Self { reader, lookahead } + } + + /// Read next event and put it in lookahead, return the current lookahead + #[inline(always)] + fn next_impl(&mut self) -> Result, DeError> { + replace(&mut self.lookahead, self.reader.next()) + } + + #[inline(always)] + fn need_trim_end(&self) -> bool { + // If next event is a text or CDATA, we should not trim trailing spaces + !matches!( + self.lookahead, + Ok(PayloadEvent::Text(_)) | Ok(PayloadEvent::CData(_)) + ) + } + + /// Read all consequent [`Text`] and [`CData`] events until non-text event + /// occurs. Content of all events would be appended to `result` and returned + /// as [`DeEvent::Text`]. + /// + /// [`Text`]: PayloadEvent::Text + /// [`CData`]: PayloadEvent::CData + fn drain_text(&mut self, mut result: Cow<'i, str>) -> Result, DeError> { + loop { + match self.lookahead { + Ok(PayloadEvent::Text(_) | PayloadEvent::CData(_)) => { + let text = self.next_text()?; + + let mut s = result.into_owned(); + s += &text; + result = Cow::Owned(s); + } + _ => break, + } + } + Ok(DeEvent::Text(result)) + } + + /// Read one text event, panics if current event is not a text event + /// + /// |Event |XML |Handling + /// |-----------------------|---------------------------|---------------------------------------- + /// |[`PayloadEvent::Start`]|`...` |Possible panic (unreachable) + /// |[`PayloadEvent::End`] |`
` |Possible panic (unreachable) + /// |[`PayloadEvent::Text`] |`text content` |Unescapes `text content` and returns it + /// |[`PayloadEvent::CData`]|``|Returns `cdata content` unchanged + /// |[`PayloadEvent::Eof`] | |Possible panic (unreachable) + #[inline(always)] + fn next_text(&mut self) -> Result, DeError> { + match self.next_impl()? { + PayloadEvent::Text(mut e) => { + if self.need_trim_end() { + e.inplace_trim_end(); + } + Ok(e.unescape()?) + } + PayloadEvent::CData(e) => Ok(e.decode()?), + + // SAFETY: this method is called only when we peeked Text or CData + _ => unreachable!("Only `Text` and `CData` events can come here"), + } + } + + /// Return an input-borrowing event. + fn next(&mut self) -> Result, DeError> { + loop { + return match self.next_impl()? { + PayloadEvent::Start(e) => Ok(DeEvent::Start(e)), + PayloadEvent::End(e) => Ok(DeEvent::End(e)), + PayloadEvent::Text(mut e) => { + if self.need_trim_end() && e.inplace_trim_end() { + continue; + } + self.drain_text(e.unescape()?) + } + PayloadEvent::CData(e) => self.drain_text(e.decode()?), + PayloadEvent::Eof => Ok(DeEvent::Eof), + }; + } + } + + #[inline] + fn read_to_end(&mut self, name: QName) -> Result<(), DeError> { + match self.lookahead { + // We pre-read event with the same name that is required to be skipped. + // First call of `read_to_end` will end out pre-read event, the second + // will consume other events + Ok(PayloadEvent::Start(ref e)) if e.name() == name => { + let result1 = self.reader.read_to_end(name); + let result2 = self.reader.read_to_end(name); + + // In case of error `next` returns `Eof` + self.lookahead = self.reader.next(); + result1?; + result2?; + } + // We pre-read event with the same name that is required to be skipped. + // Because this is end event, we already consume the whole tree, so + // nothing to do, just update lookahead + Ok(PayloadEvent::End(ref e)) if e.name() == name => { + self.lookahead = self.reader.next(); + } + Ok(_) => { + let result = self.reader.read_to_end(name); + + // In case of error `next` returns `Eof` + self.lookahead = self.reader.next(); + result?; + } + // Read next lookahead event, unpack error from the current lookahead + Err(_) => { + self.next_impl()?; + } + } + Ok(()) + } + + #[inline] + fn decoder(&self) -> Decoder { + self.reader.decoder() + } +} + //////////////////////////////////////////////////////////////////////////////////////////////////// /// A structure that deserializes XML into Rust values. @@ -1930,7 +2072,7 @@ where R: XmlRead<'de>, { /// An XML reader that streams events into this deserializer - reader: R, + reader: XmlReader<'de, R>, /// When deserializing sequences sometimes we have to skip unwanted events. /// That events should be stored and then replayed. This is a replay buffer, @@ -2035,8 +2177,8 @@ where /// - [`Deserializer::from_str`] /// - [`Deserializer::from_reader`] fn new(reader: R) -> Self { - Deserializer { - reader, + Self { + reader: XmlReader::new(reader), #[cfg(feature = "overlapped-lists")] read: VecDeque::new(), @@ -2240,19 +2382,19 @@ where } #[inline] - fn read_string(&mut self, unescape: bool) -> Result, DeError> { - self.read_string_impl(unescape, true) + fn read_string(&mut self) -> Result, DeError> { + self.read_string_impl(true) } - /// Consumes a one XML element or an XML tree, returns associated text or + /// Consumes consequent [`Text`] and [`CData`] (both a referred below as a _text_) + /// events, merge them into one string. If there are no such events, returns /// an empty string. /// - /// If `allow_start` is `false`, then only one event is consumed. If that - /// event is [`DeEvent::Start`], then [`DeError::UnexpectedStart`] is returned. + /// If `allow_start` is `false`, then only text events is consumed, for other + /// events an error is returned (see table below). /// - /// If `allow_start` is `true`, then first text of CDATA event inside it is - /// returned and all other content is skipped until corresponding end tag - /// will be consumed. + /// If `allow_start` is `true`, then first [`DeEvent::Text`] event is returned + /// and all other content is skipped until corresponding end tag will be consumed. /// /// # Handling events /// @@ -2262,8 +2404,7 @@ where /// |------------------|---------------------------|---------------------------------------- /// |[`DeEvent::Start`]|`...` |if `allow_start == true`, result determined by the second table, otherwise emits [`UnexpectedStart("tag")`](DeError::UnexpectedStart) /// |[`DeEvent::End`] |`
` |Emits [`UnexpectedEnd("any-tag")`](DeError::UnexpectedEnd) - /// |[`DeEvent::Text`] |`text content` |Unescapes `text content` and returns it - /// |[`DeEvent::CData`]|``|Returns `cdata content` unchanged + /// |[`DeEvent::Text`] |`text content` or `` (probably mixed)|Returns event content unchanged /// |[`DeEvent::Eof`] | |Emits [`UnexpectedEof`](DeError::UnexpectedEof) /// /// Second event, consumed if [`DeEvent::Start`] was received and `allow_start == true`: @@ -2273,39 +2414,27 @@ where /// |[`DeEvent::Start`]|`...` |Emits [`UnexpectedStart("any-tag")`](DeError::UnexpectedStart) /// |[`DeEvent::End`] |`` |Returns an empty slice, if close tag matched the open one /// |[`DeEvent::End`] |`` |Emits [`UnexpectedEnd("any-tag")`](DeError::UnexpectedEnd) - /// |[`DeEvent::Text`] |`text content` |Unescapes `text content` and returns it, consumes events up to `` - /// |[`DeEvent::CData`]|``|Returns `cdata content` unchanged, consumes events up to `` + /// |[`DeEvent::Text`] |`text content` or `` (probably mixed)|Returns event content unchanged, consumes events up to `` /// |[`DeEvent::Eof`] | |Emits [`UnexpectedEof`](DeError::UnexpectedEof) - fn read_string_impl( - &mut self, - unescape: bool, - allow_start: bool, - ) -> Result, DeError> { + /// + /// [`Text`]: Event::Text + /// [`CData`]: Event::CData + fn read_string_impl(&mut self, allow_start: bool) -> Result, DeError> { match self.next()? { - DeEvent::Text(e) => Ok(e.decode(unescape)?), - DeEvent::CData(e) => Ok(e.decode()?), - DeEvent::Start(e) if allow_start => { - // allow one nested level - let inner = self.next()?; - let t = match inner { - DeEvent::Text(t) => t.decode(unescape)?, - DeEvent::CData(t) => t.decode()?, - DeEvent::Start(s) => { - return Err(DeError::UnexpectedStart(s.name().as_ref().to_owned())) - } - // We can get End event in case of `` or `` input - // Return empty text in that case - DeEvent::End(end) if end.name() == e.name() => { - return Ok("".into()); - } - DeEvent::End(end) => { - return Err(DeError::UnexpectedEnd(end.name().as_ref().to_owned())) - } - DeEvent::Eof => return Err(DeError::UnexpectedEof), - }; - self.read_to_end(e.name())?; - Ok(t) - } + DeEvent::Text(e) => Ok(e), + // allow one nested level + DeEvent::Start(e) if allow_start => match self.next()? { + DeEvent::Text(t) => { + self.read_to_end(e.name())?; + Ok(t) + } + DeEvent::Start(s) => Err(DeError::UnexpectedStart(s.name().as_ref().to_owned())), + // We can get End event in case of `` or `` input + // Return empty text in that case + DeEvent::End(end) if end.name() == e.name() => Ok("".into()), + DeEvent::End(end) => Err(DeError::UnexpectedEnd(end.name().as_ref().to_owned())), + DeEvent::Eof => Err(DeError::UnexpectedEof), + }, DeEvent::Start(e) => Err(DeError::UnexpectedStart(e.name().as_ref().to_owned())), DeEvent::End(e) => Err(DeError::UnexpectedEnd(e.name().as_ref().to_owned())), DeEvent::Eof => Err(DeError::UnexpectedEof), @@ -2375,11 +2504,11 @@ impl<'de> Deserializer<'de, SliceReader<'de>> { #[allow(clippy::should_implement_trait)] pub fn from_str(s: &'de str) -> Self { let mut reader = Reader::from_str(s); - reader - .expand_empty_elements(true) - .check_end_names(true) - .trim_text(true); - Self::new(SliceReader { reader }) + reader.expand_empty_elements(true).check_end_names(true); + Self::new(SliceReader { + reader, + start_trimmer: StartTrimmer::default(), + }) } } @@ -2393,13 +2522,11 @@ where /// is known to represent UTF-8, you can decode it first before using [`from_str`]. pub fn from_reader(reader: R) -> Self { let mut reader = Reader::from_reader(reader); - reader - .expand_empty_elements(true) - .check_end_names(true) - .trim_text(true); + reader.expand_empty_elements(true).check_end_names(true); Self::new(IoReader { reader, + start_trimmer: StartTrimmer::default(), buf: Vec::new(), }) } @@ -2431,7 +2558,7 @@ where Ok(value) } DeEvent::End(e) => Err(DeError::UnexpectedEnd(e.name().as_ref().to_owned())), - DeEvent::Text(_) | DeEvent::CData(_) => Err(DeError::ExpectedStart), + DeEvent::Text(_) => Err(DeError::ExpectedStart), DeEvent::Eof => Err(DeError::UnexpectedEof), } } @@ -2442,17 +2569,16 @@ where /// Produces unit struct from any of following inputs: /// - any `...` /// - any `` - /// - any text content - /// - any CDATA content + /// - any consequent text / CDATA content (can consist of several parts + /// delimited by comments and processing instructions) /// /// # Events handling /// /// |Event |XML |Handling /// |------------------|---------------------------|------------------------------------------- - /// |[`DeEvent::Start`]|`...` |Calls `visitor.visit_unit()`, consumes all events up to corresponding `End` event + /// |[`DeEvent::Start`]|`...` |Calls `visitor.visit_unit()`, consumes all events up to and including corresponding `End` event /// |[`DeEvent::End`] |`` |Emits [`UnexpectedEnd("tag")`](DeError::UnexpectedEnd) - /// |[`DeEvent::Text`] |`text content` |Calls `visitor.visit_unit()`. Text content is ignored - /// |[`DeEvent::CData`]|``|Calls `visitor.visit_unit()`. CDATA content is ignored + /// |[`DeEvent::Text`] |`text content` or `` (probably mixed)|Calls `visitor.visit_unit()`. The content is ignored /// |[`DeEvent::Eof`] | |Emits [`UnexpectedEof`](DeError::UnexpectedEof) fn deserialize_unit(self, visitor: V) -> Result where @@ -2463,7 +2589,7 @@ where self.read_to_end(s.name())?; visitor.visit_unit() } - DeEvent::Text(_) | DeEvent::CData(_) => visitor.visit_unit(), + DeEvent::Text(_) => visitor.visit_unit(), DeEvent::End(e) => Err(DeError::UnexpectedEnd(e.name().as_ref().to_owned())), DeEvent::Eof => Err(DeError::UnexpectedEof), } @@ -2504,11 +2630,15 @@ where /// Always call `visitor.visit_unit()` because returned value ignored in any case. /// - /// This method consumes any single [event][DeEvent] except the [`Start`][DeEvent::Start] - /// event, in which case all events up to corresponding [`End`][DeEvent::End] event will - /// be consumed. + /// This method consumes any single [event][DeEvent] except the [`Start`] + /// event, in which case all events up to and including corresponding [`End`] + /// event will be consumed. + /// + /// This method returns error if current event is [`End`] or [`Eof`]. /// - /// This method returns error if current event is [`End`][DeEvent::End] or [`Eof`][DeEvent::Eof] + /// [`Start`]: DeEvent::Start + /// [`End`]: DeEvent::End + /// [`Eof`]: DeEvent::Eof fn deserialize_ignored_any(self, visitor: V) -> Result where V: Visitor<'de>, @@ -2553,7 +2683,7 @@ where match self.peek()? { DeEvent::Eof => Ok(None), - // Start(tag), End(tag), Text, CData + // Start(tag), End(tag), Text _ => seed.deserialize(&mut **self).map(Some), } } @@ -2561,6 +2691,53 @@ where //////////////////////////////////////////////////////////////////////////////////////////////////// +/// Helper struct that contains a state for an algorithm of converting events +/// from raw events to semi-trimmed events that is independent from a way of +/// events reading. +struct StartTrimmer { + /// If `true`, then leading whitespace will be removed from next returned + /// [`Event::Text`]. This field is set to `true` after reading each event + /// except [`Event::Text`] and [`Event::CData`], so [`Event::Text`] events + /// read right after them does not trimmed. + trim_start: bool, +} + +impl StartTrimmer { + /// Converts raw reader's event into a payload event. + /// Returns `None`, if event should be skipped. + #[inline(always)] + fn trim<'a>(&mut self, event: Event<'a>) -> Option> { + let (event, trim_next_event) = match event { + Event::Start(e) => (PayloadEvent::Start(e), true), + Event::End(e) => (PayloadEvent::End(e), true), + Event::Eof => (PayloadEvent::Eof, true), + + // Do not trim next text event after Text or CDATA event + Event::CData(e) => (PayloadEvent::CData(e), false), + Event::Text(mut e) => { + // If event is empty after trimming, skip it + if self.trim_start && e.inplace_trim_start() { + return None; + } + (PayloadEvent::Text(e), false) + } + + _ => return None, + }; + self.trim_start = trim_next_event; + Some(event) + } +} + +impl Default for StartTrimmer { + #[inline] + fn default() -> Self { + Self { trim_start: true } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + /// Trait used by the deserializer for iterating over input. This is manually /// "specialized" for iterating over `&[u8]`. /// @@ -2569,7 +2746,7 @@ where /// deserializer pub trait XmlRead<'i> { /// Return an input-borrowing event. - fn next(&mut self) -> Result, DeError>; + fn next(&mut self) -> Result, DeError>; /// Skips until end element is found. Unlike `next()` it will not allocate /// when it cannot satisfy the lifetime. @@ -2585,27 +2762,20 @@ pub trait XmlRead<'i> { /// [`Deserializer::from_reader`] pub struct IoReader { reader: Reader, + start_trimmer: StartTrimmer, buf: Vec, } impl<'i, R: BufRead> XmlRead<'i> for IoReader { - fn next(&mut self) -> Result, DeError> { - let event = loop { - let e = self.reader.read_event_into(&mut self.buf)?; - match e { - Event::Start(e) => break Ok(DeEvent::Start(e.into_owned())), - Event::End(e) => break Ok(DeEvent::End(e.into_owned())), - Event::Text(e) => break Ok(DeEvent::Text(e.into_owned())), - Event::CData(e) => break Ok(DeEvent::CData(e.into_owned())), - Event::Eof => break Ok(DeEvent::Eof), - - _ => self.buf.clear(), - } - }; - - self.buf.clear(); + fn next(&mut self) -> Result, DeError> { + loop { + self.buf.clear(); - event + let event = self.reader.read_event_into(&mut self.buf)?; + if let Some(event) = self.start_trimmer.trim(event) { + return Ok(event.into_owned()); + } + } } fn read_to_end(&mut self, name: QName) -> Result<(), DeError> { @@ -2627,20 +2797,15 @@ impl<'i, R: BufRead> XmlRead<'i> for IoReader { /// [`Deserializer::from_str`]. pub struct SliceReader<'de> { reader: Reader<&'de [u8]>, + start_trimmer: StartTrimmer, } impl<'de> XmlRead<'de> for SliceReader<'de> { - fn next(&mut self) -> Result, DeError> { + fn next(&mut self) -> Result, DeError> { loop { - let e = self.reader.read_event()?; - match e { - Event::Start(e) => break Ok(DeEvent::Start(e)), - Event::End(e) => break Ok(DeEvent::End(e)), - Event::Text(e) => break Ok(DeEvent::Text(e)), - Event::CData(e) => break Ok(DeEvent::CData(e)), - Event::Eof => break Ok(DeEvent::Eof), - - _ => (), + let event = self.reader.read_event()?; + if let Some(event) = self.start_trimmer.trim(event) { + return Ok(event); } } } @@ -2667,7 +2832,7 @@ mod tests { mod skip { use super::*; use crate::de::DeEvent::*; - use crate::events::{BytesEnd, BytesText}; + use crate::events::BytesEnd; use pretty_assertions::assert_eq; /// Checks that `peek()` and `read()` behaves correctly after `skip()` @@ -2704,7 +2869,7 @@ mod tests { de.write, vec![ Start(BytesStart::new("inner")), - Text(BytesText::from_escaped("text")), + Text(Cow::Borrowed("text")), Start(BytesStart::new("inner")), End(BytesEnd::new("inner")), End(BytesEnd::new("inner")), @@ -2738,7 +2903,7 @@ mod tests { de.read, vec![ Start(BytesStart::new("inner")), - Text(BytesText::from_escaped("text")), + Text(Cow::Borrowed("text")), Start(BytesStart::new("inner")), End(BytesEnd::new("inner")), End(BytesEnd::new("inner")), @@ -2766,7 +2931,7 @@ mod tests { vec![ // This comment here to keep the same formatting of both arrays // otherwise rustfmt suggest one-line it - Text(BytesText::from_escaped("text")), + Text(Cow::Borrowed("text")), ] ); @@ -2786,12 +2951,14 @@ mod tests { assert_eq!( de.read, vec![ - Text(BytesText::from_escaped("text")), + // This comment here to keep the same formatting as others + // otherwise rustfmt suggest one-line it + Text(Cow::Borrowed("text")), End(BytesEnd::new("inner")), ] ); assert_eq!(de.write, vec![]); - assert_eq!(de.next().unwrap(), Text(BytesText::from_escaped("text"))); + assert_eq!(de.next().unwrap(), Text(Cow::Borrowed("text"))); assert_eq!(de.next().unwrap(), End(BytesEnd::new("inner"))); assert_eq!(de.next().unwrap(), Start(BytesStart::new("target"))); assert_eq!(de.next().unwrap(), End(BytesEnd::new("target"))); @@ -2833,14 +3000,14 @@ mod tests { de.write, vec![ Start(BytesStart::new("skip")), - Text(BytesText::from_escaped("text")), + Text(Cow::Borrowed("text")), Start(BytesStart::new("skip")), End(BytesEnd::new("skip")), End(BytesEnd::new("skip")), ] ); - // Drop all events thet represents tree. Now unconsumed XML looks like: + // Drop all events that represents tree. Now unconsumed XML looks like: // // // text @@ -2854,7 +3021,7 @@ mod tests { de.write, vec![ Start(BytesStart::new("skip")), - Text(BytesText::from_escaped("text")), + Text(Cow::Borrowed("text")), Start(BytesStart::new("skip")), End(BytesEnd::new("skip")), End(BytesEnd::new("skip")), @@ -2876,7 +3043,7 @@ mod tests { de.read, vec![ Start(BytesStart::new("skip")), - Text(BytesText::from_escaped("text")), + Text(Cow::Borrowed("text")), Start(BytesStart::new("skip")), End(BytesEnd::new("skip")), End(BytesEnd::new("skip")), @@ -3166,7 +3333,7 @@ mod tests { de.next().unwrap(), Start(BytesStart::from_content(r#"tag a="2""#, 3)) ); - assert_eq!(de.next().unwrap(), CData(BytesCData::new("cdata content"))); + assert_eq!(de.next().unwrap(), Text(Cow::Borrowed("cdata content"))); assert_eq!(de.next().unwrap(), End(BytesEnd::new("tag"))); assert_eq!(de.next().unwrap(), Start(BytesStart::new("self-closed"))); @@ -3177,7 +3344,7 @@ mod tests { } #[test] - fn invalid_xml() { + fn invalid_xml1() { let mut de = Deserializer::from_str(""); assert_eq!(de.next().unwrap(), Start(BytesStart::new("tag"))); @@ -3189,6 +3356,20 @@ mod tests { } assert_eq!(de.next().unwrap(), Eof); } + + #[test] + fn invalid_xml2() { + let mut de = Deserializer::from_str(""); + + assert_eq!(de.next().unwrap(), Start(BytesStart::new("tag"))); + assert_eq!(de.peek().unwrap(), &Text(Cow::Borrowed(""))); + + match de.read_to_end(QName(b"tag")) { + Err(DeError::UnexpectedEof) => (), + x => panic!("Expected `Err(UnexpectedEof)`, but found {:?}", x), + } + assert_eq!(de.next().unwrap(), Eof); + } } #[test] @@ -3201,17 +3382,19 @@ mod tests { let mut reader1 = IoReader { reader: Reader::from_reader(s.as_bytes()), + start_trimmer: StartTrimmer::default(), buf: Vec::new(), }; let mut reader2 = SliceReader { reader: Reader::from_str(s), + start_trimmer: StartTrimmer::default(), }; loop { let event1 = reader1.next().unwrap(); let event2 = reader2.next().unwrap(); - if let (DeEvent::Eof, DeEvent::Eof) = (&event1, &event2) { + if let (PayloadEvent::Eof, PayloadEvent::Eof) = (&event1, &event2) { break; } @@ -3230,11 +3413,11 @@ mod tests { let mut reader = SliceReader { reader: Reader::from_str(s), + start_trimmer: StartTrimmer::default(), }; reader .reader - .trim_text(true) .expand_empty_elements(true) .check_end_names(true); @@ -3242,13 +3425,13 @@ mod tests { loop { let event = reader.next().unwrap(); - if let DeEvent::Eof = event { + if let PayloadEvent::Eof = event { break; } events.push(event); } - use crate::de::DeEvent::*; + use crate::de::PayloadEvent::*; assert_eq!( events, @@ -3298,4 +3481,881 @@ mod tests { ), } } + + /// Tests for https://github.com/tafia/quick-xml/issues/474. + /// + /// That tests ensures that comments and processed instructions is ignored + /// and can split one logical string in pieces. + mod merge_text { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn text() { + let mut de = Deserializer::from_str("text"); + assert_eq!(de.next().unwrap(), DeEvent::Text(Cow::Borrowed("text"))); + } + + #[test] + fn cdata() { + let mut de = Deserializer::from_str(""); + assert_eq!(de.next().unwrap(), DeEvent::Text(Cow::Borrowed("cdata"))); + } + + #[test] + fn text_and_cdata() { + let mut de = Deserializer::from_str("text and "); + assert_eq!( + de.next().unwrap(), + DeEvent::Text(Cow::Borrowed("text and cdata")) + ); + } + + #[test] + fn text_and_empty_cdata() { + let mut de = Deserializer::from_str("text and "); + assert_eq!( + de.next().unwrap(), + DeEvent::Text(Cow::Borrowed("text and ")) + ); + } + + #[test] + fn cdata_and_text() { + let mut de = Deserializer::from_str(" and text"); + assert_eq!( + de.next().unwrap(), + DeEvent::Text(Cow::Borrowed("cdata and text")) + ); + } + + #[test] + fn empty_cdata_and_text() { + let mut de = Deserializer::from_str(" and text"); + assert_eq!( + de.next().unwrap(), + DeEvent::Text(Cow::Borrowed(" and text")) + ); + } + + #[test] + fn cdata_and_cdata() { + let mut de = Deserializer::from_str( + "\ + \ + cdata]]>\ + ", + ); + assert_eq!( + de.next().unwrap(), + DeEvent::Text(Cow::Borrowed("cdata]]>cdata")) + ); + } + + mod comment_between { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn text() { + let mut de = Deserializer::from_str( + "\ + text \ + \ + text\ + ", + ); + assert_eq!( + de.next().unwrap(), + DeEvent::Text(Cow::Borrowed("text text")) + ); + } + + #[test] + fn cdata() { + let mut de = Deserializer::from_str( + "\ + \ + \ + cdata]]>\ + ", + ); + assert_eq!( + de.next().unwrap(), + DeEvent::Text(Cow::Borrowed("cdata]]>cdata")) + ); + } + + #[test] + fn text_and_cdata() { + let mut de = Deserializer::from_str( + "\ + text \ + \ + \ + ", + ); + assert_eq!( + de.next().unwrap(), + DeEvent::Text(Cow::Borrowed("text cdata")) + ); + } + + #[test] + fn text_and_empty_cdata() { + let mut de = Deserializer::from_str( + "\ + text \ + \ + \ + ", + ); + assert_eq!(de.next().unwrap(), DeEvent::Text(Cow::Borrowed("text "))); + } + + #[test] + fn cdata_and_text() { + let mut de = Deserializer::from_str( + "\ + \ + \ + text \ + ", + ); + assert_eq!( + de.next().unwrap(), + DeEvent::Text(Cow::Borrowed("cdata text")) + ); + } + + #[test] + fn empty_cdata_and_text() { + let mut de = Deserializer::from_str( + "\ + \ + \ + text \ + ", + ); + assert_eq!(de.next().unwrap(), DeEvent::Text(Cow::Borrowed(" text"))); + } + + #[test] + fn cdata_and_cdata() { + let mut de = Deserializer::from_str( + "\ + \ + \ + cdata]]>\ + ", + ); + assert_eq!( + de.next().unwrap(), + DeEvent::Text(Cow::Borrowed("cdata]]>cdata")) + ); + } + } + + mod pi_between { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn text() { + let mut de = Deserializer::from_str( + "\ + text \ + \ + text\ + ", + ); + assert_eq!( + de.next().unwrap(), + DeEvent::Text(Cow::Borrowed("text text")) + ); + } + + #[test] + fn cdata() { + let mut de = Deserializer::from_str( + "\ + \ + \ + cdata]]>\ + ", + ); + assert_eq!( + de.next().unwrap(), + DeEvent::Text(Cow::Borrowed("cdata]]>cdata")) + ); + } + + #[test] + fn text_and_cdata() { + let mut de = Deserializer::from_str( + "\ + text \ + \ + \ + ", + ); + assert_eq!( + de.next().unwrap(), + DeEvent::Text(Cow::Borrowed("text cdata")) + ); + } + + #[test] + fn text_and_empty_cdata() { + let mut de = Deserializer::from_str( + "\ + text \ + \ + \ + ", + ); + assert_eq!(de.next().unwrap(), DeEvent::Text(Cow::Borrowed("text "))); + } + + #[test] + fn cdata_and_text() { + let mut de = Deserializer::from_str( + "\ + \ + \ + text \ + ", + ); + assert_eq!( + de.next().unwrap(), + DeEvent::Text(Cow::Borrowed("cdata text")) + ); + } + + #[test] + fn empty_cdata_and_text() { + let mut de = Deserializer::from_str( + "\ + \ + \ + text \ + ", + ); + assert_eq!(de.next().unwrap(), DeEvent::Text(Cow::Borrowed(" text"))); + } + + #[test] + fn cdata_and_cdata() { + let mut de = Deserializer::from_str( + "\ + \ + \ + cdata]]>\ + ", + ); + assert_eq!( + de.next().unwrap(), + DeEvent::Text(Cow::Borrowed("cdata]]>cdata")) + ); + } + } + } + + /// Tests for https://github.com/tafia/quick-xml/issues/474. + /// + /// This tests ensures that any combination of payload data is processed + /// as expected. + mod triples { + use super::*; + use pretty_assertions::assert_eq; + + mod start { + use super::*; + + /// ... + mod start { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn start() { + let mut de = Deserializer::from_str(""); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag1"))); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag2"))); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag3"))); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + + /// Not matching end tag will result to error + #[test] + fn end() { + let mut de = Deserializer::from_str(""); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag1"))); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag2"))); + assert_eq!(de.next().unwrap(), DeEvent::End(BytesEnd::new("tag2"))); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + + #[test] + fn text() { + let mut de = Deserializer::from_str(" text "); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag1"))); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag2"))); + // Text is trimmed from both sides + assert_eq!(de.next().unwrap(), DeEvent::Text(Cow::Borrowed("text"))); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + + #[test] + fn cdata() { + let mut de = Deserializer::from_str(""); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag1"))); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag2"))); + assert_eq!(de.next().unwrap(), DeEvent::Text(Cow::Borrowed(" cdata "))); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + + #[test] + fn eof() { + let mut de = Deserializer::from_str(""); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag1"))); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag2"))); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + } + + /// ... + mod end { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn start() { + let mut de = Deserializer::from_str(""); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag"))); + assert_eq!(de.next().unwrap(), DeEvent::End(BytesEnd::new("tag"))); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag2"))); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + + #[test] + fn end() { + let mut de = Deserializer::from_str(""); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag"))); + assert_eq!(de.next().unwrap(), DeEvent::End(BytesEnd::new("tag"))); + match de.next() { + Err(DeError::InvalidXml(Error::EndEventMismatch { expected, found })) => { + assert_eq!(expected, ""); + assert_eq!(found, "tag2"); + } + x => panic!("Expected `InvalidXml(EndEventMismatch {{ expected = '', found = 'tag2' }})`, but got {:?}", x), + } + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + + #[test] + fn text() { + let mut de = Deserializer::from_str(" text "); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag"))); + assert_eq!(de.next().unwrap(), DeEvent::End(BytesEnd::new("tag"))); + // Text is trimmed from both sides + assert_eq!(de.next().unwrap(), DeEvent::Text(Cow::Borrowed("text"))); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + + #[test] + fn cdata() { + let mut de = Deserializer::from_str(""); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag"))); + assert_eq!(de.next().unwrap(), DeEvent::End(BytesEnd::new("tag"))); + assert_eq!(de.next().unwrap(), DeEvent::Text(Cow::Borrowed(" cdata "))); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + + #[test] + fn eof() { + let mut de = Deserializer::from_str(""); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag"))); + assert_eq!(de.next().unwrap(), DeEvent::End(BytesEnd::new("tag"))); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + } + + /// text ... + mod text { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn start() { + let mut de = Deserializer::from_str(" text "); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag"))); + // Text is trimmed from both sides + assert_eq!(de.next().unwrap(), DeEvent::Text(Cow::Borrowed("text"))); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag2"))); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + + #[test] + fn end() { + let mut de = Deserializer::from_str(" text "); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag"))); + // Text is trimmed from both sides + assert_eq!(de.next().unwrap(), DeEvent::Text(Cow::Borrowed("text"))); + assert_eq!(de.next().unwrap(), DeEvent::End(BytesEnd::new("tag"))); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + + // start::text::text has no difference from start::text + + #[test] + fn cdata() { + let mut de = Deserializer::from_str(" text "); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag"))); + // Text is trimmed from the start + assert_eq!( + de.next().unwrap(), + DeEvent::Text(Cow::Borrowed("text cdata ")) + ); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + + #[test] + fn eof() { + let mut de = Deserializer::from_str(" text "); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag"))); + // Text is trimmed from both sides + assert_eq!(de.next().unwrap(), DeEvent::Text(Cow::Borrowed("text"))); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + } + + /// ... + mod cdata { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn start() { + let mut de = Deserializer::from_str(""); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag"))); + assert_eq!(de.next().unwrap(), DeEvent::Text(Cow::Borrowed(" cdata "))); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag2"))); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + + #[test] + fn end() { + let mut de = Deserializer::from_str(""); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag"))); + assert_eq!(de.next().unwrap(), DeEvent::Text(Cow::Borrowed(" cdata "))); + assert_eq!(de.next().unwrap(), DeEvent::End(BytesEnd::new("tag"))); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + + #[test] + fn text() { + let mut de = Deserializer::from_str(" text "); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag"))); + // Text is trimmed from the end + assert_eq!( + de.next().unwrap(), + DeEvent::Text(Cow::Borrowed(" cdata text")) + ); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + + #[test] + fn cdata() { + let mut de = + Deserializer::from_str(""); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag"))); + assert_eq!( + de.next().unwrap(), + DeEvent::Text(Cow::Borrowed(" cdata cdata2 ")) + ); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + + #[test] + fn eof() { + let mut de = Deserializer::from_str(""); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag"))); + assert_eq!(de.next().unwrap(), DeEvent::Text(Cow::Borrowed(" cdata "))); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + } + } + + /// Start from End event will always generate an error + #[test] + fn end() { + let mut de = Deserializer::from_str(""); + match de.next() { + Err(DeError::InvalidXml(Error::EndEventMismatch { expected, found })) => { + assert_eq!(expected, ""); + assert_eq!(found, "tag"); + } + x => panic!("Expected `InvalidXml(EndEventMismatch {{ expected = '', found = 'tag' }})`, but got {:?}", x), + } + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + + mod text { + use super::*; + use pretty_assertions::assert_eq; + + mod start { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn start() { + let mut de = Deserializer::from_str(" text "); + // Text is trimmed from both sides + assert_eq!(de.next().unwrap(), DeEvent::Text(Cow::Borrowed("text"))); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag1"))); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag2"))); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + + /// Not matching end tag will result in error + #[test] + fn end() { + let mut de = Deserializer::from_str(" text "); + // Text is trimmed from both sides + assert_eq!(de.next().unwrap(), DeEvent::Text(Cow::Borrowed("text"))); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag"))); + assert_eq!(de.next().unwrap(), DeEvent::End(BytesEnd::new("tag"))); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + + #[test] + fn text() { + let mut de = Deserializer::from_str(" text text2 "); + // Text is trimmed from both sides + assert_eq!(de.next().unwrap(), DeEvent::Text(Cow::Borrowed("text"))); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag"))); + // Text is trimmed from both sides + assert_eq!(de.next().unwrap(), DeEvent::Text(Cow::Borrowed("text2"))); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + + #[test] + fn cdata() { + let mut de = Deserializer::from_str(" text "); + // Text is trimmed from both sides + assert_eq!(de.next().unwrap(), DeEvent::Text(Cow::Borrowed("text"))); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag"))); + assert_eq!(de.next().unwrap(), DeEvent::Text(Cow::Borrowed(" cdata "))); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + + #[test] + fn eof() { + // Text is trimmed from both sides + let mut de = Deserializer::from_str(" text "); + assert_eq!(de.next().unwrap(), DeEvent::Text(Cow::Borrowed("text"))); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag"))); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + } + + /// End event without corresponding start event will always generate an error + #[test] + fn end() { + let mut de = Deserializer::from_str(" text "); + // Text is trimmed from both sides + assert_eq!(de.next().unwrap(), DeEvent::Text(Cow::Borrowed("text"))); + match de.next() { + Err(DeError::InvalidXml(Error::EndEventMismatch { expected, found })) => { + assert_eq!(expected, ""); + assert_eq!(found, "tag"); + } + x => panic!("Expected `InvalidXml(EndEventMismatch {{ expected = '', found = 'tag' }})`, but got {:?}", x), + } + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + + // text::text::something is equivalent to text::something + + mod cdata { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn start() { + let mut de = Deserializer::from_str(" text "); + // Text is trimmed from the start + assert_eq!( + de.next().unwrap(), + DeEvent::Text(Cow::Borrowed("text cdata ")) + ); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag"))); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + + #[test] + fn end() { + let mut de = Deserializer::from_str(" text "); + // Text is trimmed from the start + assert_eq!( + de.next().unwrap(), + DeEvent::Text(Cow::Borrowed("text cdata ")) + ); + match de.next() { + Err(DeError::InvalidXml(Error::EndEventMismatch { expected, found })) => { + assert_eq!(expected, ""); + assert_eq!(found, "tag"); + } + x => panic!("Expected `InvalidXml(EndEventMismatch {{ expected = '', found = 'tag' }})`, but got {:?}", x), + } + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + + #[test] + fn text() { + let mut de = Deserializer::from_str(" text text2 "); + // Text is trimmed from the start and from the end + assert_eq!( + de.next().unwrap(), + DeEvent::Text(Cow::Borrowed("text cdata text2")) + ); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + + #[test] + fn cdata() { + let mut de = + Deserializer::from_str(" text "); + // Text is trimmed from the start + assert_eq!( + de.next().unwrap(), + DeEvent::Text(Cow::Borrowed("text cdata cdata2 ")) + ); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + + #[test] + fn eof() { + let mut de = Deserializer::from_str(" text "); + // Text is trimmed from the start + assert_eq!( + de.next().unwrap(), + DeEvent::Text(Cow::Borrowed("text cdata ")) + ); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + } + } + + mod cdata { + use super::*; + use pretty_assertions::assert_eq; + + mod start { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn start() { + let mut de = Deserializer::from_str(""); + assert_eq!(de.next().unwrap(), DeEvent::Text(Cow::Borrowed(" cdata "))); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag1"))); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag2"))); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + + /// Not matching end tag will result in error + #[test] + fn end() { + let mut de = Deserializer::from_str(""); + assert_eq!(de.next().unwrap(), DeEvent::Text(Cow::Borrowed(" cdata "))); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag"))); + assert_eq!(de.next().unwrap(), DeEvent::End(BytesEnd::new("tag"))); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + + #[test] + fn text() { + let mut de = Deserializer::from_str(" text "); + assert_eq!(de.next().unwrap(), DeEvent::Text(Cow::Borrowed(" cdata "))); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag"))); + // Text is trimmed from both sides + assert_eq!(de.next().unwrap(), DeEvent::Text(Cow::Borrowed("text"))); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + + #[test] + fn cdata() { + let mut de = + Deserializer::from_str(""); + assert_eq!(de.next().unwrap(), DeEvent::Text(Cow::Borrowed(" cdata "))); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag"))); + assert_eq!(de.next().unwrap(), DeEvent::Text(Cow::Borrowed(" cdata2 "))); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + + #[test] + fn eof() { + let mut de = Deserializer::from_str(""); + assert_eq!(de.next().unwrap(), DeEvent::Text(Cow::Borrowed(" cdata "))); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag"))); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + } + + /// End event without corresponding start event will always generate an error + #[test] + fn end() { + let mut de = Deserializer::from_str(""); + assert_eq!(de.next().unwrap(), DeEvent::Text(Cow::Borrowed(" cdata "))); + match de.next() { + Err(DeError::InvalidXml(Error::EndEventMismatch { expected, found })) => { + assert_eq!(expected, ""); + assert_eq!(found, "tag"); + } + x => panic!("Expected `InvalidXml(EndEventMismatch {{ expected = '', found = 'tag' }})`, but got {:?}", x), + } + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + + mod text { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn start() { + let mut de = Deserializer::from_str(" text "); + // Text is trimmed from the end + assert_eq!( + de.next().unwrap(), + DeEvent::Text(Cow::Borrowed(" cdata text")) + ); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag"))); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + + #[test] + fn end() { + let mut de = Deserializer::from_str(" text "); + // Text is trimmed from the end + assert_eq!( + de.next().unwrap(), + DeEvent::Text(Cow::Borrowed(" cdata text")) + ); + match de.next() { + Err(DeError::InvalidXml(Error::EndEventMismatch { expected, found })) => { + assert_eq!(expected, ""); + assert_eq!(found, "tag"); + } + x => panic!("Expected `InvalidXml(EndEventMismatch {{ expected = '', found = 'tag' }})`, but got {:?}", x), + } + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + + // cdata::text::text is equivalent to cdata::text + + #[test] + fn cdata() { + let mut de = + Deserializer::from_str(" text "); + assert_eq!( + de.next().unwrap(), + DeEvent::Text(Cow::Borrowed(" cdata text cdata2 ")) + ); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + + #[test] + fn eof() { + let mut de = Deserializer::from_str(" text "); + // Text is trimmed from the end + assert_eq!( + de.next().unwrap(), + DeEvent::Text(Cow::Borrowed(" cdata text")) + ); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + } + + mod cdata { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn start() { + let mut de = + Deserializer::from_str(""); + assert_eq!( + de.next().unwrap(), + DeEvent::Text(Cow::Borrowed(" cdata cdata2 ")) + ); + assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag"))); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + + #[test] + fn end() { + let mut de = + Deserializer::from_str(""); + assert_eq!( + de.next().unwrap(), + DeEvent::Text(Cow::Borrowed(" cdata cdata2 ")) + ); + match de.next() { + Err(DeError::InvalidXml(Error::EndEventMismatch { expected, found })) => { + assert_eq!(expected, ""); + assert_eq!(found, "tag"); + } + x => panic!("Expected `InvalidXml(EndEventMismatch {{ expected = '', found = 'tag' }})`, but got {:?}", x), + } + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + + #[test] + fn text() { + let mut de = + Deserializer::from_str(" text "); + // Text is trimmed from the end + assert_eq!( + de.next().unwrap(), + DeEvent::Text(Cow::Borrowed(" cdata cdata2 text")) + ); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + + #[test] + fn cdata() { + let mut de = Deserializer::from_str( + "", + ); + assert_eq!( + de.next().unwrap(), + DeEvent::Text(Cow::Borrowed(" cdata cdata2 cdata3 ")) + ); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + + #[test] + fn eof() { + let mut de = Deserializer::from_str(""); + assert_eq!( + de.next().unwrap(), + DeEvent::Text(Cow::Borrowed(" cdata cdata2 ")) + ); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + assert_eq!(de.next().unwrap(), DeEvent::Eof); + } + } + } + } } diff --git a/src/de/simple_type.rs b/src/de/simple_type.rs index c3ea9c93..bd6759a6 100644 --- a/src/de/simple_type.rs +++ b/src/de/simple_type.rs @@ -504,14 +504,12 @@ impl<'de, 'a> Deref for CowRef<'de, 'a> { /// /// Used for deserialize values from: /// - attribute values (`<... ...="value" ...>`) -/// - text content (`<...>text`) -/// - CDATA content (`<...>`) +/// - mixed text / CDATA content (`<...>text`) /// /// [simple types]: https://www.w3.org/TR/xmlschema11-1/#Simple_Type_Definition pub struct SimpleTypeDeserializer<'de, 'a> { /// - In case of attribute contains escaped attribute value - /// - In case of text contains escaped text value - /// - In case of CData contains unescaped cdata value + /// - In case of text contains unescaped text value content: CowRef<'de, 'a>, /// If `true`, `content` in escaped form and should be unescaped before use escaped: bool, diff --git a/src/de/var.rs b/src/de/var.rs index 8a7b0194..b5403e06 100644 --- a/src/de/var.rs +++ b/src/de/var.rs @@ -41,7 +41,7 @@ where seed.deserialize(QNameDeserializer::from_elem(e.name(), decoder)?)?, false, ), - DeEvent::Text(_) | DeEvent::CData(_) => ( + DeEvent::Text(_) => ( seed.deserialize(StrDeserializer::::new(TEXT_KEY))?, true, ), @@ -80,9 +80,9 @@ where DeEvent::Start(e) => self.de.read_to_end(e.name()), // Does not needed to deserialize using SimpleTypeDeserializer, because // it returns `()` when `deserialize_unit()` is requested - DeEvent::Text(_) | DeEvent::CData(_) => Ok(()), + DeEvent::Text(_) => Ok(()), // SAFETY: the other events are filtered in `variant_seed()` - _ => unreachable!("Only `Start`, `Text` or `CData` events are possible here"), + _ => unreachable!("Only `Start` or `Text` events are possible here"), } } @@ -92,14 +92,9 @@ where { if self.is_text { match self.de.next()? { - DeEvent::Text(e) => { - seed.deserialize(SimpleTypeDeserializer::from_text_content(e.decode(true)?)) - } - DeEvent::CData(e) => { - seed.deserialize(SimpleTypeDeserializer::from_text_content(e.decode()?)) - } + DeEvent::Text(e) => seed.deserialize(SimpleTypeDeserializer::from_text_content(e)), // SAFETY: the other events are filtered in `variant_seed()` - _ => unreachable!("Only `Text` or `CData` events are possible here"), + _ => unreachable!("Only `Text` events are possible here"), } } else { seed.deserialize(&mut *self.de) @@ -112,12 +107,11 @@ where { if self.is_text { match self.de.next()? { - DeEvent::Text(e) => SimpleTypeDeserializer::from_text_content(e.decode(true)?) - .deserialize_tuple(len, visitor), - DeEvent::CData(e) => SimpleTypeDeserializer::from_text_content(e.decode()?) - .deserialize_tuple(len, visitor), + DeEvent::Text(e) => { + SimpleTypeDeserializer::from_text_content(e).deserialize_tuple(len, visitor) + } // SAFETY: the other events are filtered in `variant_seed()` - _ => unreachable!("Only `Text` or `CData` events are possible here"), + _ => unreachable!("Only `Text` events are possible here"), } } else { self.de.deserialize_tuple(len, visitor) @@ -134,12 +128,10 @@ where { if self.is_text { match self.de.next()? { - DeEvent::Text(e) => SimpleTypeDeserializer::from_text_content(e.decode(true)?) - .deserialize_struct("", fields, visitor), - DeEvent::CData(e) => SimpleTypeDeserializer::from_text_content(e.decode()?) + DeEvent::Text(e) => SimpleTypeDeserializer::from_text_content(e) .deserialize_struct("", fields, visitor), // SAFETY: the other events are filtered in `variant_seed()` - _ => unreachable!("Only `Text` or `CData` events are possible here"), + _ => unreachable!("Only `Text` events are possible here"), } } else { self.de.deserialize_struct("", fields, visitor) diff --git a/src/events/mod.rs b/src/events/mod.rs index 1b813a8e..ebea6ebf 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -48,8 +48,10 @@ use crate::encoding::Decoder; use crate::errors::{Error, Result}; use crate::escape::{escape, partial_escape, unescape_with}; use crate::name::{LocalName, QName}; +use crate::reader::is_whitespace; use crate::utils::write_cow_string; use attributes::{Attribute, Attributes}; +use std::mem::replace; /// Opening tag data (`Event::Start`), with optional attributes. /// @@ -694,26 +696,23 @@ impl<'a> BytesText<'a> { } } - /// Gets content of this text buffer in the specified encoding and optionally - /// unescapes it. - #[cfg(feature = "serialize")] - pub(crate) fn decode(&self, unescape: bool) -> Result> { - let text = match &self.content { - Cow::Borrowed(bytes) => self.decoder.decode(bytes)?, - // Convert to owned, because otherwise Cow will be bound with wrong lifetime - Cow::Owned(bytes) => self.decoder.decode(bytes)?.into_owned().into(), - }; - let text = if unescape { - //FIXME: need to take into account entities defined in the document - match unescape_with(&text, |_| None)? { - // Because result is borrowed, no replacements was done and we can use original string - Cow::Borrowed(_) => text, - Cow::Owned(s) => s.into(), - } - } else { - text - }; - Ok(text) + /// Removes leading XML whitespace bytes from text content. + /// + /// Returns `true` if content is empty after that + pub fn inplace_trim_start(&mut self) -> bool { + self.content = trim_cow( + replace(&mut self.content, Cow::Borrowed(b"")), + trim_xml_start, + ); + self.content.is_empty() + } + + /// Removes trailing XML whitespace bytes from text content. + /// + /// Returns `true` if content is empty after that + pub fn inplace_trim_end(&mut self) -> bool { + self.content = trim_cow(replace(&mut self.content, Cow::Borrowed(b"")), trim_xml_end); + self.content.is_empty() } } @@ -963,6 +962,54 @@ fn str_cow_to_bytes<'a, C: Into>>(content: C) -> Cow<'a, [u8]> { } } +/// Returns a byte slice with leading XML whitespace bytes removed. +/// +/// 'Whitespace' refers to the definition used by [`is_whitespace`]. +const fn trim_xml_start(mut bytes: &[u8]) -> &[u8] { + // Note: A pattern matching based approach (instead of indexing) allows + // making the function const. + while let [first, rest @ ..] = bytes { + if is_whitespace(*first) { + bytes = rest; + } else { + break; + } + } + bytes +} + +/// Returns a byte slice with trailing XML whitespace bytes removed. +/// +/// 'Whitespace' refers to the definition used by [`is_whitespace`]. +const fn trim_xml_end(mut bytes: &[u8]) -> &[u8] { + // Note: A pattern matching based approach (instead of indexing) allows + // making the function const. + while let [rest @ .., last] = bytes { + if is_whitespace(*last) { + bytes = rest; + } else { + break; + } + } + bytes +} + +fn trim_cow<'a, F>(value: Cow<'a, [u8]>, trim: F) -> Cow<'a, [u8]> +where + F: FnOnce(&[u8]) -> &[u8], +{ + match value { + Cow::Borrowed(bytes) => Cow::Borrowed(trim(bytes)), + Cow::Owned(mut bytes) => { + let trimmed = trim(&bytes); + if trimmed.len() != bytes.len() { + bytes = trimmed.to_vec(); + } + Cow::Owned(bytes) + } + } +} + #[cfg(test)] mod test { use super::*; diff --git a/src/reader/mod.rs b/src/reader/mod.rs index d928230f..de0c7b32 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -42,10 +42,21 @@ macro_rules! configure_methods { /// /// Changing this option automatically changes the [`trim_text_end`] option. /// - /// (`false` by default) + /// (`false` by default). + /// + ///
+ /// + /// WARNING: With this option every text events will be trimmed which is + /// incorrect behavior when text events delimited by comments, processing + /// instructions or CDATA sections. To correctly trim data manually apply + /// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`] + /// only to necessary events. + ///
/// /// [`Text`]: Event::Text /// [`trim_text_end`]: Self::trim_text_end + /// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start + /// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end pub fn trim_text(&mut self, val: bool) -> &mut Self { self $(.$holder)? .parser.trim_text_start = val; self $(.$holder)? .parser.trim_text_end = val; @@ -57,9 +68,20 @@ macro_rules! configure_methods { /// When set to `true`, trailing whitespace is trimmed in [`Text`] events. /// If after that the event is empty it will not be pushed. /// - /// (`false` by default) + /// (`false` by default). + /// + ///
+ /// + /// WARNING: With this option every text events will be trimmed which is + /// incorrect behavior when text events delimited by comments, processing + /// instructions or CDATA sections. To correctly trim data manually apply + /// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`] + /// only to necessary events. + ///
/// /// [`Text`]: Event::Text + /// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start + /// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end pub fn trim_text_end(&mut self, val: bool) -> &mut Self { self $(.$holder)? .parser.trim_text_end = val; self @@ -848,7 +870,7 @@ impl ReadElementState { /// A function to check whether the byte is a whitespace (blank, new line, carriage return or tab) #[inline] -pub(crate) fn is_whitespace(b: u8) -> bool { +pub(crate) const fn is_whitespace(b: u8) -> bool { matches!(b, b' ' | b'\r' | b'\n' | b'\t') } diff --git a/tests/serde-de.rs b/tests/serde-de.rs index f78c2125..fa926941 100644 --- a/tests/serde-de.rs +++ b/tests/serde-de.rs @@ -530,7 +530,8 @@ mod seq { #[test] fn mixed_content() { - from_str::<[(); 3]>( + // Text and CDATA represents a one logical text item + from_str::<[(); 2]>( r#" text @@ -547,7 +548,8 @@ mod seq { "#, ) .unwrap(); - assert_eq!(data, vec![(), (), ()]); + // Text and CDATA represents a one logical text item + assert_eq!(data, vec![(), ()]); } /// This test ensures that composition of deserializer building blocks plays well @@ -2432,8 +2434,9 @@ mod seq { fn mixed_content() { #[derive(Debug, PartialEq, Deserialize)] struct List { + /// Text and CDATA represents a one logical text item #[serde(rename = "$value")] - item: [(); 3], + item: [(); 2], } from_str::( @@ -3540,7 +3543,8 @@ mod seq { assert_eq!( data, List { - item: vec![(), (), ()], + // Text and CDATA represents a one logical text item + item: vec![(), ()], } ); }