Avoid allocating new buffers if possible

tafia · Apr 3, 2022 · e45064f · e45064f
1 parent 1cdea62
commit e45064f
Showing 1 changed file with 75 additions and 83 deletions.
diff --git a/src/events/attributes.rs b/src/events/attributes.rs
@@ -331,93 +331,95 @@ impl<'a> From<(&'a str, &'a str)> for Attribute<'a> {
     }
 }
 
-// 1) All line breaks MUST have been normalized on input to #xA as described in 2.11 End-of-Line Handling, so the rest of this algorithm operates on text normalized in this way.
-// 2) Begin with a normalized value consisting of the empty string.
-// 3) For each character, entity reference, or character reference in the unnormalized attribute value, beginning with the first and continuing to the last, do the following:
-//   * For a character reference, append the referenced character to the normalized value.
-//   * For an entity reference, recursively apply step 3 of this algorithm to the replacement text of the entity.
-//   * For a white space character (#x20, #xD, #xA, #x9), append a space character (#x20) to the normalized value.
-//   * For another character, append the character to the normalized value.
-//
-// If the attribute type is not CDATA, then the XML processor MUST further process the normalized attribute value by discarding any leading and trailing space (#x20) characters,
-// and by replacing sequences of space (#x20) characters by a single space (#x20) character.
-//
-// Note that if the unnormalized attribute value contains a character reference to a white space character other than space (#x20), the normalized value contains the referenced
-// character itself (#xD, #xA or #x9). This contrasts with the case where the unnormalized value contains a white space character (not a reference), which is replaced with a
-// space character (#x20) in the normalized value and also contrasts with the case where the unnormalized value contains an entity reference whose replacement text contains a
-// white space character; being recursively processed, the white space character is replaced with a space character (#x20) in the normalized value.
-fn normalize_attribute_value(attr: Cow<[u8]>) -> Cow<[u8]> {
+///
+///
+/// 1) All line breaks MUST have been normalized on input to #xA as described in 2.11 End-of-Line Handling, so the rest of this algorithm operates on text normalized in this way.
+/// 2) Begin with a normalized value consisting of the empty string.
+/// 3) For each character, entity reference, or character reference in the unnormalized attribute value, beginning with the first and continuing to the last, do the following:
+///   * For a character reference, append the referenced character to the normalized value.
+///   * For an entity reference, recursively apply step 3 of this algorithm to the replacement text of the entity.
+///   * For a white space character (#x20, #xD, #xA, #x9), append a space character (#x20) to the normalized value.
+///   * For another character, append the character to the normalized value.
+///
+/// If the attribute type is not CDATA, then the XML processor MUST further process the normalized attribute value by discarding any leading and trailing space (#x20) characters,
+/// and by replacing sequences of space (#x20) characters by a single space (#x20) character.
+///
+/// Note that if the unnormalized attribute value contains a character reference to a white space character other than space (#x20), the normalized value contains the referenced
+/// character itself (#xD, #xA or #x9). This contrasts with the case where the unnormalized value contains a white space character (not a reference), which is replaced with a
+/// space character (#x20) in the normalized value and also contrasts with the case where the unnormalized value contains an entity reference whose replacement text contains a
+/// white space character; being recursively processed, the white space character is replaced with a space character (#x20) in the normalized value.
+fn normalize_attribute_value(attr: &[u8]) -> Cow<[u8]> {
     // TODO: character references, entity references, error handling associated with those
-    // TODO: don't allocated unless needed?
 
     #[derive(PartialEq)]
     enum ParseState {
-        SpaceOrStart,
+        Space,
         CDATA,
     }
 
-    let mut value: Vec<u8> = Vec::new();
-    // Starting in the state where we think we've added a space means we implicitly skip leading spaces
-    let mut current_state = ParseState::SpaceOrStart;
-    // Used for trimming trailing spaces
-    let mut last_cdata_idx = 0;
+    let is_whitespace_like = |c| matches!(c, b'\n' | b'\r' | b'\t' | b' ');
+
+    let first_non_space_char = attr.iter().position(|c| !is_whitespace_like(*c));
+
+    if first_non_space_char.is_none() {
+        // The entire value was whitespace-like characters
+        return Cow::Borrowed(b"");
+    }
+
+    let last_non_space_char = attr.iter().rposition(|c| !is_whitespace_like(*c));
+
+    // Trim all whitespace-like characters away from the beginning and end of the attribute value.
+    let begin = first_non_space_char.unwrap();
+    let end = last_non_space_char.unwrap_or(attr.len());
+    let trimmed_attr = &attr[begin..=end];
+
+    // A new buffer is only created when we encounter a situation that requires it.
+    let mut normalized: Option<Vec<u8>> = None;
+    // We start on character data because all whitespace-like characters are already trimmed away.
+    let mut current_state = ParseState::CDATA;
 
-    // In one pass, strip leading and trailing spaces and replace sequences of spaces with a single one
-    for ch in attr.as_ref() {
+    // Perform a single pass over the trimmed attribute value. If we encounter a character / entity reference
+    // or whitespace-like characters that need to be substituted, copy everything processed thus far to a new
+    // buffer and continue using this buffer.
+    for (idx, ch) in trimmed_attr.iter().enumerate() {
         match ch {
             b'\n' | b'\r' | b'\t' | b' ' => match current_state {
-                ParseState::SpaceOrStart => continue,
+                ParseState::Space => match normalized {
+                    Some(_) => continue,
+                    None => normalized = Some(Vec::from(&trimmed_attr[..idx])),
+                },
                 ParseState::CDATA => {
-                    current_state = ParseState::SpaceOrStart;
-                    value.push(b' ');
+                    current_state = ParseState::Space;
+                    match normalized.as_mut() {
+                        Some(buf) => buf.push(b' '),
+                        None => {
+                            let mut buf = Vec::from(&trimmed_attr[..idx]);
+                            buf.push(b' ');
+                            normalized = Some(buf);
+                        }
+                    }
                 }
             },
             c @ _ => match current_state {
-                ParseState::SpaceOrStart => {
+                ParseState::Space => {
                     current_state = ParseState::CDATA;
-                    last_cdata_idx = value.len();
-                    value.push(*c);
+                    if let Some(normalized) = normalized.as_mut() {
+                        normalized.push(*c);
+                    }
                 }
                 ParseState::CDATA => {
-                    last_cdata_idx = value.len();
-                    value.push(*c);
+                    if let Some(normalized) = normalized.as_mut() {
+                        normalized.push(*c);
+                    }
                 }
             },
         }
     }
 
-    // Trim any trailing spaces
-    if current_state == ParseState::SpaceOrStart {
-        value.truncate(last_cdata_idx + 1);
+    match normalized {
+        Some(normalized) => Cow::Owned(normalized),
+        None => Cow::Borrowed(trimmed_attr),
     }
-
-    Cow::Owned(value)
-
-    // let mut value: Vec<u8> = Vec::new();
-
-    // // TODO: replace sequences of spaces
-    // for i in 0..attr.len() {
-    //     let ch = attr[i];
-    //     match ch {
-    //         b'\n' => value.push(b' '),
-    //         b'\r' => value.push(b' '),
-    //         b'\t' => value.push(b' '),
-    //         c @ _ => value.push(c),
-    //     }
-    // }
-
-    // // Position where value starts after whitespace.
-    // let first_non_space_char = value
-    //     .iter()
-    //     .position(|c| !c.is_ascii_whitespace())
-    //     .unwrap_or(0);
-    // // Position where the trailing whitespace starts.
-    // let last_non_space_char = value
-    //     .iter()
-    //     .rposition(|c| !c.is_ascii_whitespace())
-    //     .and_then(|idx| Some(idx + 1))
-    //     .unwrap_or(0);
-    // Cow::Owned(value[first_non_space_char..last_non_space_char].to_vec())
 }
 
 impl<'a> Iterator for Attributes<'a> {
@@ -444,7 +446,7 @@ impl<'a> Iterator for Attributes<'a> {
             ($key:expr, $val:expr) => {
                 Some(Ok(Attribute {
                     key: &self.bytes[$key],
-                    value: normalize_attribute_value(Cow::Borrowed(&self.bytes[$val])),
+                    value: normalize_attribute_value(&self.bytes[$val]),
                 }))
             };
         }
@@ -605,37 +607,27 @@ mod tests {
 
     #[test]
     fn attribute_value_normalization() {
+        // empty value
+        assert_eq!(normalize_attribute_value(b"").as_ref(), b"");
         // return, tab, and newline characters (0xD, 0x9, 0xA) must be replaced with a space character
         assert_eq!(
-            normalize_attribute_value(Cow::Borrowed(b"\rfoo\rbar\tbaz\ndelta\n")).as_ref(),
+            normalize_attribute_value(b"\rfoo\rbar\tbaz\ndelta\n").as_ref(),
             b"foo bar baz delta"
         );
         // leading and trailing spaces must be stripped
-        assert_eq!(
-            normalize_attribute_value(Cow::Borrowed(b"  foo ")).as_ref(),
-            b"foo"
-        );
+        assert_eq!(normalize_attribute_value(b"  foo ").as_ref(), b"foo");
         // leading space
-        assert_eq!(
-            normalize_attribute_value(Cow::Borrowed(b"  bar")).as_ref(),
-            b"bar"
-        );
+        assert_eq!(normalize_attribute_value(b" bar").as_ref(), b"bar");
         // trailing space
-        assert_eq!(
-            normalize_attribute_value(Cow::Borrowed(b"baz ")).as_ref(),
-            b"baz"
-        );
+        assert_eq!(normalize_attribute_value(b"baz ").as_ref(), b"baz");
         // sequences of spaces must be replaced with a single space
         assert_eq!(
-            normalize_attribute_value(Cow::Borrowed(b"   foo bar   baz ")).as_ref(),
+            normalize_attribute_value(b"   foo bar   baz ").as_ref(),
             b"foo bar baz"
         );
         // sequence replacement mixed with characters treated as whitespace (\t \r \n)
         assert_eq!(
-            normalize_attribute_value(Cow::Borrowed(
-                b" \tfoo\tbar \rbaz \n\ndelta\n\t\r echo foxtrot\r"
-            ))
-            .as_ref(),
+            normalize_attribute_value(b" \tfoo\tbar \rbaz \n\ndelta\n\t\r echo foxtrot\r").as_ref(),
             b"foo bar baz delta echo foxtrot"
         );
     }