From 05a6d0a8174eba7e84ff73d5e50de746f577ca52 Mon Sep 17 00:00:00 2001 From: Philip Helger Date: Wed, 7 Jun 2023 19:49:19 +0200 Subject: [PATCH] Fixed newline handling and input character filtering for processing --- .../com/helger/css/parser/CSSParseHelper.java | 142 +++++++++++++++--- .../helger/css/parser/CSSParseHelperTest.java | 13 +- 2 files changed, 134 insertions(+), 21 deletions(-) diff --git a/ph-css/src/main/java/com/helger/css/parser/CSSParseHelper.java b/ph-css/src/main/java/com/helger/css/parser/CSSParseHelper.java index e9326f60..1b906613 100644 --- a/ph-css/src/main/java/com/helger/css/parser/CSSParseHelper.java +++ b/ph-css/src/main/java/com/helger/css/parser/CSSParseHelper.java @@ -84,9 +84,103 @@ public static String extractStringValue (@Nullable final String sStr) return sStr; } + /** + * A special char iterator based on + * https://www.w3.org/TR/css-syntax-3/#css-filter-code-points + * + * @author Philip Helger + */ + private static final class CSSCharIterator + { + private final char [] m_aSrc; + private final int m_nSrcLen; + private int m_nIndex = 0; + + public CSSCharIterator (@Nonnull final String sSrc) + { + m_aSrc = sSrc.toCharArray (); + m_nSrcLen = sSrc.length (); + } + + public boolean hasNext () + { + return m_nIndex < m_nSrcLen; + } + + private char _current () + { + return m_aSrc[m_nIndex]; + } + + private char _next () + { + return m_aSrc[m_nIndex + 1]; + } + + /** + * @return Next character to come without modifying the index + */ + public char lookahead () + { + char ret = _current (); + switch (ret) + { + case 0: + ret = (char) 0xfffd; + break; + case '\f': + ret = '\n'; + break; + case '\r': + // No matter if followed by \n or not + ret = '\n'; + break; + } + return ret; + } + + /** + * @return Next character and advancing the index (by 1 or 2) + */ + public char next () + { + // See + char ret = _current (); + int nAdvance = 1; + switch (ret) + { + case 0: + ret = (char) 0xfffd; + break; + case '\f': + ret = '\n'; + break; + case '\r': + { + if (hasNext () && _next () == '\n') + { + // Handle \r\n as one \n + nAdvance = 2; + } + ret = '\n'; + break; + } + } + // Move forward + m_nIndex += nAdvance; + return ret; + } + } + + private static boolean _isNewLine (final char c) + { + // \r is NOT a new line char + return c == '\n'; + } + private static boolean _isWhitespace (final char c) { - return c == '\n' || c == '\t' || c == ' '; + return _isNewLine (c) || c == '\t' || c == ' '; } private static boolean _isHexChar (final char c) @@ -113,26 +207,26 @@ public static String unescapeURL (@Nonnull final String sEscapedURL) return sEscapedURL; } - // The source length is always longer - final int nSrcLen = sEscapedURL.length (); - final StringBuilder aSB = new StringBuilder (nSrcLen); - int nCharIndex = 0; - while (nCharIndex < nSrcLen) + // The source length is never shorter + final StringBuilder aSB = new StringBuilder (sEscapedURL.length ()); + + final CSSCharIterator aIter = new CSSCharIterator (sEscapedURL); + while (aIter.hasNext ()) { - final char c = sEscapedURL.charAt (nCharIndex); - nCharIndex++; + final char c = aIter.next (); if (c == URL_ESCAPE_CHAR) { int nCodePoint = 0; int nHexCount = 0; - while (nHexCount <= 6 && nCharIndex < nSrcLen) + + while (nHexCount <= 6 && aIter.hasNext ()) { - final char cNext = sEscapedURL.charAt (nCharIndex); + final char cNext = aIter.lookahead (); if (_isHexChar (cNext)) { + aIter.next (); nHexCount++; - nCharIndex++; nCodePoint = (nCodePoint * 16) + StringHelper.getHexValue (cNext); } else @@ -142,11 +236,11 @@ public static String unescapeURL (@Nonnull final String sEscapedURL) if (nHexCount > 0) { // Check for a trailing whitespace and evtl. skip it - if (nCharIndex < nSrcLen) + if (aIter.hasNext ()) { - final char cNext = sEscapedURL.charAt (nCharIndex); + final char cNext = aIter.lookahead (); if (_isWhitespace (cNext)) - nCharIndex++; + aIter.next (); } if (nCodePoint > '\uFFFF') @@ -156,13 +250,27 @@ public static String unescapeURL (@Nonnull final String sEscapedURL) } else { - // Append \ verbose - aSB.append (c); + // No hex char found - check for newline + // Goal is to make "\\nx" should become "x" + final char cNext = aIter.lookahead (); + if (_isNewLine (cNext)) + { + // Consume newline char + aIter.next (); + + // Take following char as it is + aSB.append (aIter.next ()); + } + else + { + // Append \ verbose + aSB.append (c); + } } } else { - // Copy as is + // Copy char as is aSB.append (c); } } diff --git a/ph-css/src/test/java/com/helger/css/parser/CSSParseHelperTest.java b/ph-css/src/test/java/com/helger/css/parser/CSSParseHelperTest.java index 72dd32f4..7d41f87f 100644 --- a/ph-css/src/test/java/com/helger/css/parser/CSSParseHelperTest.java +++ b/ph-css/src/test/java/com/helger/css/parser/CSSParseHelperTest.java @@ -56,12 +56,9 @@ public void testSplitNumber () @Test public void testUnescapeCSSURL () { + // Nothing to escape assertEquals ("bla.gif", CSSParseHelper.unescapeURL ("bla.gif")); assertEquals ("/foo/bla.gif", CSSParseHelper.unescapeURL ("/foo/bla.gif")); - if (false) - assertEquals ("/foo/bla().gif", CSSParseHelper.unescapeURL ("/foo/bla\\(\\).gif")); - if (false) - assertEquals ("\\\\server\\foo\\bla.gif", CSSParseHelper.unescapeURL ("\\\\\\\\server\\\\foo\\\\bla.gif")); assertEquals ("/home/data/image.png", CSSParseHelper.unescapeURL ("\\2f home\\2f data\\2f image.png")); assertEquals ("/home/data/image.png", CSSParseHelper.unescapeURL ("\\2fhome\\2f data\\2fimage.png")); assertEquals ("/home /data /image.png", CSSParseHelper.unescapeURL ("\\2fhome \\2f data \\2f image.png")); @@ -79,5 +76,13 @@ public void testUnescapeCSSURL () assertEquals ("AZ", CSSParseHelper.unescapeURL ("\\00041 Z")); assertEquals ("AZ", CSSParseHelper.unescapeURL ("\\000041Z")); assertEquals ("AZ", CSSParseHelper.unescapeURL ("\\000041 Z")); + assertEquals ("a not so very long title", CSSParseHelper.unescapeURL ("a not so very long title")); + assertEquals ("a not so very long title", CSSParseHelper.unescapeURL ("a not s\\\no very long title")); + assertEquals ("a not so very long title", CSSParseHelper.unescapeURL ("a not s\\\r\no very long title")); + assertEquals ("a not so very long title", CSSParseHelper.unescapeURL ("a not s\\\fo very long title")); + assertEquals ("A\nZ", CSSParseHelper.unescapeURL ("\\041 \nZ")); + assertEquals ("A\nZ", CSSParseHelper.unescapeURL ("\\041 \r\nZ")); + assertEquals ("A\n", CSSParseHelper.unescapeURL ("\\041 \n")); + assertEquals ("A\n", CSSParseHelper.unescapeURL ("\\041 \r\n")); } }