From 48179aa9acfedd5db95acb403c8417287c99281e Mon Sep 17 00:00:00 2001 From: Miha Zupan Date: Fri, 2 Dec 2022 02:13:53 +0100 Subject: [PATCH] Fix edge-case of ASCII set after Notonelazy that doesn't overlap with target --- .../gen/RegexGenerator.Emitter.cs | 5 +++-- .../System/Text/RegularExpressions/RegexCompiler.cs | 13 ++++++++++--- .../tests/FunctionalTests/Regex.Match.Tests.cs | 3 +++ 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index 36b1d203bc2b7..780b956992666 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -379,7 +379,6 @@ private static string EmitIndexOfAnyValuesOrLiteral(ReadOnlySpan chars, Di private static string EmitIndexOfAnyValues(char[] asciiChars, Dictionary requiredHelpers) { Debug.Assert(RegexCharClass.IsAscii(asciiChars)); - Debug.Assert(asciiChars.AsSpan().SequenceEqual(asciiChars.OrderBy(c => c).ToArray())); // The set of ASCII characters can be represented as a 128-bit bitmap. Use the 16-byte hex string as the key. byte[] bitmap = new byte[16]; @@ -404,6 +403,8 @@ private static string EmitIndexOfAnyValues(char[] asciiChars, Dictionary {fieldName} = IndexOfAnyValues.Create({Literal(new string(asciiChars))});", @@ -3168,8 +3169,8 @@ literal.SetChars is not null || } else if (literal.AsciiChars is not null) // set of only ASCII characters { - overlap = literal.AsciiChars.Contains(node.Ch); char[] asciiChars = literal.AsciiChars; + overlap = asciiChars.Contains(node.Ch); if (!overlap) { Debug.Assert(node.Ch < 128); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 04307e6053e95..510f0f2af350c 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -3399,7 +3399,7 @@ node.Kind is RegexNodeKind.Notonelazy && !literal.Negated && // not negated; can't search for both the node.Ch and a negated subsequent char with an IndexOf* method (literal.String is not null || literal.SetChars is not null || - literal.AsciiChars is not null || + (literal.AsciiChars is not null && node.Ch < 128) || // for ASCII sets, only allow when the target can be efficiently included in the set literal.Range.LowInclusive == literal.Range.HighInclusive || (literal.Range.LowInclusive <= node.Ch && node.Ch <= literal.Range.HighInclusive))) // for ranges, only allow when the range overlaps with the target, since there's no accelerated way to search for the union { @@ -3474,8 +3474,15 @@ literal.AsciiChars is not null || } else if (literal.AsciiChars is not null) // set of only ASCII characters { - overlap = literal.AsciiChars.AsSpan().Contains(node.Ch); - LoadIndexOfAnyValues(literal.AsciiChars); + char[] asciiChars = literal.AsciiChars; + overlap = asciiChars.AsSpan().Contains(node.Ch); + if (!overlap) + { + Debug.Assert(node.Ch < 128); + Array.Resize(ref asciiChars, asciiChars.Length + 1); + asciiChars[asciiChars.Length - 1] = node.Ch; + } + LoadIndexOfAnyValues(asciiChars); Call(s_spanIndexOfAnyIndexOfAnyValues); } else if (literal.Range.LowInclusive == literal.Range.HighInclusive) // single char from a RegexNode.One diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs index 98a3ca21a356e..465f790b8fec5 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs @@ -382,6 +382,9 @@ public static IEnumerable Match_MemberData() yield return (@"a[^c]*?[bcdef]", "xyza12345e6789", lineOption, 0, 14, true, "a12345e"); yield return (@"a[^b]*?[bcdef]", "xyza12345f6789", lineOption, 0, 14, true, "a12345f"); yield return (@"a[^c]*?[bcdef]", "xyza12345g6789", lineOption, 0, 14, false, ""); + + yield return ("a[^b]*?[cdefgz]", "xyza123bc4", lineOption, 0, 10, false, ""); + yield return ("a[^b]*?[bdefgz]", "xyza123bc4", lineOption, 0, 10, true, "a123b"); } // Nested loops