From 604e50ed2f68f9b52feb8a2fb9910fa7f463f8a3 Mon Sep 17 00:00:00 2001 From: Miha Zupan Date: Mon, 28 Nov 2022 18:27:09 +0100 Subject: [PATCH 1/7] Use IndexOfAnyValues in the RegexCompiler and source gen --- .../gen/RegexGenerator.Emitter.cs | 102 ++++++++++++++++-- .../gen/RegexGenerator.cs | 3 +- .../RegularExpressions/CompiledRegexRunner.cs | 11 +- .../CompiledRegexRunnerFactory.cs | 7 +- .../Text/RegularExpressions/RegexCharClass.cs | 17 +++ .../Text/RegularExpressions/RegexCompiler.cs | 64 ++++++++++- .../RegexFindOptimizations.cs | 2 + .../RegularExpressions/RegexLWCGCompiler.cs | 2 +- .../Text/RegularExpressions/RegexNode.cs | 36 +++++-- .../RegularExpressions/RegexPrefixAnalyzer.cs | 43 +++++++- 10 files changed, 258 insertions(+), 29 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index df851e496e7f5..86bac5416be2a 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -363,6 +363,57 @@ private static void AddIsECMABoundaryHelper(Dictionary require } } + /// Adds an IndexOfAnyValues instance declaration to the required helpers collection if the chars are ASCII. + private static string EmitIndexOfAnyValuesOrLiteral(ReadOnlySpan chars, Dictionary requiredHelpers) + { + // IndexOfAnyValues is faster than a regular IndexOfAny("abcd") for sets of 4/5 values iff they are ASCII. + // Only emit IndexOfAnyValues instances when we know they'll be faster to avoid increasing the startup cost too much. + Debug.Assert(chars.Length is 4 or 5); + + return RegexCharClass.IsAscii(chars) + ? EmitIndexOfAnyValues(chars.ToArray(), requiredHelpers) + : Literal(chars.ToString()); + } + + /// Adds an IndexOfAnyValues instance declaration to the required helpers collection. + private static string EmitIndexOfAnyValues(char[] asciiChars, Dictionary requiredHelpers) + { + Debug.Assert(RegexCharClass.IsAscii(asciiChars)); + Debug.Assert(asciiChars.AsSpan().SequenceEqual(asciiChars.OrderBy(c => c).ToArray())); + + // The set of ASCII characters can be represented as a 128-bit bitmap. Use the 16-byte hex string as the key. + byte[] bitmap = new byte[16]; + foreach (char c in asciiChars) + { + bitmap[c >> 3] |= (byte)(1 << (c & 7)); + } + + string hexBitmap = BitConverter.ToString(bitmap).Replace("-", string.Empty); + + string fieldName = hexBitmap switch + { + "0000000000000000FEFFFF07FEFFFF07" => "AsciiLetter", + "000000000000FF03FEFFFF07FEFFFF07" => "AsciiLetterOrDigit", + "000000000000FF037E0000007E000000" => "AsciiHexDigit", + "000000000000FF03000000007E000000" => "AsciiHexDigitLower", + "000000000000FF037E00000000000000" => "AsciiHexDigitUpper", + _ => $"Ascii_{hexBitmap.TrimStart('0')}" + }; + + string helperName = $"IndexOfAnyValues_{fieldName}"; + + if (!requiredHelpers.ContainsKey(helperName)) + { + requiredHelpers.Add(helperName, new string[] + { + $"internal static readonly IndexOfAnyValues {fieldName} =", + $" IndexOfAnyValues.Create({Literal(new string(asciiChars))});", + }); + } + + return $"{HelpersTypeName}.{fieldName}"; + } + /// Emits the body of the Scan method override. private static (bool NeedsTryFind, bool NeedsTryMatch) EmitScan(IndentedTextWriter writer, RegexMethod rm) { @@ -810,7 +861,7 @@ void EmitFixedSet_LeftToRight() int setIndex = 0; bool canUseIndexOf = primarySet.Set != RegexCharClass.NotNewLineClass && - (primarySet.Chars is not null || primarySet.Range is not null); + (primarySet.Chars is not null || primarySet.Range is not null || primarySet.AsciiSet is not null); bool needLoop = !canUseIndexOf || setsToUse > 1; FinishEmitBlock loopBlock = default; @@ -841,7 +892,12 @@ void EmitFixedSet_LeftToRight() 1 => $"{span}.IndexOf({Literal(primarySet.Chars[0])})", 2 => $"{span}.IndexOfAny({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])})", 3 => $"{span}.IndexOfAny({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])}, {Literal(primarySet.Chars[2])})", - _ => $"{span}.IndexOfAny({Literal(new string(primarySet.Chars))})", + _ => $"{span}.IndexOfAny({EmitIndexOfAnyValuesOrLiteral(primarySet.Chars, requiredHelpers)})", + } : + primarySet.AsciiSet is not null ? primarySet.AsciiSet.Value.Negated switch + { + false => $"{span}.IndexOfAny({EmitIndexOfAnyValues(primarySet.AsciiSet.Value.Chars, requiredHelpers)})", + true => $"{span}.IndexOfAnyExcept({EmitIndexOfAnyValues(primarySet.AsciiSet.Value.Chars, requiredHelpers)})", } : (primarySet.Range.Value.LowInclusive == primarySet.Range.Value.HighInclusive, primarySet.Range.Value.Negated) switch { @@ -1010,7 +1066,7 @@ void EmitLiteralAfterAtomicLoop() { 2 => $"IndexOfAny({Literal(literalChars[0])}, {Literal(literalChars[1])});", 3 => $"IndexOfAny({Literal(literalChars[0])}, {Literal(literalChars[1])}, {Literal(literalChars[2])});", - _ => $"IndexOfAny({Literal(new string(literalChars))});", + _ => $"IndexOfAny({EmitIndexOfAnyValuesOrLiteral(literalChars, requiredHelpers)});", }); FinishEmitBlock indexOfFoundBlock = default; @@ -2920,7 +2976,7 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL if (!rtl && node.N > 1 && // no point in using IndexOf for small loops, in particular optionals subsequent?.FindStartingLiteralNode() is RegexNode literalNode && - TryEmitIndexOf(literalNode, useLast: true, negate: false, out int literalLength, out string indexOfExpr)) + TryEmitIndexOf(requiredHelpers, literalNode, useLast: true, negate: false, out int literalLength, out string indexOfExpr)) { writer.WriteLine($"if ({startingPos} >= {endingPos} ||"); @@ -3079,6 +3135,7 @@ node.Kind is RegexNodeKind.Notonelazy && !literal.Negated && // not negated; can't search for both the node.Ch and a negated subsequent char with an IndexOf* method (literal.String is not null || literal.SetChars is not null || + (literal.AsciiChars is not null && node.Ch < 128) || // for ASCII sets, only allow when the target can be efficiently included in the set literal.Range.LowInclusive == literal.Range.HighInclusive || (literal.Range.LowInclusive <= node.Ch && node.Ch <= literal.Range.HighInclusive))) // for ranges, only allow when the range overlaps with the target, since there's no accelerated way to search for the union { @@ -3104,12 +3161,24 @@ literal.SetChars is not null || { (true, 2) => $"{startingPos} = {sliceSpan}.IndexOfAny({Literal(literal.SetChars[0])}, {Literal(literal.SetChars[1])});", (true, 3) => $"{startingPos} = {sliceSpan}.IndexOfAny({Literal(literal.SetChars[0])}, {Literal(literal.SetChars[1])}, {Literal(literal.SetChars[2])});", - (true, _) => $"{startingPos} = {sliceSpan}.IndexOfAny({Literal(literal.SetChars)});", + (true, _) => $"{startingPos} = {sliceSpan}.IndexOfAny({EmitIndexOfAnyValuesOrLiteral(literal.SetChars.AsSpan(), requiredHelpers)});", (false, 2) => $"{startingPos} = {sliceSpan}.IndexOfAny({Literal(node.Ch)}, {Literal(literal.SetChars[0])}, {Literal(literal.SetChars[1])});", - (false, _) => $"{startingPos} = {sliceSpan}.IndexOfAny({Literal($"{node.Ch}{literal.SetChars}")});", + (false, _) => $"{startingPos} = {sliceSpan}.IndexOfAny({EmitIndexOfAnyValuesOrLiteral($"{node.Ch}{literal.SetChars}".AsSpan(), requiredHelpers)});", }); } + else if (literal.AsciiChars is not null) // set of only ASCII characters + { + overlap = literal.AsciiChars.Contains(node.Ch); + char[] asciiChars = literal.AsciiChars; + if (!overlap) + { + Debug.Assert(node.Ch < 128); + Array.Resize(ref asciiChars, asciiChars.Length + 1); + asciiChars[asciiChars.Length - 1] = node.Ch; + } + writer.WriteLine($"{startingPos} = {sliceSpan}.IndexOfAny({EmitIndexOfAnyValues(asciiChars, requiredHelpers)});"); + } else if (literal.Range.LowInclusive == literal.Range.HighInclusive) // single char from a RegexNode.One { overlap = literal.Range.LowInclusive == node.Ch; @@ -3144,7 +3213,7 @@ literal.SetChars is not null || node.Kind is RegexNodeKind.Setlazy && node.Str == RegexCharClass.AnyClass && subsequent?.FindStartingLiteralNode() is RegexNode literal2 && - TryEmitIndexOf(literal2, useLast: false, negate: false, out _, out string? indexOfExpr)) + TryEmitIndexOf(requiredHelpers, literal2, useLast: false, negate: false, out _, out string? indexOfExpr)) { // e.g. ".*?string" with RegexOptions.Singleline // This lazy loop will consume all characters until the subsequent literal. If the subsequent literal @@ -3592,7 +3661,7 @@ void EmitSingleCharRepeater(RegexNode node, bool emitLengthCheck = true) // For the loop, we're validating that each char matches the target node. // For IndexOf, we're looking for the first thing that _doesn't_ match the target node, // and thus similarly validating that everything does. - if (TryEmitIndexOf(node, useLast: false, negate: true, out _, out string? indexOfExpr)) + if (TryEmitIndexOf(requiredHelpers, node, useLast: false, negate: true, out _, out string? indexOfExpr)) { using (EmitBlock(writer, $"if ({sliceSpan}.Slice({sliceStaticPos}, {iterations}).{indexOfExpr} >= 0)")) { @@ -3685,7 +3754,7 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired = TransferSliceStaticPosToPos(); writer.WriteLine($"int {iterationLocal} = inputSpan.Length - pos;"); } - else if (maxIterations == int.MaxValue && TryEmitIndexOf(node, useLast: false, negate: true, out _, out string indexOfExpr)) + else if (maxIterations == int.MaxValue && TryEmitIndexOf(requiredHelpers, node, useLast: false, negate: true, out _, out string indexOfExpr)) { // We're unbounded and we can use an IndexOf method to perform the search. The unbounded restriction is // purely for simplicity; it could be removed in the future with additional code to handle that case. @@ -4337,6 +4406,7 @@ private static void EmitTimeoutCheckIfNeeded(IndentedTextWriter writer, RegexMet /// The resulting expression if it returns true; otherwise, null. /// true if an expression could be produced; otherwise, false. private static bool TryEmitIndexOf( + Dictionary requiredHelpers, RegexNode node, bool useLast, bool negate, out int literalLength, [NotNullWhen(true)] out string? indexOfExpr) @@ -4383,7 +4453,7 @@ private static bool TryEmitIndexOf( 1 => $"{last}{indexOfName}({Literal(setChars[0])})", 2 => $"{last}{indexOfAnyName}({Literal(setChars[0])}, {Literal(setChars[1])})", 3 => $"{last}{indexOfAnyName}({Literal(setChars[0])}, {Literal(setChars[1])}, {Literal(setChars[2])})", - _ => $"{last}{indexOfAnyName}({Literal(setChars.ToString())})", + _ => $"{last}{indexOfAnyName}({EmitIndexOfAnyValuesOrLiteral(setChars, requiredHelpers)})", }; literalLength = 1; @@ -4401,6 +4471,18 @@ private static bool TryEmitIndexOf( literalLength = 1; return true; } + + if (RegexCharClass.TryGetAsciiSetChars(node.Str, out char[]? asciiChars)) + { + string indexOfAnyName = !negated ? + "IndexOfAny" : + "IndexOfAnyExcept"; + + indexOfExpr = $"{last}{indexOfAnyName}({EmitIndexOfAnyValues(asciiChars, requiredHelpers)})"; + + literalLength = 1; + return true; + } } indexOfExpr = null; diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.cs index ed506320a1a8f..2dd5c6d0d551b 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.cs @@ -198,6 +198,7 @@ x.Options is CSharpCompilationOptions options ? // a user's partial type. We can now rely on binding rules mapping to these usings and don't need to // use global-qualified names for the rest of the implementation. writer.WriteLine($" using System;"); + writer.WriteLine($" using System.Buffers;"); writer.WriteLine($" using System.CodeDom.Compiler;"); writer.WriteLine($" using System.Collections;"); writer.WriteLine($" using System.ComponentModel;"); @@ -240,7 +241,7 @@ x.Options is CSharpCompilationOptions options ? writer.WriteLine($"{{"); writer.Indent++; bool sawFirst = false; - foreach (KeyValuePair helper in requiredHelpers) + foreach (KeyValuePair helper in requiredHelpers.OrderBy(h => h.Key, StringComparer.Ordinal)) { if (sawFirst) { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs index a0e66e369ec97..75e34610da8e8 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs @@ -1,6 +1,7 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Buffers; using System.Globalization; namespace System.Text.RegularExpressions @@ -8,20 +9,24 @@ namespace System.Text.RegularExpressions internal sealed class CompiledRegexRunner : RegexRunner { private readonly ScanDelegate _scanMethod; - /// This field will only be set if the pattern contains backreferences and has RegexOptions.IgnoreCase - private readonly CultureInfo? _culture; #pragma warning disable CA1823 // Avoid unused private fields. Justification: Used via reflection to cache the Case behavior if needed. #pragma warning disable CS0169 + private readonly IndexOfAnyValues[]? _indexOfAnyValues; + + /// This field will only be set if the pattern contains backreferences and has RegexOptions.IgnoreCase + private readonly CultureInfo? _culture; + private RegexCaseBehavior _caseBehavior; #pragma warning restore CS0169 #pragma warning restore CA1823 internal delegate void ScanDelegate(RegexRunner runner, ReadOnlySpan text); - public CompiledRegexRunner(ScanDelegate scan, CultureInfo? culture) + public CompiledRegexRunner(ScanDelegate scan, IndexOfAnyValues[]? indexOfAnyValues, CultureInfo? culture) { _scanMethod = scan; + _indexOfAnyValues = indexOfAnyValues; _culture = culture; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunnerFactory.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunnerFactory.cs index b7ec852f4cdbe..5d21799b339fe 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunnerFactory.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunnerFactory.cs @@ -1,6 +1,7 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Buffers; using System.Globalization; using System.Reflection.Emit; @@ -9,20 +10,22 @@ namespace System.Text.RegularExpressions internal sealed class CompiledRegexRunnerFactory : RegexRunnerFactory { private readonly DynamicMethod _scanMethod; + private readonly IndexOfAnyValues[]? _indexOfAnyValues; /// This field will only be set if the pattern has backreferences and uses RegexOptions.IgnoreCase private readonly CultureInfo? _culture; // Delegate is lazily created to avoid forcing JIT'ing until the regex is actually executed. private CompiledRegexRunner.ScanDelegate? _scan; - public CompiledRegexRunnerFactory(DynamicMethod scanMethod, CultureInfo? culture) + public CompiledRegexRunnerFactory(DynamicMethod scanMethod, IndexOfAnyValues[]? indexOfAnyValues, CultureInfo? culture) { _scanMethod = scanMethod; + _indexOfAnyValues = indexOfAnyValues; _culture = culture; } protected internal override RegexRunner CreateInstance() => new CompiledRegexRunner( - _scan ??= _scanMethod.CreateDelegate(), _culture); + _scan ??= _scanMethod.CreateDelegate(), _indexOfAnyValues, _culture); } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs index 70d2e6a72619a..215e2c1143cc2 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs @@ -3,6 +3,7 @@ using System.Collections.Generic; using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; using System.Globalization; using System.Numerics; using System.Runtime.CompilerServices; @@ -839,6 +840,22 @@ public static int GetSetChars(string set, Span chars) return count; } + public static bool TryGetAsciiSetChars(string set, [NotNullWhen(true)] out char[]? asciiChars) + { + Span chars = stackalloc char[128]; + + chars = chars.Slice(0, GetSetChars(set, chars)); + + if (chars.IsEmpty || !IsAscii(chars)) + { + asciiChars = null; + return false; + } + + asciiChars = chars.ToArray(); + return true; + } + /// /// Determines whether two sets may overlap. /// diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 1c592b9c89fb0..fcd5a7fe6aad3 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -1,6 +1,7 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Buffers; using System.Collections.Generic; using System.Diagnostics; using System.Diagnostics.CodeAnalysis; @@ -23,6 +24,7 @@ internal abstract class RegexCompiler private static readonly FieldInfo s_runstackField = RegexRunnerField("runstack"); private static readonly FieldInfo s_cultureField = typeof(CompiledRegexRunner).GetField("_culture", BindingFlags.Instance | BindingFlags.NonPublic)!; private static readonly FieldInfo s_caseBehaviorField = typeof(CompiledRegexRunner).GetField("_caseBehavior", BindingFlags.Instance | BindingFlags.NonPublic)!; + private static readonly FieldInfo s_indexOfAnyValuesArrayField = typeof(CompiledRegexRunner).GetField("_indexOfAnyValues", BindingFlags.Instance | BindingFlags.NonPublic)!; private static readonly MethodInfo s_captureMethod = RegexRunnerMethod("Capture"); private static readonly MethodInfo s_transferCaptureMethod = RegexRunnerMethod("TransferCapture"); @@ -65,21 +67,25 @@ internal abstract class RegexCompiler private static readonly MethodInfo s_spanIndexOfAnyCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOfAnyCharCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOfAnySpan = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); + private static readonly MethodInfo s_spanIndexOfAnyIndexOfAnyValues = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(IndexOfAnyValues<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOfAnyExceptChar = typeof(MemoryExtensions).GetMethod("IndexOfAnyExcept", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOfAnyExceptCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAnyExcept", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOfAnyExceptCharCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAnyExcept", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOfAnyExceptSpan = typeof(MemoryExtensions).GetMethod("IndexOfAnyExcept", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); + private static readonly MethodInfo s_spanIndexOfAnyExceptIndexOfAnyValues = typeof(MemoryExtensions).GetMethod("IndexOfAnyExcept", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(IndexOfAnyValues<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOfAnyInRange = typeof(MemoryExtensions).GetMethod("IndexOfAnyInRange", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOfAnyExceptInRange = typeof(MemoryExtensions).GetMethod("IndexOfAnyExceptInRange", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanLastIndexOfChar = typeof(MemoryExtensions).GetMethod("LastIndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanLastIndexOfAnyCharChar = typeof(MemoryExtensions).GetMethod("LastIndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanLastIndexOfAnyCharCharChar = typeof(MemoryExtensions).GetMethod("LastIndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanLastIndexOfAnySpan = typeof(MemoryExtensions).GetMethod("LastIndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); + private static readonly MethodInfo s_spanLastIndexOfAnyIndexOfAnyValues = typeof(MemoryExtensions).GetMethod("LastIndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(IndexOfAnyValues<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanLastIndexOfSpan = typeof(MemoryExtensions).GetMethod("LastIndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanLastIndexOfAnyExceptChar = typeof(MemoryExtensions).GetMethod("LastIndexOfAnyExcept", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanLastIndexOfAnyExceptCharChar = typeof(MemoryExtensions).GetMethod("LastIndexOfAnyExcept", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanLastIndexOfAnyExceptCharCharChar = typeof(MemoryExtensions).GetMethod("LastIndexOfAnyExcept", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanLastIndexOfAnyExceptSpan = typeof(MemoryExtensions).GetMethod("LastIndexOfAnyExcept", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); + private static readonly MethodInfo s_spanLastIndexOfAnyExceptIndexOfAnyValues = typeof(MemoryExtensions).GetMethod("LastIndexOfAnyExcept", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(IndexOfAnyValues<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanLastIndexOfAnyInRange = typeof(MemoryExtensions).GetMethod("LastIndexOfAnyInRange", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanLastIndexOfAnyExceptInRange = typeof(MemoryExtensions).GetMethod("LastIndexOfAnyExceptInRange", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanSliceIntMethod = typeof(ReadOnlySpan).GetMethod("Slice", new Type[] { typeof(int) })!; @@ -103,6 +109,9 @@ internal abstract class RegexCompiler /// Whether this expression has a non-infinite timeout. protected bool _hasTimeout; + /// instances used by the expression. For now these are only ASCII sets. + protected List>? _indexOfAnyValues; + /// Pool of Int32 LocalBuilders. private Stack? _int32LocalsPool; /// Pool of ReadOnlySpan of char locals. @@ -829,7 +838,7 @@ void EmitFixedSet_LeftToRight() int setIndex = 0; bool canUseIndexOf = primarySet.Set != RegexCharClass.NotNewLineClass && - (primarySet.Chars is not null || primarySet.Range is not null); + (primarySet.Chars is not null || primarySet.Range is not null || primarySet.AsciiSet is not null); bool needLoop = !canUseIndexOf || setsToUse > 1; Label checkSpanLengthLabel = default; @@ -879,7 +888,7 @@ void EmitFixedSet_LeftToRight() if (primarySet.Chars is not null) { - switch (primarySet.Chars!.Length) + switch (primarySet.Chars.Length) { case 1: // tmp = ...IndexOf(setChars[0]); @@ -909,18 +918,23 @@ void EmitFixedSet_LeftToRight() break; } } + else if (primarySet.AsciiSet is not null) + { + LoadIndexOfAnyValues(primarySet.AsciiSet.Value.Chars); + Call(primarySet.AsciiSet.Value.Negated ? s_spanIndexOfAnyExceptIndexOfAnyValues : s_spanIndexOfAnyIndexOfAnyValues); + } else { if (primarySet.Range!.Value.LowInclusive == primarySet.Range.Value.HighInclusive) { // tmp = ...IndexOf{AnyExcept}(low); - Ldc(primarySet.Range!.Value.LowInclusive); + Ldc(primarySet.Range.Value.LowInclusive); Call(primarySet.Range.Value.Negated ? s_spanIndexOfAnyExceptChar : s_spanIndexOfChar); } else { // tmp = ...IndexOfAny{Except}InRange(low, high); - Ldc(primarySet.Range!.Value.LowInclusive); + Ldc(primarySet.Range.Value.LowInclusive); Ldc(primarySet.Range.Value.HighInclusive); Call(primarySet.Range.Value.Negated ? s_spanIndexOfAnyExceptInRange : s_spanIndexOfAnyInRange); } @@ -3385,6 +3399,7 @@ node.Kind is RegexNodeKind.Notonelazy && !literal.Negated && // not negated; can't search for both the node.Ch and a negated subsequent char with an IndexOf* method (literal.String is not null || literal.SetChars is not null || + literal.AsciiChars is not null || literal.Range.LowInclusive == literal.Range.HighInclusive || (literal.Range.LowInclusive <= node.Ch && node.Ch <= literal.Range.HighInclusive))) // for ranges, only allow when the range overlaps with the target, since there's no accelerated way to search for the union { @@ -3457,6 +3472,12 @@ literal.SetChars is not null || break; } } + else if (literal.AsciiChars is not null) // set of only ASCII characters + { + overlap = literal.AsciiChars.AsSpan().Contains(node.Ch); + LoadIndexOfAnyValues(literal.AsciiChars); + Call(s_spanIndexOfAnyIndexOfAnyValues); + } else if (literal.Range.LowInclusive == literal.Range.HighInclusive) // single char from a RegexNode.One { overlap = literal.Range.LowInclusive == node.Ch; @@ -4919,6 +4940,12 @@ bool CanEmitIndexOf(RegexNode node, out int literalLength) literalLength = 1; return true; } + + if (RegexCharClass.TryGetAsciiSetChars(node.Str, out _)) + { + literalLength = 1; + return true; + } } literalLength = 0; @@ -5050,6 +5077,20 @@ void EmitIndexOf(RegexNode node, bool useLast, bool negate) }); return; } + + // IndexOfAny{Except}(IndexOfAnyValues) + if (RegexCharClass.TryGetAsciiSetChars(node.Str, out char[]? asciiChars)) + { + LoadIndexOfAnyValues(asciiChars); + Call((useLast, negated) switch + { + (false, false) => s_spanIndexOfAnyIndexOfAnyValues, + (false, true) => s_spanIndexOfAnyExceptIndexOfAnyValues, + (true, false) => s_spanLastIndexOfAnyIndexOfAnyValues, + (true, true) => s_spanLastIndexOfAnyExceptIndexOfAnyValues, + }); + return; + } } Debug.Fail("We should never get here. This method should only be called if CanEmitIndexOf returned true, and all of the same cases should be covered."); @@ -5941,5 +5982,20 @@ private void EmitTimeoutCheckIfNeeded() Call(s_checkTimeoutMethod); } } + + /// + /// Adds an entry in for the given and emits a load of that initialized value. + /// + private void LoadIndexOfAnyValues(char[] chars) + { + List> list = _indexOfAnyValues ??= new(); + int index = list.Count; + list.Add(IndexOfAnyValues.Create(chars)); + + // this._indexOfAnyValues[index] + Ldthisfld(s_indexOfAnyValuesArrayField); + _ilg!.Emit(OpCodes.Ldc_I4_S, index); + _ilg.Emit(OpCodes.Ldelem_Ref); + } } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs index 099073da0cb82..75e16fbf8a545 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs @@ -247,6 +247,8 @@ public FixedDistanceSet(char[]? chars, string set, int distance) public int Distance; /// As an alternative to , a description of the single range the set represents, if it does. public (char LowInclusive, char HighInclusive, bool Negated)? Range; + /// As an alternative to , a description of the set of ASCII characters it represents, if it does. + public (char[] Chars, bool Negated)? AsciiSet; } /// When in literal after set loop node, gets the literal to search for and the RegexNode representing the leading loop. diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs index 3ed85cb6394dd..edeb6976c5312 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs @@ -63,7 +63,7 @@ internal sealed class RegexLWCGCompiler : RegexCompiler DynamicMethod scanMethod = DefineDynamicMethod($"Regex{regexNum}_Scan{description}", null, typeof(CompiledRegexRunner), new[] { typeof(RegexRunner), typeof(ReadOnlySpan) }); EmitScan(options, tryfindNextPossibleStartPositionMethod, tryMatchAtCurrentPositionMethod); - return new CompiledRegexRunnerFactory(scanMethod, regexTree.Culture); + return new CompiledRegexRunnerFactory(scanMethod, _indexOfAnyValues?.ToArray(), regexTree.Culture); } /// Begins the definition of a new method (no args) with a specified return value. diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index a94be746767a0..d659026d0ae52 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -1428,10 +1428,10 @@ public char FirstCharOfOneOrMulti() switch (node.Kind) { case RegexNodeKind.One or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or RegexNodeKind.Onelazy: - return new StartingLiteralData(range: (node.Ch, node.Ch), @string: null, setChars: null, negated: false); + return new StartingLiteralData(range: (node.Ch, node.Ch), negated: false); case RegexNodeKind.Notone or RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Notonelazy: - return new StartingLiteralData(range: (node.Ch, node.Ch), @string: null, setChars: null, negated: true); + return new StartingLiteralData(range: (node.Ch, node.Ch), negated: true); case RegexNodeKind.Set or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic or RegexNodeKind.Setlazy: Span setChars = stackalloc char[maxSetCharacters]; @@ -1439,18 +1439,23 @@ public char FirstCharOfOneOrMulti() if ((numChars = RegexCharClass.GetSetChars(node.Str!, setChars)) != 0) { setChars = setChars.Slice(0, numChars); - return new StartingLiteralData(range: default, @string: null, setChars: setChars.ToString(), negated: RegexCharClass.IsNegated(node.Str!)); + return new StartingLiteralData(setChars: setChars.ToString(), negated: RegexCharClass.IsNegated(node.Str!)); } if (RegexCharClass.TryGetSingleRange(node.Str!, out char lowInclusive, out char highInclusive)) { Debug.Assert(lowInclusive < highInclusive); - return new StartingLiteralData(range: (lowInclusive, highInclusive), @string: null, setChars: null, negated: RegexCharClass.IsNegated(node.Str!)); + return new StartingLiteralData(range: (lowInclusive, highInclusive), negated: RegexCharClass.IsNegated(node.Str!)); + } + + if (RegexCharClass.TryGetAsciiSetChars(node.Str!, out char[]? asciiChars)) + { + return new StartingLiteralData(asciiChars: asciiChars, negated: RegexCharClass.IsNegated(node.Str!)); } break; case RegexNodeKind.Multi: - return new StartingLiteralData(range: default, @string: node.Str, setChars: null, negated: false); + return new StartingLiteralData(@string: node.Str); } } @@ -1463,15 +1468,34 @@ public readonly struct StartingLiteralData public readonly (char LowInclusive, char HighInclusive) Range; public readonly string? String; public readonly string? SetChars; + public readonly char[]? AsciiChars; public readonly bool Negated; - public StartingLiteralData((char LowInclusive, char HighInclusive) range, string? @string, string? setChars, bool negated) + public StartingLiteralData((char LowInclusive, char HighInclusive) range, bool negated) { Range = range; + Negated = negated; + } + + public StartingLiteralData(string? @string) + { + Debug.Assert(@string is not null); String = @string; + } + + public StartingLiteralData(string? setChars, bool negated) + { + Debug.Assert(setChars is not null); SetChars = setChars; Negated = negated; } + + public StartingLiteralData(char[]? asciiChars, bool negated) + { + Debug.Assert(asciiChars is not null); + AsciiChars = asciiChars; + Negated = negated; + } } /// diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs index cacf02d321ed5..88e21595c29a1 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs @@ -219,6 +219,11 @@ static bool Process(RegexNode node, ref ValueStringBuilder vsb) result.Range = (lowInclusive, highInclusive, negated); results[i] = result; } + else if (RegexCharClass.TryGetAsciiSetChars(result.Set, out char[]? asciiChars)) + { + result.AsciiSet = (asciiChars, negated); + results[i] = result; + } } } @@ -437,13 +442,47 @@ public static void SortFixedDistanceSetsByQuality(List { + char[]? s1Chars = s1.Chars ?? s1.AsciiSet?.Chars; + char[]? s2Chars = s2.Chars ?? s2.AsciiSet?.Chars; + int s1CharsLength = s1Chars?.Length ?? 0; + int s2CharsLength = s2Chars?.Length ?? 0; + bool s1Negated = s1.AsciiSet.GetValueOrDefault().Negated; + bool s2Negated = s2.AsciiSet.GetValueOrDefault().Negated; + + if (s1Negated) + { + s1CharsLength = char.MaxValue - s1CharsLength; + } + + if (s2Negated) + { + s2CharsLength = char.MaxValue - s2CharsLength; + } + // If both have chars, prioritize the one with the smaller frequency for those chars. - if (s1.Chars is not null && s2.Chars is not null) + if (s1Chars is not null && s2Chars is not null) { + // If they have different lengths, prefer the shorter one. + if (s1CharsLength != s2CharsLength) + { + return s1CharsLength.CompareTo(s2CharsLength); + } + + Debug.Assert(s1Negated == s2Negated, "The lengths should have been different"); + // Then of the ones that are the same length, prefer those with less frequent values. The frequency is // only an approximation, used as a tie-breaker when we'd otherwise effectively be picking randomly. True // frequencies will vary widely based on the actual data being searched, the language of the data, etc. - int c = SumFrequencies(s1.Chars).CompareTo(SumFrequencies(s2.Chars)); + float s1Frequency = SumFrequencies(s1Chars); + float s2Frequency = SumFrequencies(s2Chars); + + if (s1Negated) + { + s1Frequency = -s1Frequency; + s2Frequency = -s2Frequency; + } + + int c = s1Frequency.CompareTo(s2Frequency); if (c != 0) { return c; From a71911c2cd95326369c054c127fa15ca8ce3e19a Mon Sep 17 00:00:00 2001 From: Miha Zupan Date: Fri, 2 Dec 2022 01:20:55 +0100 Subject: [PATCH 2/7] Review feedback --- .../gen/RegexGenerator.Emitter.cs | 15 +++++++-------- .../Text/RegularExpressions/RegexCompiler.cs | 4 ++-- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index 86bac5416be2a..96930dc6d22f1 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -392,12 +392,12 @@ private static string EmitIndexOfAnyValues(char[] asciiChars, Dictionary "AsciiLetter", - "000000000000FF03FEFFFF07FEFFFF07" => "AsciiLetterOrDigit", - "000000000000FF037E0000007E000000" => "AsciiHexDigit", - "000000000000FF03000000007E000000" => "AsciiHexDigitLower", - "000000000000FF037E00000000000000" => "AsciiHexDigitUpper", - _ => $"Ascii_{hexBitmap.TrimStart('0')}" + "0000000000000000FEFFFF07FEFFFF07" => "s_asciiLetters", + "000000000000FF03FEFFFF07FEFFFF07" => "s_asciiLettersAndDigits", + "000000000000FF037E0000007E000000" => "s_asciiHexDigits", + "000000000000FF03000000007E000000" => "s_asciiHexDigitsLower", + "000000000000FF037E00000000000000" => "s_asciiHexDigitsUpper", + _ => $"s_ascii_{hexBitmap.TrimStart('0')}" }; string helperName = $"IndexOfAnyValues_{fieldName}"; @@ -406,8 +406,7 @@ private static string EmitIndexOfAnyValues(char[] asciiChars, Dictionary {fieldName} =", - $" IndexOfAnyValues.Create({Literal(new string(asciiChars))});", + $"internal static readonly IndexOfAnyValues {fieldName} = IndexOfAnyValues.Create({Literal(new string(asciiChars))});", }); } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index fcd5a7fe6aad3..9a5c46fd91742 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -5994,8 +5994,8 @@ private void LoadIndexOfAnyValues(char[] chars) // this._indexOfAnyValues[index] Ldthisfld(s_indexOfAnyValuesArrayField); - _ilg!.Emit(OpCodes.Ldc_I4_S, index); - _ilg.Emit(OpCodes.Ldelem_Ref); + Ldc(index); + _ilg!.Emit(OpCodes.Ldelem_Ref); } } } From bc8a994841b586c70f66fb8220ab1b59cab19e0a Mon Sep 17 00:00:00 2001 From: Miha Zupan Date: Fri, 2 Dec 2022 02:13:53 +0100 Subject: [PATCH 3/7] Fix edge-case of ASCII set after Notonelazy that doesn't overlap with target --- .../gen/RegexGenerator.Emitter.cs | 5 +++-- .../System/Text/RegularExpressions/RegexCompiler.cs | 13 ++++++++++--- .../tests/FunctionalTests/Regex.Match.Tests.cs | 3 +++ 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index 96930dc6d22f1..5b29445d350ff 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -379,7 +379,6 @@ private static string EmitIndexOfAnyValuesOrLiteral(ReadOnlySpan chars, Di private static string EmitIndexOfAnyValues(char[] asciiChars, Dictionary requiredHelpers) { Debug.Assert(RegexCharClass.IsAscii(asciiChars)); - Debug.Assert(asciiChars.AsSpan().SequenceEqual(asciiChars.OrderBy(c => c).ToArray())); // The set of ASCII characters can be represented as a 128-bit bitmap. Use the 16-byte hex string as the key. byte[] bitmap = new byte[16]; @@ -404,6 +403,8 @@ private static string EmitIndexOfAnyValues(char[] asciiChars, Dictionary {fieldName} = IndexOfAnyValues.Create({Literal(new string(asciiChars))});", @@ -3168,8 +3169,8 @@ literal.SetChars is not null || } else if (literal.AsciiChars is not null) // set of only ASCII characters { - overlap = literal.AsciiChars.Contains(node.Ch); char[] asciiChars = literal.AsciiChars; + overlap = asciiChars.Contains(node.Ch); if (!overlap) { Debug.Assert(node.Ch < 128); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 9a5c46fd91742..0e14fb2278c0b 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -3399,7 +3399,7 @@ node.Kind is RegexNodeKind.Notonelazy && !literal.Negated && // not negated; can't search for both the node.Ch and a negated subsequent char with an IndexOf* method (literal.String is not null || literal.SetChars is not null || - literal.AsciiChars is not null || + (literal.AsciiChars is not null && node.Ch < 128) || // for ASCII sets, only allow when the target can be efficiently included in the set literal.Range.LowInclusive == literal.Range.HighInclusive || (literal.Range.LowInclusive <= node.Ch && node.Ch <= literal.Range.HighInclusive))) // for ranges, only allow when the range overlaps with the target, since there's no accelerated way to search for the union { @@ -3474,8 +3474,15 @@ literal.AsciiChars is not null || } else if (literal.AsciiChars is not null) // set of only ASCII characters { - overlap = literal.AsciiChars.AsSpan().Contains(node.Ch); - LoadIndexOfAnyValues(literal.AsciiChars); + char[] asciiChars = literal.AsciiChars; + overlap = asciiChars.AsSpan().Contains(node.Ch); + if (!overlap) + { + Debug.Assert(node.Ch < 128); + Array.Resize(ref asciiChars, asciiChars.Length + 1); + asciiChars[asciiChars.Length - 1] = node.Ch; + } + LoadIndexOfAnyValues(asciiChars); Call(s_spanIndexOfAnyIndexOfAnyValues); } else if (literal.Range.LowInclusive == literal.Range.HighInclusive) // single char from a RegexNode.One diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs index f138d610e2b94..970e60fc29ec3 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs @@ -382,6 +382,9 @@ public static IEnumerable Match_MemberData() yield return (@"a[^c]*?[bcdef]", "xyza12345e6789", lineOption, 0, 14, true, "a12345e"); yield return (@"a[^b]*?[bcdef]", "xyza12345f6789", lineOption, 0, 14, true, "a12345f"); yield return (@"a[^c]*?[bcdef]", "xyza12345g6789", lineOption, 0, 14, false, ""); + + yield return ("a[^b]*?[cdefgz]", "xyza123bc4", lineOption, 0, 10, false, ""); + yield return ("a[^b]*?[bdefgz]", "xyza123bc4", lineOption, 0, 10, true, "a123b"); } // Nested loops From aa01724738e959a7ca8585208e3ece9b1993c1d3 Mon Sep 17 00:00:00 2001 From: Miha Zupan Date: Mon, 5 Dec 2022 05:32:45 +0100 Subject: [PATCH 4/7] Use ranges for short sets where applicable, avoid negated IndexOfAnyValues primary sets --- .../gen/RegexGenerator.Emitter.cs | 50 +++--- .../Text/RegularExpressions/RegexCompiler.cs | 80 +++++----- .../RegexFindOptimizations.cs | 15 +- .../RegularExpressions/RegexPrefixAnalyzer.cs | 143 +++++++++--------- 4 files changed, 148 insertions(+), 140 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index 5b29445d350ff..12683cbe7ec94 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -887,19 +887,21 @@ void EmitFixedSet_LeftToRight() }; string indexOf = - primarySet.Chars is not null ? primarySet.Chars!.Length switch + primarySet.Chars is not null ? (primarySet.Negated, primarySet.Chars.Length) switch { - 1 => $"{span}.IndexOf({Literal(primarySet.Chars[0])})", - 2 => $"{span}.IndexOfAny({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])})", - 3 => $"{span}.IndexOfAny({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])}, {Literal(primarySet.Chars[2])})", - _ => $"{span}.IndexOfAny({EmitIndexOfAnyValuesOrLiteral(primarySet.Chars, requiredHelpers)})", + (false, 1) => $"{span}.IndexOf({Literal(primarySet.Chars[0])})", + (false, 2) => $"{span}.IndexOfAny({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])})", + (false, 3) => $"{span}.IndexOfAny({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])}, {Literal(primarySet.Chars[2])})", + (false, _) => $"{span}.IndexOfAny({EmitIndexOfAnyValuesOrLiteral(primarySet.Chars, requiredHelpers)})", + (true, 1) => $"{span}.IndexOfAnyExcept({Literal(primarySet.Chars[0])})", + _ => throw new InvalidOperationException("Expected that negated sets will have at most 1 value in Chars."), } : - primarySet.AsciiSet is not null ? primarySet.AsciiSet.Value.Negated switch + primarySet.AsciiSet is not null ? primarySet.Negated switch { - false => $"{span}.IndexOfAny({EmitIndexOfAnyValues(primarySet.AsciiSet.Value.Chars, requiredHelpers)})", - true => $"{span}.IndexOfAnyExcept({EmitIndexOfAnyValues(primarySet.AsciiSet.Value.Chars, requiredHelpers)})", + false => $"{span}.IndexOfAny({EmitIndexOfAnyValues(primarySet.AsciiSet, requiredHelpers)})", + _ => throw new InvalidOperationException("Expected AsciiSets not to be negated."), } : - (primarySet.Range.Value.LowInclusive == primarySet.Range.Value.HighInclusive, primarySet.Range.Value.Negated) switch + (primarySet.Range.Value.LowInclusive == primarySet.Range.Value.HighInclusive, primarySet.Negated) switch { (false, false) => $"{span}.IndexOfAnyInRange({Literal(primarySet.Range.Value.LowInclusive)}, {Literal(primarySet.Range.Value.HighInclusive)})", (true, false) => $"{span}.IndexOf({Literal(primarySet.Range.Value.LowInclusive)})", @@ -4440,8 +4442,22 @@ private static bool TryEmitIndexOf( bool negated = RegexCharClass.IsNegated(node.Str) ^ negate; Span setChars = stackalloc char[5]; // current max that's vectorized - int setCharsCount; - if ((setCharsCount = RegexCharClass.GetSetChars(node.Str, setChars)) > 0) + int setCharsCount = RegexCharClass.GetSetChars(node.Str, setChars); + + // Prefer IndexOfAnyInRange over IndexOfAny for sets of 2-5 values that fit in a single range + if (setCharsCount != 1 && RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive)) + { + string indexOfAnyInRangeName = !negated ? + "IndexOfAnyInRange" : + "IndexOfAnyExceptInRange"; + + indexOfExpr = $"{last}{indexOfAnyInRangeName}({Literal(lowInclusive)}, {Literal(highInclusive)})"; + + literalLength = 1; + return true; + } + + if (setCharsCount > 0) { (string indexOfName, string indexOfAnyName) = !negated ? ("IndexOf", "IndexOfAny") : @@ -4460,18 +4476,6 @@ private static bool TryEmitIndexOf( return true; } - if (RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive)) - { - string indexOfAnyInRangeName = !negated ? - "IndexOfAnyInRange" : - "IndexOfAnyExceptInRange"; - - indexOfExpr = $"{last}{indexOfAnyInRangeName}({Literal(lowInclusive)}, {Literal(highInclusive)})"; - - literalLength = 1; - return true; - } - if (RegexCharClass.TryGetAsciiSetChars(node.Str, out char[]? asciiChars)) { string indexOfAnyName = !negated ? diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 0e14fb2278c0b..e3115ddc3665b 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -888,12 +888,13 @@ void EmitFixedSet_LeftToRight() if (primarySet.Chars is not null) { + Debug.Assert(!primarySet.Negated || primarySet.Chars.Length == 1); switch (primarySet.Chars.Length) { case 1: - // tmp = ...IndexOf(setChars[0]); + // tmp = ...IndexOf{AnyExcept}(setChars[0]); Ldc(primarySet.Chars[0]); - Call(s_spanIndexOfChar); + Call(primarySet.Negated ? s_spanIndexOfAnyExceptChar : s_spanIndexOfChar); break; case 2: @@ -920,8 +921,9 @@ void EmitFixedSet_LeftToRight() } else if (primarySet.AsciiSet is not null) { - LoadIndexOfAnyValues(primarySet.AsciiSet.Value.Chars); - Call(primarySet.AsciiSet.Value.Negated ? s_spanIndexOfAnyExceptIndexOfAnyValues : s_spanIndexOfAnyIndexOfAnyValues); + Debug.Assert(!primarySet.Negated); + LoadIndexOfAnyValues(primarySet.AsciiSet); + Call(s_spanIndexOfAnyIndexOfAnyValues); } else { @@ -929,14 +931,14 @@ void EmitFixedSet_LeftToRight() { // tmp = ...IndexOf{AnyExcept}(low); Ldc(primarySet.Range.Value.LowInclusive); - Call(primarySet.Range.Value.Negated ? s_spanIndexOfAnyExceptChar : s_spanIndexOfChar); + Call(primarySet.Negated ? s_spanIndexOfAnyExceptChar : s_spanIndexOfChar); } else { // tmp = ...IndexOfAny{Except}InRange(low, high); Ldc(primarySet.Range.Value.LowInclusive); Ldc(primarySet.Range.Value.HighInclusive); - Call(primarySet.Range.Value.Negated ? s_spanIndexOfAnyExceptInRange : s_spanIndexOfAnyInRange); + Call(primarySet.Negated ? s_spanIndexOfAnyExceptInRange : s_spanIndexOfAnyInRange); } } @@ -1060,7 +1062,7 @@ void EmitFixedSet_RightToLeft() RegexFindOptimizations.FixedDistanceSet set = _regexTree.FindOptimizations.FixedDistanceSets![0]; Debug.Assert(set.Distance == 0); - if (set.Chars is { Length: 1 }) + if (set.Chars is { Length: 1 } && !set.Negated) { // pos = inputSpan.Slice(0, pos).LastIndexOf(set.Chars[0]); Ldloca(inputSpan); @@ -4999,10 +5001,40 @@ void EmitIndexOf(RegexNode node, bool useLast, bool negate) { bool negated = RegexCharClass.IsNegated(node.Str) ^ negate; - // IndexOfAny{Except}(ch1, ...) Span setChars = stackalloc char[5]; // current max that's vectorized - int setCharsCount; - if ((setCharsCount = RegexCharClass.GetSetChars(node.Str, setChars)) > 0) + int setCharsCount = RegexCharClass.GetSetChars(node.Str, setChars); + + // IndexOfAny{Except}InRange + // Prefer IndexOfAnyInRange over IndexOfAny for sets of 2-5 values that fit in a single range + if (setCharsCount != 1 && RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive)) + { + if (lowInclusive == highInclusive) + { + Ldc(lowInclusive); + Call((useLast, negated) switch + { + (false, false) => s_spanIndexOfChar, + (false, true) => s_spanIndexOfAnyExceptChar, + (true, false) => s_spanLastIndexOfChar, + (true, true) => s_spanLastIndexOfAnyExceptChar, + }); + return; + } + + Ldc(lowInclusive); + Ldc(highInclusive); + Call((useLast, negated) switch + { + (false, false) => s_spanIndexOfAnyInRange, + (false, true) => s_spanIndexOfAnyExceptInRange, + (true, false) => s_spanLastIndexOfAnyInRange, + (true, true) => s_spanLastIndexOfAnyExceptInRange, + }); + return; + } + + // IndexOfAny{Except}(ch1, ...) + if (setCharsCount > 0) { setChars = setChars.Slice(0, setCharsCount); switch (setChars.Length) @@ -5057,34 +5089,6 @@ void EmitIndexOf(RegexNode node, bool useLast, bool negate) } } - // IndexOfAny{Except}InRange - if (RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive)) - { - if (lowInclusive == highInclusive) - { - Ldc(lowInclusive); - Call((useLast, negated) switch - { - (false, false) => s_spanIndexOfChar, - (false, true) => s_spanIndexOfAnyExceptChar, - (true, false) => s_spanLastIndexOfChar, - (true, true) => s_spanLastIndexOfAnyExceptChar, - }); - return; - } - - Ldc(lowInclusive); - Ldc(highInclusive); - Call((useLast, negated) switch - { - (false, false) => s_spanIndexOfAnyInRange, - (false, true) => s_spanIndexOfAnyExceptInRange, - (true, false) => s_spanLastIndexOfAnyInRange, - (true, true) => s_spanLastIndexOfAnyExceptInRange, - }); - return; - } - // IndexOfAny{Except}(IndexOfAnyValues) if (RegexCharClass.TryGetAsciiSetChars(node.Str, out char[]? asciiChars)) { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs index 75e16fbf8a545..14b4c80a308cf 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs @@ -159,7 +159,8 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options) // the set's characteristics. if (!compiled && fixedDistanceSets.Count == 1 && - fixedDistanceSets[0].Chars is { Length: 1 }) + fixedDistanceSets[0].Chars is { Length: 1 } && + !fixedDistanceSets[0].Negated) { FixedDistanceLiteral = (fixedDistanceSets[0].Chars![0], null, fixedDistanceSets[0].Distance); FindMode = FindNextStartingPositionMode.FixedDistanceChar_LeftToRight; @@ -241,14 +242,16 @@ public FixedDistanceSet(char[]? chars, string set, int distance) /// The character class description. public string Set; + /// Whether the is negated. + public bool Negated; /// Small list of all of the characters that make up the set, if known; otherwise, null. public char[]? Chars; /// The distance of the set from the beginning of the match. public int Distance; /// As an alternative to , a description of the single range the set represents, if it does. - public (char LowInclusive, char HighInclusive, bool Negated)? Range; + public (char LowInclusive, char HighInclusive)? Range; /// As an alternative to , a description of the set of ASCII characters it represents, if it does. - public (char[] Chars, bool Negated)? AsciiSet; + public char[]? AsciiSet; } /// When in literal after set loop node, gets the literal to search for and the RegexNode representing the leading loop. @@ -273,7 +276,7 @@ private static (string String, int Distance)? FindFixedDistanceString(List textSpan, string set = primarySet.Set; ReadOnlySpan span = textSpan.Slice(pos); - if (chars is not null) + if (chars is not null && !primarySet.Negated) { int i = span.IndexOfAny(chars); if (i >= 0) @@ -621,7 +624,7 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan textSpan, int endMinusRequiredLength = textSpan.Length - Math.Max(1, MinRequiredLength); - if (primarySet.Chars is not null) + if (primarySet.Chars is not null && !primarySet.Negated) { for (int inputPosition = pos; inputPosition <= endMinusRequiredLength; inputPosition++) { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs index 88e21595c29a1..c769f9463de10 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs @@ -200,31 +200,29 @@ static bool Process(RegexNode node, ref ValueStringBuilder vsb) for (int i = 0; i < results.Count; i++) { RegexFindOptimizations.FixedDistanceSet result = results[i]; - bool negated = RegexCharClass.IsNegated(result.Set); + result.Negated = RegexCharClass.IsNegated(result.Set); - if (!negated) + int count = RegexCharClass.GetSetChars(result.Set, scratch); + if (result.Negated ? (count == 1) : (count > 0)) { - int count = RegexCharClass.GetSetChars(result.Set, scratch); - if (count != 0) - { - result.Chars = scratch.Slice(0, count).ToArray(); - results[i] = result; - } + result.Chars = scratch.Slice(0, count).ToArray(); } - if (thorough && result.Chars is null) + if (thorough) { - if (RegexCharClass.TryGetSingleRange(result.Set, out char lowInclusive, out char highInclusive)) + // Prefer IndexOfAnyInRange over IndexOfAny for sets of 2-5 values that fit in a single range + if (count != 1 && RegexCharClass.TryGetSingleRange(result.Set, out char lowInclusive, out char highInclusive)) { - result.Range = (lowInclusive, highInclusive, negated); - results[i] = result; + result.Chars = null; + result.Range = (lowInclusive, highInclusive); } - else if (RegexCharClass.TryGetAsciiSetChars(result.Set, out char[]? asciiChars)) + else if (result.Chars is null && !result.Negated && RegexCharClass.TryGetAsciiSetChars(result.Set, out char[]? asciiChars)) { - result.AsciiSet = (asciiChars, negated); - results[i] = result; + result.AsciiSet = asciiChars; } } + + results[i] = result; } return results; @@ -440,52 +438,53 @@ static bool TryFindFixedSets(RegexNode node, List results) => // Finally, try to move the "best" results to be earlier. "best" here are ones we're able to search // for the fastest and that have the best chance of matching as few false positives as possible. - results.Sort((s1, s2) => + results.Sort(static (s1, s2) => { - char[]? s1Chars = s1.Chars ?? s1.AsciiSet?.Chars; - char[]? s2Chars = s2.Chars ?? s2.AsciiSet?.Chars; + char[]? s1Chars = s1.Chars ?? s1.AsciiSet; + char[]? s2Chars = s2.Chars ?? s2.AsciiSet; int s1CharsLength = s1Chars?.Length ?? 0; int s2CharsLength = s2Chars?.Length ?? 0; - bool s1Negated = s1.AsciiSet.GetValueOrDefault().Negated; - bool s2Negated = s2.AsciiSet.GetValueOrDefault().Negated; + bool s1Negated = s1.Negated; + bool s2Negated = s2.Negated; + int s1RangeLength = s1.Range is not null ? GetRangeLength(s1.Range.Value, s1Negated) : 0; + int s2RangeLength = s2.Range is not null ? GetRangeLength(s2.Range.Value, s2Negated) : 0; - if (s1Negated) + if (s1Negated && s1CharsLength > 0) { - s1CharsLength = char.MaxValue - s1CharsLength; + s1CharsLength = char.MaxValue + 1 - s1CharsLength; } - if (s2Negated) + if (s2Negated && s2CharsLength > 0) { - s2CharsLength = char.MaxValue - s2CharsLength; + s2CharsLength = char.MaxValue + 1 - s2CharsLength; } // If both have chars, prioritize the one with the smaller frequency for those chars. if (s1Chars is not null && s2Chars is not null) { - // If they have different lengths, prefer the shorter one. - if (s1CharsLength != s2CharsLength) + // If one is negated and the other isn't, prefer the non-negated one. + if (s1Negated != s2Negated) { - return s1CharsLength.CompareTo(s2CharsLength); + return s1Negated ? 1 : -1; } - Debug.Assert(s1Negated == s2Negated, "The lengths should have been different"); - - // Then of the ones that are the same length, prefer those with less frequent values. The frequency is - // only an approximation, used as a tie-breaker when we'd otherwise effectively be picking randomly. True - // frequencies will vary widely based on the actual data being searched, the language of the data, etc. + // Prefer sets with less frequent values. The frequency is only an approximation, + // used as a tie-breaker when we'd otherwise effectively be picking randomly. + // True frequencies will vary widely based on the actual data being searched, the language of the data, etc. float s1Frequency = SumFrequencies(s1Chars); float s2Frequency = SumFrequencies(s2Chars); - if (s1Negated) + if (s1Frequency != s2Frequency) { - s1Frequency = -s1Frequency; - s2Frequency = -s2Frequency; + return s1Negated + ? s2Frequency.CompareTo(s1Frequency) + : s1Frequency.CompareTo(s2Frequency); } - int c = s1Frequency.CompareTo(s2Frequency); - if (c != 0) + if (!RegexCharClass.IsAscii(s1Chars) && !RegexCharClass.IsAscii(s2Chars)) { - return c; + // Prefer the set with fewer values. + return s1CharsLength.CompareTo(s2CharsLength); } [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -494,45 +493,59 @@ static float SumFrequencies(char[] chars) float sum = 0; foreach (char c in chars) { - // Lookup each character in the table. For values > 255, this will end up truncating + // Lookup each character in the table. Values >= 128 are ignored // and thus we'll get skew in the data. It's already a gross approximation, though, // and it is primarily meant for disambiguation of ASCII letters. - sum += s_frequency[(byte)c]; + if (c < 128) + { + sum += s_frequency[c]; + } } return sum; } } + // If one has chars and the other has a range, prefer the shorter set. + if ((s1CharsLength > 0 && s2RangeLength > 0) || (s1RangeLength > 0 && s2CharsLength > 0)) + { + int c = Math.Max(s1CharsLength, s1RangeLength).CompareTo(Math.Max(s2CharsLength, s2RangeLength)); + if (c != 0) + { + return c; + } + + // If lengths are the same, prefer the chars. + return s1CharsLength > 0 ? -1 : 1; + } + // If one has chars and the other doesn't, prioritize the one with chars. - if ((s1.Chars is not null) != (s2.Chars is not null)) + if ((s1CharsLength > 0) != (s2CharsLength > 0)) { - return s1.Chars is not null ? -1 : 1; + return s1CharsLength > 0 ? -1 : 1; } // If one has a range and the other doesn't, prioritize the one with a range. - if ((s1.Range is not null) != (s2.Range is not null)) + if ((s1RangeLength > 0) != (s2RangeLength > 0)) { - return s1.Range is not null ? -1 : 1; + return s1RangeLength > 0 ? -1 : 1; } // If both have ranges, prefer the one that includes fewer characters. - if (s1.Range is not null) + if (s1RangeLength > 0) { - return - GetRangeLength(s1.Range.GetValueOrDefault()).CompareTo( - GetRangeLength(s2.Range.GetValueOrDefault())); - - static int GetRangeLength((char LowInclusive, char HighInclusive, bool Negated) range) - { - int length = range.HighInclusive - range.LowInclusive + 1; - return range.Negated ? - char.MaxValue + 1 - length : - length; - } + return s1RangeLength.CompareTo(s2RangeLength); } // As a tiebreaker, prioritize the earlier one. return s1.Distance.CompareTo(s2.Distance); + + static int GetRangeLength((char LowInclusive, char HighInclusive) range, bool negated) + { + int length = range.HighInclusive - range.LowInclusive + 1; + return negated ? + char.MaxValue + 1 - length : + length; + } }); /// @@ -947,22 +960,6 @@ private static RegexNodeKind FindLeadingOrTrailingAnchor(RegexNode node, bool le 1.024f /* ' h' */, 3.750f /* ' i' */, 0.286f /* ' j' */, 0.439f /* ' k' */, 2.913f /* ' l' */, 1.459f /* ' m' */, 3.908f /* ' n' */, 3.230f /* ' o' */, 1.444f /* ' p' */, 0.231f /* ' q' */, 4.220f /* ' r' */, 3.924f /* ' s' */, 5.312f /* ' t' */, 2.112f /* ' u' */, 0.737f /* ' v' */, 0.573f /* ' w' */, 0.992f /* ' x' */, 1.067f /* ' y' */, 0.181f /* ' z' */, 0.391f /* ' {' */, 0.056f /* ' |' */, 0.391f /* ' }' */, 0.002f /* ' ~' */, 0.000f /* '\x7F' */, - 0.000f /* '\x80' */, 0.000f /* '\x81' */, 0.000f /* '\x82' */, 0.000f /* '\x83' */, 0.000f /* '\x84' */, 0.000f /* '\x85' */, 0.000f /* '\x86' */, 0.000f /* '\x87' */, - 0.000f /* '\x88' */, 0.000f /* '\x89' */, 0.000f /* '\x8A' */, 0.000f /* '\x8B' */, 0.000f /* '\x8C' */, 0.000f /* '\x8D' */, 0.000f /* '\x8E' */, 0.000f /* '\x8F' */, - 0.000f /* '\x90' */, 0.000f /* '\x91' */, 0.000f /* '\x92' */, 0.000f /* '\x93' */, 0.000f /* '\x94' */, 0.000f /* '\x95' */, 0.000f /* '\x96' */, 0.000f /* '\x97' */, - 0.000f /* '\x98' */, 0.000f /* '\x99' */, 0.000f /* '\x9A' */, 0.000f /* '\x9B' */, 0.000f /* '\x9C' */, 0.000f /* '\x9D' */, 0.000f /* '\x9E' */, 0.000f /* '\x9F' */, - 0.000f /* '\xA0' */, 0.000f /* '\xA1' */, 0.000f /* '\xA2' */, 0.000f /* '\xA3' */, 0.000f /* '\xA4' */, 0.000f /* '\xA5' */, 0.000f /* '\xA6' */, 0.000f /* '\xA7' */, - 0.000f /* '\xA8' */, 0.000f /* '\xA9' */, 0.000f /* '\xAA' */, 0.000f /* '\xAB' */, 0.000f /* '\xAC' */, 0.000f /* '\xAD' */, 0.000f /* '\xAE' */, 0.000f /* '\xAF' */, - 0.000f /* '\xB0' */, 0.000f /* '\xB1' */, 0.000f /* '\xB2' */, 0.000f /* '\xB3' */, 0.000f /* '\xB4' */, 0.000f /* '\xB5' */, 0.000f /* '\xB6' */, 0.000f /* '\xB7' */, - 0.000f /* '\xB8' */, 0.000f /* '\xB9' */, 0.000f /* '\xBA' */, 0.000f /* '\xBB' */, 0.000f /* '\xBC' */, 0.000f /* '\xBD' */, 0.000f /* '\xBE' */, 0.000f /* '\xBF' */, - 0.000f /* '\xC0' */, 0.000f /* '\xC1' */, 0.000f /* '\xC2' */, 0.000f /* '\xC3' */, 0.000f /* '\xC4' */, 0.000f /* '\xC5' */, 0.000f /* '\xC6' */, 0.000f /* '\xC7' */, - 0.000f /* '\xC8' */, 0.000f /* '\xC9' */, 0.000f /* '\xCA' */, 0.000f /* '\xCB' */, 0.000f /* '\xCC' */, 0.000f /* '\xCD' */, 0.000f /* '\xCE' */, 0.000f /* '\xCF' */, - 0.000f /* '\xD0' */, 0.000f /* '\xD1' */, 0.000f /* '\xD2' */, 0.000f /* '\xD3' */, 0.000f /* '\xD4' */, 0.000f /* '\xD5' */, 0.000f /* '\xD6' */, 0.000f /* '\xD7' */, - 0.000f /* '\xD8' */, 0.000f /* '\xD9' */, 0.000f /* '\xDA' */, 0.000f /* '\xDB' */, 0.000f /* '\xDC' */, 0.000f /* '\xDD' */, 0.000f /* '\xDE' */, 0.000f /* '\xDF' */, - 0.000f /* '\xE0' */, 0.000f /* '\xE1' */, 0.000f /* '\xE2' */, 0.000f /* '\xE3' */, 0.000f /* '\xE4' */, 0.000f /* '\xE5' */, 0.000f /* '\xE6' */, 0.000f /* '\xE7' */, - 0.000f /* '\xE8' */, 0.000f /* '\xE9' */, 0.000f /* '\xEA' */, 0.000f /* '\xEB' */, 0.000f /* '\xEC' */, 0.000f /* '\xED' */, 0.000f /* '\xEE' */, 0.000f /* '\xEF' */, - 0.000f /* '\xF0' */, 0.000f /* '\xF1' */, 0.000f /* '\xF2' */, 0.000f /* '\xF3' */, 0.000f /* '\xF4' */, 0.000f /* '\xF5' */, 0.000f /* '\xF6' */, 0.000f /* '\xF7' */, - 0.000f /* '\xF8' */, 0.000f /* '\xF9' */, 0.000f /* '\xFA' */, 0.000f /* '\xFB' */, 0.000f /* '\xFC' */, 0.000f /* '\xFD' */, 0.000f /* '\xFE' */, 0.000f /* '\xFF' */, }; // The above table was generated programmatically with the following. This can be augmented to incorporate additional data sources, @@ -992,7 +989,7 @@ private static RegexNodeKind FindLeadingOrTrailingAnchor(RegexNode node, bool le // Console.WriteLine("private static readonly float[] s_frequency = new float[]"); // Console.WriteLine("{"); // int i = 0; - // for (int row = 0; row < 32; row++) + // for (int row = 0; row < 16; row++) // { // Console.Write(" "); // for (int col = 0; col < 8; col++) From 768a5ebf325cd61fb4217d2f7b7647ab40516dc1 Mon Sep 17 00:00:00 2001 From: Miha Zupan Date: Wed, 7 Dec 2022 23:26:16 +0100 Subject: [PATCH 5/7] Add XML comment to IndexOfAnyValues declarations --- .../gen/RegexGenerator.Emitter.cs | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index 12683cbe7ec94..c09c2df09342d 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -22,6 +22,10 @@ namespace System.Text.RegularExpressions.Generator { public partial class RegexGenerator { + /// Escapes '&', '<' and '>' characters. We aren't using HtmlEncode as that would also escape single and double quotes. + private static string EscapeXmlComment(string text) => + text.Replace("&", "&").Replace("<", "<").Replace(">", ">"); + /// Emits the definition of the partial method. This method just delegates to the property cache on the generated Regex-derived type. private static void EmitRegexPartialMethod(RegexMethod regexMethod, IndentedTextWriter writer) { @@ -405,9 +409,12 @@ private static string EmitIndexOfAnyValues(char[] asciiChars, Dictionary {fieldName} = IndexOfAnyValues.Create({Literal(new string(asciiChars))});", + $"/// Cached data to efficiently search for a character in the set {EscapeXmlComment(setLiteral)}.", + $"internal static readonly IndexOfAnyValues {fieldName} = IndexOfAnyValues.Create({setLiteral});", }); } @@ -5066,14 +5073,11 @@ RegexNodeKind.BackreferenceConditional when node.Parent.Child(1) == node => "Not _ => "", }; - // Get a textual description of the node, making it safe for an XML comment (escaping the minimal amount necessary to - // avoid compilation failures: we don't want to escape single and double quotes, as HtmlEncode would do). string nodeDescription = DescribeNode(node, rm); - nodeDescription = nodeDescription.Replace("&", "&").Replace("<", "<").Replace(">", ">"); // Write out the line for the node. const char BulletPoint = '\u25CB'; - writer.WriteLine($"/// {new string(' ', depth * 4)}{BulletPoint} {tag}{nodeDescription}
"); + writer.WriteLine($"/// {new string(' ', depth * 4)}{BulletPoint} {tag}{EscapeXmlComment(nodeDescription)}
"); } // Process each child. From 981d33c2ae465f9156f85817763d3926f5c6f3a7 Mon Sep 17 00:00:00 2001 From: Miha Zupan Date: Fri, 9 Dec 2022 01:30:21 +0100 Subject: [PATCH 6/7] PR feedback --- .../gen/RegexGenerator.Emitter.cs | 4 ++-- .../Text/RegularExpressions/CompiledRegexRunner.cs | 4 ++-- .../System/Text/RegularExpressions/RegexCompiler.cs | 6 +++--- .../Text/RegularExpressions/RegexPrefixAnalyzer.cs | 11 ++++++++--- 4 files changed, 15 insertions(+), 10 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index c09c2df09342d..a24655fc629a4 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -4451,8 +4451,8 @@ private static bool TryEmitIndexOf( Span setChars = stackalloc char[5]; // current max that's vectorized int setCharsCount = RegexCharClass.GetSetChars(node.Str, setChars); - // Prefer IndexOfAnyInRange over IndexOfAny for sets of 2-5 values that fit in a single range - if (setCharsCount != 1 && RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive)) + // Prefer IndexOfAnyInRange over IndexOfAny for sets of 3-5 values that fit in a single range. + if (setCharsCount is not (1 or 2) && RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive)) { string indexOfAnyInRangeName = !negated ? "IndexOfAnyInRange" : diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs index 75e34610da8e8..3517193477797 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs @@ -10,13 +10,13 @@ internal sealed class CompiledRegexRunner : RegexRunner { private readonly ScanDelegate _scanMethod; -#pragma warning disable CA1823 // Avoid unused private fields. Justification: Used via reflection to cache the Case behavior if needed. -#pragma warning disable CS0169 private readonly IndexOfAnyValues[]? _indexOfAnyValues; /// This field will only be set if the pattern contains backreferences and has RegexOptions.IgnoreCase private readonly CultureInfo? _culture; +#pragma warning disable CA1823 // Avoid unused private fields. Justification: Used via reflection to cache the Case behavior if needed. +#pragma warning disable CS0169 private RegexCaseBehavior _caseBehavior; #pragma warning restore CS0169 #pragma warning restore CA1823 diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index e3115ddc3665b..c797f2bf16bae 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -3482,7 +3482,7 @@ literal.SetChars is not null || { Debug.Assert(node.Ch < 128); Array.Resize(ref asciiChars, asciiChars.Length + 1); - asciiChars[asciiChars.Length - 1] = node.Ch; + asciiChars[^1] = node.Ch; } LoadIndexOfAnyValues(asciiChars); Call(s_spanIndexOfAnyIndexOfAnyValues); @@ -5005,8 +5005,8 @@ void EmitIndexOf(RegexNode node, bool useLast, bool negate) int setCharsCount = RegexCharClass.GetSetChars(node.Str, setChars); // IndexOfAny{Except}InRange - // Prefer IndexOfAnyInRange over IndexOfAny for sets of 2-5 values that fit in a single range - if (setCharsCount != 1 && RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive)) + // Prefer IndexOfAnyInRange over IndexOfAny for sets of 3-5 values that fit in a single range. + if (setCharsCount is not (1 or 2) && RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive)) { if (lowInclusive == highInclusive) { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs index c769f9463de10..2cb491a9646b5 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs @@ -203,15 +203,20 @@ static bool Process(RegexNode node, ref ValueStringBuilder vsb) result.Negated = RegexCharClass.IsNegated(result.Set); int count = RegexCharClass.GetSetChars(result.Set, scratch); + + // We only use IndexOfAnyExcept for primary sets with a single value. + // For non-negated sets, we use IndexOfAny for up to 5 values. if (result.Negated ? (count == 1) : (count > 0)) { result.Chars = scratch.Slice(0, count).ToArray(); } - if (thorough) + // 'Count == 1' will always be handeled by Chars above. + if (thorough && count != 1) { - // Prefer IndexOfAnyInRange over IndexOfAny for sets of 2-5 values that fit in a single range - if (count != 1 && RegexCharClass.TryGetSingleRange(result.Set, out char lowInclusive, out char highInclusive)) + // Prefer IndexOfAnyInRange over IndexOfAny for sets of 3-5 values that fit in a single range. + // Chars may be null for 'Count == 2' if we're dealing with a negated set. + if ((count != 2 || result.Chars is null) && RegexCharClass.TryGetSingleRange(result.Set, out char lowInclusive, out char highInclusive)) { result.Chars = null; result.Range = (lowInclusive, highInclusive); From b2951db705e9fd20731fa40e994aaf7b114f6d4a Mon Sep 17 00:00:00 2001 From: Miha Zupan Date: Sat, 10 Dec 2022 01:49:59 +0100 Subject: [PATCH 7/7] Revert useless code around length=1 negated chars --- .../gen/RegexGenerator.Emitter.cs | 20 +++++------- .../Text/RegularExpressions/RegexCompiler.cs | 9 +++--- .../RegexFindOptimizations.cs | 7 ++--- .../RegularExpressions/RegexPrefixAnalyzer.cs | 31 ++++--------------- 4 files changed, 22 insertions(+), 45 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index a24655fc629a4..809d5d29c9c74 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -893,21 +893,17 @@ void EmitFixedSet_LeftToRight() (true, _) => $"{span}.Slice(i + {primarySet.Distance})", }; + Debug.Assert(!primarySet.Negated || (primarySet.Chars is null && primarySet.AsciiSet is null)); + string indexOf = - primarySet.Chars is not null ? (primarySet.Negated, primarySet.Chars.Length) switch - { - (false, 1) => $"{span}.IndexOf({Literal(primarySet.Chars[0])})", - (false, 2) => $"{span}.IndexOfAny({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])})", - (false, 3) => $"{span}.IndexOfAny({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])}, {Literal(primarySet.Chars[2])})", - (false, _) => $"{span}.IndexOfAny({EmitIndexOfAnyValuesOrLiteral(primarySet.Chars, requiredHelpers)})", - (true, 1) => $"{span}.IndexOfAnyExcept({Literal(primarySet.Chars[0])})", - _ => throw new InvalidOperationException("Expected that negated sets will have at most 1 value in Chars."), - } : - primarySet.AsciiSet is not null ? primarySet.Negated switch + primarySet.Chars is not null ? primarySet.Chars.Length switch { - false => $"{span}.IndexOfAny({EmitIndexOfAnyValues(primarySet.AsciiSet, requiredHelpers)})", - _ => throw new InvalidOperationException("Expected AsciiSets not to be negated."), + 1 => $"{span}.IndexOf({Literal(primarySet.Chars[0])})", + 2 => $"{span}.IndexOfAny({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])})", + 3 => $"{span}.IndexOfAny({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])}, {Literal(primarySet.Chars[2])})", + _ => $"{span}.IndexOfAny({EmitIndexOfAnyValuesOrLiteral(primarySet.Chars, requiredHelpers)})", } : + primarySet.AsciiSet is not null ? $"{span}.IndexOfAny({EmitIndexOfAnyValues(primarySet.AsciiSet, requiredHelpers)})" : (primarySet.Range.Value.LowInclusive == primarySet.Range.Value.HighInclusive, primarySet.Negated) switch { (false, false) => $"{span}.IndexOfAnyInRange({Literal(primarySet.Range.Value.LowInclusive)}, {Literal(primarySet.Range.Value.HighInclusive)})", diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index c797f2bf16bae..d85473e28e73b 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -886,15 +886,16 @@ void EmitFixedSet_LeftToRight() Ldloc(textSpanLocal); } + Debug.Assert(!primarySet.Negated || (primarySet.Chars is null && primarySet.AsciiSet is null)); + if (primarySet.Chars is not null) { - Debug.Assert(!primarySet.Negated || primarySet.Chars.Length == 1); switch (primarySet.Chars.Length) { case 1: - // tmp = ...IndexOf{AnyExcept}(setChars[0]); + // tmp = ...IndexOf(setChars[0]); Ldc(primarySet.Chars[0]); - Call(primarySet.Negated ? s_spanIndexOfAnyExceptChar : s_spanIndexOfChar); + Call(s_spanIndexOfChar); break; case 2: @@ -1062,7 +1063,7 @@ void EmitFixedSet_RightToLeft() RegexFindOptimizations.FixedDistanceSet set = _regexTree.FindOptimizations.FixedDistanceSets![0]; Debug.Assert(set.Distance == 0); - if (set.Chars is { Length: 1 } && !set.Negated) + if (set.Chars is { Length: 1 }) { // pos = inputSpan.Slice(0, pos).LastIndexOf(set.Chars[0]); Ldloca(inputSpan); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs index 14b4c80a308cf..d02c74a70c7b6 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs @@ -159,8 +159,7 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options) // the set's characteristics. if (!compiled && fixedDistanceSets.Count == 1 && - fixedDistanceSets[0].Chars is { Length: 1 } && - !fixedDistanceSets[0].Negated) + fixedDistanceSets[0].Chars is { Length: 1 }) { FixedDistanceLiteral = (fixedDistanceSets[0].Chars![0], null, fixedDistanceSets[0].Distance); FindMode = FindNextStartingPositionMode.FixedDistanceChar_LeftToRight; @@ -557,7 +556,7 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan textSpan, string set = primarySet.Set; ReadOnlySpan span = textSpan.Slice(pos); - if (chars is not null && !primarySet.Negated) + if (chars is not null) { int i = span.IndexOfAny(chars); if (i >= 0) @@ -624,7 +623,7 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan textSpan, int endMinusRequiredLength = textSpan.Length - Math.Max(1, MinRequiredLength); - if (primarySet.Chars is not null && !primarySet.Negated) + if (primarySet.Chars is not null) { for (int inputPosition = pos; inputPosition <= endMinusRequiredLength; inputPosition++) { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs index 2cb491a9646b5..0e0badd650ebf 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs @@ -204,19 +204,15 @@ static bool Process(RegexNode node, ref ValueStringBuilder vsb) int count = RegexCharClass.GetSetChars(result.Set, scratch); - // We only use IndexOfAnyExcept for primary sets with a single value. - // For non-negated sets, we use IndexOfAny for up to 5 values. - if (result.Negated ? (count == 1) : (count > 0)) + if (!result.Negated && count > 0) { result.Chars = scratch.Slice(0, count).ToArray(); } - // 'Count == 1' will always be handeled by Chars above. - if (thorough && count != 1) + if (thorough) { // Prefer IndexOfAnyInRange over IndexOfAny for sets of 3-5 values that fit in a single range. - // Chars may be null for 'Count == 2' if we're dealing with a negated set. - if ((count != 2 || result.Chars is null) && RegexCharClass.TryGetSingleRange(result.Set, out char lowInclusive, out char highInclusive)) + if ((result.Chars is null || count > 2) && RegexCharClass.TryGetSingleRange(result.Set, out char lowInclusive, out char highInclusive)) { result.Chars = null; result.Range = (lowInclusive, highInclusive); @@ -454,25 +450,12 @@ public static void SortFixedDistanceSetsByQuality(List 0) - { - s1CharsLength = char.MaxValue + 1 - s1CharsLength; - } - - if (s2Negated && s2CharsLength > 0) - { - s2CharsLength = char.MaxValue + 1 - s2CharsLength; - } + Debug.Assert(!s1Negated || s1Chars is null); + Debug.Assert(!s2Negated || s2Chars is null); // If both have chars, prioritize the one with the smaller frequency for those chars. if (s1Chars is not null && s2Chars is not null) { - // If one is negated and the other isn't, prefer the non-negated one. - if (s1Negated != s2Negated) - { - return s1Negated ? 1 : -1; - } - // Prefer sets with less frequent values. The frequency is only an approximation, // used as a tie-breaker when we'd otherwise effectively be picking randomly. // True frequencies will vary widely based on the actual data being searched, the language of the data, etc. @@ -481,9 +464,7 @@ public static void SortFixedDistanceSetsByQuality(List