diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index 547b4115161d7f..b7e23bbce5ec17 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -771,8 +771,8 @@ void EmitFixedSet_LeftToRight() { Debug.Assert(regexTree.FindOptimizations.FixedDistanceSets is { Count: > 0 }); - List<(char[]? Chars, string Set, int Distance)>? sets = regexTree.FindOptimizations.FixedDistanceSets; - (char[]? Chars, string Set, int Distance) primarySet = sets![0]; + List? sets = regexTree.FindOptimizations.FixedDistanceSets; + RegexFindOptimizations.FixedDistanceSet primarySet = sets![0]; const int MaxSets = 4; int setsToUse = Math.Min(sets.Count, MaxSets); @@ -784,7 +784,7 @@ void EmitFixedSet_LeftToRight() // If we can use IndexOf{Any}, try to accelerate the skip loop via vectorization to match the first prefix. // We can use it if this is a case-sensitive class with a small number of characters in the class. int setIndex = 0; - bool canUseIndexOf = primarySet.Chars is not null; + bool canUseIndexOf = primarySet.Chars is not null || primarySet.Range is not null; bool needLoop = !canUseIndexOf || setsToUse > 1; FinishEmitBlock loopBlock = default; @@ -809,13 +809,21 @@ void EmitFixedSet_LeftToRight() (true, _) => $"{span}.Slice(i + {primarySet.Distance})", }; - string indexOf = primarySet.Chars!.Length switch - { - 1 => $"{span}.IndexOf({Literal(primarySet.Chars[0])})", - 2 => $"{span}.IndexOfAny({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])})", - 3 => $"{span}.IndexOfAny({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])}, {Literal(primarySet.Chars[2])})", - _ => $"{span}.IndexOfAny({Literal(new string(primarySet.Chars))})", - }; + string indexOf = + primarySet.Chars is not null ? primarySet.Chars!.Length switch + { + 1 => $"{span}.IndexOf({Literal(primarySet.Chars[0])})", + 2 => $"{span}.IndexOfAny({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])})", + 3 => $"{span}.IndexOfAny({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])}, {Literal(primarySet.Chars[2])})", + _ => $"{span}.IndexOfAny({Literal(new string(primarySet.Chars))})", + } : + (primarySet.Range.Value.LowInclusive == primarySet.Range.Value.HighInclusive, primarySet.Range.Value.Negated) switch + { + (false, false) => $"{span}.IndexOfAnyInRange({Literal(primarySet.Range.Value.LowInclusive)}, {Literal(primarySet.Range.Value.HighInclusive)})", + (true, false) => $"{span}.IndexOf({Literal(primarySet.Range.Value.LowInclusive)})", + (false, true) => $"{span}.IndexOfAnyExceptInRange({Literal(primarySet.Range.Value.LowInclusive)}, {Literal(primarySet.Range.Value.HighInclusive)})", + (true, true) => $"{span}.IndexOfAnyExcept({Literal(primarySet.Range.Value.LowInclusive)})", + }; if (needLoop) { @@ -910,7 +918,7 @@ void EmitFixedSet_RightToLeft() { Debug.Assert(regexTree.FindOptimizations.FixedDistanceSets is { Count: > 0 }); - (char[]? Chars, string Set, int Distance) set = regexTree.FindOptimizations.FixedDistanceSets![0]; + RegexFindOptimizations.FixedDistanceSet set = regexTree.FindOptimizations.FixedDistanceSets![0]; Debug.Assert(set.Distance == 0); writer.WriteLine($"// The pattern begins with {DescribeSet(set.Set)}."); @@ -2883,21 +2891,33 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL // We're backtracking. Check the timeout. EmitTimeoutCheckIfNeeded(writer, rm); - if (!rtl && subsequent?.FindStartingLiteral() is ValueTuple literal) // char, string, chars, negated + if (!rtl && subsequent?.FindStartingLiteral() is RegexNode.StartingLiteralData literal) { writer.WriteLine($"if ({startingPos} >= {endingPos} ||"); - (string lastIndexOfName, string lastIndexOfAnyName) = !literal.Item4 ? + (string lastIndexOfName, string lastIndexOfAnyName) = !literal.Negated ? ("LastIndexOf", "LastIndexOfAny") : ("LastIndexOfAnyExcept", "LastIndexOfAnyExcept"); - using (EmitBlock(writer, - literal.Item2 is not null ? $" ({endingPos} = inputSpan.Slice({startingPos}, Math.Min(inputSpan.Length, {endingPos} + {literal.Item2.Length - 1}) - {startingPos}).{lastIndexOfName}({Literal(literal.Item2)})) < 0)" : - literal.Item3 is null ? $" ({endingPos} = inputSpan.Slice({startingPos}, {endingPos} - {startingPos}).{lastIndexOfName}({Literal(literal.Item1)})) < 0)" : - literal.Item3.Length switch + + string setEndingPosCondition = $" ({endingPos} = inputSpan.Slice({startingPos}, "; + if (literal.String is not null) + { + setEndingPosCondition += $"Math.Min(inputSpan.Length, {endingPos} + {literal.String.Length - 1}) - {startingPos}).{lastIndexOfName}({Literal(literal.String)}"; + } + else + { + setEndingPosCondition += $"{endingPos} - {startingPos})."; + setEndingPosCondition += literal.SetChars is not null ? literal.SetChars.Length switch { - 2 => $" ({endingPos} = inputSpan.Slice({startingPos}, {endingPos} - {startingPos}).{lastIndexOfAnyName}({Literal(literal.Item3[0])}, {Literal(literal.Item3[1])})) < 0)", - 3 => $" ({endingPos} = inputSpan.Slice({startingPos}, {endingPos} - {startingPos}).{lastIndexOfAnyName}({Literal(literal.Item3[0])}, {Literal(literal.Item3[1])}, {Literal(literal.Item3[2])})) < 0)", - _ => $" ({endingPos} = inputSpan.Slice({startingPos}, {endingPos} - {startingPos}).{lastIndexOfAnyName}({Literal(literal.Item3)})) < 0)", - })) + 2 => $"{lastIndexOfAnyName}({Literal(literal.SetChars[0])}, {Literal(literal.SetChars[1])}", + 3 => $"{lastIndexOfAnyName}({Literal(literal.SetChars[0])}, {Literal(literal.SetChars[1])}, {Literal(literal.SetChars[2])}", + _ => $"{lastIndexOfAnyName}({Literal(literal.SetChars)}", + } : + literal.Range.LowInclusive == literal.Range.HighInclusive ? $"{lastIndexOfName}({Literal(literal.Range.LowInclusive)}" : + $"{lastIndexOfAnyName}InRange({Literal(literal.Range.LowInclusive)}, {Literal(literal.Range.HighInclusive)}"; + } + setEndingPosCondition += ")) < 0)"; + + using (EmitBlock(writer, setEndingPosCondition)) { Goto(doneLabel); } @@ -3043,8 +3063,12 @@ void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitL { if (iterationCount is null && node.Kind is RegexNodeKind.Notonelazy && - subsequent?.FindStartingLiteral(4) is ValueTuple literal && // 5 == max optimized by IndexOfAny, and we need to reserve 1 for node.Ch - !literal.Item4) // not negated; can't search for both the node.Ch and a negated subsequent char with an IndexOf* method + subsequent?.FindStartingLiteral(4) is RegexNode.StartingLiteralData literal && // 5 == max optimized by IndexOfAny, and we need to reserve 1 for node.Ch + !literal.Negated && // not negated; can't search for both the node.Ch and a negated subsequent char with an IndexOf* method + (literal.String is not null || + literal.SetChars is not null || + literal.Range.LowInclusive == literal.Range.HighInclusive || + (literal.Range.LowInclusive <= node.Ch && node.Ch <= literal.Range.HighInclusive))) // for ranges, only allow when the range overlaps with the target, since there's no accelerated way to search for the union { // e.g. "<[^>]*?>" @@ -3054,32 +3078,37 @@ node.Kind is RegexNodeKind.Notonelazy && // This lazy loop will consume all characters other than node.Ch until the subsequent literal. // We can implement it to search for either that char or the literal, whichever comes first. - if (literal.Item2 is not null) // string literal + if (literal.String is not null) // string literal { - overlap = literal.Item2[0] == node.Ch; + overlap = literal.String[0] == node.Ch; writer.WriteLine(overlap ? $"{startingPos} = {sliceSpan}.IndexOf({Literal(node.Ch)});" : - $"{startingPos} = {sliceSpan}.IndexOfAny({Literal(node.Ch)}, {Literal(literal.Item2[0])});"); + $"{startingPos} = {sliceSpan}.IndexOfAny({Literal(node.Ch)}, {Literal(literal.String[0])});"); } - else if (literal.Item3 is null) // char literal + else if (literal.SetChars is not null) // set literal { - overlap = literal.Item1 == node.Ch; + overlap = literal.SetChars.Contains(node.Ch); + writer.WriteLine((overlap, literal.SetChars.Length) switch + { + (true, 2) => $"{startingPos} = {sliceSpan}.IndexOfAny({Literal(literal.SetChars[0])}, {Literal(literal.SetChars[1])});", + (true, 3) => $"{startingPos} = {sliceSpan}.IndexOfAny({Literal(literal.SetChars[0])}, {Literal(literal.SetChars[1])}, {Literal(literal.SetChars[2])});", + (true, _) => $"{startingPos} = {sliceSpan}.IndexOfAny({Literal(literal.SetChars)});", + + (false, 2) => $"{startingPos} = {sliceSpan}.IndexOfAny({Literal(node.Ch)}, {Literal(literal.SetChars[0])}, {Literal(literal.SetChars[1])});", + (false, _) => $"{startingPos} = {sliceSpan}.IndexOfAny({Literal($"{node.Ch}{literal.SetChars}")});", + }); + } + else if (literal.Range.LowInclusive == literal.Range.HighInclusive) // single char + { + overlap = literal.Range.LowInclusive == node.Ch; writer.WriteLine(overlap ? $"{startingPos} = {sliceSpan}.IndexOf({Literal(node.Ch)});" : - $"{startingPos} = {sliceSpan}.IndexOfAny({Literal(node.Ch)}, {Literal(literal.Item1)});"); + $"{startingPos} = {sliceSpan}.IndexOfAny({Literal(node.Ch)}, {Literal(literal.Range.LowInclusive)});"); } - else // set literal + else // char range { - overlap = literal.Item3.Contains(node.Ch); - writer.WriteLine((overlap, literal.Item3.Length) switch - { - (true, 2) => $"{startingPos} = {sliceSpan}.IndexOfAny({Literal(literal.Item3[0])}, {Literal(literal.Item3[1])});", - (true, 3) => $"{startingPos} = {sliceSpan}.IndexOfAny({Literal(literal.Item3[0])}, {Literal(literal.Item3[1])}, {Literal(literal.Item3[2])});", - (true, _) => $"{startingPos} = {sliceSpan}.IndexOfAny({Literal(literal.Item3)});", - - (false, 2) => $"{startingPos} = {sliceSpan}.IndexOfAny({Literal(node.Ch)}, {Literal(literal.Item3[0])}, {Literal(literal.Item3[1])});", - (false, _) => $"{startingPos} = {sliceSpan}.IndexOfAny({Literal($"{node.Ch}{literal.Item3}")});", - }); + overlap = true; + writer.WriteLine($"{startingPos} = {sliceSpan}.IndexOfAnyInRange({Literal(literal.Range.LowInclusive)}, {Literal(literal.Range.HighInclusive)});"); } // If the search didn't find anything, fail the match. If it did find something, then we need to consider whether @@ -3102,23 +3131,26 @@ node.Kind is RegexNodeKind.Notonelazy && else if (iterationCount is null && node.Kind is RegexNodeKind.Setlazy && node.Str == RegexCharClass.AnyClass && - subsequent?.FindStartingLiteral() is ValueTuple literal2) + subsequent?.FindStartingLiteral() is RegexNode.StartingLiteralData literal2) { // e.g. ".*?string" with RegexOptions.Singleline // This lazy loop will consume all characters until the subsequent literal. If the subsequent literal // isn't found, the loop fails. We can implement it to just search for that literal. - (string indexOfName, string indexOfAnyName) = !literal2.Item4 ? + (string indexOfName, string indexOfAnyName) = !literal2.Negated ? ("IndexOf", "IndexOfAny") : ("IndexOfAnyExcept", "IndexOfAnyExcept"); + writer.WriteLine($"{startingPos} = {sliceSpan}."); writer.WriteLine( - literal2.Item2 is not null ? $"{startingPos} = {sliceSpan}.{indexOfName}({Literal(literal2.Item2)});" : - literal2.Item3 is null ? $"{startingPos} = {sliceSpan}.{indexOfName}({Literal(literal2.Item1)});" : - literal2.Item3.Length switch + literal2.String is not null ? $"{indexOfName}({Literal(literal2.String)});" : + literal2.SetChars is not null ? literal2.SetChars.Length switch { - 2 => $"{startingPos} = {sliceSpan}.{indexOfAnyName}({Literal(literal2.Item3[0])}, {Literal(literal2.Item3[1])});", - 3 => $"{startingPos} = {sliceSpan}.{indexOfAnyName}({Literal(literal2.Item3[0])}, {Literal(literal2.Item3[1])}, {Literal(literal2.Item3[2])});", - _ => $"{startingPos} = {sliceSpan}.{indexOfAnyName}({Literal(literal2.Item3)});", - }); + 2 => $"{indexOfAnyName}({Literal(literal2.SetChars[0])}, {Literal(literal2.SetChars[1])});", + 3 => $"{indexOfAnyName}({Literal(literal2.SetChars[0])}, {Literal(literal2.SetChars[1])}, {Literal(literal2.SetChars[2])});", + _ => $"{indexOfAnyName}({Literal(literal2.SetChars)});", + } : + literal2.Range.LowInclusive == literal2.Range.HighInclusive ? $"{indexOfName}({Literal(literal2.Range.LowInclusive)});" : + $"{indexOfAnyName}InRange({Literal(literal2.Range.LowInclusive)}, {Literal(literal2.Range.HighInclusive)});"); + using (EmitBlock(writer, $"if ({startingPos} < 0)")) { Goto(doneLabel); @@ -3686,6 +3718,28 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired = TransferSliceStaticPosToPos(); writer.WriteLine($"int {iterationLocal} = inputSpan.Length - pos;"); } + else if (node.IsSetFamily && + maxIterations == int.MaxValue && + RegexCharClass.TryGetSingleRange(node.Str!, out char rangeLowInclusive, out char rangeHighInclusive)) + { + // If the set contains a single range, we can use an IndexOfAny{Except}InRange to find any of the target characters. + // As with the cases above, the unbounded constraint is purely for simplicity. + string indexOfMethod = RegexCharClass.IsNegated(node.Str!) ? "IndexOfAnyInRange" : "IndexOfAnyExceptInRange"; + + writer.Write($"int {iterationLocal} = {sliceSpan}"); + if (sliceStaticPos != 0) + { + writer.Write($".Slice({sliceStaticPos})"); + } + writer.WriteLine($".{indexOfMethod}({Literal(rangeLowInclusive)}, {Literal(rangeHighInclusive)});"); + using (EmitBlock(writer, $"if ({iterationLocal} < 0)")) + { + writer.WriteLine(sliceStaticPos > 0 ? + $"{iterationLocal} = {sliceSpan}.Length - {sliceStaticPos};" : + $"{iterationLocal} = {sliceSpan}.Length;"); + } + writer.WriteLine(); + } else { // For everything else, do a normal loop. diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index bdb513a3f0704d..17e2432c356cfe 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -7,7 +7,6 @@ using System.Globalization; using System.Reflection; using System.Reflection.Emit; -using System.Runtime.InteropServices; using System.Threading; namespace System.Text.RegularExpressions @@ -67,6 +66,8 @@ internal abstract class RegexCompiler private static readonly MethodInfo s_spanIndexOfAnyExceptCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAnyExcept", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOfAnyExceptCharCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAnyExcept", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOfAnyExceptSpan = typeof(MemoryExtensions).GetMethod("IndexOfAnyExcept", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); + private static readonly MethodInfo s_spanIndexOfAnyInRange = typeof(MemoryExtensions).GetMethod("IndexOfAnyInRange", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); + private static readonly MethodInfo s_spanIndexOfAnyExceptInRange = typeof(MemoryExtensions).GetMethod("IndexOfAnyExceptInRange", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanLastIndexOfChar = typeof(MemoryExtensions).GetMethod("LastIndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanLastIndexOfAnyCharChar = typeof(MemoryExtensions).GetMethod("LastIndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanLastIndexOfAnyCharCharChar = typeof(MemoryExtensions).GetMethod("LastIndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); @@ -76,6 +77,8 @@ internal abstract class RegexCompiler private static readonly MethodInfo s_spanLastIndexOfAnyExceptCharChar = typeof(MemoryExtensions).GetMethod("LastIndexOfAnyExcept", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanLastIndexOfAnyExceptCharCharChar = typeof(MemoryExtensions).GetMethod("LastIndexOfAnyExcept", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanLastIndexOfAnyExceptSpan = typeof(MemoryExtensions).GetMethod("LastIndexOfAnyExcept", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); + private static readonly MethodInfo s_spanLastIndexOfAnyInRange = typeof(MemoryExtensions).GetMethod("LastIndexOfAnyInRange", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); + private static readonly MethodInfo s_spanLastIndexOfAnyExceptInRange = typeof(MemoryExtensions).GetMethod("LastIndexOfAnyExceptInRange", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanSliceIntMethod = typeof(ReadOnlySpan).GetMethod("Slice", new Type[] { typeof(int) })!; private static readonly MethodInfo s_spanSliceIntIntMethod = typeof(ReadOnlySpan).GetMethod("Slice", new Type[] { typeof(int), typeof(int) })!; private static readonly MethodInfo s_spanStartsWithSpan = typeof(MemoryExtensions).GetMethod("StartsWith", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); @@ -802,8 +805,8 @@ void EmitFixedSet_LeftToRight() { Debug.Assert(_regexTree.FindOptimizations.FixedDistanceSets is { Count: > 0 }); - List<(char[]? Chars, string Set, int Distance)>? sets = _regexTree.FindOptimizations.FixedDistanceSets; - (char[]? Chars, string Set, int Distance) primarySet = sets![0]; + List? sets = _regexTree.FindOptimizations.FixedDistanceSets; + RegexFindOptimizations.FixedDistanceSet primarySet = sets![0]; const int MaxSets = 4; int setsToUse = Math.Min(sets.Count, MaxSets); @@ -819,7 +822,7 @@ void EmitFixedSet_LeftToRight() // If we can use IndexOf{Any}, try to accelerate the skip loop via vectorization to match the first prefix. // We can use it if this is a case-sensitive class with a small number of characters in the class. int setIndex = 0; - bool canUseIndexOf = primarySet.Chars is not null; + bool canUseIndexOf = primarySet.Chars is not null || primarySet.Range is not null; bool needLoop = !canUseIndexOf || setsToUse > 1; Label checkSpanLengthLabel = default; @@ -867,34 +870,53 @@ void EmitFixedSet_LeftToRight() Ldloc(textSpanLocal); } - switch (primarySet.Chars!.Length) + if (primarySet.Chars is not null) { - case 1: - // tmp = ...IndexOf(setChars[0]); - Ldc(primarySet.Chars[0]); - Call(s_spanIndexOfChar); - break; + switch (primarySet.Chars!.Length) + { + case 1: + // tmp = ...IndexOf(setChars[0]); + Ldc(primarySet.Chars[0]); + Call(s_spanIndexOfChar); + break; - case 2: - // tmp = ...IndexOfAny(setChars[0], setChars[1]); - Ldc(primarySet.Chars[0]); - Ldc(primarySet.Chars[1]); - Call(s_spanIndexOfAnyCharChar); - break; + case 2: + // tmp = ...IndexOfAny(setChars[0], setChars[1]); + Ldc(primarySet.Chars[0]); + Ldc(primarySet.Chars[1]); + Call(s_spanIndexOfAnyCharChar); + break; - case 3: - // tmp = ...IndexOfAny(setChars[0], setChars[1], setChars[2]}); - Ldc(primarySet.Chars[0]); - Ldc(primarySet.Chars[1]); - Ldc(primarySet.Chars[2]); - Call(s_spanIndexOfAnyCharCharChar); - break; + case 3: + // tmp = ...IndexOfAny(setChars[0], setChars[1], setChars[2]}); + Ldc(primarySet.Chars[0]); + Ldc(primarySet.Chars[1]); + Ldc(primarySet.Chars[2]); + Call(s_spanIndexOfAnyCharCharChar); + break; - default: - Ldstr(new string(primarySet.Chars)); - Call(s_stringAsSpanMethod); - Call(s_spanIndexOfAnySpan); - break; + default: + Ldstr(new string(primarySet.Chars)); + Call(s_stringAsSpanMethod); + Call(s_spanIndexOfAnySpan); + break; + } + } + else + { + if (primarySet.Range!.Value.LowInclusive == primarySet.Range.Value.HighInclusive) + { + // tmp = ...IndexOf{AnyExcept}(low); + Ldc(primarySet.Range!.Value.LowInclusive); + Call(primarySet.Range.Value.Negated ? s_spanIndexOfAnyExceptChar : s_spanIndexOfChar); + } + else + { + // tmp = ...IndexOfAny{Except}InRange(low, high); + Ldc(primarySet.Range!.Value.LowInclusive); + Ldc(primarySet.Range.Value.HighInclusive); + Call(primarySet.Range.Value.Negated ? s_spanIndexOfAnyExceptInRange : s_spanIndexOfAnyInRange); + } } if (needLoop) @@ -1014,7 +1036,7 @@ void EmitFixedSet_RightToLeft() { Debug.Assert(_regexTree.FindOptimizations.FixedDistanceSets is { Count: > 0 }); - (char[]? Chars, string Set, int Distance) set = _regexTree.FindOptimizations.FixedDistanceSets![0]; + RegexFindOptimizations.FixedDistanceSet set = _regexTree.FindOptimizations.FixedDistanceSets![0]; Debug.Assert(set.Distance == 0); if (set.Chars is { Length: 1 }) @@ -3151,29 +3173,28 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL BleFar(doneLabel); } - if (!rtl && subsequent?.FindStartingLiteral() is ValueTuple literal) + if (!rtl && subsequent?.FindStartingLiteral() is RegexNode.StartingLiteralData literal) { // endingPos = inputSpan.Slice(startingPos, Math.Min(inputSpan.Length, endingPos + literal.Length - 1) - startingPos).LastIndexOf(literal); // if (endingPos < 0) // { // goto doneLabel; // } - bool negated = literal.Item4; Ldloca(inputSpan); Ldloc(startingPos); - if (literal.Item2 is not null) + if (literal.String is not null) { - Debug.Assert(!negated, "strings should not be negated"); + Debug.Assert(!literal.Negated, "strings should not be negated"); Ldloca(inputSpan); Call(s_spanGetLengthMethod); Ldloc(endingPos); - Ldc(literal.Item2.Length - 1); + Ldc(literal.String.Length - 1); Add(); Call(s_mathMinIntInt); Ldloc(startingPos); Sub(); Call(s_spanSliceIntIntMethod); - Ldstr(literal.Item2); + Ldstr(literal.String); Call(s_stringAsSpanMethod); Call(s_spanLastIndexOfSpan); } @@ -3183,34 +3204,40 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL Ldloc(startingPos); Sub(); Call(s_spanSliceIntIntMethod); - if (literal.Item3 is not null) + if (literal.SetChars is not null) { - switch (literal.Item3.Length) + switch (literal.SetChars.Length) { case 2: - Ldc(literal.Item3[0]); - Ldc(literal.Item3[1]); - Call(negated ? s_spanLastIndexOfAnyExceptCharChar : s_spanLastIndexOfAnyCharChar); + Ldc(literal.SetChars[0]); + Ldc(literal.SetChars[1]); + Call(literal.Negated ? s_spanLastIndexOfAnyExceptCharChar : s_spanLastIndexOfAnyCharChar); break; case 3: - Ldc(literal.Item3[0]); - Ldc(literal.Item3[1]); - Ldc(literal.Item3[2]); - Call(negated ? s_spanLastIndexOfAnyExceptCharCharChar : s_spanLastIndexOfAnyCharCharChar); + Ldc(literal.SetChars[0]); + Ldc(literal.SetChars[1]); + Ldc(literal.SetChars[2]); + Call(literal.Negated ? s_spanLastIndexOfAnyExceptCharCharChar : s_spanLastIndexOfAnyCharCharChar); break; default: - Ldstr(literal.Item3); + Ldstr(literal.SetChars); Call(s_stringAsSpanMethod); - Call(negated ? s_spanLastIndexOfAnyExceptSpan : s_spanLastIndexOfAnySpan); + Call(literal.Negated ? s_spanLastIndexOfAnyExceptSpan : s_spanLastIndexOfAnySpan); break; } } + else if (literal.Range.LowInclusive == literal.Range.HighInclusive) + { + Ldc(literal.Range.LowInclusive); + Call(literal.Negated ? s_spanLastIndexOfAnyExceptChar : s_spanLastIndexOfChar); + } else { - Ldc(literal.Item1); - Call(negated ? s_spanLastIndexOfAnyExceptChar : s_spanLastIndexOfChar); + Ldc(literal.Range.LowInclusive); + Ldc(literal.Range.HighInclusive); + Call(literal.Negated ? s_spanLastIndexOfAnyExceptInRange : s_spanLastIndexOfAnyInRange); } } Stloc(endingPos); @@ -3381,8 +3408,12 @@ void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitL if (!rtl && iterationCount is null && node.Kind is RegexNodeKind.Notonelazy && - subsequent?.FindStartingLiteral(4) is ValueTuple literal && // 5 == max optimized by IndexOfAny, and we need to reserve 1 for node.Ch - !literal.Item4) // not negated; can't search for both the node.Ch and a negated subsequent char with an IndexOf* method + subsequent?.FindStartingLiteral(4) is RegexNode.StartingLiteralData literal && // 5 == max optimized by IndexOfAny, and we need to reserve 1 for node.Ch + !literal.Negated && // not negated; can't search for both the node.Ch and a negated subsequent char with an IndexOf* method + (literal.String is not null || + literal.SetChars is not null || + literal.Range.LowInclusive == literal.Range.HighInclusive || + (literal.Range.LowInclusive <= node.Ch && node.Ch <= literal.Range.HighInclusive))) // for ranges, only allow when the range overlaps with the target, since there's no accelerated way to search for the union { // e.g. "<[^>]*?>" @@ -3393,9 +3424,9 @@ node.Kind is RegexNodeKind.Notonelazy && // This lazy loop will consume all characters other than node.Ch until the subsequent literal. // We can implement it to search for either that char or the literal, whichever comes first. Ldloc(slice); - if (literal.Item2 is not null) // string literal + if (literal.String is not null) // string literal { - overlap = literal.Item2[0] == node.Ch; + overlap = literal.String[0] == node.Ch; if (overlap) { // startingPos = slice.IndexOf(node.Ch); @@ -3404,72 +3435,80 @@ node.Kind is RegexNodeKind.Notonelazy && } else { - // startingPos = slice.IndexOfAny(node.Ch, literal.Item2[0]); + // startingPos = slice.IndexOfAny(node.Ch, literal.String[0]); Ldc(node.Ch); - Ldc(literal.Item2[0]); + Ldc(literal.String[0]); Call(s_spanIndexOfAnyCharChar); } } - else if (literal.Item3 is null) // char literal + else if (literal.SetChars is not null) // set literal { - overlap = literal.Item1 == node.Ch; - if (overlap) - { - // startingPos = slice.IndexOf(node.Ch); - Ldc(node.Ch); - Call(s_spanIndexOfChar); - } - else - { - // startingPos = slice.IndexOfAny(node.Ch, literal.Item1); - Ldc(node.Ch); - Ldc(literal.Item1); - Call(s_spanIndexOfAnyCharChar); - } - } - else // set literal - { - overlap = literal.Item3.Contains(node.Ch); - switch ((overlap, literal.Item3.Length)) + overlap = literal.SetChars.Contains(node.Ch); + switch ((overlap, literal.SetChars.Length)) { case (true, 2): - // startingPos = slice.IndexOfAny(literal.Item3[0], literal.Item3[1]); - Ldc(literal.Item3[0]); - Ldc(literal.Item3[1]); + // startingPos = slice.IndexOfAny(literal.SetChars[0], literal.SetChars[1]); + Ldc(literal.SetChars[0]); + Ldc(literal.SetChars[1]); Call(s_spanIndexOfAnyCharChar); break; case (true, 3): - // startingPos = slice.IndexOfAny(literal.Item3[0], literal.Item3[1], literal.Item3[2]); - Ldc(literal.Item3[0]); - Ldc(literal.Item3[1]); - Ldc(literal.Item3[2]); + // startingPos = slice.IndexOfAny(literal.SetChars[0], literal.SetChars[1], literal.SetChars[2]); + Ldc(literal.SetChars[0]); + Ldc(literal.SetChars[1]); + Ldc(literal.SetChars[2]); Call(s_spanIndexOfAnyCharCharChar); break; case (true, _): - // startingPos = slice.IndexOfAny(literal.Item3); - Ldstr(literal.Item3); + // startingPos = slice.IndexOfAny(literal.SetChars); + Ldstr(literal.SetChars); Call(s_stringAsSpanMethod); Call(s_spanIndexOfAnySpan); break; case (false, 2): - // startingPos = slice.IndexOfAny(node.Ch, literal.Item3[0], literal.Item3[1]); + // startingPos = slice.IndexOfAny(node.Ch, literal.SetChars[0], literal.SetChars[1]); Ldc(node.Ch); - Ldc(literal.Item3[0]); - Ldc(literal.Item3[1]); + Ldc(literal.SetChars[0]); + Ldc(literal.SetChars[1]); Call(s_spanIndexOfAnyCharCharChar); break; case (false, _): - // startingPos = slice.IndexOfAny($"{node.Ch}{literal.Item3}"); - Ldstr($"{node.Ch}{literal.Item3}"); + // startingPos = slice.IndexOfAny($"{node.Ch}{literal.SetChars}"); + Ldstr($"{node.Ch}{literal.SetChars}"); Call(s_stringAsSpanMethod); Call(s_spanIndexOfAnySpan); break; } } + else if (literal.Range.LowInclusive == literal.Range.HighInclusive) // char literal + { + overlap = literal.Range.LowInclusive == node.Ch; + if (overlap) + { + // startingPos = slice.IndexOf(node.Ch); + Ldc(node.Ch); + Call(s_spanIndexOfChar); + } + else + { + // startingPos = slice.IndexOfAny(node.Ch, literal.Range.LowInclusive); + Ldc(node.Ch); + Ldc(literal.Range.LowInclusive); + Call(s_spanIndexOfAnyCharChar); + } + } + else // range literal + { + // startingPos = slice.IndexOfAnyInRange(literal.Range.LowInclusive, literal.Range.HighInclusive); + overlap = true; + Ldc(literal.Range.LowInclusive); + Ldc(literal.Range.HighInclusive); + Call(s_spanIndexOfAnyInRange); + } Stloc(startingPos); // If the search didn't find anything, fail the match. If it did find something, then we need to consider whether @@ -3515,50 +3554,57 @@ node.Kind is RegexNodeKind.Notonelazy && iterationCount is null && node.Kind is RegexNodeKind.Setlazy && node.Str == RegexCharClass.AnyClass && - subsequent?.FindStartingLiteral() is ValueTuple literal2) + subsequent?.FindStartingLiteral() is RegexNode.StartingLiteralData literal2) { // e.g. ".*?string" with RegexOptions.Singleline // This lazy loop will consume all characters until the subsequent literal. If the subsequent literal // isn't found, the loop fails. We can implement it to just search for that literal. - bool negated = literal2.Item4; // startingPos = slice.IndexOf(literal); Ldloc(slice); - if (literal2.Item2 is not null) + if (literal2.String is not null) { - Debug.Assert(!negated, "strings should not be negated"); - Ldstr(literal2.Item2); + Debug.Assert(!literal2.Negated, "strings should not be negated"); + Ldstr(literal2.String); Call(s_stringAsSpanMethod); Call(s_spanIndexOfSpan); } - else if (literal2.Item3 is not null) + else if (literal2.SetChars is not null) { - switch (literal2.Item3.Length) + switch (literal2.SetChars.Length) { case 2: - Ldc(literal2.Item3[0]); - Ldc(literal2.Item3[1]); - Call(negated ? s_spanIndexOfAnyExceptCharChar : s_spanIndexOfAnyCharChar); + Ldc(literal2.SetChars[0]); + Ldc(literal2.SetChars[1]); + Call(literal2.Negated ? s_spanIndexOfAnyExceptCharChar : s_spanIndexOfAnyCharChar); break; case 3: - Ldc(literal2.Item3[0]); - Ldc(literal2.Item3[1]); - Ldc(literal2.Item3[2]); - Call(negated ? s_spanIndexOfAnyExceptCharCharChar : s_spanIndexOfAnyCharCharChar); + Ldc(literal2.SetChars[0]); + Ldc(literal2.SetChars[1]); + Ldc(literal2.SetChars[2]); + Call(literal2.Negated ? s_spanIndexOfAnyExceptCharCharChar : s_spanIndexOfAnyCharCharChar); break; default: - Ldstr(literal2.Item3); + Ldstr(literal2.SetChars); Call(s_stringAsSpanMethod); - Call(negated ? s_spanIndexOfAnyExceptSpan : s_spanIndexOfAnySpan); + Call(literal2.Negated ? s_spanIndexOfAnyExceptSpan : s_spanIndexOfAnySpan); break; } } else { - Ldc(literal2.Item1); - Call(negated ? s_spanIndexOfAnyExceptChar : s_spanIndexOfChar); + Ldc(literal2.Range.LowInclusive); + if (literal2.Range.LowInclusive == literal2.Range.HighInclusive) + { + Call(literal2.Negated ? s_spanIndexOfAnyExceptChar : s_spanIndexOfChar); + } + else + { + Ldc(literal2.Range.HighInclusive); + Call(literal2.Negated ? s_spanIndexOfAnyExceptInRange : s_spanIndexOfAnyInRange); + } } Stloc(startingPos); @@ -4338,6 +4384,44 @@ void EmitSingleCharAtomicLoop(RegexNode node) Sub(); Stloc(iterationLocal); } + else if (node.IsSetFamily && + maxIterations == int.MaxValue && + RegexCharClass.TryGetSingleRange(node.Str!, out char rangeLowInclusive, out char rangeHighInclusive)) + { + // If the set contains a single range, we can use an IndexOfAny{Except}InRange to find any of the target characters. + // As with the cases above, the unbounded constraint is purely for simplicity. + + // int i = slice.Slice(sliceStaticPos).IndexOfAny{Except}InRange(rangeLowInclusive, rangeHighInclusive); + if (sliceStaticPos > 0) + { + Ldloca(slice); + Ldc(sliceStaticPos); + Call(s_spanSliceIntMethod); + } + else + { + Ldloc(slice); + } + Ldc(rangeLowInclusive); + Ldc(rangeHighInclusive); + Call(RegexCharClass.IsNegated(node.Str!) ? s_spanIndexOfAnyInRange : s_spanIndexOfAnyExceptInRange); + Stloc(iterationLocal); + + // if (i >= 0) goto atomicLoopDoneLabel; + Ldloc(iterationLocal); + Ldc(0); + BgeFar(atomicLoopDoneLabel); + + // i = slice.Length - sliceStaticPos; + Ldloca(slice); + Call(s_spanGetLengthMethod); + if (sliceStaticPos > 0) + { + Ldc(sliceStaticPos); + Sub(); + } + Stloc(iterationLocal); + } else { // For everything else, do a normal loop. diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs index 673ae8118f42c3..823e3617e3b5e9 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs @@ -115,9 +115,9 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options) else { // The set may match multiple characters. Search for that. - FixedDistanceSets = new List<(char[]? Chars, string Set, int Distance)>() + FixedDistanceSets = new List() { - (chars, charClass, 0) + new FixedDistanceSet(chars, charClass, 0) }; FindMode = FindNextStartingPositionMode.LeadingSet_RightToLeft; _asciiLookups = new uint[1][]; @@ -129,7 +129,7 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options) // We're now left-to-right only and looking for sets. // Build up a list of all of the sets that are a fixed distance from the start of the expression. - List<(char[]? Chars, string Set, int Distance)>? fixedDistanceSets = RegexPrefixAnalyzer.FindFixedDistanceSets(root, thorough: !interpreter); + List? fixedDistanceSets = RegexPrefixAnalyzer.FindFixedDistanceSets(root, thorough: !interpreter); Debug.Assert(fixedDistanceSets is null || fixedDistanceSets.Count != 0); // See if we can make a string of at least two characters long out of those sets. We should have already caught @@ -227,13 +227,33 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options) /// When in fixed distance set mode, gets the set and how far it is from the start of the pattern. /// The case-insensitivity of the 0th entry will always match the mode selected, but subsequent entries may not. - public List<(char[]? Chars, string Set, int Distance)>? FixedDistanceSets { get; } + public List? FixedDistanceSets { get; } + + /// Data about a character class at a fixed offset from the start of any match to a pattern. + public struct FixedDistanceSet + { + public FixedDistanceSet(char[]? chars, string set, int distance) + { + Chars = chars; + Set = set; + Distance = distance; + } + + /// The character class description. + public string Set; + /// Small list of all of the characters that make up the set, if known; otherwise, null. + public char[]? Chars; + /// The distance of the set from the beginning of the match. + public int Distance; + /// As an alternative to , a description of the single range the set represents, if it does. + public (char LowInclusive, char HighInclusive, bool Negated)? Range; + } /// When in literal after set loop node, gets the literal to search for and the RegexNode representing the leading loop. public (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal)? LiteralAfterLoop { get; } /// Analyzes a list of fixed-distance sets to extract a case-sensitive string at a fixed distance. - private static (string String, int Distance)? FindFixedDistanceString(List<(char[]? Chars, string Set, int Distance)> fixedDistanceSets) + private static (string String, int Distance)? FindFixedDistanceString(List fixedDistanceSets) { (string String, int Distance)? best = null; @@ -487,7 +507,9 @@ public bool TryFindNextStartingPosition(ReadOnlySpan textSpan, ref int pos case FindNextStartingPositionMode.LeadingSet_LeftToRight: { - (char[]? chars, string set, _) = FixedDistanceSets![0]; + FixedDistanceSet primarySet = FixedDistanceSets![0]; + char[]? chars = primarySet.Chars; + string set = primarySet.Set; ReadOnlySpan span = textSpan.Slice(pos); if (chars is not null) @@ -571,16 +593,17 @@ public bool TryFindNextStartingPosition(ReadOnlySpan textSpan, ref int pos case FindNextStartingPositionMode.FixedDistanceSets_LeftToRight: { - List<(char[]? Chars, string Set, int Distance)> sets = FixedDistanceSets!; - (char[]? primaryChars, string primarySet, int primaryDistance) = sets[0]; + List sets = FixedDistanceSets!; + FixedDistanceSet primarySet = sets[0]; + int endMinusRequiredLength = textSpan.Length - Math.Max(1, MinRequiredLength); - if (primaryChars is not null) + if (primarySet.Chars is not null) { for (int inputPosition = pos; inputPosition <= endMinusRequiredLength; inputPosition++) { - int offset = inputPosition + primaryDistance; - int index = textSpan.Slice(offset).IndexOfAny(primaryChars); + int offset = inputPosition + primarySet.Distance; + int index = textSpan.Slice(offset).IndexOfAny(primarySet.Chars); if (index < 0) { break; @@ -588,7 +611,7 @@ public bool TryFindNextStartingPosition(ReadOnlySpan textSpan, ref int pos index += offset; // The index here will be offset indexed due to the use of span, so we add offset to get // real position on the string. - inputPosition = index - primaryDistance; + inputPosition = index - primarySet.Distance; if (inputPosition > endMinusRequiredLength) { break; @@ -596,9 +619,9 @@ public bool TryFindNextStartingPosition(ReadOnlySpan textSpan, ref int pos for (int i = 1; i < sets.Count; i++) { - (_, string nextSet, int nextDistance) = sets[i]; - char c = textSpan[inputPosition + nextDistance]; - if (!RegexCharClass.CharInClass(c, nextSet, ref _asciiLookups![i])) + FixedDistanceSet nextSet = sets[i]; + char c = textSpan[inputPosition + nextSet.Distance]; + if (!RegexCharClass.CharInClass(c, nextSet.Set, ref _asciiLookups![i])) { goto Bumpalong; } @@ -616,17 +639,17 @@ public bool TryFindNextStartingPosition(ReadOnlySpan textSpan, ref int pos for (int inputPosition = pos; inputPosition <= endMinusRequiredLength; inputPosition++) { - char c = textSpan[inputPosition + primaryDistance]; - if (!RegexCharClass.CharInClass(c, primarySet, ref startingAsciiLookup)) + char c = textSpan[inputPosition + primarySet.Distance]; + if (!RegexCharClass.CharInClass(c, primarySet.Set, ref startingAsciiLookup)) { goto Bumpalong; } for (int i = 1; i < sets.Count; i++) { - (_, string nextSet, int nextDistance) = sets[i]; - c = textSpan[inputPosition + nextDistance]; - if (!RegexCharClass.CharInClass(c, nextSet, ref _asciiLookups![i])) + FixedDistanceSet nextSet = sets[i]; + c = textSpan[inputPosition + nextSet.Distance]; + if (!RegexCharClass.CharInClass(c, nextSet.Set, ref _asciiLookups![i])) { goto Bumpalong; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index 3ed8102ed49d1f..d2eef1c622f6d1 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -1381,7 +1381,7 @@ public char FirstCharOfOneOrMulti() /// A tuple of data about the literal: only one of the Char/String/SetChars fields is relevant. /// The Negated value indicates whether the Char/SetChars should be considered exclusionary. /// - public (char Char, string? String, string? SetChars, bool Negated)? FindStartingLiteral(int maxSetCharacters = 5) // 5 is max optimized by IndexOfAny today + public StartingLiteralData? FindStartingLiteral(int maxSetCharacters = 5) // 5 is max optimized by IndexOfAny today { Debug.Assert(maxSetCharacters >= 0 && maxSetCharacters <= 128, $"{nameof(maxSetCharacters)} == {maxSetCharacters} should be small enough to be stack allocated."); @@ -1394,14 +1394,14 @@ public char FirstCharOfOneOrMulti() { case RegexNodeKind.One: case RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or RegexNodeKind.Onelazy when node.M > 0: - return (node.Ch, null, null, false); + return new StartingLiteralData(range: (node.Ch, node.Ch), @string: null, setChars: null, negated: false); case RegexNodeKind.Notone: case RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Notonelazy when node.M > 0: - return (node.Ch, null, null, true); + return new StartingLiteralData(range: (node.Ch, node.Ch), @string: null, setChars: null, negated: true); case RegexNodeKind.Multi: - return ('\0', node.Str, null, false); + return new StartingLiteralData(range: default, @string: node.Str, setChars: null, negated: false); case RegexNodeKind.Set: case RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic or RegexNodeKind.Setlazy when node.M > 0: @@ -1410,7 +1410,13 @@ public char FirstCharOfOneOrMulti() if ((numChars = RegexCharClass.GetSetChars(node.Str!, setChars)) != 0) { setChars = setChars.Slice(0, numChars); - return ('\0', null, setChars.ToString(), RegexCharClass.IsNegated(node.Str!)); + return new StartingLiteralData(range: default, @string: null, setChars: setChars.ToString(), negated: RegexCharClass.IsNegated(node.Str!)); + } + + if (RegexCharClass.TryGetSingleRange(node.Str!, out char lowInclusive, out char highInclusive)) + { + Debug.Assert(lowInclusive < highInclusive); + return new StartingLiteralData(range: (lowInclusive, highInclusive), @string: null, setChars: null, negated: RegexCharClass.IsNegated(node.Str!)); } break; @@ -1429,6 +1435,23 @@ public char FirstCharOfOneOrMulti() } } + /// Data about a starting literal as returned by . + public readonly struct StartingLiteralData + { + public readonly (char LowInclusive, char HighInclusive) Range; + public readonly string? String; + public readonly string? SetChars; + public readonly bool Negated; + + public StartingLiteralData((char LowInclusive, char HighInclusive) range, string? @string, string? setChars, bool negated) + { + Range = range; + String = @string; + SetChars = setChars; + Negated = negated; + } + } + /// /// Optimizes a concatenation by coalescing adjacent characters and strings, /// coalescing adjacent loops, converting loops to be atomic where applicable, diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs index 00f2892e8118f4..1aeb8f6d4fdb21 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs @@ -159,13 +159,13 @@ static bool Process(RegexNode node, ref ValueStringBuilder vsb) /// The RegexNode tree root. /// true to spend more time finding sets (e.g. through alternations); false to do a faster analysis that's potentially more incomplete. /// The array of found sets, or null if there aren't any. - public static List<(char[]? Chars, string Set, int Distance)>? FindFixedDistanceSets(RegexNode root, bool thorough) + public static List? FindFixedDistanceSets(RegexNode root, bool thorough) { const int MaxLoopExpansion = 20; // arbitrary cut-off to avoid loops adding significant overhead to processing const int MaxFixedResults = 50; // arbitrary cut-off to avoid generating lots of sets unnecessarily // Find all fixed-distance sets. - var results = new List<(char[]? Chars, string Set, int Distance)>(); + var results = new List(); int distance = 0; TryFindFixedSets(root, results, ref distance, thorough); @@ -193,7 +193,7 @@ static bool Process(RegexNode node, ref ValueStringBuilder vsb) string? charClass = FindFirstCharClass(root); if (charClass is not null) { - results.Add((null, charClass, 0)); + results.Add(new RegexFindOptimizations.FixedDistanceSet(null, charClass, 0)); } if (results.Count == 0) @@ -203,11 +203,14 @@ static bool Process(RegexNode node, ref ValueStringBuilder vsb) } // For every entry, try to get the chars that make up the set, if there are few enough. + // For any for which we couldn't get the small chars list, see if we can get other useful info. Span scratch = stackalloc char[5]; // max optimized by IndexOfAny today for (int i = 0; i < results.Count; i++) { - (char[]? Chars, string Set, int Distance) result = results[i]; - if (!RegexCharClass.IsNegated(result.Set)) + RegexFindOptimizations.FixedDistanceSet result = results[i]; + bool negated = RegexCharClass.IsNegated(result.Set); + + if (!negated) { int count = RegexCharClass.GetSetChars(result.Set, scratch); if (count != 0) @@ -216,6 +219,15 @@ static bool Process(RegexNode node, ref ValueStringBuilder vsb) results[i] = result; } } + + if (thorough && result.Chars is null) + { + if (RegexCharClass.TryGetSingleRange(result.Set, out char lowInclusive, out char highInclusive)) + { + result.Range = (lowInclusive, highInclusive, negated); + results[i] = result; + } + } } return results; @@ -226,7 +238,7 @@ static bool Process(RegexNode node, ref ValueStringBuilder vsb) // of the node. If it returns false, the node isn't entirely fixed, in which case subsequent nodes // shouldn't be examined and distance should no longer be trusted. However, regardless of whether it // returns true or false, it may have populated results, and all populated results are valid. - static bool TryFindFixedSets(RegexNode node, List<(char[]? Chars, string Set, int Distance)> results, ref int distance, bool thorough) + static bool TryFindFixedSets(RegexNode node, List results, ref int distance, bool thorough) { if (!StackHelper.TryEnsureSufficientExecutionStack()) { @@ -244,7 +256,7 @@ static bool TryFindFixedSets(RegexNode node, List<(char[]? Chars, string Set, in if (results.Count < MaxFixedResults) { string setString = RegexCharClass.OneToStringClass(node.Ch); - results.Add((null, setString, distance++)); + results.Add(new RegexFindOptimizations.FixedDistanceSet(null, setString, distance++)); return true; } return false; @@ -256,7 +268,7 @@ static bool TryFindFixedSets(RegexNode node, List<(char[]? Chars, string Set, in int i = 0; for (; i < minIterations && results.Count < MaxFixedResults; i++) { - results.Add((null, setString, distance++)); + results.Add(new RegexFindOptimizations.FixedDistanceSet(null, setString, distance++)); } return i == node.M && i == node.N; } @@ -268,7 +280,7 @@ static bool TryFindFixedSets(RegexNode node, List<(char[]? Chars, string Set, in for (; i < s.Length && results.Count < MaxFixedResults; i++) { string setString = RegexCharClass.OneToStringClass(s[i]); - results.Add((null, setString, distance++)); + results.Add(new RegexFindOptimizations.FixedDistanceSet(null, setString, distance++)); } return i == s.Length; } @@ -276,7 +288,7 @@ static bool TryFindFixedSets(RegexNode node, List<(char[]? Chars, string Set, in case RegexNodeKind.Set: if (results.Count < MaxFixedResults) { - results.Add((null, node.Str!, distance++)); + results.Add(new RegexFindOptimizations.FixedDistanceSet(null, node.Str!, distance++)); return true; } return false; @@ -287,7 +299,7 @@ static bool TryFindFixedSets(RegexNode node, List<(char[]? Chars, string Set, in int i = 0; for (; i < minIterations && results.Count < MaxFixedResults; i++) { - results.Add((null, node.Str!, distance++)); + results.Add(new RegexFindOptimizations.FixedDistanceSet(null, node.Str!, distance++)); } return i == node.M && i == node.N; } @@ -356,7 +368,7 @@ static bool TryFindFixedSets(RegexNode node, List<(char[]? Chars, string Set, in int? sameDistance = null; var combined = new Dictionary(); - var localResults = new List<(char[]? Chars, string Set, int Distance)>(); + var localResults = new List (); for (int i = 0; i < childCount; i++) { localResults.Clear(); @@ -380,7 +392,7 @@ static bool TryFindFixedSets(RegexNode node, List<(char[]? Chars, string Set, in } } - foreach ((char[]? Chars, string Set, int Distance) fixedSet in localResults) + foreach (RegexFindOptimizations.FixedDistanceSet fixedSet in localResults) { if (combined.TryGetValue(fixedSet.Distance, out (RegexCharClass Set, int Count) value)) { @@ -407,7 +419,7 @@ static bool TryFindFixedSets(RegexNode node, List<(char[]? Chars, string Set, in if (pair.Value.Count == childCount) { - results.Add((null, pair.Value.Set.ToStringClass(), pair.Key + distance)); + results.Add(new RegexFindOptimizations.FixedDistanceSet(null, pair.Value.Set.ToStringClass(), pair.Key + distance)); } } @@ -428,11 +440,12 @@ static bool TryFindFixedSets(RegexNode node, List<(char[]? Chars, string Set, in } /// Sorts a set of fixed-distance set results from best to worst quality. - public static void SortFixedDistanceSetsByQuality(List<(char[]? Chars, string Set, int Distance)> results) => + public static void SortFixedDistanceSetsByQuality(List results) => // Finally, try to move the "best" results to be earlier. "best" here are ones we're able to search // for the fastest and that have the best chance of matching as few false positives as possible. results.Sort((s1, s2) => { + // If both have chars, prioritize the one with the smaller frequency for those chars. if (s1.Chars is not null && s2.Chars is not null) { // Then of the ones that are the same length, prefer those with less frequent values. The frequency is @@ -458,17 +471,20 @@ static float SumFrequencies(char[] chars) return sum; } } - else if (s1.Chars is not null) + + // If one has chars and the other doesn't, prioritize the one with chars. + if ((s1.Chars is not null) != (s2.Chars is not null)) { - // If s1 has chars and s2 doesn't, then s1 has fewer chars. - return -1; + return s1.Chars is not null ? -1 : 1; } - else if (s2.Chars is not null) + + // If one has a range and the other doesn't, prioritize the one with a range. + if ((s1.Range is not null) != (s2.Range is not null)) { - // If s2 has chars and s1 doesn't, then s2 has fewer chars. - return 1; + return s1.Range is not null ? -1 : 1; } + // As a tiebreaker, prioritize the earlier one. return s1.Distance.CompareTo(s2.Distance); }); diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs index b5b27236a56a7d..98a3ca21a356e8 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs @@ -336,27 +336,52 @@ public static IEnumerable Match_MemberData() yield return (@"b.*?", "abc", lineOption, 1, 2, true, "b"); yield return (@".*?", "abc", lineOption, 2, 1, true, ""); + yield return (@"a.*?[b\n]", "xyza12345b6789", lineOption, 0, 14, true, "a12345b"); + yield return (@"a.*?[b\n]", "xyza12345c6789", lineOption, 0, 14, false, ""); + yield return (@"a.*?[bc]", "xyza12345b6789", lineOption, 0, 14, true, "a12345b"); yield return (@"a.*?[bc]", "xyza12345c6789", lineOption, 0, 14, true, "a12345c"); yield return (@"a.*?[bc]", "xyza12345d6789", lineOption, 0, 14, false, ""); + yield return (@"a.*?[bc\n]", "xyza12345b6789", lineOption, 0, 14, true, "a12345b"); + yield return (@"a.*?[bc\n]", "xyza12345c6789", lineOption, 0, 14, true, "a12345c"); + yield return (@"a.*?[bc\n]", "xyza12345d6789", lineOption, 0, 14, false, ""); + yield return (@"a.*?[bcd]", "xyza12345b6789", lineOption, 0, 14, true, "a12345b"); yield return (@"a.*?[bcd]", "xyza12345c6789", lineOption, 0, 14, true, "a12345c"); yield return (@"a.*?[bcd]", "xyza12345d6789", lineOption, 0, 14, true, "a12345d"); yield return (@"a.*?[bcd]", "xyza12345e6789", lineOption, 0, 14, false, ""); + yield return (@"a.*?[bcd\n]", "xyza12345b6789", lineOption, 0, 14, true, "a12345b"); + yield return (@"a.*?[bcd\n]", "xyza12345c6789", lineOption, 0, 14, true, "a12345c"); + yield return (@"a.*?[bcd\n]", "xyza12345d6789", lineOption, 0, 14, true, "a12345d"); + yield return (@"a.*?[bcd\n]", "xyza12345e6789", lineOption, 0, 14, false, ""); + yield return (@"a.*?[bcde]", "xyza12345b6789", lineOption, 0, 14, true, "a12345b"); yield return (@"a.*?[bcde]", "xyza12345c6789", lineOption, 0, 14, true, "a12345c"); yield return (@"a.*?[bcde]", "xyza12345d6789", lineOption, 0, 14, true, "a12345d"); yield return (@"a.*?[bcde]", "xyza12345e6789", lineOption, 0, 14, true, "a12345e"); yield return (@"a.*?[bcde]", "xyza12345f6789", lineOption, 0, 14, false, ""); + yield return (@"a.*?[bcde\n]", "xyza12345b6789", lineOption, 0, 14, true, "a12345b"); + yield return (@"a.*?[bcde\n]", "xyza12345c6789", lineOption, 0, 14, true, "a12345c"); + yield return (@"a.*?[bcde\n]", "xyza12345d6789", lineOption, 0, 14, true, "a12345d"); + yield return (@"a.*?[bcde\n]", "xyza12345e6789", lineOption, 0, 14, true, "a12345e"); + yield return (@"a.*?[bcde\n]", "xyza12345f6789", lineOption, 0, 14, false, ""); + yield return (@"a.*?[bcdef]", "xyza12345b6789", lineOption, 0, 14, true, "a12345b"); yield return (@"a.*?[bcdef]", "xyza12345c6789", lineOption, 0, 14, true, "a12345c"); yield return (@"a.*?[bcdef]", "xyza12345d6789", lineOption, 0, 14, true, "a12345d"); yield return (@"a.*?[bcdef]", "xyza12345e6789", lineOption, 0, 14, true, "a12345e"); yield return (@"a.*?[bcdef]", "xyza12345f6789", lineOption, 0, 14, true, "a12345f"); yield return (@"a.*?[bcdef]", "xyza12345g6789", lineOption, 0, 14, false, ""); + + yield return (@"a[^b]*?[bcdef]", "xyza12345b6789", lineOption, 0, 14, true, "a12345b"); + yield return (@"a[^c]*?[bcdef]", "xyza12345c6789", lineOption, 0, 14, true, "a12345c"); + yield return (@"a[^b]*?[bcdef]", "xyza12345d6789", lineOption, 0, 14, true, "a12345d"); + yield return (@"a[^c]*?[bcdef]", "xyza12345e6789", lineOption, 0, 14, true, "a12345e"); + yield return (@"a[^b]*?[bcdef]", "xyza12345f6789", lineOption, 0, 14, true, "a12345f"); + yield return (@"a[^c]*?[bcdef]", "xyza12345g6789", lineOption, 0, 14, false, ""); } // Nested loops