Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -88,55 +88,7 @@ private int IndexOf(ref char searchSpace, int searchSpaceLength)
nuint ch2ByteOffset = _ch2ByteOffset;
nuint ch3ByteOffset = _ch3ByteOffset;

if (Vector512.IsHardwareAccelerated && searchSpaceMinusValueTailLength - Vector512<ushort>.Count >= 0)
{
Vector512<ushort> ch1 = Vector512.Create(_ch1);
Vector512<ushort> ch2 = Vector512.Create(_ch2);
Vector512<ushort> ch3 = Vector512.Create(_ch3);

ref char lastSearchSpace = ref Unsafe.Add(ref searchSpace, searchSpaceMinusValueTailLength - Vector512<ushort>.Count);

while (true)
{
ValidateReadPosition(ref searchSpaceStart, searchSpaceLength, ref searchSpace, Vector512<ushort>.Count);
ValidateReadPosition(ref searchSpaceStart, searchSpaceLength, ref searchSpace, Vector512<ushort>.Count + (int)(_ch2ByteOffset / 2));
ValidateReadPosition(ref searchSpaceStart, searchSpaceLength, ref searchSpace, Vector512<ushort>.Count + (int)(_ch3ByteOffset / 2));

// Find which starting positions likely contain a match (likely match all 3 anchor characters).
Vector512<byte> result = GetComparisonResult(ref searchSpace, ch2ByteOffset, ch3ByteOffset, ch1, ch2, ch3);

if (result != Vector512<byte>.Zero)
{
goto CandidateFound;
}

LoopFooter:
// We haven't found a match. Update the input position and check if we've reached the end.
searchSpace = ref Unsafe.Add(ref searchSpace, Vector512<ushort>.Count);

if (Unsafe.IsAddressGreaterThan(ref searchSpace, ref lastSearchSpace))
{
if (Unsafe.AreSame(ref searchSpace, ref Unsafe.Add(ref lastSearchSpace, Vector512<ushort>.Count)))
{
return -1;
}

// We have fewer than 32 characters remaining. Adjust the input position such that we will do one last loop iteration.
searchSpace = ref lastSearchSpace;
}

continue;

CandidateFound:
// We found potential matches, but they may be false-positives, so we must verify each one.
if (TryMatch(ref searchSpaceStart, searchSpaceLength, ref searchSpace, result.ExtractMostSignificantBits(), out int offset))
{
return offset;
}
goto LoopFooter;
}
}
else if (Vector256.IsHardwareAccelerated && searchSpaceMinusValueTailLength - Vector256<ushort>.Count >= 0)
if (Vector256.IsHardwareAccelerated && searchSpaceMinusValueTailLength - Vector256<ushort>.Count >= 0)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd really rather we not delete this. The issue isn't really V512, but the algorithm/loop being suboptimal for all the vector paths here, this is particularly prevalent from the scalar fallback which causes it to pessimize more for larger vector sizes.

Fixing it isn't that much more work and would be a bigger win.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you mean by scalar fallback in this case?
Throughput numbers are just stressing the vectorized inner loop for large inputs with no matches.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here's what the main loops look like in this case:

M01_L00:
       vpcmpeqw  ymm3,ymm0,[rdx]
       vpcmpeqw  ymm4,ymm1,[rdx+r10]
       vpcmpeqw  ymm5,ymm2,[rdx+r9]
       vpternlogd ymm5,ymm4,ymm3,80
       vptest    ymm5,ymm5
       jne       short M01_L02
M01_L01:
       add       rdx,20
       cmp       rdx,r8
       jbe       short M01_L00
M01_L00:
       vpcmpeqw  k1,zmm0,[rdx]
       vpmovm2w  zmm3,k1
       vpcmpeqw  k1,zmm1,[rdx+r10]
       vpmovm2w  zmm4,k1
       vpcmpeqw  k1,zmm2,[rdx+r9]
       vpmovm2w  zmm5,k1
       vpternlogd zmm5,zmm4,zmm3,80
       vptestmb  k1,zmm5,zmm5
       kortestq  k1,k1
       nop       dword ptr [rax]
       jne       short M01_L02
M01_L01:
       add       rdx,40
       cmp       rdx,r8
       jbe       short M01_L00

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you mean by scalar fallback in this case?

The ShortInput path that is hit is pessimized for larger vector sizes as it must process 2-4x as many elements as non-vectorized. While this doesn't necessarily show up for some bigger inputs, it will show up for small inputs and for inputs that have trailing elements. Additionally, for the whole method the non-idiomatic loops with goto can pessimize various JIT optimizations, control flow analysis, and other things.

Longer term, these should all be rewritten to follow a "better" pattern which helps minimize the dispatch branching and which allows the trailing elements to also be vectorized. -- Ideally we're generally using a pattern like TensorPrimitives uses, where the core loop/trailing logic is centralized and we're just specifying the inner loops and exit conditions.

Here's what the main loops look like in this case:

The problem with the main loop is the vpcmpeqw, vpmovm2w sequences. This is a really trivially issue related to the fact that the bitwise operands (and/andn/or/xor) are normalized to having a base type of int/uint since the underlying instructions only support these sizes due to embedded broadcast/masking support.

The check that looks for and(cvtmasktovec(op1), cvtmasktovec(op2)) sequences was looking for all three base types to match, when it actually only needs cvtmasktovec(op1) and cvtmasktovec(op2) to match and then the replacement andmask(op1, op2) to track that base type.

The following PR resolves that: #117887

-      vpcmpeqw k1, zmm6, zmmword ptr [rsi]
-      vpmovm2w zmm0, k1
-      vpcmpeqw k1, zmm7, zmmword ptr [rsi+r14]
-      vpmovm2w zmm1, k1
-      vpcmpeqw k1, zmm8, zmmword ptr [rsi+r15]
-      vpmovm2w zmm2, k1
-      vpternlogd zmm2, zmm1, zmm0, -128
-      vptestmb k1, zmm2, zmm2
-      kortestq k1, k1
+      vpcmpeqw k1, zmm6, zmmword ptr [rsi]
+      vpcmpeqw k2, zmm7, zmmword ptr [rsi+r14]
+      kandd    k1, k1, k2
+      vpcmpeqw k2, zmm8, zmmword ptr [rsi+r15]
+      kandd    k1, k1, k2
+      vpmovm2w zmm0, k1
+      vptestmb k1, zmm0, zmm0
+      kortestq k1, k1

Now the codegen still isn't "ideal" because we end up converting the mask to a vector to do the "is there any matches" check (this is the vpmovm2w, vptestmb, kortestq). That is a little more complicated to fix since it requires moving some of the op_Inequality transforms from LIR (lowering) into HIR (morph, valuenum, etc). This is planned work, just not something we've completed yet.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The ShortInput path that is hit is pessimized for larger vector sizes as it must process 2-4x as many elements as non-vectorized. While this doesn't necessarily show up for some bigger inputs, it will show up for small inputs and for inputs that have trailing elements.

I'm not sure I follow? This PR doesn't change how short inputs behave here.

The ShortInput path is only used for inputs relative to Vector128's length. When it's taken does not depend on whether the system has Avx512 or Avx2 support. Trailing elements are also processed with a vectorized step.

The following PR resolves that: #117887
This is planned work, just not something we've completed yet.

Thanks! I'll double-check what the numbers look like with your change.

Assuming it's still worse/not better compared to Vector256 paths, does it make sense to keep around?
E.g. We've reverted Avx512 support from IndexOfAnyAsciiSearcher over much smaller regressions even though there are meaningful throughput benefits on longer inputs there (#93222), whereas it's just worse across the board here.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure I follow? This PR doesn't change how short inputs behave here.

It was a general comment about how this and several other vectorized code paths in corelib are written in a way, in general, that pessimizes the larger vector sizes and/or small inptus. It wasn't a comment about the changes in this PR, rather just a general "issue" that helps make V512 perform worse than it should. If we were to fix those issues, all the paths should get faster

Assuming it's still worse/not better compared to Vector256 paths, does it make sense to keep around?
E.g. We've reverted Avx512 support from IndexOfAnyAsciiSearcher over much smaller regressions even though there are meaningful throughput benefits on longer inputs there (#93222), whereas it's just worse across the board here.

I believe it's still worth keeping and to continue incrementally tracking the improvements around. The more we revert, the harder it is to test/validate the improvements as they go in. Which applies to AVX512 and SVE alike, both of which have different considerations for mask handling.

The long term goal is to have centralized SIMD looping logic and to utilize things like (the currently internal) ISimdVector, we're getting closer to that each release and continuing to get large improvements to the handling and codegen across the board.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

-- A lot of the places where the perf is found to be suboptimal bad are also fairly trivial fixes, like the one I did. If we file issues for them and go and ensure the pattern recognition is being handled correctly, it is far better for all the vector paths. The same goes for utilizing the helpers like the Any, All, None, Count, IndexOf, and LastIndexOf helpers that now exist on the vector types.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With your change the Regex SliceSlice benchmark (just a ton of span.IndexOf(string)-like searches) shows a 40% regression (as in taking 1.4x as long) with the Avx512 paths compared to Avx2 on Zen hardware.

Should we reconsider targeted arch-specific opt outs for such cases if performance diverges this much, and we consider affected code paths as important?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We generally don't do target specific opt outs.

If you're really that concerned with the regression and it showing up in real world, then I'd just go with the removal for now. But please ensure a tracking issue is filed to ensure it is added back when the direct kortest logic is added in .NET 11.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With #118108 now merged, Regex won't hit these paths anymore for ASCII values, so I'm less concerned about the real-world impact.
IndexOf(string) is still impacted, but switching to SearchValues<string> can mitigate that.

In general it is unfortunate that we would keep around Vector512 paths if they aren't improving perf, but hopefully potntial future changes you mentioned can help here.

{
Vector256<ushort> ch1 = Vector256.Create(_ch1);
Vector256<ushort> ch2 = Vector256.Create(_ch2);
Expand Down Expand Up @@ -300,29 +252,6 @@ private static Vector256<byte> GetComparisonResult(ref char searchSpace, nuint c
}
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector512<byte> GetComparisonResult(ref char searchSpace, nuint ch2ByteOffset, nuint ch3ByteOffset, Vector512<ushort> ch1, Vector512<ushort> ch2, Vector512<ushort> ch3)
{
// See comments in 'GetComparisonResult' for Vector128<byte> above.
// This method is the same, but operates on 64 input characters at a time.
if (typeof(TCaseSensitivity) == typeof(CaseSensitive))
{
Vector512<ushort> cmpCh1 = Vector512.Equals(ch1, Vector512.LoadUnsafe(ref searchSpace));
Vector512<ushort> cmpCh2 = Vector512.Equals(ch2, Vector512.LoadUnsafe(ref Unsafe.As<char, byte>(ref searchSpace), ch2ByteOffset).AsUInt16());
Vector512<ushort> cmpCh3 = Vector512.Equals(ch3, Vector512.LoadUnsafe(ref Unsafe.As<char, byte>(ref searchSpace), ch3ByteOffset).AsUInt16());
return (cmpCh1 & cmpCh2 & cmpCh3).AsByte();
}
else
{
Vector512<ushort> caseConversion = Vector512.Create(CaseConversionMask);

Vector512<ushort> cmpCh1 = Vector512.Equals(ch1, Vector512.LoadUnsafe(ref searchSpace) & caseConversion);
Vector512<ushort> cmpCh2 = Vector512.Equals(ch2, Vector512.LoadUnsafe(ref Unsafe.As<char, byte>(ref searchSpace), ch2ByteOffset).AsUInt16() & caseConversion);
Vector512<ushort> cmpCh3 = Vector512.Equals(ch3, Vector512.LoadUnsafe(ref Unsafe.As<char, byte>(ref searchSpace), ch3ByteOffset).AsUInt16() & caseConversion);
return (cmpCh1 & cmpCh2 & cmpCh3).AsByte();
}
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private bool TryMatch(ref char searchSpaceStart, int searchSpaceLength, ref char searchSpace, uint mask, out int offsetFromStart)
{
Expand Down Expand Up @@ -351,35 +280,6 @@ private bool TryMatch(ref char searchSpaceStart, int searchSpaceLength, ref char
return false;
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private bool TryMatch(ref char searchSpaceStart, int searchSpaceLength, ref char searchSpace, ulong mask, out int offsetFromStart)
{
// 'mask' encodes the input positions where at least 3 characters likely matched.
// Verify each one to see if we've found a match, otherwise return back to the vectorized loop.
do
{
int bitPos = BitOperations.TrailingZeroCount(mask);
Debug.Assert(bitPos % 2 == 0);

ref char matchRef = ref Unsafe.AddByteOffset(ref searchSpace, bitPos);

ValidateReadPosition(ref searchSpaceStart, searchSpaceLength, ref matchRef, _valueState.Value.Length);

if (CanSkipAnchorMatchVerification || TCaseSensitivity.Equals<TValueLength>(ref matchRef, in _valueState))
{
offsetFromStart = (int)((nuint)Unsafe.ByteOffset(ref searchSpaceStart, ref matchRef) / 2);
return true;
}

mask = BitOperations.ResetLowestSetBit(BitOperations.ResetLowestSetBit(mask));
}
while (mask != 0);

offsetFromStart = 0;
return false;
}


internal override bool ContainsCore(string value) => HasUniqueValues
? base.ContainsCore(value)
: _valueState.Value.Equals(value, IgnoreCase ? StringComparison.OrdinalIgnoreCase : StringComparison.Ordinal);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,74 +68,7 @@ ref Unsafe.As<char, byte>(ref Unsafe.Add(ref searchSpace, offset + 1)),
// Based on http://0x80.pl/articles/simd-strfind.html#algorithm-1-generic-simd "Algorithm 1: Generic SIMD" by Wojciech Mula
// Some details about the implementation can also be found in https://github.com/dotnet/runtime/pull/63285
SEARCH_TWO_CHARS:
if (Vector512.IsHardwareAccelerated && searchSpaceMinusValueTailLength - Vector512<ushort>.Count >= 0)
{
// Find the last unique (which is not equal to ch1) character
// the algorithm is fine if both are equal, just a little bit less efficient
ushort ch2Val = Unsafe.Add(ref value, valueTailLength);
nint ch1ch2Distance = (nint)(uint)valueTailLength;
while (ch2Val == valueHead && ch1ch2Distance > 1)
ch2Val = Unsafe.Add(ref value, --ch1ch2Distance);

Vector512<ushort> ch1 = Vector512.Create((ushort)valueHead);
Vector512<ushort> ch2 = Vector512.Create(ch2Val);

nint searchSpaceMinusValueTailLengthAndVector =
searchSpaceMinusValueTailLength - (nint)Vector512<ushort>.Count;

do
{
// Make sure we don't go out of bounds
Debug.Assert(offset + ch1ch2Distance + Vector512<ushort>.Count <= searchSpaceLength);

Vector512<ushort> cmpCh2 = Vector512.Equals(ch2, Vector512.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance)));
Vector512<ushort> cmpCh1 = Vector512.Equals(ch1, Vector512.LoadUnsafe(ref searchSpace, (nuint)offset));
Vector512<byte> cmpAnd = (cmpCh1 & cmpCh2).AsByte();

// Early out: cmpAnd is all zeros
if (cmpAnd != Vector512<byte>.Zero)
{
goto CANDIDATE_FOUND;
}

LOOP_FOOTER:
offset += Vector512<ushort>.Count;

if (offset == searchSpaceMinusValueTailLength)
return -1;

// Overlap with the current chunk for trailing elements
if (offset > searchSpaceMinusValueTailLengthAndVector)
offset = searchSpaceMinusValueTailLengthAndVector;

continue;

CANDIDATE_FOUND:
ulong mask = cmpAnd.ExtractMostSignificantBits();
do
{
int bitPos = BitOperations.TrailingZeroCount(mask);
// div by 2 (shr) because we work with 2-byte chars
nint charPos = (nint)((uint)bitPos / 2);
if (valueLength == 2 || // we already matched two chars
SequenceEqual(
ref Unsafe.As<char, byte>(ref Unsafe.Add(ref searchSpace, offset + charPos)),
ref Unsafe.As<char, byte>(ref value), (nuint)(uint)valueLength * 2))
{
return (int)(offset + charPos);
}

// Clear two the lowest set bits
if (Bmi1.X64.IsSupported)
mask = Bmi1.X64.ResetLowestSetBit(Bmi1.X64.ResetLowestSetBit(mask));
else
mask &= ~(ulong)((ulong)0b11 << bitPos);
} while (mask != 0);
goto LOOP_FOOTER;

} while (true);
}
else if (Vector256.IsHardwareAccelerated && searchSpaceMinusValueTailLength - Vector256<ushort>.Count >= 0)
if (Vector256.IsHardwareAccelerated && searchSpaceMinusValueTailLength - Vector256<ushort>.Count >= 0)
{
// Find the last unique (which is not equal to ch1) character
// the algorithm is fine if both are equal, just a little bit less efficient
Expand Down
Loading