From daddd95263f1a413c17b7ca80514c62e184c5924 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Foidl?= Date: Mon, 7 Jan 2019 21:21:07 +0100 Subject: [PATCH 01/23] Optimized scalar code-path --- .../src/System/Buffers/Text/Base64Decoder.cs | 145 ++++++++++-------- .../src/System/Buffers/Text/Base64Encoder.cs | 97 ++++++------ 2 files changed, 133 insertions(+), 109 deletions(-) diff --git a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs index d54a8d59ba57..ec95b7daff6e 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs @@ -4,7 +4,6 @@ using System.Runtime.CompilerServices; using System.Runtime.InteropServices; -using Internal.Runtime.CompilerServices; namespace System.Buffers.Text { @@ -35,8 +34,8 @@ public static OperationStatus DecodeFromUtf8(ReadOnlySpan utf8, Span int srcLength = utf8.Length & ~0x3; // only decode input up to the closest multiple of 4. int destLength = bytes.Length; - int sourceIndex = 0; - int destIndex = 0; + uint sourceIndex = 0; + uint destIndex = 0; if (utf8.Length == 0) goto DoneExit; @@ -59,14 +58,21 @@ public static OperationStatus DecodeFromUtf8(ReadOnlySpan utf8, Span maxSrcLength = (destLength / 3) * 4; } - while (sourceIndex < maxSrcLength) + // In order to elide the movsxd in the loop + if (sourceIndex < maxSrcLength) { - int result = Decode(ref Unsafe.Add(ref srcBytes, sourceIndex), ref decodingMap); - if (result < 0) - goto InvalidExit; - WriteThreeLowOrderBytes(ref Unsafe.Add(ref destBytes, destIndex), result); - destIndex += 3; - sourceIndex += 4; + do + { + int result = Decode(ref Unsafe.Add(ref srcBytes, (IntPtr)sourceIndex), ref decodingMap); + + if (result < 0) + goto InvalidExit; + + WriteThreeLowOrderBytes(ref Unsafe.Add(ref destBytes, (IntPtr)destIndex), result); + destIndex += 3; + sourceIndex += 4; + } + while (sourceIndex < (uint)maxSrcLength); } if (maxSrcLength != srcLength - skipLastChunk) @@ -83,23 +89,25 @@ public static OperationStatus DecodeFromUtf8(ReadOnlySpan utf8, Span // if isFinalBlock is false, we will never reach this point - int i0 = Unsafe.Add(ref srcBytes, srcLength - 4); - int i1 = Unsafe.Add(ref srcBytes, srcLength - 3); - int i2 = Unsafe.Add(ref srcBytes, srcLength - 2); - int i3 = Unsafe.Add(ref srcBytes, srcLength - 1); + // Handle last four bytes. There are 0, 1, 2 padding chars. + uint t0, t1, t2, t3; + t0 = Unsafe.Add(ref srcBytes, (IntPtr)(uint)(srcLength - 4)); + t1 = Unsafe.Add(ref srcBytes, (IntPtr)(uint)(srcLength - 3)); + t2 = Unsafe.Add(ref srcBytes, (IntPtr)(uint)(srcLength - 2)); + t3 = Unsafe.Add(ref srcBytes, (IntPtr)(uint)(srcLength - 1)); - i0 = Unsafe.Add(ref decodingMap, i0); - i1 = Unsafe.Add(ref decodingMap, i1); + int i0 = Unsafe.Add(ref decodingMap, (IntPtr)t0); + int i1 = Unsafe.Add(ref decodingMap, (IntPtr)t1); i0 <<= 18; i1 <<= 12; i0 |= i1; - if (i3 != EncodingPad) + if (t3 != EncodingPad) { - i2 = Unsafe.Add(ref decodingMap, i2); - i3 = Unsafe.Add(ref decodingMap, i3); + int i2 = Unsafe.Add(ref decodingMap, (IntPtr)t2); + int i3 = Unsafe.Add(ref decodingMap, (IntPtr)t3); i2 <<= 6; @@ -110,12 +118,13 @@ public static OperationStatus DecodeFromUtf8(ReadOnlySpan utf8, Span goto InvalidExit; if (destIndex > destLength - 3) goto DestinationSmallExit; - WriteThreeLowOrderBytes(ref Unsafe.Add(ref destBytes, destIndex), i0); + + WriteThreeLowOrderBytes(ref Unsafe.Add(ref destBytes, (IntPtr)destIndex), i0); destIndex += 3; } - else if (i2 != EncodingPad) + else if (t2 != EncodingPad) { - i2 = Unsafe.Add(ref decodingMap, i2); + int i2 = Unsafe.Add(ref decodingMap, (IntPtr)t2); i2 <<= 6; @@ -125,8 +134,9 @@ public static OperationStatus DecodeFromUtf8(ReadOnlySpan utf8, Span goto InvalidExit; if (destIndex > destLength - 2) goto DestinationSmallExit; - Unsafe.Add(ref destBytes, destIndex) = (byte)(i0 >> 16); - Unsafe.Add(ref destBytes, destIndex + 1) = (byte)(i0 >> 8); + + Unsafe.Add(ref destBytes, (IntPtr)destIndex) = (byte)(i0 >> 16); + Unsafe.Add(ref destBytes, (IntPtr)(destIndex + 1)) = (byte)(i0 >> 8); destIndex += 2; } else @@ -135,7 +145,8 @@ public static OperationStatus DecodeFromUtf8(ReadOnlySpan utf8, Span goto InvalidExit; if (destIndex > destLength - 1) goto DestinationSmallExit; - Unsafe.Add(ref destBytes, destIndex) = (byte)(i0 >> 16); + + Unsafe.Add(ref destBytes, (IntPtr)destIndex) = (byte)(i0 >> 16); destIndex += 1; } @@ -145,25 +156,26 @@ public static OperationStatus DecodeFromUtf8(ReadOnlySpan utf8, Span goto InvalidExit; DoneExit: - bytesConsumed = sourceIndex; - bytesWritten = destIndex; + bytesConsumed = (int)sourceIndex; + bytesWritten = (int)destIndex; return OperationStatus.Done; DestinationSmallExit: if (srcLength != utf8.Length && isFinalBlock) goto InvalidExit; // if input is not a multiple of 4, and there is no more data, return invalid data instead - bytesConsumed = sourceIndex; - bytesWritten = destIndex; + + bytesConsumed = (int)sourceIndex; + bytesWritten = (int)destIndex; return OperationStatus.DestinationTooSmall; NeedMoreExit: - bytesConsumed = sourceIndex; - bytesWritten = destIndex; + bytesConsumed = (int)sourceIndex; + bytesWritten = (int)destIndex; return OperationStatus.NeedMoreData; InvalidExit: - bytesConsumed = sourceIndex; - bytesWritten = destIndex; + bytesConsumed = (int)sourceIndex; + bytesWritten = (int)destIndex; return OperationStatus.InvalidData; } @@ -200,8 +212,8 @@ public static int GetMaxDecodedFromUtf8Length(int length) public static OperationStatus DecodeFromUtf8InPlace(Span buffer, out int bytesWritten) { int bufferLength = buffer.Length; - int sourceIndex = 0; - int destIndex = 0; + uint sourceIndex = 0; + uint destIndex = 0; // only decode input if it is a multiple of 4 if (bufferLength != ((bufferLength >> 2) * 4)) @@ -215,31 +227,33 @@ public static OperationStatus DecodeFromUtf8InPlace(Span buffer, out int b while (sourceIndex < bufferLength - 4) { - int result = Decode(ref Unsafe.Add(ref bufferBytes, sourceIndex), ref decodingMap); + int result = Decode(ref Unsafe.Add(ref bufferBytes, (IntPtr)sourceIndex), ref decodingMap); if (result < 0) goto InvalidExit; - WriteThreeLowOrderBytes(ref Unsafe.Add(ref bufferBytes, destIndex), result); + WriteThreeLowOrderBytes(ref Unsafe.Add(ref bufferBytes, (IntPtr)destIndex), result); destIndex += 3; sourceIndex += 4; } - int i0 = Unsafe.Add(ref bufferBytes, bufferLength - 4); - int i1 = Unsafe.Add(ref bufferBytes, bufferLength - 3); - int i2 = Unsafe.Add(ref bufferBytes, bufferLength - 2); - int i3 = Unsafe.Add(ref bufferBytes, bufferLength - 1); + uint t0, t1, t2, t3; + uint n = (uint)(bufferLength - 4); + t0 = Unsafe.Add(ref bufferBytes, (IntPtr)n); + t1 = Unsafe.Add(ref bufferBytes, (IntPtr)(n+1)); + t2 = Unsafe.Add(ref bufferBytes, (IntPtr)(n+2)); + t3 = Unsafe.Add(ref bufferBytes, (IntPtr)(n+3)); - i0 = Unsafe.Add(ref decodingMap, i0); - i1 = Unsafe.Add(ref decodingMap, i1); + int i0 = Unsafe.Add(ref decodingMap, (IntPtr)t0); + int i1 = Unsafe.Add(ref decodingMap, (IntPtr)t1); i0 <<= 18; i1 <<= 12; i0 |= i1; - if (i3 != EncodingPad) + if (t3 != EncodingPad) { - i2 = Unsafe.Add(ref decodingMap, i2); - i3 = Unsafe.Add(ref decodingMap, i3); + int i2 = Unsafe.Add(ref decodingMap, (IntPtr)t2); + int i3 = Unsafe.Add(ref decodingMap, (IntPtr)t3); i2 <<= 6; @@ -248,12 +262,13 @@ public static OperationStatus DecodeFromUtf8InPlace(Span buffer, out int b if (i0 < 0) goto InvalidExit; - WriteThreeLowOrderBytes(ref Unsafe.Add(ref bufferBytes, destIndex), i0); + + WriteThreeLowOrderBytes(ref Unsafe.Add(ref bufferBytes, (IntPtr)destIndex), i0); destIndex += 3; } - else if (i2 != EncodingPad) + else if (t2 != EncodingPad) { - i2 = Unsafe.Add(ref decodingMap, i2); + int i2 = Unsafe.Add(ref decodingMap, (IntPtr)t2); i2 <<= 6; @@ -261,39 +276,43 @@ public static OperationStatus DecodeFromUtf8InPlace(Span buffer, out int b if (i0 < 0) goto InvalidExit; - Unsafe.Add(ref bufferBytes, destIndex) = (byte)(i0 >> 16); - Unsafe.Add(ref bufferBytes, destIndex + 1) = (byte)(i0 >> 8); + + Unsafe.Add(ref bufferBytes, (IntPtr)destIndex) = (byte)(i0 >> 16); + Unsafe.Add(ref bufferBytes, (IntPtr)(destIndex + 1)) = (byte)(i0 >> 8); destIndex += 2; } else { if (i0 < 0) goto InvalidExit; - Unsafe.Add(ref bufferBytes, destIndex) = (byte)(i0 >> 16); + + Unsafe.Add(ref bufferBytes, (IntPtr)destIndex) = (byte)(i0 >> 16); destIndex += 1; } DoneExit: - bytesWritten = destIndex; + bytesWritten = (int)destIndex; return OperationStatus.Done; InvalidExit: - bytesWritten = destIndex; + bytesWritten = (int)destIndex; return OperationStatus.InvalidData; } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static int Decode(ref byte encodedBytes, ref sbyte decodingMap) { - int i0 = encodedBytes; - int i1 = Unsafe.Add(ref encodedBytes, 1); - int i2 = Unsafe.Add(ref encodedBytes, 2); - int i3 = Unsafe.Add(ref encodedBytes, 3); - - i0 = Unsafe.Add(ref decodingMap, i0); - i1 = Unsafe.Add(ref decodingMap, i1); - i2 = Unsafe.Add(ref decodingMap, i2); - i3 = Unsafe.Add(ref decodingMap, i3); + uint t0, t1, t2, t3; + + t0 = encodedBytes; + t1 = Unsafe.Add(ref encodedBytes, 1); + t2 = Unsafe.Add(ref encodedBytes, 2); + t3 = Unsafe.Add(ref encodedBytes, 3); + + int i0 = Unsafe.Add(ref decodingMap, (IntPtr)t0); + int i1 = Unsafe.Add(ref decodingMap, (IntPtr)t1); + int i2 = Unsafe.Add(ref decodingMap, (IntPtr)t2); + int i3 = Unsafe.Add(ref decodingMap, (IntPtr)t3); i0 <<= 18; i1 <<= 12; diff --git a/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs b/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs index 4bb7cabc40f7..5fa5f2604534 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs @@ -4,7 +4,6 @@ using System.Runtime.CompilerServices; using System.Runtime.InteropServices; -using Internal.Runtime.CompilerServices; namespace System.Buffers.Text { @@ -46,18 +45,24 @@ public static OperationStatus EncodeToUtf8(ReadOnlySpan bytes, Span maxSrcLength = (destLength >> 2) * 3 - 2; } - int sourceIndex = 0; - int destIndex = 0; - int result = 0; + // PERF: use uint to avoid the sign-extensions + uint sourceIndex = 0; + uint destIndex = 0; + uint result = 0; ref byte encodingMap = ref s_encodingMap[0]; - while (sourceIndex < maxSrcLength) + // In order to elide the movsxd in the loop + if (sourceIndex < maxSrcLength) { - result = Encode(ref Unsafe.Add(ref srcBytes, sourceIndex), ref encodingMap); - Unsafe.WriteUnaligned(ref Unsafe.Add(ref destBytes, destIndex), result); - destIndex += 4; - sourceIndex += 3; + do + { + result = Encode(ref Unsafe.Add(ref srcBytes, (IntPtr)sourceIndex), ref encodingMap); + Unsafe.WriteUnaligned(ref Unsafe.Add(ref destBytes, (IntPtr)destIndex), result); + destIndex += 4; + sourceIndex += 3; + } + while (sourceIndex < (uint)maxSrcLength); } if (maxSrcLength != srcLength - 2) @@ -68,31 +73,31 @@ public static OperationStatus EncodeToUtf8(ReadOnlySpan bytes, Span if (sourceIndex == srcLength - 1) { - result = EncodeAndPadTwo(ref Unsafe.Add(ref srcBytes, sourceIndex), ref encodingMap); - Unsafe.WriteUnaligned(ref Unsafe.Add(ref destBytes, destIndex), result); + result = EncodeAndPadTwo(ref Unsafe.Add(ref srcBytes, (IntPtr)sourceIndex), ref encodingMap); + Unsafe.WriteUnaligned(ref Unsafe.Add(ref destBytes, (IntPtr)destIndex), result); destIndex += 4; sourceIndex += 1; } else if (sourceIndex == srcLength - 2) { - result = EncodeAndPadOne(ref Unsafe.Add(ref srcBytes, sourceIndex), ref encodingMap); - Unsafe.WriteUnaligned(ref Unsafe.Add(ref destBytes, destIndex), result); + result = EncodeAndPadOne(ref Unsafe.Add(ref srcBytes, (IntPtr)sourceIndex), ref encodingMap); + Unsafe.WriteUnaligned(ref Unsafe.Add(ref destBytes, (IntPtr)destIndex), result); destIndex += 4; sourceIndex += 2; } - bytesConsumed = sourceIndex; - bytesWritten = destIndex; + bytesConsumed = (int)sourceIndex; + bytesWritten = (int)destIndex; return OperationStatus.Done; NeedMoreDataExit: - bytesConsumed = sourceIndex; - bytesWritten = destIndex; + bytesConsumed = (int)sourceIndex; + bytesWritten = (int)destIndex; return OperationStatus.NeedMoreData; DestinationSmallExit: - bytesConsumed = sourceIndex; - bytesWritten = destIndex; + bytesConsumed = (int)sourceIndex; + bytesWritten = (int)destIndex; return OperationStatus.DestinationTooSmall; } @@ -134,9 +139,10 @@ public static OperationStatus EncodeToUtf8InPlace(Span buffer, int dataLen int leftover = dataLength - (dataLength / 3) * 3; // how many bytes after packs of 3 - int destinationIndex = encodedLength - 4; - int sourceIndex = dataLength - leftover; - int result = 0; + // PERF: use uint to avoid the sign-extensions + uint destinationIndex = (uint)(encodedLength - 4); + uint sourceIndex = (uint)(dataLength - leftover); + uint result = 0; ref byte encodingMap = ref s_encodingMap[0]; ref byte bufferBytes = ref MemoryMarshal.GetReference(buffer); @@ -146,23 +152,22 @@ public static OperationStatus EncodeToUtf8InPlace(Span buffer, int dataLen { if (leftover == 1) { - result = EncodeAndPadTwo(ref Unsafe.Add(ref bufferBytes, sourceIndex), ref encodingMap); - Unsafe.WriteUnaligned(ref Unsafe.Add(ref bufferBytes, destinationIndex), result); - destinationIndex -= 4; + result = EncodeAndPadTwo(ref Unsafe.Add(ref bufferBytes, (IntPtr)sourceIndex), ref encodingMap); } else { - result = EncodeAndPadOne(ref Unsafe.Add(ref bufferBytes, sourceIndex), ref encodingMap); - Unsafe.WriteUnaligned(ref Unsafe.Add(ref bufferBytes, destinationIndex), result); - destinationIndex -= 4; + result = EncodeAndPadOne(ref Unsafe.Add(ref bufferBytes, (IntPtr)sourceIndex), ref encodingMap); } + + Unsafe.WriteUnaligned(ref Unsafe.Add(ref bufferBytes, (IntPtr)destinationIndex), result); + destinationIndex -= 4; } sourceIndex -= 3; - while (sourceIndex >= 0) + while ((int)sourceIndex >= 0) { - result = Encode(ref Unsafe.Add(ref bufferBytes, sourceIndex), ref encodingMap); - Unsafe.WriteUnaligned(ref Unsafe.Add(ref bufferBytes, destinationIndex), result); + result = Encode(ref Unsafe.Add(ref bufferBytes, (IntPtr)sourceIndex), ref encodingMap); + Unsafe.WriteUnaligned(ref Unsafe.Add(ref bufferBytes, (IntPtr)destinationIndex), result); destinationIndex -= 4; sourceIndex -= 3; } @@ -176,37 +181,37 @@ public static OperationStatus EncodeToUtf8InPlace(Span buffer, int dataLen } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static int Encode(ref byte threeBytes, ref byte encodingMap) + private static uint Encode(ref byte threeBytes, ref byte encodingMap) { - int i = (threeBytes << 16) | (Unsafe.Add(ref threeBytes, 1) << 8) | Unsafe.Add(ref threeBytes, 2); + uint i = (uint)((threeBytes << 16) | (Unsafe.Add(ref threeBytes, 1) << 8) | Unsafe.Add(ref threeBytes, 2)); - int i0 = Unsafe.Add(ref encodingMap, i >> 18); - int i1 = Unsafe.Add(ref encodingMap, (i >> 12) & 0x3F); - int i2 = Unsafe.Add(ref encodingMap, (i >> 6) & 0x3F); - int i3 = Unsafe.Add(ref encodingMap, i & 0x3F); + uint i0 = Unsafe.Add(ref encodingMap, (IntPtr)(i >> 18)); + uint i1 = Unsafe.Add(ref encodingMap, (IntPtr)((i >> 12) & 0x3F)); + uint i2 = Unsafe.Add(ref encodingMap, (IntPtr)((i >> 6) & 0x3F)); + uint i3 = Unsafe.Add(ref encodingMap, (IntPtr)(i & 0x3F)); return i0 | (i1 << 8) | (i2 << 16) | (i3 << 24); } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static int EncodeAndPadOne(ref byte twoBytes, ref byte encodingMap) + private static uint EncodeAndPadOne(ref byte twoBytes, ref byte encodingMap) { - int i = (twoBytes << 16) | (Unsafe.Add(ref twoBytes, 1) << 8); + uint i = (uint)((twoBytes << 16) | (Unsafe.Add(ref twoBytes, 1) << 8)); - int i0 = Unsafe.Add(ref encodingMap, i >> 18); - int i1 = Unsafe.Add(ref encodingMap, (i >> 12) & 0x3F); - int i2 = Unsafe.Add(ref encodingMap, (i >> 6) & 0x3F); + uint i0 = Unsafe.Add(ref encodingMap, (IntPtr)(i >> 18)); + uint i1 = Unsafe.Add(ref encodingMap, (IntPtr)((i >> 12) & 0x3F)); + uint i2 = Unsafe.Add(ref encodingMap, (IntPtr)((i >> 6) & 0x3F)); return i0 | (i1 << 8) | (i2 << 16) | (EncodingPad << 24); } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static int EncodeAndPadTwo(ref byte oneByte, ref byte encodingMap) + private static uint EncodeAndPadTwo(ref byte oneByte, ref byte encodingMap) { - int i = (oneByte << 8); + uint i = (uint)(oneByte << 8); - int i0 = Unsafe.Add(ref encodingMap, i >> 10); - int i1 = Unsafe.Add(ref encodingMap, (i >> 4) & 0x3F); + uint i0 = Unsafe.Add(ref encodingMap, (IntPtr)(i >> 10)); + uint i1 = Unsafe.Add(ref encodingMap, (IntPtr)((i >> 4) & 0x3F)); return i0 | (i1 << 8) | (EncodingPad << 16) | (EncodingPad << 24); } From 144c431f31bb179e202099d8f7f155a7ce6f7a0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Foidl?= Date: Thu, 10 Jan 2019 20:03:51 +0100 Subject: [PATCH 02/23] Fixed label names --- .../src/System/Buffers/Text/Base64Decoder.cs | 32 +++++++++---------- .../src/System/Buffers/Text/Base64Encoder.cs | 4 +-- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs index ec95b7daff6e..75360388ff72 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs @@ -66,7 +66,7 @@ public static OperationStatus DecodeFromUtf8(ReadOnlySpan utf8, Span int result = Decode(ref Unsafe.Add(ref srcBytes, (IntPtr)sourceIndex), ref decodingMap); if (result < 0) - goto InvalidExit; + goto InvalidDataExit; WriteThreeLowOrderBytes(ref Unsafe.Add(ref destBytes, (IntPtr)destIndex), result); destIndex += 3; @@ -76,15 +76,15 @@ public static OperationStatus DecodeFromUtf8(ReadOnlySpan utf8, Span } if (maxSrcLength != srcLength - skipLastChunk) - goto DestinationSmallExit; + goto DestinationTooSmallExit; // If input is less than 4 bytes, srcLength == sourceIndex == 0 // If input is not a multiple of 4, sourceIndex == srcLength != 0 if (sourceIndex == srcLength) { if (isFinalBlock) - goto InvalidExit; - goto NeedMoreExit; + goto InvalidDataExit; + goto NeedMoreDataExit; } // if isFinalBlock is false, we will never reach this point @@ -115,9 +115,9 @@ public static OperationStatus DecodeFromUtf8(ReadOnlySpan utf8, Span i0 |= i2; if (i0 < 0) - goto InvalidExit; + goto InvalidDataExit; if (destIndex > destLength - 3) - goto DestinationSmallExit; + goto DestinationTooSmallExit; WriteThreeLowOrderBytes(ref Unsafe.Add(ref destBytes, (IntPtr)destIndex), i0); destIndex += 3; @@ -131,9 +131,9 @@ public static OperationStatus DecodeFromUtf8(ReadOnlySpan utf8, Span i0 |= i2; if (i0 < 0) - goto InvalidExit; + goto InvalidDataExit; if (destIndex > destLength - 2) - goto DestinationSmallExit; + goto DestinationTooSmallExit; Unsafe.Add(ref destBytes, (IntPtr)destIndex) = (byte)(i0 >> 16); Unsafe.Add(ref destBytes, (IntPtr)(destIndex + 1)) = (byte)(i0 >> 8); @@ -142,9 +142,9 @@ public static OperationStatus DecodeFromUtf8(ReadOnlySpan utf8, Span else { if (i0 < 0) - goto InvalidExit; + goto InvalidDataExit; if (destIndex > destLength - 1) - goto DestinationSmallExit; + goto DestinationTooSmallExit; Unsafe.Add(ref destBytes, (IntPtr)destIndex) = (byte)(i0 >> 16); destIndex += 1; @@ -153,27 +153,27 @@ public static OperationStatus DecodeFromUtf8(ReadOnlySpan utf8, Span sourceIndex += 4; if (srcLength != utf8.Length) - goto InvalidExit; + goto InvalidDataExit; - DoneExit: + DoneExit: bytesConsumed = (int)sourceIndex; bytesWritten = (int)destIndex; return OperationStatus.Done; - DestinationSmallExit: + DestinationTooSmallExit: if (srcLength != utf8.Length && isFinalBlock) - goto InvalidExit; // if input is not a multiple of 4, and there is no more data, return invalid data instead + goto InvalidDataExit; // if input is not a multiple of 4, and there is no more data, return invalid data instead bytesConsumed = (int)sourceIndex; bytesWritten = (int)destIndex; return OperationStatus.DestinationTooSmall; - NeedMoreExit: + NeedMoreDataExit: bytesConsumed = (int)sourceIndex; bytesWritten = (int)destIndex; return OperationStatus.NeedMoreData; - InvalidExit: + InvalidDataExit: bytesConsumed = (int)sourceIndex; bytesWritten = (int)destIndex; return OperationStatus.InvalidData; diff --git a/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs b/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs index 5fa5f2604534..d3b8b81fc859 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs @@ -66,7 +66,7 @@ public static OperationStatus EncodeToUtf8(ReadOnlySpan bytes, Span } if (maxSrcLength != srcLength - 2) - goto DestinationSmallExit; + goto DestinationTooSmallExit; if (!isFinalBlock) goto NeedMoreDataExit; @@ -95,7 +95,7 @@ public static OperationStatus EncodeToUtf8(ReadOnlySpan bytes, Span bytesWritten = (int)destIndex; return OperationStatus.NeedMoreData; - DestinationSmallExit: + DestinationTooSmallExit: bytesConsumed = (int)sourceIndex; bytesWritten = (int)destIndex; return OperationStatus.DestinationTooSmall; From 81a3c157ba761f0ecbe2b9f7280a1ef07ffdeeb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Foidl?= Date: Thu, 10 Jan 2019 20:00:39 +0100 Subject: [PATCH 03/23] Implemented vectorized versions --- src/System.Memory/src/System.Memory.csproj | 1 + .../src/System/Buffers/Text/Base64.cs | 171 +++++++++++++++++ .../src/System/Buffers/Text/Base64Decoder.cs | 172 +++++++++++++++++- .../src/System/Buffers/Text/Base64Encoder.cs | 165 ++++++++++++++++- 4 files changed, 493 insertions(+), 16 deletions(-) create mode 100644 src/System.Memory/src/System/Buffers/Text/Base64.cs diff --git a/src/System.Memory/src/System.Memory.csproj b/src/System.Memory/src/System.Memory.csproj index e393188ac2c4..0fa7e2ca283d 100644 --- a/src/System.Memory/src/System.Memory.csproj +++ b/src/System.Memory/src/System.Memory.csproj @@ -27,6 +27,7 @@ + diff --git a/src/System.Memory/src/System/Buffers/Text/Base64.cs b/src/System.Memory/src/System/Buffers/Text/Base64.cs new file mode 100644 index 000000000000..048c838a71a2 --- /dev/null +++ b/src/System.Memory/src/System/Buffers/Text/Base64.cs @@ -0,0 +1,171 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; + +namespace System.Buffers.Text +{ + public static partial class Base64 + { + static Base64() + { + if (Ssse3.IsSupported) + { + s_sseEncodeShuffleVec = Vector128.Create( + 1, 0, 2, 1, + 4, 3, 5, 4, + 7, 6, 8, 7, + 10, 9, 11, 10 + ); + + s_sseEncodeLut = Vector128.Create( + 65, 71, -4, -4, + -4, -4, -4, -4, + -4, -4, -4, -4, + -19, -16, 0, 0 + ); + + s_sseDecodeShuffleVec = Vector128.Create( + 2, 1, 0, 6, + 5, 4, 10, 9, + 8, 14, 13, 12, + -1, -1, -1, -1 + ); + + s_sseDecodeLutLo = Vector128.Create( + 0x15, 0x11, 0x11, 0x11, + 0x11, 0x11, 0x11, 0x11, + 0x11, 0x11, 0x13, 0x1A, + 0x1B, 0x1B, 0x1B, 0x1A + ); + + s_sseDecodeLutHi = Vector128.Create( + 0x10, 0x10, 0x01, 0x02, + 0x04, 0x08, 0x04, 0x08, + 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10 + ); + + s_sseDecodeLutShift = Vector128.Create( + 0, 16, 19, 4, + -65, -65, -71, -71, + 0, 0, 0, 0, + 0, 0, 0, 0 + ); + + s_sseDecodeMask2F = Vector128.Create((sbyte)0x2F); // ASCII: / + } + + if (Avx2.IsSupported) + { + s_avxEncodePermuteVec = Vector256.Create(0, 0, 1, 2, 3, 4, 5, 6); + + s_avxEncodeShuffleVec = Vector256.Create( + 5, 4, 6, 5, + 8, 7, 9, 8, + 11, 10, 12, 11, + 14, 13, 15, 14, + 1, 0, 2, 1, + 4, 3, 5, 4, + 7, 6, 8, 7, + 10, 9, 11, 10 + ); + + s_avxEncodeLut = Vector256.Create( + 65, 71, -4, -4, + -4, -4, -4, -4, + -4, -4, -4, -4, + -19, -16, 0, 0, + 65, 71, -4, -4, + -4, -4, -4, -4, + -4, -4, -4, -4, + -19, -16, 0, 0 + ); + + s_avxDecodeLutLo = Vector256.Create( + 0x15, 0x11, 0x11, 0x11, + 0x11, 0x11, 0x11, 0x11, + 0x11, 0x11, 0x13, 0x1A, + 0x1B, 0x1B, 0x1B, 0x1A, + 0x15, 0x11, 0x11, 0x11, + 0x11, 0x11, 0x11, 0x11, + 0x11, 0x11, 0x13, 0x1A, + 0x1B, 0x1B, 0x1B, 0x1A + ); + + s_avxDecodeLutHi = Vector256.Create( + 0x10, 0x10, 0x01, 0x02, + 0x04, 0x08, 0x04, 0x08, + 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x01, 0x02, + 0x04, 0x08, 0x04, 0x08, + 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10 + ); + + s_avxDecodeLutShift = Vector256.Create( + 0, 16, 19, 4, + -65, -65, -71, -71, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 16, 19, 4, + -65, -65, -71, -71, + 0, 0, 0, 0, + 0, 0, 0, 0 + ); + + s_avxDecodeShuffleVec = Vector256.Create( + 2, 1, 0, 6, + 5, 4, 10, 9, + 8, 14, 13, 12, + -1, -1, -1, -1, + 2, 1, 0, 6, + 5, 4, 10, 9, + 8, 14, 13, 12, + -1, -1, -1, -1 + ); + + s_avxDecodePermuteVec = Vector256.Create(0, 1, 2, 4, 5, 6, -1, -1); + + s_avxDecodeMask2F = Vector256.Create((sbyte)0x2F); // ASCII: / + } + } + + [Conditional("DEBUG")] + private static void AssertRead(ref byte src, ref byte srcStart, int srcLength) + { + int vectorElements = Unsafe.SizeOf(); + ref byte readEnd = ref Unsafe.Add(ref src, vectorElements); + ref byte srcEnd = ref Unsafe.Add(ref srcStart, srcLength + 1); + + bool isSafe = Unsafe.IsAddressLessThan(ref readEnd, ref srcEnd); + + if (!isSafe) + { + int srcIndex = Unsafe.ByteOffset(ref srcStart, ref src).ToInt32(); + throw new InvalidOperationException($"Read for {typeof(TVector)} is not within safe bounds. srcIndex: {srcIndex}, srcLength: {srcLength}"); + } + } + + [Conditional("DEBUG")] + private static void AssertWrite(ref byte dest, ref byte destStart, int destLength) + { + int vectorElements = Unsafe.SizeOf(); + ref byte writeEnd = ref Unsafe.Add(ref dest, vectorElements); + ref byte destEnd = ref Unsafe.Add(ref destStart, destLength + 1); + + bool isSafe = Unsafe.IsAddressLessThan(ref writeEnd, ref destEnd); + + if (!isSafe) + { + int destIndex = Unsafe.ByteOffset(ref destStart, ref dest).ToInt32(); + throw new InvalidOperationException($"Write for {typeof(TVector)} is not within safe bounds. destIndex: {destIndex}, destLength: {destLength}"); + } + } + } +} diff --git a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs index 75360388ff72..2a7a762d5a08 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs @@ -4,6 +4,8 @@ using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; namespace System.Buffers.Text { @@ -28,26 +30,48 @@ public static partial class Base64 /// public static OperationStatus DecodeFromUtf8(ReadOnlySpan utf8, Span bytes, out int bytesConsumed, out int bytesWritten, bool isFinalBlock = true) { + // PERF: use uint to avoid the sign-extensions + uint sourceIndex = 0; + uint destIndex = 0; + + if (utf8.IsEmpty) + goto DoneExit; + ref byte srcBytes = ref MemoryMarshal.GetReference(utf8); ref byte destBytes = ref MemoryMarshal.GetReference(bytes); int srcLength = utf8.Length & ~0x3; // only decode input up to the closest multiple of 4. int destLength = bytes.Length; + int maxSrcLength = srcLength; + int decodedLength = GetMaxDecodedFromUtf8Length(srcLength); - uint sourceIndex = 0; - uint destIndex = 0; + // max. 2 padding chars + if (destLength + 2 < decodedLength) + { + // For overflow see comment below + maxSrcLength = destLength / 3 * 4; + } - if (utf8.Length == 0) - goto DoneExit; + if (Avx2.IsSupported && maxSrcLength >= 45) + { + Avx2Decode(ref srcBytes, ref destBytes, maxSrcLength, destLength, ref sourceIndex, ref destIndex); - ref sbyte decodingMap = ref s_decodingMap[0]; + if (sourceIndex == srcLength) + goto DoneExit; + } + else if (Ssse3.IsSupported && maxSrcLength >= 24) + { + Ssse3Decode(ref srcBytes, ref destBytes, maxSrcLength, destLength, ref sourceIndex, ref destIndex); + + if (sourceIndex == srcLength) + goto DoneExit; + } // Last bytes could have padding characters, so process them separately and treat them as valid only if isFinalBlock is true // if isFinalBlock is false, padding characters are considered invalid int skipLastChunk = isFinalBlock ? 4 : 0; - int maxSrcLength = 0; - if (destLength >= GetMaxDecodedFromUtf8Length(srcLength)) + if (destLength >= decodedLength) { maxSrcLength = srcLength - skipLastChunk; } @@ -58,6 +82,8 @@ public static OperationStatus DecodeFromUtf8(ReadOnlySpan utf8, Span maxSrcLength = (destLength / 3) * 4; } + ref sbyte decodingMap = ref s_decodingMap[0]; + // In order to elide the movsxd in the loop if (sourceIndex < maxSrcLength) { @@ -238,9 +264,9 @@ public static OperationStatus DecodeFromUtf8InPlace(Span buffer, out int b uint t0, t1, t2, t3; uint n = (uint)(bufferLength - 4); t0 = Unsafe.Add(ref bufferBytes, (IntPtr)n); - t1 = Unsafe.Add(ref bufferBytes, (IntPtr)(n+1)); - t2 = Unsafe.Add(ref bufferBytes, (IntPtr)(n+2)); - t3 = Unsafe.Add(ref bufferBytes, (IntPtr)(n+3)); + t1 = Unsafe.Add(ref bufferBytes, (IntPtr)(n + 1)); + t2 = Unsafe.Add(ref bufferBytes, (IntPtr)(n + 2)); + t3 = Unsafe.Add(ref bufferBytes, (IntPtr)(n + 3)); int i0 = Unsafe.Add(ref decodingMap, (IntPtr)t0); int i1 = Unsafe.Add(ref decodingMap, (IntPtr)t1); @@ -299,6 +325,119 @@ public static OperationStatus DecodeFromUtf8InPlace(Span buffer, out int b return OperationStatus.InvalidData; } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Avx2Decode(ref byte src, ref byte destBytes, int sourceLength, int destLength, ref uint sourceIndex, ref uint destIndex) + { + ref byte srcStart = ref src; + ref byte destStart = ref destBytes; + ref byte simdSrcEnd = ref Unsafe.Add(ref src, (IntPtr)((uint)sourceLength - 45 + 1)); + + // The JIT won't hoist these "constants", so help him + Vector256 lutHi = s_avxDecodeLutHi; + Vector256 lutLo = s_avxDecodeLutLo; + Vector256 lutShift = s_avxDecodeLutShift; + Vector256 mask2F = s_avxDecodeMask2F; + Vector256 shuffleConstant0 = Vector256.Create(0x01400140).AsSByte(); + Vector256 shuffleConstant1 = Vector256.Create(0x00011000).AsInt16(); + Vector256 shuffleVec = s_avxDecodeShuffleVec; + Vector256 permuteVec = s_avxDecodePermuteVec; + + //while (remaining >= 45) + do + { + AssertRead>(ref src, ref srcStart, sourceLength); + Vector256 str = Unsafe.As>(ref src); + + Vector256 hiNibbles = Avx2.And(Avx2.ShiftRightLogical(str.AsInt32(), 4).AsSByte(), mask2F); + Vector256 loNibbles = Avx2.And(str, mask2F); + Vector256 hi = Avx2.Shuffle(lutHi, hiNibbles); + Vector256 lo = Avx2.Shuffle(lutLo, loNibbles); + Vector256 zero = Vector256.Zero; + + // https://github.com/dotnet/coreclr/issues/21247 + if (Avx2.MoveMask(Avx2.CompareGreaterThan(Avx2.And(lo, hi), zero)) != 0) + break; + + Vector256 eq2F = Avx2.CompareEqual(str, mask2F); + Vector256 shift = Avx2.Shuffle(lutShift, Avx2.Add(eq2F, hiNibbles)); + str = Avx2.Add(str, shift); + + Vector256 merge_ab_and_bc = Avx2.MultiplyAddAdjacent(str.AsByte(), shuffleConstant0); + Vector256 @out = Avx2.MultiplyAddAdjacent(merge_ab_and_bc, shuffleConstant1); + @out = Avx2.Shuffle(@out.AsSByte(), shuffleVec).AsInt32(); + str = Avx2.PermuteVar8x32(@out, permuteVec).AsSByte(); + + AssertWrite>(ref destBytes, ref destStart, destLength); + Unsafe.As>(ref destBytes) = str; + + src = ref Unsafe.Add(ref src, 32); + destBytes = ref Unsafe.Add(ref destBytes, 24); + } + while (Unsafe.IsAddressLessThan(ref src, ref simdSrcEnd)); + + // Cast to ulong to avoid the overflow-check. Codegen for x86 is still good. + sourceIndex = (uint)(ulong)Unsafe.ByteOffset(ref srcStart, ref src); + destIndex = (uint)(ulong)Unsafe.ByteOffset(ref destStart, ref destBytes); + + src = ref srcStart; + destBytes = ref destStart; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Ssse3Decode(ref byte src, ref byte destBytes, int sourceLength, int destLength, ref uint sourceIndex, ref uint destIndex) + { + ref byte srcStart = ref src; + ref byte destStart = ref destBytes; + ref byte simdSrcEnd = ref Unsafe.Add(ref src, (IntPtr)((uint)sourceLength - 24 + 1)); + + // The JIT won't hoist these "constants", so help him + Vector128 lutHi = s_sseDecodeLutHi; + Vector128 lutLo = s_sseDecodeLutLo; + Vector128 lutShift = s_sseDecodeLutShift; + Vector128 mask2F = s_sseDecodeMask2F; + Vector128 shuffleConstant0 = Vector128.Create(0x01400140).AsSByte(); + Vector128 shuffleConstant1 = Vector128.Create(0x00011000).AsInt16(); + Vector128 shuffleVec = s_sseDecodeShuffleVec; + + //while (remaining >= 24) + do + { + AssertRead>(ref src, ref srcStart, sourceLength); + Vector128 str = Unsafe.As>(ref src); + + Vector128 hiNibbles = Sse2.And(Sse2.ShiftRightLogical(str.AsInt32(), 4).AsSByte(), mask2F); + Vector128 loNibbles = Sse2.And(str, mask2F); + Vector128 hi = Ssse3.Shuffle(lutHi, hiNibbles); + Vector128 lo = Ssse3.Shuffle(lutLo, loNibbles); + Vector128 zero = Vector128.Zero; + + if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.And(lo, hi), zero)) != 0) + break; + + Vector128 eq2F = Sse2.CompareEqual(str, mask2F); + Vector128 shift = Ssse3.Shuffle(lutShift, Sse2.Add(eq2F, hiNibbles)); + str = Sse2.Add(str, shift); + + Vector128 merge_ab_and_bc = Ssse3.MultiplyAddAdjacent(str.AsByte(), shuffleConstant0); + Vector128 @out = Sse2.MultiplyAddAdjacent(merge_ab_and_bc, shuffleConstant1); + str = Ssse3.Shuffle(@out.AsSByte(), shuffleVec); + + AssertWrite>(ref destBytes, ref destStart, destLength); + Unsafe.As>(ref destBytes) = str; + + src = ref Unsafe.Add(ref src, 16); + destBytes = ref Unsafe.Add(ref destBytes, 12); + } + while (Unsafe.IsAddressLessThan(ref src, ref simdSrcEnd)); + + // Cast to ulong to avoid the overflow-check. Codegen for x86 is still good. + sourceIndex = (uint)(ulong)Unsafe.ByteOffset(ref srcStart, ref src); + destIndex = (uint)(ulong)Unsafe.ByteOffset(ref destStart, ref destBytes); + + src = ref srcStart; + destBytes = ref destStart; + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static int Decode(ref byte encodedBytes, ref sbyte decodingMap) { @@ -352,5 +491,18 @@ private static void WriteThreeLowOrderBytes(ref byte destination, int value) -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, }; + + private static readonly Vector128 s_sseDecodeShuffleVec; + private static readonly Vector128 s_sseDecodeLutLo; + private static readonly Vector128 s_sseDecodeLutHi; + private static readonly Vector128 s_sseDecodeLutShift; + private static readonly Vector128 s_sseDecodeMask2F; + + private static readonly Vector256 s_avxDecodeShuffleVec; + private static readonly Vector256 s_avxDecodePermuteVec; + private static readonly Vector256 s_avxDecodeLutLo; + private static readonly Vector256 s_avxDecodeLutHi; + private static readonly Vector256 s_avxDecodeLutShift; + private static readonly Vector256 s_avxDecodeMask2F; } } diff --git a/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs b/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs index d3b8b81fc859..3ab82582b1b5 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs @@ -4,6 +4,8 @@ using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; namespace System.Buffers.Text { @@ -29,25 +31,50 @@ public static partial class Base64 /// public static OperationStatus EncodeToUtf8(ReadOnlySpan bytes, Span utf8, out int bytesConsumed, out int bytesWritten, bool isFinalBlock = true) { + // PERF: use uint to avoid the sign-extensions + uint sourceIndex = 0; + uint destIndex = 0; + + if (bytes.IsEmpty) + goto DoneExit; + ref byte srcBytes = ref MemoryMarshal.GetReference(bytes); ref byte destBytes = ref MemoryMarshal.GetReference(utf8); int srcLength = bytes.Length; int destLength = utf8.Length; + int maxSrcLength = srcLength; - int maxSrcLength = 0; if (srcLength <= MaximumEncodeLength && destLength >= GetMaxEncodedToUtf8Length(srcLength)) { - maxSrcLength = srcLength - 2; + maxSrcLength = srcLength; } else { - maxSrcLength = (destLength >> 2) * 3 - 2; + maxSrcLength = (destLength >> 2) * 3; } - // PERF: use uint to avoid the sign-extensions - uint sourceIndex = 0; - uint destIndex = 0; + if (srcLength < 16) + goto Scalar; + + if (Avx2.IsSupported && maxSrcLength >= 32) + { + Avx2Encode(ref srcBytes, ref destBytes, maxSrcLength, destLength, ref sourceIndex, ref destIndex); + + if (sourceIndex == srcLength) + goto DoneExit; + } + + if (Ssse3.IsSupported && (maxSrcLength >= (int)sourceIndex + 16)) + { + Ssse3Encode(ref srcBytes, ref destBytes, maxSrcLength, destLength, ref sourceIndex, ref destIndex); + + if (sourceIndex == srcLength) + goto DoneExit; + } + + Scalar: + maxSrcLength -= 2; uint result = 0; ref byte encodingMap = ref s_encodingMap[0]; @@ -86,6 +113,7 @@ public static OperationStatus EncodeToUtf8(ReadOnlySpan bytes, Span sourceIndex += 2; } + DoneExit: bytesConsumed = (int)sourceIndex; bytesWritten = (int)destIndex; return OperationStatus.Done; @@ -180,6 +208,124 @@ public static OperationStatus EncodeToUtf8InPlace(Span buffer, int dataLen return OperationStatus.DestinationTooSmall; } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Avx2Encode(ref byte src, ref byte dest, int sourceLength, int destLength, ref uint sourceIndex, ref uint destIndex) + { + ref byte srcStart = ref src; + ref byte destStart = ref dest; + ref byte simdSrcEnd = ref Unsafe.Add(ref src, (IntPtr)((uint)sourceLength - 28)); + + // The JIT won't hoist these "constants", so help him + Vector256 shuffleVec = s_avxEncodeShuffleVec; + Vector256 shuffleConstant0 = Vector256.Create(0x0fc0fc00).AsSByte(); + Vector256 shuffleConstant2 = Vector256.Create(0x003f03f0).AsSByte(); + Vector256 shuffleConstant1 = Vector256.Create(0x04000040).AsUInt16(); + Vector256 shuffleConstant3 = Vector256.Create(0x01000010).AsInt16(); + Vector256 translationContant0 = Vector256.Create((byte)51); + Vector256 translationContant1 = Vector256.Create((sbyte)25); + Vector256 lut = s_avxEncodeLut; + + // first load is done at c-0 not to get a segfault + AssertRead>(ref src, ref srcStart, sourceLength); + Vector256 str = Unsafe.As>(ref src); + + // shift by 4 bytes, as required by enc_reshuffle + str = Avx2.PermuteVar8x32(str.AsInt32(), s_avxEncodePermuteVec).AsSByte(); + + while (true) + { + // Reshuffle + str = Avx2.Shuffle(str, shuffleVec); + Vector256 t0 = Avx2.And(str, shuffleConstant0); + Vector256 t2 = Avx2.And(str, shuffleConstant2); + Vector256 t1 = Avx2.MultiplyHigh(t0.AsUInt16(), shuffleConstant1); + Vector256 t3 = Avx2.MultiplyLow(t2.AsInt16(), shuffleConstant3); + str = Avx2.Or(t1.AsSByte(), t3.AsSByte()); + + // Translation + Vector256 indices = Avx2.SubtractSaturate(str.AsByte(), translationContant0); + Vector256 mask = Avx2.CompareGreaterThan(str, translationContant1); + Vector256 tmp = Avx2.Subtract(indices.AsSByte(), mask); + str = Avx2.Add(str, Avx2.Shuffle(lut, tmp)); + + AssertWrite>(ref dest, ref destStart, destLength); + Unsafe.As>(ref dest) = str; + + src = ref Unsafe.Add(ref src, 24); + dest = ref Unsafe.Add(ref dest, 32); + + if (Unsafe.IsAddressGreaterThan(ref src, ref simdSrcEnd)) + break; + + // Load at c-4, as required by enc_reshuffle + AssertRead>(ref Unsafe.Subtract(ref src, 4), ref srcStart, sourceLength); + str = Unsafe.As>(ref Unsafe.Subtract(ref src, 4)); + } + + // Cast to ulong to avoid the overflow-check. Codegen for x86 is still good. + sourceIndex = (uint)(ulong)Unsafe.ByteOffset(ref srcStart, ref src); + destIndex = (uint)(ulong)Unsafe.ByteOffset(ref destStart, ref dest); + + src = ref srcStart; + dest = ref destStart; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Ssse3Encode(ref byte src, ref byte dest, int sourceLength, int destLength, ref uint sourceIndex, ref uint destIndex) + { + ref byte srcStart = ref src; + ref byte destStart = ref dest; + ref byte simdSrcEnd = ref Unsafe.Add(ref src, (IntPtr)((uint)sourceLength - 16 + 1)); + + // Shift to workspace + src = ref Unsafe.Add(ref src, (IntPtr)sourceIndex); + dest = ref Unsafe.Add(ref dest, (IntPtr)destIndex); + + // The JIT won't hoist these "constants", so help him + Vector128 shuffleVec = s_sseEncodeShuffleVec; + Vector128 shuffleConstant0 = Vector128.Create(0x0fc0fc00).AsSByte(); + Vector128 shuffleConstant2 = Vector128.Create(0x003f03f0).AsSByte(); + Vector128 shuffleConstant1 = Vector128.Create(0x04000040).AsUInt16(); + Vector128 shuffleConstant3 = Vector128.Create(0x01000010).AsInt16(); + Vector128 translationContant0 = Vector128.Create((byte)51); + Vector128 translationContant1 = Vector128.Create((sbyte)25); + Vector128 lut = s_sseEncodeLut; + + //while (remaining >= 16) + while (Unsafe.IsAddressLessThan(ref src, ref simdSrcEnd)) + { + AssertRead>(ref src, ref srcStart, sourceLength); + Vector128 str = Unsafe.As>(ref src); + + // Reshuffle + str = Ssse3.Shuffle(str, shuffleVec); + Vector128 t0 = Sse2.And(str, shuffleConstant0); + Vector128 t2 = Sse2.And(str, shuffleConstant2); + Vector128 t1 = Sse2.MultiplyHigh(t0.AsUInt16(), shuffleConstant1); + Vector128 t3 = Sse2.MultiplyLow(t2.AsInt16(), shuffleConstant3); + str = Sse2.Or(t1.AsSByte(), t3.AsSByte()); + + // Translation + Vector128 indices = Sse2.SubtractSaturate(str.AsByte(), translationContant0); + Vector128 mask = Sse2.CompareGreaterThan(str, translationContant1); + Vector128 tmp = Sse2.Subtract(indices.AsSByte(), mask); + str = Sse2.Add(str, Ssse3.Shuffle(lut, tmp)); + + AssertWrite>(ref dest, ref destStart, destLength); + Unsafe.As>(ref dest) = str; + + src = ref Unsafe.Add(ref src, 12); + dest = ref Unsafe.Add(ref dest, 16); + } + + // Cast to ulong to avoid the overflow-check. Codegen for x86 is still good. + sourceIndex = (uint)(ulong)Unsafe.ByteOffset(ref srcStart, ref src); + destIndex = (uint)(ulong)Unsafe.ByteOffset(ref destStart, ref dest); + + src = ref srcStart; + dest = ref destStart; + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static uint Encode(ref byte threeBytes, ref byte encodingMap) { @@ -228,6 +374,13 @@ private static uint EncodeAndPadTwo(ref byte oneByte, ref byte encodingMap) 52, 53, 54, 55, 56, 57, 43, 47 //4..9, +, / }; + private static readonly Vector128 s_sseEncodeShuffleVec; + private static readonly Vector128 s_sseEncodeLut; + + private static readonly Vector256 s_avxEncodePermuteVec; + private static readonly Vector256 s_avxEncodeShuffleVec; + private static readonly Vector256 s_avxEncodeLut; + private const byte EncodingPad = (byte)'='; // '=', for padding private const int MaximumEncodeLength = (int.MaxValue / 4) * 3; // 1610612733 From a46b9b265b1faf02a4ddfb6a8f89a79300b69f7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Foidl?= Date: Thu, 10 Jan 2019 21:16:57 +0100 Subject: [PATCH 04/23] Added reference to source of algorithm --- src/System.Memory/src/System/Buffers/Text/Base64.cs | 3 +++ src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs | 3 +++ src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs | 3 +++ 3 files changed, 9 insertions(+) diff --git a/src/System.Memory/src/System/Buffers/Text/Base64.cs b/src/System.Memory/src/System/Buffers/Text/Base64.cs index 048c838a71a2..888576581e29 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64.cs @@ -9,6 +9,9 @@ namespace System.Buffers.Text { + // AVX2 version based on https://github.com/aklomp/base64/tree/e516d769a2a432c08404f1981e73b431566057be/lib/arch/avx2 + // SSSE3 version based on https://github.com/aklomp/base64/tree/e516d769a2a432c08404f1981e73b431566057be/lib/arch/ssse3 + public static partial class Base64 { static Base64() diff --git a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs index 2a7a762d5a08..3af47e8f434c 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs @@ -9,6 +9,9 @@ namespace System.Buffers.Text { + // AVX2 version based on https://github.com/aklomp/base64/tree/e516d769a2a432c08404f1981e73b431566057be/lib/arch/avx2 + // SSSE3 version based on https://github.com/aklomp/base64/tree/e516d769a2a432c08404f1981e73b431566057be/lib/arch/ssse3 + public static partial class Base64 { /// diff --git a/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs b/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs index 3ab82582b1b5..2c899e9c5b83 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs @@ -9,6 +9,9 @@ namespace System.Buffers.Text { + // AVX2 version based on https://github.com/aklomp/base64/tree/e516d769a2a432c08404f1981e73b431566057be/lib/arch/avx2 + // SSSE3 version based on https://github.com/aklomp/base64/tree/e516d769a2a432c08404f1981e73b431566057be/lib/arch/ssse3 + /// /// Convert between binary data and UTF-8 encoded text that is represented in base 64. /// From 2ac8aa4bd316541f05d96178c04ca50280bece76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Foidl?= Date: Thu, 10 Jan 2019 22:33:30 +0100 Subject: [PATCH 05/23] Added back missing namespace --- src/System.Memory/src/System/Buffers/Text/Base64.cs | 1 + src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs | 1 + src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs | 1 + 3 files changed, 3 insertions(+) diff --git a/src/System.Memory/src/System/Buffers/Text/Base64.cs b/src/System.Memory/src/System/Buffers/Text/Base64.cs index 888576581e29..388fb204ff6f 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64.cs @@ -6,6 +6,7 @@ using System.Runtime.CompilerServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; +using Internal.Runtime.CompilerServices; namespace System.Buffers.Text { diff --git a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs index 3af47e8f434c..db7bd95644a6 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs @@ -6,6 +6,7 @@ using System.Runtime.InteropServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; +using Internal.Runtime.CompilerServices; namespace System.Buffers.Text { diff --git a/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs b/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs index 2c899e9c5b83..5d114cdcc7cc 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs @@ -6,6 +6,7 @@ using System.Runtime.InteropServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; +using Internal.Runtime.CompilerServices; namespace System.Buffers.Text { From 40d216b65ed28f96dd79359156906fd9a94da0f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Foidl?= Date: Thu, 10 Jan 2019 22:51:18 +0100 Subject: [PATCH 06/23] Unsafe.Add instead of Unsafe.Subtract Fixed build-failure (https://ci3.dot.net/job/dotnet_corefx/job/master/job/linux-musl-TGroup_netcoreapp+CGroup_Debug+AGroup_x64+TestOuter_false_prtest/8247/console) Seems like the internal Unsafe doesn't have a Subtract method, so use Add instead. --- src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs b/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs index 5d114cdcc7cc..766fd0ca3d7d 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs @@ -262,8 +262,8 @@ private static void Avx2Encode(ref byte src, ref byte dest, int sourceLength, in break; // Load at c-4, as required by enc_reshuffle - AssertRead>(ref Unsafe.Subtract(ref src, 4), ref srcStart, sourceLength); - str = Unsafe.As>(ref Unsafe.Subtract(ref src, 4)); + AssertRead>(ref Unsafe.Add(ref src, -4), ref srcStart, sourceLength); + str = Unsafe.As>(ref Unsafe.Add(ref src, -4)); } // Cast to ulong to avoid the overflow-check. Codegen for x86 is still good. From e7f00fb91bb20357868eb397b9f36166a382905b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Foidl?= Date: Fri, 11 Jan 2019 09:46:31 +0100 Subject: [PATCH 07/23] Added THIRD-PARTY-NOTICES --- .../System/Buffers/Text/THIRD-PARTY-NOTICES | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 src/System.Memory/src/System/Buffers/Text/THIRD-PARTY-NOTICES diff --git a/src/System.Memory/src/System/Buffers/Text/THIRD-PARTY-NOTICES b/src/System.Memory/src/System/Buffers/Text/THIRD-PARTY-NOTICES new file mode 100644 index 000000000000..9a2d8092c805 --- /dev/null +++ b/src/System.Memory/src/System/Buffers/Text/THIRD-PARTY-NOTICES @@ -0,0 +1,41 @@ +.NET Core uses third-party libraries or other resources that may be +distributed under licenses different than the .NET Core software. + +In the event that we accidentally failed to list a required notice, please +bring it to our attention. Post an issue or email us: + + dotnet@microsoft.com + +The attached notices are provided for information only. + +License notice for vectorized base64 encoding / decoding +----------------------------------------------------------------------------- + +Copyright (c) 2005-2007, Nick Galbreath +Copyright (c) 2013-2017, Alfred Klomp +Copyright (c) 2015-2017, Wojciech Mula +Copyright (c) 2016-2017, Matthieu Darbois +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +- Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + +- Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. From 60729584d747dfa5480113302607458902315e9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Foidl?= Date: Fri, 11 Jan 2019 11:08:38 +0100 Subject: [PATCH 08/23] PR Feedback --- .../src/System/Buffers/Text/Base64.cs | 44 ++++++++++--------- .../src/System/Buffers/Text/Base64Decoder.cs | 6 ++- .../src/System/Buffers/Text/Base64Encoder.cs | 8 +++- 3 files changed, 35 insertions(+), 23 deletions(-) diff --git a/src/System.Memory/src/System/Buffers/Text/Base64.cs b/src/System.Memory/src/System/Buffers/Text/Base64.cs index 388fb204ff6f..1ed3db8d2232 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64.cs @@ -141,34 +141,38 @@ static Base64() } [Conditional("DEBUG")] - private static void AssertRead(ref byte src, ref byte srcStart, int srcLength) + private static unsafe void AssertRead(ref byte src, ref byte srcStart, int srcLength) { - int vectorElements = Unsafe.SizeOf(); - ref byte readEnd = ref Unsafe.Add(ref src, vectorElements); - ref byte srcEnd = ref Unsafe.Add(ref srcStart, srcLength + 1); - - bool isSafe = Unsafe.IsAddressLessThan(ref readEnd, ref srcEnd); - - if (!isSafe) + fixed (byte* pSrc = &src) + fixed (byte* pSrcStart = &srcStart) { - int srcIndex = Unsafe.ByteOffset(ref srcStart, ref src).ToInt32(); - throw new InvalidOperationException($"Read for {typeof(TVector)} is not within safe bounds. srcIndex: {srcIndex}, srcLength: {srcLength}"); + int vectorElements = Unsafe.SizeOf(); + byte* readEnd = pSrc + vectorElements; + byte* srcEnd = pSrcStart + srcLength; + + if (readEnd > srcEnd) + { + int srcIndex = (int)(pSrc - pSrcStart); + throw new InvalidOperationException($"Read for {typeof(TVector)} is not within safe bounds. srcIndex: {srcIndex}, srcLength: {srcLength}"); + } } } [Conditional("DEBUG")] - private static void AssertWrite(ref byte dest, ref byte destStart, int destLength) + private static unsafe void AssertWrite(ref byte dest, ref byte destStart, int destLength) { - int vectorElements = Unsafe.SizeOf(); - ref byte writeEnd = ref Unsafe.Add(ref dest, vectorElements); - ref byte destEnd = ref Unsafe.Add(ref destStart, destLength + 1); - - bool isSafe = Unsafe.IsAddressLessThan(ref writeEnd, ref destEnd); - - if (!isSafe) + fixed (byte* pDest = &dest) + fixed (byte* pDestStart = &destStart) { - int destIndex = Unsafe.ByteOffset(ref destStart, ref dest).ToInt32(); - throw new InvalidOperationException($"Write for {typeof(TVector)} is not within safe bounds. destIndex: {destIndex}, destLength: {destLength}"); + int vectorElements = Unsafe.SizeOf(); + byte* writeEnd = pDest + vectorElements; + byte* destEnd = pDestStart + destLength; + + if (writeEnd > destEnd) + { + int destIndex = (int)(pDest - pDestStart); + throw new InvalidOperationException($"Write for {typeof(TVector)} is not within safe bounds. destIndex: {destIndex}, destLength: {destLength}"); + } } } } diff --git a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs index db7bd95644a6..627f0f185c5e 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs @@ -50,7 +50,7 @@ public static OperationStatus DecodeFromUtf8(ReadOnlySpan utf8, Span int decodedLength = GetMaxDecodedFromUtf8Length(srcLength); // max. 2 padding chars - if (destLength + 2 < decodedLength) + if (destLength < decodedLength - 2) { // For overflow see comment below maxSrcLength = destLength / 3 * 4; @@ -372,6 +372,8 @@ private static void Avx2Decode(ref byte src, ref byte destBytes, int sourceLengt str = Avx2.PermuteVar8x32(@out, permuteVec).AsSByte(); AssertWrite>(ref destBytes, ref destStart, destLength); + // As has better CQ than WriteUnaligned + // https://github.com/dotnet/coreclr/issues/21132 Unsafe.As>(ref destBytes) = str; src = ref Unsafe.Add(ref src, 32); @@ -427,6 +429,8 @@ private static void Ssse3Decode(ref byte src, ref byte destBytes, int sourceLeng str = Ssse3.Shuffle(@out.AsSByte(), shuffleVec); AssertWrite>(ref destBytes, ref destStart, destLength); + // As has better CQ than WriteUnaligned + // https://github.com/dotnet/coreclr/issues/21132 Unsafe.As>(ref destBytes) = str; src = ref Unsafe.Add(ref src, 16); diff --git a/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs b/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs index 766fd0ca3d7d..63162e478f4e 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs @@ -145,7 +145,7 @@ public static int GetMaxEncodedToUtf8Length(int length) if ((uint)length > MaximumEncodeLength) ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.length); - return (((length + 2) / 3) * 4); + return ((length + 2) / 3) * 4; } /// @@ -217,7 +217,7 @@ private static void Avx2Encode(ref byte src, ref byte dest, int sourceLength, in { ref byte srcStart = ref src; ref byte destStart = ref dest; - ref byte simdSrcEnd = ref Unsafe.Add(ref src, (IntPtr)((uint)sourceLength - 28)); + ref byte simdSrcEnd = ref Unsafe.Add(ref src, (IntPtr)((uint)sourceLength - 28)); // 28 = 32 - 4 // The JIT won't hoist these "constants", so help him Vector256 shuffleVec = s_avxEncodeShuffleVec; @@ -253,6 +253,8 @@ private static void Avx2Encode(ref byte src, ref byte dest, int sourceLength, in str = Avx2.Add(str, Avx2.Shuffle(lut, tmp)); AssertWrite>(ref dest, ref destStart, destLength); + // As has better CQ than WriteUnaligned + // https://github.com/dotnet/coreclr/issues/21132 Unsafe.As>(ref dest) = str; src = ref Unsafe.Add(ref src, 24); @@ -316,6 +318,8 @@ private static void Ssse3Encode(ref byte src, ref byte dest, int sourceLength, i str = Sse2.Add(str, Ssse3.Shuffle(lut, tmp)); AssertWrite>(ref dest, ref destStart, destLength); + // As has better CQ than WriteUnaligned + // https://github.com/dotnet/coreclr/issues/21132 Unsafe.As>(ref dest) = str; src = ref Unsafe.Add(ref src, 12); From f2067cc54048045ba825f777ff1b0b3f5d6ec9a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Foidl?= Date: Fri, 11 Jan 2019 18:14:08 +0100 Subject: [PATCH 09/23] THIRD-PARTY-NOTICES in repo-base instead instead in folder Cf. https://github.com/dotnet/corefx/pull/34529#issuecomment-453510246 --- THIRD-PARTY-NOTICES.TXT | 32 +++++++++++++++ .../System/Buffers/Text/THIRD-PARTY-NOTICES | 41 ------------------- 2 files changed, 32 insertions(+), 41 deletions(-) delete mode 100644 src/System.Memory/src/System/Buffers/Text/THIRD-PARTY-NOTICES diff --git a/THIRD-PARTY-NOTICES.TXT b/THIRD-PARTY-NOTICES.TXT index b25636f506b8..de86db916f79 100644 --- a/THIRD-PARTY-NOTICES.TXT +++ b/THIRD-PARTY-NOTICES.TXT @@ -332,3 +332,35 @@ FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +License notice for vectorized base64 encoding / decoding +-------------------------------------------------------- + +Copyright (c) 2005-2007, Nick Galbreath +Copyright (c) 2013-2017, Alfred Klomp +Copyright (c) 2015-2017, Wojciech Mula +Copyright (c) 2016-2017, Matthieu Darbois +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +- Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + +- Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/src/System.Memory/src/System/Buffers/Text/THIRD-PARTY-NOTICES b/src/System.Memory/src/System/Buffers/Text/THIRD-PARTY-NOTICES deleted file mode 100644 index 9a2d8092c805..000000000000 --- a/src/System.Memory/src/System/Buffers/Text/THIRD-PARTY-NOTICES +++ /dev/null @@ -1,41 +0,0 @@ -.NET Core uses third-party libraries or other resources that may be -distributed under licenses different than the .NET Core software. - -In the event that we accidentally failed to list a required notice, please -bring it to our attention. Post an issue or email us: - - dotnet@microsoft.com - -The attached notices are provided for information only. - -License notice for vectorized base64 encoding / decoding ------------------------------------------------------------------------------ - -Copyright (c) 2005-2007, Nick Galbreath -Copyright (c) 2013-2017, Alfred Klomp -Copyright (c) 2015-2017, Wojciech Mula -Copyright (c) 2016-2017, Matthieu Darbois -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - -- Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - -- Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED -TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. From 695480259787e5128539d8fb2d626f66eb04fe6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Foidl?= Date: Fri, 11 Jan 2019 19:15:40 +0100 Subject: [PATCH 10/23] PR Feedback * https://github.com/dotnet/corefx/pull/34529#discussion_r247200659 * https://github.com/dotnet/corefx/pull/34529#discussion_r247214904 --- .../src/System/Buffers/Text/Base64Decoder.cs | 14 +++++++------- .../src/System/Buffers/Text/Base64Encoder.cs | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs index 627f0f185c5e..1b991ec00d1d 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs @@ -336,7 +336,7 @@ private static void Avx2Decode(ref byte src, ref byte destBytes, int sourceLengt ref byte destStart = ref destBytes; ref byte simdSrcEnd = ref Unsafe.Add(ref src, (IntPtr)((uint)sourceLength - 45 + 1)); - // The JIT won't hoist these "constants", so help him + // The JIT won't hoist these "constants", so help it Vector256 lutHi = s_avxDecodeLutHi; Vector256 lutLo = s_avxDecodeLutLo; Vector256 lutShift = s_avxDecodeLutShift; @@ -367,9 +367,9 @@ private static void Avx2Decode(ref byte src, ref byte destBytes, int sourceLengt str = Avx2.Add(str, shift); Vector256 merge_ab_and_bc = Avx2.MultiplyAddAdjacent(str.AsByte(), shuffleConstant0); - Vector256 @out = Avx2.MultiplyAddAdjacent(merge_ab_and_bc, shuffleConstant1); - @out = Avx2.Shuffle(@out.AsSByte(), shuffleVec).AsInt32(); - str = Avx2.PermuteVar8x32(@out, permuteVec).AsSByte(); + Vector256 output = Avx2.MultiplyAddAdjacent(merge_ab_and_bc, shuffleConstant1); + output = Avx2.Shuffle(output.AsSByte(), shuffleVec).AsInt32(); + str = Avx2.PermuteVar8x32(output, permuteVec).AsSByte(); AssertWrite>(ref destBytes, ref destStart, destLength); // As has better CQ than WriteUnaligned @@ -396,7 +396,7 @@ private static void Ssse3Decode(ref byte src, ref byte destBytes, int sourceLeng ref byte destStart = ref destBytes; ref byte simdSrcEnd = ref Unsafe.Add(ref src, (IntPtr)((uint)sourceLength - 24 + 1)); - // The JIT won't hoist these "constants", so help him + // The JIT won't hoist these "constants", so help it Vector128 lutHi = s_sseDecodeLutHi; Vector128 lutLo = s_sseDecodeLutLo; Vector128 lutShift = s_sseDecodeLutShift; @@ -425,8 +425,8 @@ private static void Ssse3Decode(ref byte src, ref byte destBytes, int sourceLeng str = Sse2.Add(str, shift); Vector128 merge_ab_and_bc = Ssse3.MultiplyAddAdjacent(str.AsByte(), shuffleConstant0); - Vector128 @out = Sse2.MultiplyAddAdjacent(merge_ab_and_bc, shuffleConstant1); - str = Ssse3.Shuffle(@out.AsSByte(), shuffleVec); + Vector128 output = Sse2.MultiplyAddAdjacent(merge_ab_and_bc, shuffleConstant1); + str = Ssse3.Shuffle(output.AsSByte(), shuffleVec); AssertWrite>(ref destBytes, ref destStart, destLength); // As has better CQ than WriteUnaligned diff --git a/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs b/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs index 63162e478f4e..681880ec6ec7 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs @@ -219,7 +219,7 @@ private static void Avx2Encode(ref byte src, ref byte dest, int sourceLength, in ref byte destStart = ref dest; ref byte simdSrcEnd = ref Unsafe.Add(ref src, (IntPtr)((uint)sourceLength - 28)); // 28 = 32 - 4 - // The JIT won't hoist these "constants", so help him + // The JIT won't hoist these "constants", so help it Vector256 shuffleVec = s_avxEncodeShuffleVec; Vector256 shuffleConstant0 = Vector256.Create(0x0fc0fc00).AsSByte(); Vector256 shuffleConstant2 = Vector256.Create(0x003f03f0).AsSByte(); @@ -287,7 +287,7 @@ private static void Ssse3Encode(ref byte src, ref byte dest, int sourceLength, i src = ref Unsafe.Add(ref src, (IntPtr)sourceIndex); dest = ref Unsafe.Add(ref dest, (IntPtr)destIndex); - // The JIT won't hoist these "constants", so help him + // The JIT won't hoist these "constants", so help it Vector128 shuffleVec = s_sseEncodeShuffleVec; Vector128 shuffleConstant0 = Vector128.Create(0x0fc0fc00).AsSByte(); Vector128 shuffleConstant2 = Vector128.Create(0x003f03f0).AsSByte(); From 74e36e7bcc670150ff916519f848fe610dbab80e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Foidl?= Date: Sun, 13 Jan 2019 22:05:39 +0100 Subject: [PATCH 11/23] Rewritten to use raw-pointers instead of GC-tracked refs Cf. https://github.com/dotnet/corefx/pull/34529#discussion_r247197669 --- .../src/System/Buffers/Text/Base64.cs | 40 +- .../src/System/Buffers/Text/Base64Decoder.cs | 518 +++++++++--------- .../src/System/Buffers/Text/Base64Encoder.cs | 374 ++++++------- 3 files changed, 461 insertions(+), 471 deletions(-) diff --git a/src/System.Memory/src/System/Buffers/Text/Base64.cs b/src/System.Memory/src/System/Buffers/Text/Base64.cs index 1ed3db8d2232..69493f64b065 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64.cs @@ -141,38 +141,30 @@ static Base64() } [Conditional("DEBUG")] - private static unsafe void AssertRead(ref byte src, ref byte srcStart, int srcLength) + private static unsafe void AssertRead(byte* src, byte* srcStart, int srcLength) { - fixed (byte* pSrc = &src) - fixed (byte* pSrcStart = &srcStart) + int vectorElements = Unsafe.SizeOf(); + byte* readEnd = src + vectorElements; + byte* srcEnd = srcStart + srcLength; + + if (readEnd > srcEnd) { - int vectorElements = Unsafe.SizeOf(); - byte* readEnd = pSrc + vectorElements; - byte* srcEnd = pSrcStart + srcLength; - - if (readEnd > srcEnd) - { - int srcIndex = (int)(pSrc - pSrcStart); - throw new InvalidOperationException($"Read for {typeof(TVector)} is not within safe bounds. srcIndex: {srcIndex}, srcLength: {srcLength}"); - } + int srcIndex = (int)(src - srcStart); + throw new InvalidOperationException($"Read for {typeof(TVector)} is not within safe bounds. srcIndex: {srcIndex}, srcLength: {srcLength}"); } } [Conditional("DEBUG")] - private static unsafe void AssertWrite(ref byte dest, ref byte destStart, int destLength) + private static unsafe void AssertWrite(byte* dest, byte* destStart, int destLength) { - fixed (byte* pDest = &dest) - fixed (byte* pDestStart = &destStart) + int vectorElements = Unsafe.SizeOf(); + byte* writeEnd = dest + vectorElements; + byte* destEnd = destStart + destLength; + + if (writeEnd > destEnd) { - int vectorElements = Unsafe.SizeOf(); - byte* writeEnd = pDest + vectorElements; - byte* destEnd = pDestStart + destLength; - - if (writeEnd > destEnd) - { - int destIndex = (int)(pDest - pDestStart); - throw new InvalidOperationException($"Write for {typeof(TVector)} is not within safe bounds. destIndex: {destIndex}, destLength: {destLength}"); - } + int destIndex = (int)(dest - destStart); + throw new InvalidOperationException($"Write for {typeof(TVector)} is not within safe bounds. destIndex: {destIndex}, destLength: {destLength}"); } } } diff --git a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs index 1b991ec00d1d..1b34df5d981e 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs @@ -3,10 +3,14 @@ // See the LICENSE file in the project root for more information. using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; -using Internal.Runtime.CompilerServices; + +#if BIT64 +using nuint = System.UInt64; +#else +using nuint = System.UInt32; +#endif namespace System.Buffers.Text { @@ -23,7 +27,7 @@ public static partial class Base64 /// The output span which contains the result of the operation, i.e. the decoded binary data. /// The number of input bytes consumed during the operation. This can be used to slice the input for subsequent calls, if necessary. /// The number of bytes written into the output span. This can be used to slice the output for subsequent calls, if necessary. - /// True (default) when the input span contains the entire data to decode. + /// True (default) when the input span contains the entire data to decode. /// Set to false only if it is known that the input span contains partial data with more data to follow. /// It returns the OperationStatus enum values: /// - Done - on successful processing of the entire input span @@ -32,181 +36,191 @@ public static partial class Base64 /// - InvalidData - if the input contains bytes outside of the expected base 64 range, or if it contains invalid/more than two padding characters, /// or if the input is incomplete (i.e. not a multiple of 4) and isFinalBlock is true. /// - public static OperationStatus DecodeFromUtf8(ReadOnlySpan utf8, Span bytes, out int bytesConsumed, out int bytesWritten, bool isFinalBlock = true) + public static unsafe OperationStatus DecodeFromUtf8(ReadOnlySpan utf8, Span bytes, out int bytesConsumed, out int bytesWritten, bool isFinalBlock = true) { - // PERF: use uint to avoid the sign-extensions - uint sourceIndex = 0; - uint destIndex = 0; - if (utf8.IsEmpty) - goto DoneExit; - - ref byte srcBytes = ref MemoryMarshal.GetReference(utf8); - ref byte destBytes = ref MemoryMarshal.GetReference(bytes); - - int srcLength = utf8.Length & ~0x3; // only decode input up to the closest multiple of 4. - int destLength = bytes.Length; - int maxSrcLength = srcLength; - int decodedLength = GetMaxDecodedFromUtf8Length(srcLength); - - // max. 2 padding chars - if (destLength < decodedLength - 2) { - // For overflow see comment below - maxSrcLength = destLength / 3 * 4; + bytesConsumed = 0; + bytesWritten = 0; + return OperationStatus.Done; } - if (Avx2.IsSupported && maxSrcLength >= 45) + fixed (byte* srcBytes = utf8) + fixed (byte* destBytes = bytes) + fixed (sbyte* decodingMap = s_decodingMap) { - Avx2Decode(ref srcBytes, ref destBytes, maxSrcLength, destLength, ref sourceIndex, ref destIndex); + int srcLength = utf8.Length & ~0x3; // only decode input up to the closest multiple of 4. + int destLength = bytes.Length; + int maxSrcLength = srcLength; + int decodedLength = GetMaxDecodedFromUtf8Length(srcLength); - if (sourceIndex == srcLength) - goto DoneExit; - } - else if (Ssse3.IsSupported && maxSrcLength >= 24) - { - Ssse3Decode(ref srcBytes, ref destBytes, maxSrcLength, destLength, ref sourceIndex, ref destIndex); + // max. 2 padding chars + if (destLength < decodedLength - 2) + { + // For overflow see comment below + maxSrcLength = destLength / 3 * 4; + } - if (sourceIndex == srcLength) - goto DoneExit; - } + byte* src = srcBytes; + byte* dest = destBytes; + byte* srcEnd = srcBytes + (nuint)srcLength; + byte* srcMax = srcBytes + (nuint)maxSrcLength; - // Last bytes could have padding characters, so process them separately and treat them as valid only if isFinalBlock is true - // if isFinalBlock is false, padding characters are considered invalid - int skipLastChunk = isFinalBlock ? 4 : 0; + if (maxSrcLength >= 24) + { + byte* end = srcMax - 45; + if (Avx2.IsSupported && (end >= src)) + { + Avx2Decode(ref src, ref dest, end, maxSrcLength, destLength, srcBytes, destBytes); + + if (src == srcEnd) + goto DoneExit; + } + + end = srcMax - 24; + if (Ssse3.IsSupported && (end >= src)) + { + Ssse3Decode(ref src, ref dest, end, maxSrcLength, destLength, srcBytes, destBytes); + + if (src == srcEnd) + goto DoneExit; + } + } - if (destLength >= decodedLength) - { - maxSrcLength = srcLength - skipLastChunk; - } - else - { - // This should never overflow since destLength here is less than int.MaxValue / 4 * 3 (i.e. 1610612733) - // Therefore, (destLength / 3) * 4 will always be less than 2147483641 - maxSrcLength = (destLength / 3) * 4; - } + // Last bytes could have padding characters, so process them separately and treat them as valid only if isFinalBlock is true + // if isFinalBlock is false, padding characters are considered invalid + bool isDestinationTooSmall; - ref sbyte decodingMap = ref s_decodingMap[0]; + if (destLength >= decodedLength) + { + isDestinationTooSmall = false; + maxSrcLength = isFinalBlock ? srcLength - 4 : srcLength; + } + else + { + // This should never overflow since destLength here is less than int.MaxValue / 4 * 3 (i.e. 1610612733) + // Therefore, (destLength / 3) * 4 will always be less than 2147483641 + maxSrcLength = (destLength / 3) * 4; + isDestinationTooSmall = true; + } - // In order to elide the movsxd in the loop - if (sourceIndex < maxSrcLength) - { - do + srcMax = srcBytes + (nuint)maxSrcLength; + while (src < srcMax) { - int result = Decode(ref Unsafe.Add(ref srcBytes, (IntPtr)sourceIndex), ref decodingMap); + int result = Decode(src, decodingMap); if (result < 0) goto InvalidDataExit; - WriteThreeLowOrderBytes(ref Unsafe.Add(ref destBytes, (IntPtr)destIndex), result); - destIndex += 3; - sourceIndex += 4; + WriteThreeLowOrderBytes(dest, result); + src += 4; + dest += 3; } - while (sourceIndex < (uint)maxSrcLength); - } - if (maxSrcLength != srcLength - skipLastChunk) - goto DestinationTooSmallExit; + if (isDestinationTooSmall) + goto DestinationTooSmallExit; - // If input is less than 4 bytes, srcLength == sourceIndex == 0 - // If input is not a multiple of 4, sourceIndex == srcLength != 0 - if (sourceIndex == srcLength) - { - if (isFinalBlock) - goto InvalidDataExit; - goto NeedMoreDataExit; - } + // If input is less than 4 bytes, srcLength == sourceIndex == 0 + // If input is not a multiple of 4, sourceIndex == srcLength != 0 + if (src == srcEnd) + { + if (isFinalBlock) + goto InvalidDataExit; + goto NeedMoreDataExit; + } - // if isFinalBlock is false, we will never reach this point + // if isFinalBlock is false, we will never reach this point - // Handle last four bytes. There are 0, 1, 2 padding chars. - uint t0, t1, t2, t3; - t0 = Unsafe.Add(ref srcBytes, (IntPtr)(uint)(srcLength - 4)); - t1 = Unsafe.Add(ref srcBytes, (IntPtr)(uint)(srcLength - 3)); - t2 = Unsafe.Add(ref srcBytes, (IntPtr)(uint)(srcLength - 2)); - t3 = Unsafe.Add(ref srcBytes, (IntPtr)(uint)(srcLength - 1)); + // Handle last four bytes. There are 0, 1, 2 padding chars. + uint t0 = srcEnd[-4]; + uint t1 = srcEnd[-3]; + uint t2 = srcEnd[-2]; + uint t3 = srcEnd[-1]; - int i0 = Unsafe.Add(ref decodingMap, (IntPtr)t0); - int i1 = Unsafe.Add(ref decodingMap, (IntPtr)t1); + int i0 = decodingMap[t0]; + int i1 = decodingMap[t1]; - i0 <<= 18; - i1 <<= 12; + i0 <<= 18; + i1 <<= 12; - i0 |= i1; + i0 |= i1; - if (t3 != EncodingPad) - { - int i2 = Unsafe.Add(ref decodingMap, (IntPtr)t2); - int i3 = Unsafe.Add(ref decodingMap, (IntPtr)t3); + byte* destMax = destBytes + (nuint)destLength; - i2 <<= 6; + if (t3 != EncodingPad) + { + int i2 = decodingMap[t2]; + int i3 = decodingMap[t3]; - i0 |= i3; - i0 |= i2; + i2 <<= 6; - if (i0 < 0) - goto InvalidDataExit; - if (destIndex > destLength - 3) - goto DestinationTooSmallExit; + i0 |= i3; + i0 |= i2; - WriteThreeLowOrderBytes(ref Unsafe.Add(ref destBytes, (IntPtr)destIndex), i0); - destIndex += 3; - } - else if (t2 != EncodingPad) - { - int i2 = Unsafe.Add(ref decodingMap, (IntPtr)t2); + if (i0 < 0) + goto InvalidDataExit; + if (dest + 3 > destMax) + goto DestinationTooSmallExit; - i2 <<= 6; + WriteThreeLowOrderBytes(dest, i0); + dest += 3; + } + else if (t2 != EncodingPad) + { + int i2 = decodingMap[t2]; - i0 |= i2; + i2 <<= 6; - if (i0 < 0) - goto InvalidDataExit; - if (destIndex > destLength - 2) - goto DestinationTooSmallExit; + i0 |= i2; - Unsafe.Add(ref destBytes, (IntPtr)destIndex) = (byte)(i0 >> 16); - Unsafe.Add(ref destBytes, (IntPtr)(destIndex + 1)) = (byte)(i0 >> 8); - destIndex += 2; - } - else - { - if (i0 < 0) - goto InvalidDataExit; - if (destIndex > destLength - 1) - goto DestinationTooSmallExit; + if (i0 < 0) + goto InvalidDataExit; + if (dest + 2 > destMax) + goto DestinationTooSmallExit; - Unsafe.Add(ref destBytes, (IntPtr)destIndex) = (byte)(i0 >> 16); - destIndex += 1; - } + dest[0] = (byte)(i0 >> 16); + dest[1] = (byte)(i0 >> 8); + dest += 2; + } + else + { + if (i0 < 0) + goto InvalidDataExit; + if (dest + 1 > destMax) + goto DestinationTooSmallExit; - sourceIndex += 4; + dest[0] = (byte)(i0 >> 16); + dest += 1; + } - if (srcLength != utf8.Length) - goto InvalidDataExit; + src += 4; - DoneExit: - bytesConsumed = (int)sourceIndex; - bytesWritten = (int)destIndex; - return OperationStatus.Done; + if (srcLength != utf8.Length) + goto InvalidDataExit; + + DoneExit: + bytesConsumed = (int)(src - srcBytes); + bytesWritten = (int)(dest - destBytes); + return OperationStatus.Done; - DestinationTooSmallExit: - if (srcLength != utf8.Length && isFinalBlock) - goto InvalidDataExit; // if input is not a multiple of 4, and there is no more data, return invalid data instead + DestinationTooSmallExit: + if (srcLength != utf8.Length && isFinalBlock) + goto InvalidDataExit; // if input is not a multiple of 4, and there is no more data, return invalid data instead - bytesConsumed = (int)sourceIndex; - bytesWritten = (int)destIndex; - return OperationStatus.DestinationTooSmall; + bytesConsumed = (int)(src - srcBytes); + bytesWritten = (int)(dest - destBytes); + return OperationStatus.DestinationTooSmall; - NeedMoreDataExit: - bytesConsumed = (int)sourceIndex; - bytesWritten = (int)destIndex; - return OperationStatus.NeedMoreData; + NeedMoreDataExit: + bytesConsumed = (int)(src - srcBytes); + bytesWritten = (int)(dest - destBytes); + return OperationStatus.NeedMoreData; - InvalidDataExit: - bytesConsumed = (int)sourceIndex; - bytesWritten = (int)destIndex; - return OperationStatus.InvalidData; + InvalidDataExit: + bytesConsumed = (int)(src - srcBytes); + bytesWritten = (int)(dest - destBytes); + return OperationStatus.InvalidData; + } } /// @@ -233,109 +247,103 @@ public static int GetMaxDecodedFromUtf8Length(int length) /// The number of bytes written into the buffer. /// It returns the OperationStatus enum values: /// - Done - on successful processing of the entire input span - /// - InvalidData - if the input contains bytes outside of the expected base 64 range, or if it contains invalid/more than two padding characters, + /// - InvalidData - if the input contains bytes outside of the expected base 64 range, or if it contains invalid/more than two padding characters, /// or if the input is incomplete (i.e. not a multiple of 4). /// It does not return DestinationTooSmall since that is not possible for base 64 decoding. - /// It does not return NeedMoreData since this method tramples the data in the buffer and + /// It does not return NeedMoreData since this method tramples the data in the buffer and /// hence can only be called once with all the data in the buffer. /// - public static OperationStatus DecodeFromUtf8InPlace(Span buffer, out int bytesWritten) + public static unsafe OperationStatus DecodeFromUtf8InPlace(Span buffer, out int bytesWritten) { - int bufferLength = buffer.Length; - uint sourceIndex = 0; - uint destIndex = 0; + fixed (byte* bufferBytes = buffer) + fixed (sbyte* decodingMap = s_decodingMap) + { + int bufferLength = buffer.Length; + uint sourceIndex = 0; + uint destIndex = 0; - // only decode input if it is a multiple of 4 - if (bufferLength != ((bufferLength >> 2) * 4)) - goto InvalidExit; - if (bufferLength == 0) - goto DoneExit; + // only decode input if it is a multiple of 4 + if (bufferLength != ((bufferLength >> 2) * 4)) + goto InvalidExit; + if (bufferLength == 0) + goto DoneExit; - ref byte bufferBytes = ref MemoryMarshal.GetReference(buffer); + while (sourceIndex < bufferLength - 4) + { + int result = Decode(bufferBytes + sourceIndex, decodingMap); + if (result < 0) + goto InvalidExit; + WriteThreeLowOrderBytes(bufferBytes + destIndex, result); + destIndex += 3; + sourceIndex += 4; + } - ref sbyte decodingMap = ref s_decodingMap[0]; + uint t0 = bufferBytes[bufferLength - 4]; + uint t1 = bufferBytes[bufferLength - 3]; + uint t2 = bufferBytes[bufferLength - 2]; + uint t3 = bufferBytes[bufferLength - 1]; - while (sourceIndex < bufferLength - 4) - { - int result = Decode(ref Unsafe.Add(ref bufferBytes, (IntPtr)sourceIndex), ref decodingMap); - if (result < 0) - goto InvalidExit; - WriteThreeLowOrderBytes(ref Unsafe.Add(ref bufferBytes, (IntPtr)destIndex), result); - destIndex += 3; - sourceIndex += 4; - } + int i0 = decodingMap[t0]; + int i1 = decodingMap[t1]; - uint t0, t1, t2, t3; - uint n = (uint)(bufferLength - 4); - t0 = Unsafe.Add(ref bufferBytes, (IntPtr)n); - t1 = Unsafe.Add(ref bufferBytes, (IntPtr)(n + 1)); - t2 = Unsafe.Add(ref bufferBytes, (IntPtr)(n + 2)); - t3 = Unsafe.Add(ref bufferBytes, (IntPtr)(n + 3)); + i0 <<= 18; + i1 <<= 12; - int i0 = Unsafe.Add(ref decodingMap, (IntPtr)t0); - int i1 = Unsafe.Add(ref decodingMap, (IntPtr)t1); + i0 |= i1; - i0 <<= 18; - i1 <<= 12; + if (t3 != EncodingPad) + { + int i2 = decodingMap[t2]; + int i3 = decodingMap[t3]; - i0 |= i1; + i2 <<= 6; - if (t3 != EncodingPad) - { - int i2 = Unsafe.Add(ref decodingMap, (IntPtr)t2); - int i3 = Unsafe.Add(ref decodingMap, (IntPtr)t3); + i0 |= i3; + i0 |= i2; - i2 <<= 6; + if (i0 < 0) + goto InvalidExit; - i0 |= i3; - i0 |= i2; + WriteThreeLowOrderBytes(bufferBytes + destIndex, i0); + destIndex += 3; + } + else if (t2 != EncodingPad) + { + int i2 = decodingMap[t2]; - if (i0 < 0) - goto InvalidExit; + i2 <<= 6; - WriteThreeLowOrderBytes(ref Unsafe.Add(ref bufferBytes, (IntPtr)destIndex), i0); - destIndex += 3; - } - else if (t2 != EncodingPad) - { - int i2 = Unsafe.Add(ref decodingMap, (IntPtr)t2); + i0 |= i2; - i2 <<= 6; + if (i0 < 0) + goto InvalidExit; - i0 |= i2; + bufferBytes[destIndex] = (byte)(i0 >> 16); + bufferBytes[destIndex + 1] = (byte)(i0 >> 8); + destIndex += 2; + } + else + { + if (i0 < 0) + goto InvalidExit; - if (i0 < 0) - goto InvalidExit; + bufferBytes[destIndex] = (byte)(i0 >> 16); + destIndex += 1; + } - Unsafe.Add(ref bufferBytes, (IntPtr)destIndex) = (byte)(i0 >> 16); - Unsafe.Add(ref bufferBytes, (IntPtr)(destIndex + 1)) = (byte)(i0 >> 8); - destIndex += 2; - } - else - { - if (i0 < 0) - goto InvalidExit; + DoneExit: + bytesWritten = (int)destIndex; + return OperationStatus.Done; - Unsafe.Add(ref bufferBytes, (IntPtr)destIndex) = (byte)(i0 >> 16); - destIndex += 1; + InvalidExit: + bytesWritten = (int)destIndex; + return OperationStatus.InvalidData; } - - DoneExit: - bytesWritten = (int)destIndex; - return OperationStatus.Done; - - InvalidExit: - bytesWritten = (int)destIndex; - return OperationStatus.InvalidData; } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void Avx2Decode(ref byte src, ref byte destBytes, int sourceLength, int destLength, ref uint sourceIndex, ref uint destIndex) + private static unsafe void Avx2Decode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart) { - ref byte srcStart = ref src; - ref byte destStart = ref destBytes; - ref byte simdSrcEnd = ref Unsafe.Add(ref src, (IntPtr)((uint)sourceLength - 45 + 1)); - // The JIT won't hoist these "constants", so help it Vector256 lutHi = s_avxDecodeLutHi; Vector256 lutLo = s_avxDecodeLutLo; @@ -346,11 +354,14 @@ private static void Avx2Decode(ref byte src, ref byte destBytes, int sourceLengt Vector256 shuffleVec = s_avxDecodeShuffleVec; Vector256 permuteVec = s_avxDecodePermuteVec; + byte* src = srcBytes; + byte* dest = destBytes; + //while (remaining >= 45) do { - AssertRead>(ref src, ref srcStart, sourceLength); - Vector256 str = Unsafe.As>(ref src); + AssertRead>(src, srcStart, sourceLength); + Vector256 str = Avx.LoadVector256(src).AsSByte(); Vector256 hiNibbles = Avx2.And(Avx2.ShiftRightLogical(str.AsInt32(), 4).AsSByte(), mask2F); Vector256 loNibbles = Avx2.And(str, mask2F); @@ -371,31 +382,21 @@ private static void Avx2Decode(ref byte src, ref byte destBytes, int sourceLengt output = Avx2.Shuffle(output.AsSByte(), shuffleVec).AsInt32(); str = Avx2.PermuteVar8x32(output, permuteVec).AsSByte(); - AssertWrite>(ref destBytes, ref destStart, destLength); - // As has better CQ than WriteUnaligned - // https://github.com/dotnet/coreclr/issues/21132 - Unsafe.As>(ref destBytes) = str; + AssertWrite>(dest, destStart, destLength); + Avx.Store(dest, str.AsByte()); - src = ref Unsafe.Add(ref src, 32); - destBytes = ref Unsafe.Add(ref destBytes, 24); + src += 32; + dest += 24; } - while (Unsafe.IsAddressLessThan(ref src, ref simdSrcEnd)); - - // Cast to ulong to avoid the overflow-check. Codegen for x86 is still good. - sourceIndex = (uint)(ulong)Unsafe.ByteOffset(ref srcStart, ref src); - destIndex = (uint)(ulong)Unsafe.ByteOffset(ref destStart, ref destBytes); + while (src <= srcEnd); - src = ref srcStart; - destBytes = ref destStart; + srcBytes = src; + destBytes = dest; } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void Ssse3Decode(ref byte src, ref byte destBytes, int sourceLength, int destLength, ref uint sourceIndex, ref uint destIndex) + private static unsafe void Ssse3Decode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart) { - ref byte srcStart = ref src; - ref byte destStart = ref destBytes; - ref byte simdSrcEnd = ref Unsafe.Add(ref src, (IntPtr)((uint)sourceLength - 24 + 1)); - // The JIT won't hoist these "constants", so help it Vector128 lutHi = s_sseDecodeLutHi; Vector128 lutLo = s_sseDecodeLutLo; @@ -405,11 +406,14 @@ private static void Ssse3Decode(ref byte src, ref byte destBytes, int sourceLeng Vector128 shuffleConstant1 = Vector128.Create(0x00011000).AsInt16(); Vector128 shuffleVec = s_sseDecodeShuffleVec; + byte* src = srcBytes; + byte* dest = destBytes; + //while (remaining >= 24) do { - AssertRead>(ref src, ref srcStart, sourceLength); - Vector128 str = Unsafe.As>(ref src); + AssertRead>(src, srcStart, sourceLength); + Vector128 str = Sse2.LoadVector128(src).AsSByte(); Vector128 hiNibbles = Sse2.And(Sse2.ShiftRightLogical(str.AsInt32(), 4).AsSByte(), mask2F); Vector128 loNibbles = Sse2.And(str, mask2F); @@ -428,38 +432,30 @@ private static void Ssse3Decode(ref byte src, ref byte destBytes, int sourceLeng Vector128 output = Sse2.MultiplyAddAdjacent(merge_ab_and_bc, shuffleConstant1); str = Ssse3.Shuffle(output.AsSByte(), shuffleVec); - AssertWrite>(ref destBytes, ref destStart, destLength); - // As has better CQ than WriteUnaligned - // https://github.com/dotnet/coreclr/issues/21132 - Unsafe.As>(ref destBytes) = str; + AssertWrite>(dest, destStart, destLength); + Sse2.Store(dest, str.AsByte()); - src = ref Unsafe.Add(ref src, 16); - destBytes = ref Unsafe.Add(ref destBytes, 12); + src += 16; + dest += 12; } - while (Unsafe.IsAddressLessThan(ref src, ref simdSrcEnd)); - - // Cast to ulong to avoid the overflow-check. Codegen for x86 is still good. - sourceIndex = (uint)(ulong)Unsafe.ByteOffset(ref srcStart, ref src); - destIndex = (uint)(ulong)Unsafe.ByteOffset(ref destStart, ref destBytes); + while (src <= srcEnd); - src = ref srcStart; - destBytes = ref destStart; + srcBytes = src; + destBytes = dest; } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static int Decode(ref byte encodedBytes, ref sbyte decodingMap) + private static unsafe int Decode(byte* encodedBytes, sbyte* decodingMap) { - uint t0, t1, t2, t3; - - t0 = encodedBytes; - t1 = Unsafe.Add(ref encodedBytes, 1); - t2 = Unsafe.Add(ref encodedBytes, 2); - t3 = Unsafe.Add(ref encodedBytes, 3); + nuint t0 = encodedBytes[0]; + nuint t1 = encodedBytes[1]; + nuint t2 = encodedBytes[2]; + nuint t3 = encodedBytes[3]; - int i0 = Unsafe.Add(ref decodingMap, (IntPtr)t0); - int i1 = Unsafe.Add(ref decodingMap, (IntPtr)t1); - int i2 = Unsafe.Add(ref decodingMap, (IntPtr)t2); - int i3 = Unsafe.Add(ref decodingMap, (IntPtr)t3); + int i0 = decodingMap[t0]; + int i1 = decodingMap[t1]; + int i2 = decodingMap[t2]; + int i3 = decodingMap[t3]; i0 <<= 18; i1 <<= 12; @@ -473,11 +469,11 @@ private static int Decode(ref byte encodedBytes, ref sbyte decodingMap) } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void WriteThreeLowOrderBytes(ref byte destination, int value) + private static unsafe void WriteThreeLowOrderBytes(byte* destination, int value) { - destination = (byte)(value >> 16); - Unsafe.Add(ref destination, 1) = (byte)(value >> 8); - Unsafe.Add(ref destination, 2) = (byte)value; + destination[0] = (byte)(value >> 16); + destination[1] = (byte)(value >> 8); + destination[2] = (byte)(value); } // Pre-computing this table using a custom string(s_characters) and GenerateDecodingMapAndVerify (found in tests) diff --git a/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs b/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs index 681880ec6ec7..35518c231ede 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs @@ -3,11 +3,16 @@ // See the LICENSE file in the project root for more information. using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; using Internal.Runtime.CompilerServices; +#if BIT64 +using nuint = System.UInt64; +#else +using nuint = System.UInt32; +#endif + namespace System.Buffers.Text { // AVX2 version based on https://github.com/aklomp/base64/tree/e516d769a2a432c08404f1981e73b431566057be/lib/arch/avx2 @@ -25,7 +30,7 @@ public static partial class Base64 /// The output span which contains the result of the operation, i.e. the UTF-8 encoded text in base 64. /// The number of input bytes consumed during the operation. This can be used to slice the input for subsequent calls, if necessary. /// The number of bytes written into the output span. This can be used to slice the output for subsequent calls, if necessary. - /// True (default) when the input span contains the entire data to encode. + /// True (default) when the input span contains the entire data to encode. /// Set to false only if it is known that the input span contains partial data with more data to follow. /// It returns the OperationStatus enum values: /// - Done - on successful processing of the entire input span @@ -33,104 +38,105 @@ public static partial class Base64 /// - NeedMoreData - only if isFinalBlock is false, otherwise the output is padded if the input is not a multiple of 3 /// It does not return InvalidData since that is not possible for base 64 encoding. /// - public static OperationStatus EncodeToUtf8(ReadOnlySpan bytes, Span utf8, out int bytesConsumed, out int bytesWritten, bool isFinalBlock = true) + public static unsafe OperationStatus EncodeToUtf8(ReadOnlySpan bytes, Span utf8, out int bytesConsumed, out int bytesWritten, bool isFinalBlock = true) { - // PERF: use uint to avoid the sign-extensions - uint sourceIndex = 0; - uint destIndex = 0; - if (bytes.IsEmpty) - goto DoneExit; - - ref byte srcBytes = ref MemoryMarshal.GetReference(bytes); - ref byte destBytes = ref MemoryMarshal.GetReference(utf8); - - int srcLength = bytes.Length; - int destLength = utf8.Length; - int maxSrcLength = srcLength; - - if (srcLength <= MaximumEncodeLength && destLength >= GetMaxEncodedToUtf8Length(srcLength)) - { - maxSrcLength = srcLength; - } - else { - maxSrcLength = (destLength >> 2) * 3; + bytesConsumed = 0; + bytesWritten = 0; + return OperationStatus.Done; } - if (srcLength < 16) - goto Scalar; - - if (Avx2.IsSupported && maxSrcLength >= 32) + fixed (byte* srcBytes = bytes) + fixed (byte* destBytes = utf8) + fixed (byte* encodingMap = s_encodingMap) { - Avx2Encode(ref srcBytes, ref destBytes, maxSrcLength, destLength, ref sourceIndex, ref destIndex); - - if (sourceIndex == srcLength) - goto DoneExit; - } + int srcLength = bytes.Length; + int destLength = utf8.Length; + int maxSrcLength; - if (Ssse3.IsSupported && (maxSrcLength >= (int)sourceIndex + 16)) - { - Ssse3Encode(ref srcBytes, ref destBytes, maxSrcLength, destLength, ref sourceIndex, ref destIndex); + if (srcLength <= MaximumEncodeLength && destLength >= GetMaxEncodedToUtf8Length(srcLength)) + { + maxSrcLength = srcLength; + } + else + { + maxSrcLength = (destLength >> 2) * 3; + } - if (sourceIndex == srcLength) - goto DoneExit; - } + byte* src = srcBytes; + byte* dest = destBytes; + byte* srcEnd = srcBytes + (nuint)srcLength; + byte* srcMax = srcBytes + (nuint)maxSrcLength; - Scalar: - maxSrcLength -= 2; - uint result = 0; + if (maxSrcLength >= 16) + { + byte* end = srcMax - 32; + if (Avx2.IsSupported && (end >= src)) + { + Avx2Encode(ref src, ref dest, end, maxSrcLength, destLength, srcBytes, destBytes); + + if (src == srcEnd) + goto DoneExit; + } + + end = srcMax - 16; + if (Ssse3.IsSupported && (end >= src)) + { + Ssse3Encode(ref src, ref dest, end, maxSrcLength, destLength, srcBytes, destBytes); + + if (src == srcEnd) + goto DoneExit; + } + } - ref byte encodingMap = ref s_encodingMap[0]; + uint result = 0; - // In order to elide the movsxd in the loop - if (sourceIndex < maxSrcLength) - { - do + srcMax -= 2; + while (src < srcMax) { - result = Encode(ref Unsafe.Add(ref srcBytes, (IntPtr)sourceIndex), ref encodingMap); - Unsafe.WriteUnaligned(ref Unsafe.Add(ref destBytes, (IntPtr)destIndex), result); - destIndex += 4; - sourceIndex += 3; + result = Encode(src, encodingMap); + Unsafe.WriteUnaligned(dest, result); + src += 3; + dest += 4; } - while (sourceIndex < (uint)maxSrcLength); - } - if (maxSrcLength != srcLength - 2) - goto DestinationTooSmallExit; + if (srcMax + 2 != srcEnd) + goto DestinationTooSmallExit; - if (!isFinalBlock) - goto NeedMoreDataExit; + if (!isFinalBlock) + goto NeedMoreData; - if (sourceIndex == srcLength - 1) - { - result = EncodeAndPadTwo(ref Unsafe.Add(ref srcBytes, (IntPtr)sourceIndex), ref encodingMap); - Unsafe.WriteUnaligned(ref Unsafe.Add(ref destBytes, (IntPtr)destIndex), result); - destIndex += 4; - sourceIndex += 1; - } - else if (sourceIndex == srcLength - 2) - { - result = EncodeAndPadOne(ref Unsafe.Add(ref srcBytes, (IntPtr)sourceIndex), ref encodingMap); - Unsafe.WriteUnaligned(ref Unsafe.Add(ref destBytes, (IntPtr)destIndex), result); - destIndex += 4; - sourceIndex += 2; - } + if (src + 1 == srcEnd) + { + result = EncodeAndPadTwo(src, encodingMap); + Unsafe.WriteUnaligned(dest, result); + src += 1; + dest += 4; + } + else if (src + 2 == srcEnd) + { + result = EncodeAndPadOne(src, encodingMap); + Unsafe.WriteUnaligned(dest, result); + src += 2; + dest += 4; + } - DoneExit: - bytesConsumed = (int)sourceIndex; - bytesWritten = (int)destIndex; - return OperationStatus.Done; + DoneExit: + bytesConsumed = (int)(src - srcBytes); + bytesWritten = (int)(dest - destBytes); + return OperationStatus.Done; - NeedMoreDataExit: - bytesConsumed = (int)sourceIndex; - bytesWritten = (int)destIndex; - return OperationStatus.NeedMoreData; + DestinationTooSmallExit: + bytesConsumed = (int)(src - srcBytes); + bytesWritten = (int)(dest - destBytes); + return OperationStatus.DestinationTooSmall; - DestinationTooSmallExit: - bytesConsumed = (int)sourceIndex; - bytesWritten = (int)destIndex; - return OperationStatus.DestinationTooSmall; + NeedMoreData: + bytesConsumed = (int)(src - srcBytes); + bytesWritten = (int)(dest - destBytes); + return OperationStatus.NeedMoreData; + } } /// @@ -149,12 +155,12 @@ public static int GetMaxEncodedToUtf8Length(int length) } /// - /// Encode the span of binary data (in-place) into UTF-8 encoded text represented as base 64. + /// Encode the span of binary data (in-place) into UTF-8 encoded text represented as base 64. /// The encoded text output is larger than the binary data contained in the input (the operation inflates the data). /// - /// The input span which contains binary data that needs to be encoded. + /// The input span which contains binary data that needs to be encoded. /// It needs to be large enough to fit the result of the operation. - /// The amount of binary data contained within the buffer that needs to be encoded + /// The amount of binary data contained within the buffer that needs to be encoded /// (and needs to be smaller than the buffer length). /// The number of bytes written into the buffer. /// It returns the OperationStatus enum values: @@ -163,62 +169,59 @@ public static int GetMaxEncodedToUtf8Length(int length) /// It does not return NeedMoreData since this method tramples the data in the buffer and hence can only be called once with all the data in the buffer. /// It does not return InvalidData since that is not possible for base 64 encoding. /// - public static OperationStatus EncodeToUtf8InPlace(Span buffer, int dataLength, out int bytesWritten) + public static unsafe OperationStatus EncodeToUtf8InPlace(Span buffer, int dataLength, out int bytesWritten) { - int encodedLength = GetMaxEncodedToUtf8Length(dataLength); - if (buffer.Length < encodedLength) - goto FalseExit; - - int leftover = dataLength - (dataLength / 3) * 3; // how many bytes after packs of 3 + fixed (byte* bufferBytes = buffer) + fixed (byte* encodingMap = s_encodingMap) + { + int encodedLength = GetMaxEncodedToUtf8Length(dataLength); + if (buffer.Length < encodedLength) + goto FalseExit; - // PERF: use uint to avoid the sign-extensions - uint destinationIndex = (uint)(encodedLength - 4); - uint sourceIndex = (uint)(dataLength - leftover); - uint result = 0; + int leftover = dataLength - (dataLength / 3) * 3; // how many bytes after packs of 3 - ref byte encodingMap = ref s_encodingMap[0]; - ref byte bufferBytes = ref MemoryMarshal.GetReference(buffer); + // PERF: use nuint to avoid the sign-extensions + nuint destinationIndex = (nuint)(encodedLength - 4); + nuint sourceIndex = (nuint)(dataLength - leftover); + uint result = 0; - // encode last pack to avoid conditional in the main loop - if (leftover != 0) - { - if (leftover == 1) + // encode last pack to avoid conditional in the main loop + if (leftover != 0) { - result = EncodeAndPadTwo(ref Unsafe.Add(ref bufferBytes, (IntPtr)sourceIndex), ref encodingMap); + if (leftover == 1) + { + result = EncodeAndPadTwo(bufferBytes + sourceIndex, encodingMap); + } + else + { + result = EncodeAndPadOne(bufferBytes + sourceIndex, encodingMap); + } + + Unsafe.WriteUnaligned(bufferBytes + destinationIndex, result); + destinationIndex -= 4; } - else + + sourceIndex -= 3; + while ((int)sourceIndex >= 0) { - result = EncodeAndPadOne(ref Unsafe.Add(ref bufferBytes, (IntPtr)sourceIndex), ref encodingMap); + result = Encode(bufferBytes + sourceIndex, encodingMap); + Unsafe.WriteUnaligned(bufferBytes + destinationIndex, result); + destinationIndex -= 4; + sourceIndex -= 3; } - Unsafe.WriteUnaligned(ref Unsafe.Add(ref bufferBytes, (IntPtr)destinationIndex), result); - destinationIndex -= 4; - } + bytesWritten = encodedLength; + return OperationStatus.Done; - sourceIndex -= 3; - while ((int)sourceIndex >= 0) - { - result = Encode(ref Unsafe.Add(ref bufferBytes, (IntPtr)sourceIndex), ref encodingMap); - Unsafe.WriteUnaligned(ref Unsafe.Add(ref bufferBytes, (IntPtr)destinationIndex), result); - destinationIndex -= 4; - sourceIndex -= 3; + FalseExit: + bytesWritten = 0; + return OperationStatus.DestinationTooSmall; } - - bytesWritten = encodedLength; - return OperationStatus.Done; - - FalseExit: - bytesWritten = 0; - return OperationStatus.DestinationTooSmall; } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void Avx2Encode(ref byte src, ref byte dest, int sourceLength, int destLength, ref uint sourceIndex, ref uint destIndex) + private static unsafe void Avx2Encode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart) { - ref byte srcStart = ref src; - ref byte destStart = ref dest; - ref byte simdSrcEnd = ref Unsafe.Add(ref src, (IntPtr)((uint)sourceLength - 28)); // 28 = 32 - 4 - // The JIT won't hoist these "constants", so help it Vector256 shuffleVec = s_avxEncodeShuffleVec; Vector256 shuffleConstant0 = Vector256.Create(0x0fc0fc00).AsSByte(); @@ -229,13 +232,19 @@ private static void Avx2Encode(ref byte src, ref byte dest, int sourceLength, in Vector256 translationContant1 = Vector256.Create((sbyte)25); Vector256 lut = s_avxEncodeLut; + byte* src = srcBytes; + byte* dest = destBytes; + // first load is done at c-0 not to get a segfault - AssertRead>(ref src, ref srcStart, sourceLength); - Vector256 str = Unsafe.As>(ref src); + AssertRead>(src, srcStart, sourceLength); + Vector256 str = Avx.LoadVector256(src).AsSByte(); - // shift by 4 bytes, as required by enc_reshuffle + // shift by 4 bytes, as required by Reshuffle str = Avx2.PermuteVar8x32(str.AsInt32(), s_avxEncodePermuteVec).AsSByte(); + // Next loads are done at src-4, as required by Reshuffle, so shift it once + src -= 4; + while (true) { // Reshuffle @@ -252,41 +261,27 @@ private static void Avx2Encode(ref byte src, ref byte dest, int sourceLength, in Vector256 tmp = Avx2.Subtract(indices.AsSByte(), mask); str = Avx2.Add(str, Avx2.Shuffle(lut, tmp)); - AssertWrite>(ref dest, ref destStart, destLength); - // As has better CQ than WriteUnaligned - // https://github.com/dotnet/coreclr/issues/21132 - Unsafe.As>(ref dest) = str; + AssertWrite>(dest, destStart, destLength); + Avx.Store(dest, str.AsByte()); - src = ref Unsafe.Add(ref src, 24); - dest = ref Unsafe.Add(ref dest, 32); + src += 24; + dest += 32; - if (Unsafe.IsAddressGreaterThan(ref src, ref simdSrcEnd)) + if (src > srcEnd) break; - // Load at c-4, as required by enc_reshuffle - AssertRead>(ref Unsafe.Add(ref src, -4), ref srcStart, sourceLength); - str = Unsafe.As>(ref Unsafe.Add(ref src, -4)); + // Load at src-4, as required by Reshuffle (already shifted by -4) + AssertRead>(src, srcStart, sourceLength); + str = Avx.LoadVector256(src).AsSByte(); } - // Cast to ulong to avoid the overflow-check. Codegen for x86 is still good. - sourceIndex = (uint)(ulong)Unsafe.ByteOffset(ref srcStart, ref src); - destIndex = (uint)(ulong)Unsafe.ByteOffset(ref destStart, ref dest); - - src = ref srcStart; - dest = ref destStart; + srcBytes = src + 4; + destBytes = dest; } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void Ssse3Encode(ref byte src, ref byte dest, int sourceLength, int destLength, ref uint sourceIndex, ref uint destIndex) + private static unsafe void Ssse3Encode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart) { - ref byte srcStart = ref src; - ref byte destStart = ref dest; - ref byte simdSrcEnd = ref Unsafe.Add(ref src, (IntPtr)((uint)sourceLength - 16 + 1)); - - // Shift to workspace - src = ref Unsafe.Add(ref src, (IntPtr)sourceIndex); - dest = ref Unsafe.Add(ref dest, (IntPtr)destIndex); - // The JIT won't hoist these "constants", so help it Vector128 shuffleVec = s_sseEncodeShuffleVec; Vector128 shuffleConstant0 = Vector128.Create(0x0fc0fc00).AsSByte(); @@ -297,11 +292,14 @@ private static void Ssse3Encode(ref byte src, ref byte dest, int sourceLength, i Vector128 translationContant1 = Vector128.Create((sbyte)25); Vector128 lut = s_sseEncodeLut; + byte* src = srcBytes; + byte* dest = destBytes; + //while (remaining >= 16) - while (Unsafe.IsAddressLessThan(ref src, ref simdSrcEnd)) + do { - AssertRead>(ref src, ref srcStart, sourceLength); - Vector128 str = Unsafe.As>(ref src); + AssertRead>(src, srcStart, sourceLength); + Vector128 str = Sse2.LoadVector128(src).AsSByte(); // Reshuffle str = Ssse3.Shuffle(str, shuffleVec); @@ -317,57 +315,61 @@ private static void Ssse3Encode(ref byte src, ref byte dest, int sourceLength, i Vector128 tmp = Sse2.Subtract(indices.AsSByte(), mask); str = Sse2.Add(str, Ssse3.Shuffle(lut, tmp)); - AssertWrite>(ref dest, ref destStart, destLength); - // As has better CQ than WriteUnaligned - // https://github.com/dotnet/coreclr/issues/21132 - Unsafe.As>(ref dest) = str; + AssertWrite>(dest, destStart, destLength); + Sse2.Store(dest, str.AsByte()); - src = ref Unsafe.Add(ref src, 12); - dest = ref Unsafe.Add(ref dest, 16); + src += 12; + dest += 16; } + while (src <= srcEnd); - // Cast to ulong to avoid the overflow-check. Codegen for x86 is still good. - sourceIndex = (uint)(ulong)Unsafe.ByteOffset(ref srcStart, ref src); - destIndex = (uint)(ulong)Unsafe.ByteOffset(ref destStart, ref dest); - - src = ref srcStart; - dest = ref destStart; + srcBytes = src; + destBytes = dest; } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static uint Encode(ref byte threeBytes, ref byte encodingMap) + private static unsafe uint Encode(byte* threeBytes, byte* encodingMap) { - uint i = (uint)((threeBytes << 16) | (Unsafe.Add(ref threeBytes, 1) << 8) | Unsafe.Add(ref threeBytes, 2)); + nuint t0 = threeBytes[0]; + nuint t1 = threeBytes[1]; + nuint t2 = threeBytes[2]; - uint i0 = Unsafe.Add(ref encodingMap, (IntPtr)(i >> 18)); - uint i1 = Unsafe.Add(ref encodingMap, (IntPtr)((i >> 12) & 0x3F)); - uint i2 = Unsafe.Add(ref encodingMap, (IntPtr)((i >> 6) & 0x3F)); - uint i3 = Unsafe.Add(ref encodingMap, (IntPtr)(i & 0x3F)); + nuint i = (t0 << 16) | (t1 << 8) | t2; - return i0 | (i1 << 8) | (i2 << 16) | (i3 << 24); + nuint i0 = encodingMap[i >> 18]; + nuint i1 = encodingMap[(i >> 12) & 0x3F]; + nuint i2 = encodingMap[(i >> 6) & 0x3F]; + nuint i3 = encodingMap[i & 0x3F]; + + return (uint)(i0 | (i1 << 8) | (i2 << 16) | (i3 << 24)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static uint EncodeAndPadOne(ref byte twoBytes, ref byte encodingMap) + private static unsafe uint EncodeAndPadOne(byte* twoBytes, byte* encodingMap) { - uint i = (uint)((twoBytes << 16) | (Unsafe.Add(ref twoBytes, 1) << 8)); + nuint t0 = twoBytes[0]; + nuint t1 = twoBytes[1]; + + nuint i = (t0 << 16) | (t1 << 8); - uint i0 = Unsafe.Add(ref encodingMap, (IntPtr)(i >> 18)); - uint i1 = Unsafe.Add(ref encodingMap, (IntPtr)((i >> 12) & 0x3F)); - uint i2 = Unsafe.Add(ref encodingMap, (IntPtr)((i >> 6) & 0x3F)); + nuint i0 = encodingMap[i >> 18]; + nuint i1 = encodingMap[(i >> 12) & 0x3F]; + nuint i2 = encodingMap[(i >> 6) & 0x3F]; - return i0 | (i1 << 8) | (i2 << 16) | (EncodingPad << 24); + return (uint)(i0 | (i1 << 8) | (i2 << 16) | (EncodingPad << 24)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static uint EncodeAndPadTwo(ref byte oneByte, ref byte encodingMap) + private static unsafe uint EncodeAndPadTwo(byte* oneByte, byte* encodingMap) { - uint i = (uint)(oneByte << 8); + nuint t0 = oneByte[0]; + + nuint i = t0 << 8; - uint i0 = Unsafe.Add(ref encodingMap, (IntPtr)(i >> 10)); - uint i1 = Unsafe.Add(ref encodingMap, (IntPtr)((i >> 4) & 0x3F)); + nuint i0 = encodingMap[i >> 10]; + nuint i1 = encodingMap[(i >> 4) & 0x3F]; - return i0 | (i1 << 8) | (EncodingPad << 16) | (EncodingPad << 24); + return (uint)(i0 | (i1 << 8) | (EncodingPad << 16) | (EncodingPad << 24)); } // Pre-computing this table using a custom string(s_characters) and GenerateEncodingMapAndVerify (found in tests) From 294ecaa11ab4df1ab0392423e25af2bd3a7dfa4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Foidl?= Date: Tue, 15 Jan 2019 18:52:10 +0100 Subject: [PATCH 12/23] Initialized the static fields directly (i.e. w/o cctor) Cf. https://github.com/dotnet/corefx/pull/34529#discussion_r247193419 --- .../src/System/Buffers/Text/Base64.cs | 130 ------------------ .../src/System/Buffers/Text/Base64Decoder.cs | 88 ++++++++++-- .../src/System/Buffers/Text/Base64Encoder.cs | 51 +++++-- 3 files changed, 117 insertions(+), 152 deletions(-) diff --git a/src/System.Memory/src/System/Buffers/Text/Base64.cs b/src/System.Memory/src/System/Buffers/Text/Base64.cs index 69493f64b065..aa0acca6b876 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64.cs @@ -4,142 +4,12 @@ using System.Diagnostics; using System.Runtime.CompilerServices; -using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.X86; using Internal.Runtime.CompilerServices; namespace System.Buffers.Text { - // AVX2 version based on https://github.com/aklomp/base64/tree/e516d769a2a432c08404f1981e73b431566057be/lib/arch/avx2 - // SSSE3 version based on https://github.com/aklomp/base64/tree/e516d769a2a432c08404f1981e73b431566057be/lib/arch/ssse3 - public static partial class Base64 { - static Base64() - { - if (Ssse3.IsSupported) - { - s_sseEncodeShuffleVec = Vector128.Create( - 1, 0, 2, 1, - 4, 3, 5, 4, - 7, 6, 8, 7, - 10, 9, 11, 10 - ); - - s_sseEncodeLut = Vector128.Create( - 65, 71, -4, -4, - -4, -4, -4, -4, - -4, -4, -4, -4, - -19, -16, 0, 0 - ); - - s_sseDecodeShuffleVec = Vector128.Create( - 2, 1, 0, 6, - 5, 4, 10, 9, - 8, 14, 13, 12, - -1, -1, -1, -1 - ); - - s_sseDecodeLutLo = Vector128.Create( - 0x15, 0x11, 0x11, 0x11, - 0x11, 0x11, 0x11, 0x11, - 0x11, 0x11, 0x13, 0x1A, - 0x1B, 0x1B, 0x1B, 0x1A - ); - - s_sseDecodeLutHi = Vector128.Create( - 0x10, 0x10, 0x01, 0x02, - 0x04, 0x08, 0x04, 0x08, - 0x10, 0x10, 0x10, 0x10, - 0x10, 0x10, 0x10, 0x10 - ); - - s_sseDecodeLutShift = Vector128.Create( - 0, 16, 19, 4, - -65, -65, -71, -71, - 0, 0, 0, 0, - 0, 0, 0, 0 - ); - - s_sseDecodeMask2F = Vector128.Create((sbyte)0x2F); // ASCII: / - } - - if (Avx2.IsSupported) - { - s_avxEncodePermuteVec = Vector256.Create(0, 0, 1, 2, 3, 4, 5, 6); - - s_avxEncodeShuffleVec = Vector256.Create( - 5, 4, 6, 5, - 8, 7, 9, 8, - 11, 10, 12, 11, - 14, 13, 15, 14, - 1, 0, 2, 1, - 4, 3, 5, 4, - 7, 6, 8, 7, - 10, 9, 11, 10 - ); - - s_avxEncodeLut = Vector256.Create( - 65, 71, -4, -4, - -4, -4, -4, -4, - -4, -4, -4, -4, - -19, -16, 0, 0, - 65, 71, -4, -4, - -4, -4, -4, -4, - -4, -4, -4, -4, - -19, -16, 0, 0 - ); - - s_avxDecodeLutLo = Vector256.Create( - 0x15, 0x11, 0x11, 0x11, - 0x11, 0x11, 0x11, 0x11, - 0x11, 0x11, 0x13, 0x1A, - 0x1B, 0x1B, 0x1B, 0x1A, - 0x15, 0x11, 0x11, 0x11, - 0x11, 0x11, 0x11, 0x11, - 0x11, 0x11, 0x13, 0x1A, - 0x1B, 0x1B, 0x1B, 0x1A - ); - - s_avxDecodeLutHi = Vector256.Create( - 0x10, 0x10, 0x01, 0x02, - 0x04, 0x08, 0x04, 0x08, - 0x10, 0x10, 0x10, 0x10, - 0x10, 0x10, 0x10, 0x10, - 0x10, 0x10, 0x01, 0x02, - 0x04, 0x08, 0x04, 0x08, - 0x10, 0x10, 0x10, 0x10, - 0x10, 0x10, 0x10, 0x10 - ); - - s_avxDecodeLutShift = Vector256.Create( - 0, 16, 19, 4, - -65, -65, -71, -71, - 0, 0, 0, 0, - 0, 0, 0, 0, - 0, 16, 19, 4, - -65, -65, -71, -71, - 0, 0, 0, 0, - 0, 0, 0, 0 - ); - - s_avxDecodeShuffleVec = Vector256.Create( - 2, 1, 0, 6, - 5, 4, 10, 9, - 8, 14, 13, 12, - -1, -1, -1, -1, - 2, 1, 0, 6, - 5, 4, 10, 9, - 8, 14, 13, 12, - -1, -1, -1, -1 - ); - - s_avxDecodePermuteVec = Vector256.Create(0, 1, 2, 4, 5, 6, -1, -1); - - s_avxDecodeMask2F = Vector256.Create((sbyte)0x2F); // ASCII: / - } - } - [Conditional("DEBUG")] private static unsafe void AssertRead(byte* src, byte* srcStart, int srcLength) { diff --git a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs index 1b34df5d981e..ea29e6115306 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs @@ -496,17 +496,81 @@ private static unsafe void WriteThreeLowOrderBytes(byte* destination, int value) -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, }; - private static readonly Vector128 s_sseDecodeShuffleVec; - private static readonly Vector128 s_sseDecodeLutLo; - private static readonly Vector128 s_sseDecodeLutHi; - private static readonly Vector128 s_sseDecodeLutShift; - private static readonly Vector128 s_sseDecodeMask2F; - - private static readonly Vector256 s_avxDecodeShuffleVec; - private static readonly Vector256 s_avxDecodePermuteVec; - private static readonly Vector256 s_avxDecodeLutLo; - private static readonly Vector256 s_avxDecodeLutHi; - private static readonly Vector256 s_avxDecodeLutShift; - private static readonly Vector256 s_avxDecodeMask2F; + private static readonly Vector128 s_sseDecodeShuffleVec = Ssse3.IsSupported ? Vector128.Create( + 2, 1, 0, 6, + 5, 4, 10, 9, + 8, 14, 13, 12, + -1, -1, -1, -1 + ) : default; + + private static readonly Vector128 s_sseDecodeLutLo = Sse3.IsSupported ? Vector128.Create( + 0x15, 0x11, 0x11, 0x11, + 0x11, 0x11, 0x11, 0x11, + 0x11, 0x11, 0x13, 0x1A, + 0x1B, 0x1B, 0x1B, 0x1A + ) : default; + + private static readonly Vector128 s_sseDecodeLutHi = Sse3.IsSupported ? Vector128.Create( + 0x10, 0x10, 0x01, 0x02, + 0x04, 0x08, 0x04, 0x08, + 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10 + ) : default; + + private static readonly Vector128 s_sseDecodeLutShift = Sse3.IsSupported ? Vector128.Create( + 0, 16, 19, 4, + -65, -65, -71, -71, + 0, 0, 0, 0, + 0, 0, 0, 0 + ) : default; + + private static readonly Vector128 s_sseDecodeMask2F = Sse3.IsSupported ? Vector128.Create((sbyte)0x2F) : default; // ASCII: / + + private static readonly Vector256 s_avxDecodeShuffleVec = Avx2.IsSupported ? Vector256.Create( + 2, 1, 0, 6, + 5, 4, 10, 9, + 8, 14, 13, 12, + -1, -1, -1, -1, + 2, 1, 0, 6, + 5, 4, 10, 9, + 8, 14, 13, 12, + -1, -1, -1, -1 + ) : default; + + private static readonly Vector256 s_avxDecodePermuteVec = Avx2.IsSupported ? Vector256.Create(0, 1, 2, 4, 5, 6, -1, -1) : default; + private static readonly Vector256 s_avxDecodeMask2F = Avx2.IsSupported ? Vector256.Create((sbyte)0x2F) : default; // ASCII: / + + private static readonly Vector256 s_avxDecodeLutLo = Avx2.IsSupported ? Vector256.Create( + 0x15, 0x11, 0x11, 0x11, + 0x11, 0x11, 0x11, 0x11, + 0x11, 0x11, 0x13, 0x1A, + 0x1B, 0x1B, 0x1B, 0x1A, + 0x15, 0x11, 0x11, 0x11, + 0x11, 0x11, 0x11, 0x11, + 0x11, 0x11, 0x13, 0x1A, + 0x1B, 0x1B, 0x1B, 0x1A + ) : default; + + private static readonly Vector256 s_avxDecodeLutHi = Avx2.IsSupported ? Vector256.Create( + 0x10, 0x10, 0x01, 0x02, + 0x04, 0x08, 0x04, 0x08, + 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x01, 0x02, + 0x04, 0x08, 0x04, 0x08, + 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10 + ) : default; + + private static readonly Vector256 s_avxDecodeLutShift = Avx2.IsSupported ? Vector256.Create( + 0, 16, 19, 4, + -65, -65, -71, -71, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 16, 19, 4, + -65, -65, -71, -71, + 0, 0, 0, 0, + 0, 0, 0, 0 + ) : default; } } diff --git a/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs b/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs index 35518c231ede..8c4fa104ec32 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs @@ -372,6 +372,10 @@ private static unsafe uint EncodeAndPadTwo(byte* oneByte, byte* encodingMap) return (uint)(i0 | (i1 << 8) | (EncodingPad << 16) | (EncodingPad << 24)); } + private const uint EncodingPad = '='; // '=', for padding + + private const int MaximumEncodeLength = (int.MaxValue / 4) * 3; // 1610612733 + // Pre-computing this table using a custom string(s_characters) and GenerateEncodingMapAndVerify (found in tests) private static readonly byte[] s_encodingMap = { 65, 66, 67, 68, 69, 70, 71, 72, //A..H @@ -384,15 +388,42 @@ private static unsafe uint EncodeAndPadTwo(byte* oneByte, byte* encodingMap) 52, 53, 54, 55, 56, 57, 43, 47 //4..9, +, / }; - private static readonly Vector128 s_sseEncodeShuffleVec; - private static readonly Vector128 s_sseEncodeLut; - - private static readonly Vector256 s_avxEncodePermuteVec; - private static readonly Vector256 s_avxEncodeShuffleVec; - private static readonly Vector256 s_avxEncodeLut; - - private const byte EncodingPad = (byte)'='; // '=', for padding - - private const int MaximumEncodeLength = (int.MaxValue / 4) * 3; // 1610612733 + private static readonly Vector128 s_sseEncodeShuffleVec = Ssse3.IsSupported ? Vector128.Create( + 1, 0, 2, 1, + 4, 3, 5, 4, + 7, 6, 8, 7, + 10, 9, 11, 10 + ) : default; + + private static readonly Vector128 s_sseEncodeLut = Ssse3.IsSupported ? Vector128.Create( + 65, 71, -4, -4, + -4, -4, -4, -4, + -4, -4, -4, -4, + -19, -16, 0, 0 + ) : default; + + private static readonly Vector256 s_avxEncodePermuteVec = Avx2.IsSupported ? Vector256.Create(0, 0, 1, 2, 3, 4, 5, 6) : default; + + private static readonly Vector256 s_avxEncodeShuffleVec = Avx2.IsSupported ? Vector256.Create( + 5, 4, 6, 5, + 8, 7, 9, 8, + 11, 10, 12, 11, + 14, 13, 15, 14, + 1, 0, 2, 1, + 4, 3, 5, 4, + 7, 6, 8, 7, + 10, 9, 11, 10 + ) : default; + + private static readonly Vector256 s_avxEncodeLut = Avx2.IsSupported ? Vector256.Create( + 65, 71, -4, -4, + -4, -4, -4, -4, + -4, -4, -4, -4, + -19, -16, 0, 0, + 65, 71, -4, -4, + -4, -4, -4, -4, + -4, -4, -4, -4, + -19, -16, 0, 0 + ) : default; } } From 3016983a8662c989aa2b5766ce2cf6e24b4b88f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Foidl?= Date: Wed, 16 Jan 2019 14:28:39 +0100 Subject: [PATCH 13/23] Added a test for decoding a (encoded) Guid The case with decoding encoded 16 bytes was not covered by tests, so a wrong code got commited before, resulting in DestinationTooSmall instead of the correct Done. --- .../src/System/Buffers/Text/Base64Decoder.cs | 8 +++----- .../tests/Base64/Base64DecoderUnitTests.cs | 14 ++++++++++++++ 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs index ea29e6115306..1811701391d5 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs @@ -89,19 +89,17 @@ public static unsafe OperationStatus DecodeFromUtf8(ReadOnlySpan utf8, Spa // Last bytes could have padding characters, so process them separately and treat them as valid only if isFinalBlock is true // if isFinalBlock is false, padding characters are considered invalid - bool isDestinationTooSmall; + int skipLastChunk = isFinalBlock ? 4 : 0; if (destLength >= decodedLength) { - isDestinationTooSmall = false; - maxSrcLength = isFinalBlock ? srcLength - 4 : srcLength; + maxSrcLength = srcLength - skipLastChunk; } else { // This should never overflow since destLength here is less than int.MaxValue / 4 * 3 (i.e. 1610612733) // Therefore, (destLength / 3) * 4 will always be less than 2147483641 maxSrcLength = (destLength / 3) * 4; - isDestinationTooSmall = true; } srcMax = srcBytes + (nuint)maxSrcLength; @@ -117,7 +115,7 @@ public static unsafe OperationStatus DecodeFromUtf8(ReadOnlySpan utf8, Spa dest += 3; } - if (isDestinationTooSmall) + if (maxSrcLength != srcLength - skipLastChunk) goto DestinationTooSmallExit; // If input is less than 4 bytes, srcLength == sourceIndex == 0 diff --git a/src/System.Memory/tests/Base64/Base64DecoderUnitTests.cs b/src/System.Memory/tests/Base64/Base64DecoderUnitTests.cs index 3c159ddee4cf..8d6de7a7a0a6 100644 --- a/src/System.Memory/tests/Base64/Base64DecoderUnitTests.cs +++ b/src/System.Memory/tests/Base64/Base64DecoderUnitTests.cs @@ -45,6 +45,20 @@ public void DecodeEmptySpan() Assert.True(Base64TestHelper.VerifyDecodingCorrectness(source.Length, decodedBytes.Length, source, decodedBytes)); } + [Fact] + public void DecodeGuid() + { + Span source = new byte[24]; + Span decodedBytes = Guid.NewGuid().ToByteArray(); + Base64.EncodeToUtf8(decodedBytes, source, out int _, out int _); + + Assert.Equal(OperationStatus.Done, + Base64.DecodeFromUtf8(source, decodedBytes, out int consumed, out int decodedByteCount)); + Assert.Equal(24, consumed); + Assert.Equal(16, decodedByteCount); + Assert.True(Base64TestHelper.VerifyDecodingCorrectness(source.Length, decodedBytes.Length, source, decodedBytes)); + } + [Fact] public void BasicDecodingWithFinalBlockFalse() { From 0343eda1a37147cfe82aa615180606d3672744bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Foidl?= Date: Wed, 16 Jan 2019 18:31:44 +0100 Subject: [PATCH 14/23] EncodingMap / DecodingMap as byref instead of pointer So got rid of the `rep stosd` in the prolog. Cf. https://github.com/dotnet/corefx/pull/34529#discussion_r248075157 --- .../src/System/Buffers/Text/Base64Decoder.cs | 55 ++++++++++-------- .../src/System/Buffers/Text/Base64Encoder.cs | 56 +++++++++++-------- 2 files changed, 66 insertions(+), 45 deletions(-) diff --git a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs index 1811701391d5..11639be3de58 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs @@ -3,8 +3,10 @@ // See the LICENSE file in the project root for more information. using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; +using Internal.Runtime.CompilerServices; #if BIT64 using nuint = System.UInt64; @@ -45,10 +47,12 @@ public static unsafe OperationStatus DecodeFromUtf8(ReadOnlySpan utf8, Spa return OperationStatus.Done; } - fixed (byte* srcBytes = utf8) - fixed (byte* destBytes = bytes) - fixed (sbyte* decodingMap = s_decodingMap) + fixed (byte* srcBytes = &MemoryMarshal.GetReference(utf8)) + fixed (byte* destBytes = &MemoryMarshal.GetReference(bytes)) { + // PERF: needs to be initialized here, for good codegen + ref sbyte decodingMap = ref s_decodingMap[0]; + int srcLength = utf8.Length & ~0x3; // only decode input up to the closest multiple of 4. int destLength = bytes.Length; int maxSrcLength = srcLength; @@ -105,7 +109,7 @@ public static unsafe OperationStatus DecodeFromUtf8(ReadOnlySpan utf8, Spa srcMax = srcBytes + (nuint)maxSrcLength; while (src < srcMax) { - int result = Decode(src, decodingMap); + int result = Decode(src, ref decodingMap); if (result < 0) goto InvalidDataExit; @@ -135,8 +139,8 @@ public static unsafe OperationStatus DecodeFromUtf8(ReadOnlySpan utf8, Spa uint t2 = srcEnd[-2]; uint t3 = srcEnd[-1]; - int i0 = decodingMap[t0]; - int i1 = decodingMap[t1]; + int i0 = Unsafe.Add(ref decodingMap, (IntPtr)t0); + int i1 = Unsafe.Add(ref decodingMap, (IntPtr)t1); i0 <<= 18; i1 <<= 12; @@ -147,8 +151,8 @@ public static unsafe OperationStatus DecodeFromUtf8(ReadOnlySpan utf8, Spa if (t3 != EncodingPad) { - int i2 = decodingMap[t2]; - int i3 = decodingMap[t3]; + int i2 = Unsafe.Add(ref decodingMap, (IntPtr)t2); + int i3 = Unsafe.Add(ref decodingMap, (IntPtr)t3); i2 <<= 6; @@ -165,7 +169,7 @@ public static unsafe OperationStatus DecodeFromUtf8(ReadOnlySpan utf8, Spa } else if (t2 != EncodingPad) { - int i2 = decodingMap[t2]; + int i2 = Unsafe.Add(ref decodingMap, (IntPtr)t2); i2 <<= 6; @@ -253,9 +257,16 @@ public static int GetMaxDecodedFromUtf8Length(int length) /// public static unsafe OperationStatus DecodeFromUtf8InPlace(Span buffer, out int bytesWritten) { - fixed (byte* bufferBytes = buffer) - fixed (sbyte* decodingMap = s_decodingMap) + if (buffer.IsEmpty) { + bytesWritten = 0; + return OperationStatus.Done; + } + + fixed (byte* bufferBytes = &MemoryMarshal.GetReference(buffer)) + { + ref sbyte decodingMap = ref s_decodingMap[0]; + int bufferLength = buffer.Length; uint sourceIndex = 0; uint destIndex = 0; @@ -268,7 +279,7 @@ public static unsafe OperationStatus DecodeFromUtf8InPlace(Span buffer, ou while (sourceIndex < bufferLength - 4) { - int result = Decode(bufferBytes + sourceIndex, decodingMap); + int result = Decode(bufferBytes + sourceIndex, ref decodingMap); if (result < 0) goto InvalidExit; WriteThreeLowOrderBytes(bufferBytes + destIndex, result); @@ -281,8 +292,8 @@ public static unsafe OperationStatus DecodeFromUtf8InPlace(Span buffer, ou uint t2 = bufferBytes[bufferLength - 2]; uint t3 = bufferBytes[bufferLength - 1]; - int i0 = decodingMap[t0]; - int i1 = decodingMap[t1]; + int i0 = Unsafe.Add(ref decodingMap, (IntPtr)t0); + int i1 = Unsafe.Add(ref decodingMap, (IntPtr)t1); i0 <<= 18; i1 <<= 12; @@ -291,8 +302,8 @@ public static unsafe OperationStatus DecodeFromUtf8InPlace(Span buffer, ou if (t3 != EncodingPad) { - int i2 = decodingMap[t2]; - int i3 = decodingMap[t3]; + int i2 = Unsafe.Add(ref decodingMap, (IntPtr)t2); + int i3 = Unsafe.Add(ref decodingMap, (IntPtr)t3); i2 <<= 6; @@ -307,7 +318,7 @@ public static unsafe OperationStatus DecodeFromUtf8InPlace(Span buffer, ou } else if (t2 != EncodingPad) { - int i2 = decodingMap[t2]; + int i2 = Unsafe.Add(ref decodingMap, (IntPtr)t2); i2 <<= 6; @@ -443,17 +454,17 @@ private static unsafe void Ssse3Decode(ref byte* srcBytes, ref byte* destBytes, } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static unsafe int Decode(byte* encodedBytes, sbyte* decodingMap) + private static unsafe int Decode(byte* encodedBytes, ref sbyte decodingMap) { nuint t0 = encodedBytes[0]; nuint t1 = encodedBytes[1]; nuint t2 = encodedBytes[2]; nuint t3 = encodedBytes[3]; - int i0 = decodingMap[t0]; - int i1 = decodingMap[t1]; - int i2 = decodingMap[t2]; - int i3 = decodingMap[t3]; + int i0 = Unsafe.Add(ref decodingMap, (IntPtr)t0); + int i1 = Unsafe.Add(ref decodingMap, (IntPtr)t1); + int i2 = Unsafe.Add(ref decodingMap, (IntPtr)t2); + int i3 = Unsafe.Add(ref decodingMap, (IntPtr)t3); i0 <<= 18; i1 <<= 12; diff --git a/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs b/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs index 8c4fa104ec32..4a5da6f6cd38 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs @@ -3,6 +3,7 @@ // See the LICENSE file in the project root for more information. using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; using Internal.Runtime.CompilerServices; @@ -47,10 +48,12 @@ public static unsafe OperationStatus EncodeToUtf8(ReadOnlySpan bytes, Span return OperationStatus.Done; } - fixed (byte* srcBytes = bytes) - fixed (byte* destBytes = utf8) - fixed (byte* encodingMap = s_encodingMap) + fixed (byte* srcBytes = &MemoryMarshal.GetReference(bytes)) + fixed (byte* destBytes = &MemoryMarshal.GetReference(utf8)) { + // PERF: needs to be initialized here, for good codegen + ref byte encodingMap = ref s_encodingMap[0]; + int srcLength = bytes.Length; int destLength = utf8.Length; int maxSrcLength; @@ -95,7 +98,7 @@ public static unsafe OperationStatus EncodeToUtf8(ReadOnlySpan bytes, Span srcMax -= 2; while (src < srcMax) { - result = Encode(src, encodingMap); + result = Encode(src, ref encodingMap); Unsafe.WriteUnaligned(dest, result); src += 3; dest += 4; @@ -109,14 +112,14 @@ public static unsafe OperationStatus EncodeToUtf8(ReadOnlySpan bytes, Span if (src + 1 == srcEnd) { - result = EncodeAndPadTwo(src, encodingMap); + result = EncodeAndPadTwo(src, ref encodingMap); Unsafe.WriteUnaligned(dest, result); src += 1; dest += 4; } else if (src + 2 == srcEnd) { - result = EncodeAndPadOne(src, encodingMap); + result = EncodeAndPadOne(src, ref encodingMap); Unsafe.WriteUnaligned(dest, result); src += 2; dest += 4; @@ -171,9 +174,16 @@ public static int GetMaxEncodedToUtf8Length(int length) /// public static unsafe OperationStatus EncodeToUtf8InPlace(Span buffer, int dataLength, out int bytesWritten) { - fixed (byte* bufferBytes = buffer) - fixed (byte* encodingMap = s_encodingMap) + if (buffer.IsEmpty) { + bytesWritten = 0; + return OperationStatus.Done; + } + + fixed (byte* bufferBytes = &MemoryMarshal.GetReference(buffer)) + { + ref byte encodingMap = ref s_encodingMap[0]; + int encodedLength = GetMaxEncodedToUtf8Length(dataLength); if (buffer.Length < encodedLength) goto FalseExit; @@ -190,11 +200,11 @@ public static unsafe OperationStatus EncodeToUtf8InPlace(Span buffer, int { if (leftover == 1) { - result = EncodeAndPadTwo(bufferBytes + sourceIndex, encodingMap); + result = EncodeAndPadTwo(bufferBytes + sourceIndex, ref encodingMap); } else { - result = EncodeAndPadOne(bufferBytes + sourceIndex, encodingMap); + result = EncodeAndPadOne(bufferBytes + sourceIndex, ref encodingMap); } Unsafe.WriteUnaligned(bufferBytes + destinationIndex, result); @@ -204,7 +214,7 @@ public static unsafe OperationStatus EncodeToUtf8InPlace(Span buffer, int sourceIndex -= 3; while ((int)sourceIndex >= 0) { - result = Encode(bufferBytes + sourceIndex, encodingMap); + result = Encode(bufferBytes + sourceIndex, ref encodingMap); Unsafe.WriteUnaligned(bufferBytes + destinationIndex, result); destinationIndex -= 4; sourceIndex -= 3; @@ -328,7 +338,7 @@ private static unsafe void Ssse3Encode(ref byte* srcBytes, ref byte* destBytes, } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static unsafe uint Encode(byte* threeBytes, byte* encodingMap) + private static unsafe uint Encode(byte* threeBytes, ref byte encodingMap) { nuint t0 = threeBytes[0]; nuint t1 = threeBytes[1]; @@ -336,38 +346,38 @@ private static unsafe uint Encode(byte* threeBytes, byte* encodingMap) nuint i = (t0 << 16) | (t1 << 8) | t2; - nuint i0 = encodingMap[i >> 18]; - nuint i1 = encodingMap[(i >> 12) & 0x3F]; - nuint i2 = encodingMap[(i >> 6) & 0x3F]; - nuint i3 = encodingMap[i & 0x3F]; + nuint i0 = Unsafe.Add(ref encodingMap, (IntPtr)(i >> 18)); + nuint i1 = Unsafe.Add(ref encodingMap, (IntPtr)((i >> 12) & 0x3F)); + nuint i2 = Unsafe.Add(ref encodingMap, (IntPtr)((i >> 6) & 0x3F)); + nuint i3 = Unsafe.Add(ref encodingMap, (IntPtr)(i & 0x3F)); return (uint)(i0 | (i1 << 8) | (i2 << 16) | (i3 << 24)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static unsafe uint EncodeAndPadOne(byte* twoBytes, byte* encodingMap) + private static unsafe uint EncodeAndPadOne(byte* twoBytes, ref byte encodingMap) { nuint t0 = twoBytes[0]; nuint t1 = twoBytes[1]; nuint i = (t0 << 16) | (t1 << 8); - nuint i0 = encodingMap[i >> 18]; - nuint i1 = encodingMap[(i >> 12) & 0x3F]; - nuint i2 = encodingMap[(i >> 6) & 0x3F]; + nuint i0 = Unsafe.Add(ref encodingMap, (IntPtr)(i >> 18)); + nuint i1 = Unsafe.Add(ref encodingMap, (IntPtr)((i >> 12) & 0x3F)); + nuint i2 = Unsafe.Add(ref encodingMap, (IntPtr)((i >> 6) & 0x3F)); return (uint)(i0 | (i1 << 8) | (i2 << 16) | (EncodingPad << 24)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static unsafe uint EncodeAndPadTwo(byte* oneByte, byte* encodingMap) + private static unsafe uint EncodeAndPadTwo(byte* oneByte, ref byte encodingMap) { nuint t0 = oneByte[0]; nuint i = t0 << 8; - nuint i0 = encodingMap[i >> 10]; - nuint i1 = encodingMap[(i >> 4) & 0x3F]; + nuint i0 = Unsafe.Add(ref encodingMap, (IntPtr)(i >> 10)); + nuint i1 = Unsafe.Add(ref encodingMap, (IntPtr)((i >> 4) & 0x3F)); return (uint)(i0 | (i1 << 8) | (EncodingPad << 16) | (EncodingPad << 24)); } From a246816dd09d57afab71562ea18db1a86e593e5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Foidl?= Date: Tue, 5 Mar 2019 17:30:05 +0100 Subject: [PATCH 15/23] PR Feedback * https://github.com/dotnet/corefx/pull/34529#discussion_r262165689 --- .../src/System/Buffers/Text/Base64Decoder.cs | 22 +++---- .../src/System/Buffers/Text/Base64Encoder.cs | 57 ++++++++----------- 2 files changed, 33 insertions(+), 46 deletions(-) diff --git a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs index 11639be3de58..7579a43ea61a 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs @@ -8,12 +8,6 @@ using System.Runtime.Intrinsics.X86; using Internal.Runtime.CompilerServices; -#if BIT64 -using nuint = System.UInt64; -#else -using nuint = System.UInt32; -#endif - namespace System.Buffers.Text { // AVX2 version based on https://github.com/aklomp/base64/tree/e516d769a2a432c08404f1981e73b431566057be/lib/arch/avx2 @@ -67,8 +61,8 @@ public static unsafe OperationStatus DecodeFromUtf8(ReadOnlySpan utf8, Spa byte* src = srcBytes; byte* dest = destBytes; - byte* srcEnd = srcBytes + (nuint)srcLength; - byte* srcMax = srcBytes + (nuint)maxSrcLength; + byte* srcEnd = srcBytes + (uint)srcLength; + byte* srcMax = srcBytes + (uint)maxSrcLength; if (maxSrcLength >= 24) { @@ -106,7 +100,7 @@ public static unsafe OperationStatus DecodeFromUtf8(ReadOnlySpan utf8, Spa maxSrcLength = (destLength / 3) * 4; } - srcMax = srcBytes + (nuint)maxSrcLength; + srcMax = srcBytes + (uint)maxSrcLength; while (src < srcMax) { int result = Decode(src, ref decodingMap); @@ -147,7 +141,7 @@ public static unsafe OperationStatus DecodeFromUtf8(ReadOnlySpan utf8, Spa i0 |= i1; - byte* destMax = destBytes + (nuint)destLength; + byte* destMax = destBytes + (uint)destLength; if (t3 != EncodingPad) { @@ -456,10 +450,10 @@ private static unsafe void Ssse3Decode(ref byte* srcBytes, ref byte* destBytes, [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe int Decode(byte* encodedBytes, ref sbyte decodingMap) { - nuint t0 = encodedBytes[0]; - nuint t1 = encodedBytes[1]; - nuint t2 = encodedBytes[2]; - nuint t3 = encodedBytes[3]; + uint t0 = encodedBytes[0]; + uint t1 = encodedBytes[1]; + uint t2 = encodedBytes[2]; + uint t3 = encodedBytes[3]; int i0 = Unsafe.Add(ref decodingMap, (IntPtr)t0); int i1 = Unsafe.Add(ref decodingMap, (IntPtr)t1); diff --git a/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs b/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs index 4a5da6f6cd38..a7b55f4ab043 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs @@ -8,12 +8,6 @@ using System.Runtime.Intrinsics.X86; using Internal.Runtime.CompilerServices; -#if BIT64 -using nuint = System.UInt64; -#else -using nuint = System.UInt32; -#endif - namespace System.Buffers.Text { // AVX2 version based on https://github.com/aklomp/base64/tree/e516d769a2a432c08404f1981e73b431566057be/lib/arch/avx2 @@ -69,8 +63,8 @@ public static unsafe OperationStatus EncodeToUtf8(ReadOnlySpan bytes, Span byte* src = srcBytes; byte* dest = destBytes; - byte* srcEnd = srcBytes + (nuint)srcLength; - byte* srcMax = srcBytes + (nuint)maxSrcLength; + byte* srcEnd = srcBytes + (uint)srcLength; + byte* srcMax = srcBytes + (uint)maxSrcLength; if (maxSrcLength >= 16) { @@ -190,9 +184,8 @@ public static unsafe OperationStatus EncodeToUtf8InPlace(Span buffer, int int leftover = dataLength - (dataLength / 3) * 3; // how many bytes after packs of 3 - // PERF: use nuint to avoid the sign-extensions - nuint destinationIndex = (nuint)(encodedLength - 4); - nuint sourceIndex = (nuint)(dataLength - leftover); + uint destinationIndex = (uint)(encodedLength - 4); + uint sourceIndex = (uint)(dataLength - leftover); uint result = 0; // encode last pack to avoid conditional in the main loop @@ -340,46 +333,46 @@ private static unsafe void Ssse3Encode(ref byte* srcBytes, ref byte* destBytes, [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe uint Encode(byte* threeBytes, ref byte encodingMap) { - nuint t0 = threeBytes[0]; - nuint t1 = threeBytes[1]; - nuint t2 = threeBytes[2]; + uint t0 = threeBytes[0]; + uint t1 = threeBytes[1]; + uint t2 = threeBytes[2]; - nuint i = (t0 << 16) | (t1 << 8) | t2; + uint i = (t0 << 16) | (t1 << 8) | t2; - nuint i0 = Unsafe.Add(ref encodingMap, (IntPtr)(i >> 18)); - nuint i1 = Unsafe.Add(ref encodingMap, (IntPtr)((i >> 12) & 0x3F)); - nuint i2 = Unsafe.Add(ref encodingMap, (IntPtr)((i >> 6) & 0x3F)); - nuint i3 = Unsafe.Add(ref encodingMap, (IntPtr)(i & 0x3F)); + uint i0 = Unsafe.Add(ref encodingMap, (IntPtr)(i >> 18)); + uint i1 = Unsafe.Add(ref encodingMap, (IntPtr)((i >> 12) & 0x3F)); + uint i2 = Unsafe.Add(ref encodingMap, (IntPtr)((i >> 6) & 0x3F)); + uint i3 = Unsafe.Add(ref encodingMap, (IntPtr)(i & 0x3F)); - return (uint)(i0 | (i1 << 8) | (i2 << 16) | (i3 << 24)); + return i0 | (i1 << 8) | (i2 << 16) | (i3 << 24); } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe uint EncodeAndPadOne(byte* twoBytes, ref byte encodingMap) { - nuint t0 = twoBytes[0]; - nuint t1 = twoBytes[1]; + uint t0 = twoBytes[0]; + uint t1 = twoBytes[1]; - nuint i = (t0 << 16) | (t1 << 8); + uint i = (t0 << 16) | (t1 << 8); - nuint i0 = Unsafe.Add(ref encodingMap, (IntPtr)(i >> 18)); - nuint i1 = Unsafe.Add(ref encodingMap, (IntPtr)((i >> 12) & 0x3F)); - nuint i2 = Unsafe.Add(ref encodingMap, (IntPtr)((i >> 6) & 0x3F)); + uint i0 = Unsafe.Add(ref encodingMap, (IntPtr)(i >> 18)); + uint i1 = Unsafe.Add(ref encodingMap, (IntPtr)((i >> 12) & 0x3F)); + uint i2 = Unsafe.Add(ref encodingMap, (IntPtr)((i >> 6) & 0x3F)); - return (uint)(i0 | (i1 << 8) | (i2 << 16) | (EncodingPad << 24)); + return i0 | (i1 << 8) | (i2 << 16) | (EncodingPad << 24); } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe uint EncodeAndPadTwo(byte* oneByte, ref byte encodingMap) { - nuint t0 = oneByte[0]; + uint t0 = oneByte[0]; - nuint i = t0 << 8; + uint i = t0 << 8; - nuint i0 = Unsafe.Add(ref encodingMap, (IntPtr)(i >> 10)); - nuint i1 = Unsafe.Add(ref encodingMap, (IntPtr)((i >> 4) & 0x3F)); + uint i0 = Unsafe.Add(ref encodingMap, (IntPtr)(i >> 10)); + uint i1 = Unsafe.Add(ref encodingMap, (IntPtr)((i >> 4) & 0x3F)); - return (uint)(i0 | (i1 << 8) | (EncodingPad << 16) | (EncodingPad << 24)); + return i0 | (i1 << 8) | (EncodingPad << 16) | (EncodingPad << 24); } private const uint EncodingPad = '='; // '=', for padding From d31b7e80d78e155602e9a89bc2800dfabf327338 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Foidl?= Date: Sun, 10 Mar 2019 19:37:09 +0100 Subject: [PATCH 16/23] Debug.Fail instead throwing for the assertion Cf. https://github.com/dotnet/corefx/pull/34529#discussion_r263894301 --- src/System.Memory/src/System/Buffers/Text/Base64.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/System.Memory/src/System/Buffers/Text/Base64.cs b/src/System.Memory/src/System/Buffers/Text/Base64.cs index aa0acca6b876..eb51ff416b75 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64.cs @@ -20,7 +20,7 @@ private static unsafe void AssertRead(byte* src, byte* srcStart, int sr if (readEnd > srcEnd) { int srcIndex = (int)(src - srcStart); - throw new InvalidOperationException($"Read for {typeof(TVector)} is not within safe bounds. srcIndex: {srcIndex}, srcLength: {srcLength}"); + Debug.Fail($"Read for {typeof(TVector)} is not within safe bounds. srcIndex: {srcIndex}, srcLength: {srcLength}"); } } @@ -34,7 +34,7 @@ private static unsafe void AssertWrite(byte* dest, byte* destStart, int if (writeEnd > destEnd) { int destIndex = (int)(dest - destStart); - throw new InvalidOperationException($"Write for {typeof(TVector)} is not within safe bounds. destIndex: {destIndex}, destLength: {destLength}"); + Debug.Fail($"Write for {typeof(TVector)} is not within safe bounds. destIndex: {destIndex}, destLength: {destLength}"); } } } From 5c0dbeebb81bb84b729e67eddbb2a97ae31d66d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Foidl?= Date: Mon, 11 Mar 2019 21:56:58 +0100 Subject: [PATCH 17/23] ROSpan for static data --- .../src/System/Buffers/Text/Base64.cs | 7 +++ .../src/System/Buffers/Text/Base64Decoder.cs | 62 +++++++++++-------- .../src/System/Buffers/Text/Base64Encoder.cs | 37 ++++++----- 3 files changed, 66 insertions(+), 40 deletions(-) diff --git a/src/System.Memory/src/System/Buffers/Text/Base64.cs b/src/System.Memory/src/System/Buffers/Text/Base64.cs index eb51ff416b75..34f3af433db3 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64.cs @@ -4,12 +4,19 @@ using System.Diagnostics; using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; using Internal.Runtime.CompilerServices; namespace System.Buffers.Text { public static partial class Base64 { + private static TVector ReadVector(ReadOnlySpan data) + { + ref sbyte tmp = ref MemoryMarshal.GetReference(data); + return Unsafe.As(ref tmp); + } + [Conditional("DEBUG")] private static unsafe void AssertRead(byte* src, byte* srcStart, int srcLength) { diff --git a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs index 7579a43ea61a..f31be5755bf8 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs @@ -348,14 +348,14 @@ public static unsafe OperationStatus DecodeFromUtf8InPlace(Span buffer, ou private static unsafe void Avx2Decode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart) { // The JIT won't hoist these "constants", so help it - Vector256 lutHi = s_avxDecodeLutHi; - Vector256 lutLo = s_avxDecodeLutLo; - Vector256 lutShift = s_avxDecodeLutShift; + Vector256 lutHi = ReadVector>(s_avxDecodeLutHi); + Vector256 lutLo = ReadVector>(s_avxDecodeLutLo); + Vector256 lutShift = ReadVector>(s_avxDecodeLutShift); Vector256 mask2F = s_avxDecodeMask2F; Vector256 shuffleConstant0 = Vector256.Create(0x01400140).AsSByte(); Vector256 shuffleConstant1 = Vector256.Create(0x00011000).AsInt16(); - Vector256 shuffleVec = s_avxDecodeShuffleVec; - Vector256 permuteVec = s_avxDecodePermuteVec; + Vector256 shuffleVec = ReadVector>(s_avxDecodeShuffleVec); + Vector256 permuteVec = ReadVector>(s_avxDecodePermuteVec).AsInt32(); byte* src = srcBytes; byte* dest = destBytes; @@ -401,13 +401,13 @@ private static unsafe void Avx2Decode(ref byte* srcBytes, ref byte* destBytes, b private static unsafe void Ssse3Decode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart) { // The JIT won't hoist these "constants", so help it - Vector128 lutHi = s_sseDecodeLutHi; - Vector128 lutLo = s_sseDecodeLutLo; - Vector128 lutShift = s_sseDecodeLutShift; + Vector128 lutHi = ReadVector>(s_sseDecodeLutHi); + Vector128 lutLo = ReadVector>(s_sseDecodeLutLo); + Vector128 lutShift = ReadVector>(s_sseDecodeLutShift); Vector128 mask2F = s_sseDecodeMask2F; Vector128 shuffleConstant0 = Vector128.Create(0x01400140).AsSByte(); Vector128 shuffleConstant1 = Vector128.Create(0x00011000).AsInt16(); - Vector128 shuffleVec = s_sseDecodeShuffleVec; + Vector128 shuffleVec = ReadVector>(s_sseDecodeShuffleVec); byte* src = srcBytes; byte* dest = destBytes; @@ -499,37 +499,37 @@ private static unsafe void WriteThreeLowOrderBytes(byte* destination, int value) -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, }; - private static readonly Vector128 s_sseDecodeShuffleVec = Ssse3.IsSupported ? Vector128.Create( + private static ReadOnlySpan s_sseDecodeShuffleVec => new sbyte[] { 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1 - ) : default; + }; - private static readonly Vector128 s_sseDecodeLutLo = Sse3.IsSupported ? Vector128.Create( + private static ReadOnlySpan s_sseDecodeLutLo => new sbyte[] { 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A - ) : default; + }; - private static readonly Vector128 s_sseDecodeLutHi = Sse3.IsSupported ? Vector128.Create( + private static ReadOnlySpan s_sseDecodeLutHi => new sbyte[] { 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10 - ) : default; + }; - private static readonly Vector128 s_sseDecodeLutShift = Sse3.IsSupported ? Vector128.Create( + private static ReadOnlySpan s_sseDecodeLutShift => new sbyte[] { 0, 16, 19, 4, -65, -65, -71, -71, 0, 0, 0, 0, 0, 0, 0, 0 - ) : default; + }; private static readonly Vector128 s_sseDecodeMask2F = Sse3.IsSupported ? Vector128.Create((sbyte)0x2F) : default; // ASCII: / - private static readonly Vector256 s_avxDecodeShuffleVec = Avx2.IsSupported ? Vector256.Create( + private static ReadOnlySpan s_avxDecodeShuffleVec => new sbyte[] { 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, @@ -538,12 +538,22 @@ private static unsafe void WriteThreeLowOrderBytes(byte* destination, int value) 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1 - ) : default; + }; + + private static ReadOnlySpan s_avxDecodePermuteVec => new sbyte[] { + 0, 0, 0, 0, + 1, 0, 0, 0, + 2, 0, 0, 0, + 4, 0, 0, 0, + 5, 0, 0, 0, + 6, 0, 0, 0, + -1, -1, -1, -1, + -1, -1, -1, -1 + }; - private static readonly Vector256 s_avxDecodePermuteVec = Avx2.IsSupported ? Vector256.Create(0, 1, 2, 4, 5, 6, -1, -1) : default; private static readonly Vector256 s_avxDecodeMask2F = Avx2.IsSupported ? Vector256.Create((sbyte)0x2F) : default; // ASCII: / - private static readonly Vector256 s_avxDecodeLutLo = Avx2.IsSupported ? Vector256.Create( + private static ReadOnlySpan s_avxDecodeLutLo => new sbyte[] { 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x13, 0x1A, @@ -552,9 +562,9 @@ private static unsafe void WriteThreeLowOrderBytes(byte* destination, int value) 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A - ) : default; + }; - private static readonly Vector256 s_avxDecodeLutHi = Avx2.IsSupported ? Vector256.Create( + private static ReadOnlySpan s_avxDecodeLutHi => new sbyte[] { 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08, 0x10, 0x10, 0x10, 0x10, @@ -563,9 +573,9 @@ private static unsafe void WriteThreeLowOrderBytes(byte* destination, int value) 0x04, 0x08, 0x04, 0x08, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10 - ) : default; + }; - private static readonly Vector256 s_avxDecodeLutShift = Avx2.IsSupported ? Vector256.Create( + private static ReadOnlySpan s_avxDecodeLutShift => new sbyte[] { 0, 16, 19, 4, -65, -65, -71, -71, 0, 0, 0, 0, @@ -574,6 +584,6 @@ private static unsafe void WriteThreeLowOrderBytes(byte* destination, int value) -65, -65, -71, -71, 0, 0, 0, 0, 0, 0, 0, 0 - ) : default; + }; } } diff --git a/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs b/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs index a7b55f4ab043..66d0c7c51b76 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs @@ -226,14 +226,14 @@ public static unsafe OperationStatus EncodeToUtf8InPlace(Span buffer, int private static unsafe void Avx2Encode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart) { // The JIT won't hoist these "constants", so help it - Vector256 shuffleVec = s_avxEncodeShuffleVec; + Vector256 shuffleVec = ReadVector>(s_avxEncodeShuffleVec); Vector256 shuffleConstant0 = Vector256.Create(0x0fc0fc00).AsSByte(); Vector256 shuffleConstant2 = Vector256.Create(0x003f03f0).AsSByte(); Vector256 shuffleConstant1 = Vector256.Create(0x04000040).AsUInt16(); Vector256 shuffleConstant3 = Vector256.Create(0x01000010).AsInt16(); Vector256 translationContant0 = Vector256.Create((byte)51); Vector256 translationContant1 = Vector256.Create((sbyte)25); - Vector256 lut = s_avxEncodeLut; + Vector256 lut = ReadVector>(s_avxEncodeLut); byte* src = srcBytes; byte* dest = destBytes; @@ -243,7 +243,7 @@ private static unsafe void Avx2Encode(ref byte* srcBytes, ref byte* destBytes, b Vector256 str = Avx.LoadVector256(src).AsSByte(); // shift by 4 bytes, as required by Reshuffle - str = Avx2.PermuteVar8x32(str.AsInt32(), s_avxEncodePermuteVec).AsSByte(); + str = Avx2.PermuteVar8x32(str.AsInt32(), ReadVector>(s_avxEncodePermuteVec).AsInt32()).AsSByte(); // Next loads are done at src-4, as required by Reshuffle, so shift it once src -= 4; @@ -286,14 +286,14 @@ private static unsafe void Avx2Encode(ref byte* srcBytes, ref byte* destBytes, b private static unsafe void Ssse3Encode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart) { // The JIT won't hoist these "constants", so help it - Vector128 shuffleVec = s_sseEncodeShuffleVec; + Vector128 shuffleVec = ReadVector>(s_sseEncodeShuffleVec); Vector128 shuffleConstant0 = Vector128.Create(0x0fc0fc00).AsSByte(); Vector128 shuffleConstant2 = Vector128.Create(0x003f03f0).AsSByte(); Vector128 shuffleConstant1 = Vector128.Create(0x04000040).AsUInt16(); Vector128 shuffleConstant3 = Vector128.Create(0x01000010).AsInt16(); Vector128 translationContant0 = Vector128.Create((byte)51); Vector128 translationContant1 = Vector128.Create((sbyte)25); - Vector128 lut = s_sseEncodeLut; + Vector128 lut = ReadVector>(s_sseEncodeLut); byte* src = srcBytes; byte* dest = destBytes; @@ -391,23 +391,32 @@ private static unsafe uint EncodeAndPadTwo(byte* oneByte, ref byte encodingMap) 52, 53, 54, 55, 56, 57, 43, 47 //4..9, +, / }; - private static readonly Vector128 s_sseEncodeShuffleVec = Ssse3.IsSupported ? Vector128.Create( + private static ReadOnlySpan s_sseEncodeShuffleVec => new sbyte[] { 1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10 - ) : default; + }; - private static readonly Vector128 s_sseEncodeLut = Ssse3.IsSupported ? Vector128.Create( + private static ReadOnlySpan s_sseEncodeLut => new sbyte[] { 65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0 - ) : default; + }; - private static readonly Vector256 s_avxEncodePermuteVec = Avx2.IsSupported ? Vector256.Create(0, 0, 1, 2, 3, 4, 5, 6) : default; + private static ReadOnlySpan s_avxEncodePermuteVec => new sbyte[] { + 0, 0, 0, 0, + 0, 0, 0, 0, + 1, 0, 0, 0, + 2, 0, 0, 0, + 3, 0, 0, 0, + 4, 0, 0, 0, + 5, 0, 0, 0, + 6, 0, 0, 0 + }; - private static readonly Vector256 s_avxEncodeShuffleVec = Avx2.IsSupported ? Vector256.Create( + private static ReadOnlySpan s_avxEncodeShuffleVec => new sbyte[] { 5, 4, 6, 5, 8, 7, 9, 8, 11, 10, 12, 11, @@ -416,9 +425,9 @@ private static unsafe uint EncodeAndPadTwo(byte* oneByte, ref byte encodingMap) 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10 - ) : default; + }; - private static readonly Vector256 s_avxEncodeLut = Avx2.IsSupported ? Vector256.Create( + private static ReadOnlySpan s_avxEncodeLut => new sbyte[] { 65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, @@ -427,6 +436,6 @@ private static unsafe uint EncodeAndPadTwo(byte* oneByte, ref byte encodingMap) -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0 - ) : default; + }; } } From 31c4741ccb70007d393983ef38a37d99de715535 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Foidl?= Date: Mon, 11 Mar 2019 22:08:12 +0100 Subject: [PATCH 18/23] ROS for lookup maps --- .../src/System/Buffers/Text/Base64Decoder.cs | 11 +++++------ .../src/System/Buffers/Text/Base64Encoder.cs | 9 +++------ 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs index f31be5755bf8..e521bc6b4554 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs @@ -44,9 +44,6 @@ public static unsafe OperationStatus DecodeFromUtf8(ReadOnlySpan utf8, Spa fixed (byte* srcBytes = &MemoryMarshal.GetReference(utf8)) fixed (byte* destBytes = &MemoryMarshal.GetReference(bytes)) { - // PERF: needs to be initialized here, for good codegen - ref sbyte decodingMap = ref s_decodingMap[0]; - int srcLength = utf8.Length & ~0x3; // only decode input up to the closest multiple of 4. int destLength = bytes.Length; int maxSrcLength = srcLength; @@ -100,7 +97,9 @@ public static unsafe OperationStatus DecodeFromUtf8(ReadOnlySpan utf8, Spa maxSrcLength = (destLength / 3) * 4; } + ref sbyte decodingMap = ref MemoryMarshal.GetReference(s_decodingMap); srcMax = srcBytes + (uint)maxSrcLength; + while (src < srcMax) { int result = Decode(src, ref decodingMap); @@ -259,8 +258,6 @@ public static unsafe OperationStatus DecodeFromUtf8InPlace(Span buffer, ou fixed (byte* bufferBytes = &MemoryMarshal.GetReference(buffer)) { - ref sbyte decodingMap = ref s_decodingMap[0]; - int bufferLength = buffer.Length; uint sourceIndex = 0; uint destIndex = 0; @@ -271,6 +268,8 @@ public static unsafe OperationStatus DecodeFromUtf8InPlace(Span buffer, ou if (bufferLength == 0) goto DoneExit; + ref sbyte decodingMap = ref MemoryMarshal.GetReference(s_decodingMap); + while (sourceIndex < bufferLength - 4) { int result = Decode(bufferBytes + sourceIndex, ref decodingMap); @@ -480,7 +479,7 @@ private static unsafe void WriteThreeLowOrderBytes(byte* destination, int value) } // Pre-computing this table using a custom string(s_characters) and GenerateDecodingMapAndVerify (found in tests) - private static readonly sbyte[] s_decodingMap = { + private static ReadOnlySpan s_decodingMap => new sbyte[] { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63, //62 is placed at index 43 (for +), 63 at index 47 (for /) diff --git a/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs b/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs index 66d0c7c51b76..3765ed9ee63b 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs @@ -45,9 +45,6 @@ public static unsafe OperationStatus EncodeToUtf8(ReadOnlySpan bytes, Span fixed (byte* srcBytes = &MemoryMarshal.GetReference(bytes)) fixed (byte* destBytes = &MemoryMarshal.GetReference(utf8)) { - // PERF: needs to be initialized here, for good codegen - ref byte encodingMap = ref s_encodingMap[0]; - int srcLength = bytes.Length; int destLength = utf8.Length; int maxSrcLength; @@ -87,6 +84,7 @@ public static unsafe OperationStatus EncodeToUtf8(ReadOnlySpan bytes, Span } } + ref byte encodingMap = ref MemoryMarshal.GetReference(s_encodingMap); uint result = 0; srcMax -= 2; @@ -176,8 +174,6 @@ public static unsafe OperationStatus EncodeToUtf8InPlace(Span buffer, int fixed (byte* bufferBytes = &MemoryMarshal.GetReference(buffer)) { - ref byte encodingMap = ref s_encodingMap[0]; - int encodedLength = GetMaxEncodedToUtf8Length(dataLength); if (buffer.Length < encodedLength) goto FalseExit; @@ -187,6 +183,7 @@ public static unsafe OperationStatus EncodeToUtf8InPlace(Span buffer, int uint destinationIndex = (uint)(encodedLength - 4); uint sourceIndex = (uint)(dataLength - leftover); uint result = 0; + ref byte encodingMap = ref MemoryMarshal.GetReference(s_encodingMap); // encode last pack to avoid conditional in the main loop if (leftover != 0) @@ -380,7 +377,7 @@ private static unsafe uint EncodeAndPadTwo(byte* oneByte, ref byte encodingMap) private const int MaximumEncodeLength = (int.MaxValue / 4) * 3; // 1610612733 // Pre-computing this table using a custom string(s_characters) and GenerateEncodingMapAndVerify (found in tests) - private static readonly byte[] s_encodingMap = { + private static ReadOnlySpan s_encodingMap => new byte[] { 65, 66, 67, 68, 69, 70, 71, 72, //A..H 73, 74, 75, 76, 77, 78, 79, 80, //I..P 81, 82, 83, 84, 85, 86, 87, 88, //Q..X From c8b6cb3387ca856f52d246ad260172c8fe1d9dcd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Foidl?= Date: Sun, 26 May 2019 22:50:10 +0200 Subject: [PATCH 19/23] In decode avoided stack spill and hoisted zero-vector outside the loops Cf. https://github.com/dotnet/corefx/pull/34529#discussion_r287613894 --- .../src/System/Buffers/Text/Base64Decoder.cs | 26 ++++++++++++++----- 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs index e521bc6b4554..37ef748ad174 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs @@ -350,11 +350,12 @@ private static unsafe void Avx2Decode(ref byte* srcBytes, ref byte* destBytes, b Vector256 lutHi = ReadVector>(s_avxDecodeLutHi); Vector256 lutLo = ReadVector>(s_avxDecodeLutLo); Vector256 lutShift = ReadVector>(s_avxDecodeLutShift); - Vector256 mask2F = s_avxDecodeMask2F; + Vector256 mask2F = ReadVector>(s_avxDecodeMask2F); Vector256 shuffleConstant0 = Vector256.Create(0x01400140).AsSByte(); Vector256 shuffleConstant1 = Vector256.Create(0x00011000).AsInt16(); Vector256 shuffleVec = ReadVector>(s_avxDecodeShuffleVec); Vector256 permuteVec = ReadVector>(s_avxDecodePermuteVec).AsInt32(); + Vector256 zero = Vector256.Zero; byte* src = srcBytes; byte* dest = destBytes; @@ -369,7 +370,6 @@ private static unsafe void Avx2Decode(ref byte* srcBytes, ref byte* destBytes, b Vector256 loNibbles = Avx2.And(str, mask2F); Vector256 hi = Avx2.Shuffle(lutHi, hiNibbles); Vector256 lo = Avx2.Shuffle(lutLo, loNibbles); - Vector256 zero = Vector256.Zero; // https://github.com/dotnet/coreclr/issues/21247 if (Avx2.MoveMask(Avx2.CompareGreaterThan(Avx2.And(lo, hi), zero)) != 0) @@ -403,10 +403,11 @@ private static unsafe void Ssse3Decode(ref byte* srcBytes, ref byte* destBytes, Vector128 lutHi = ReadVector>(s_sseDecodeLutHi); Vector128 lutLo = ReadVector>(s_sseDecodeLutLo); Vector128 lutShift = ReadVector>(s_sseDecodeLutShift); - Vector128 mask2F = s_sseDecodeMask2F; + Vector128 mask2F = ReadVector>(s_sseDecodeMask2F); Vector128 shuffleConstant0 = Vector128.Create(0x01400140).AsSByte(); Vector128 shuffleConstant1 = Vector128.Create(0x00011000).AsInt16(); Vector128 shuffleVec = ReadVector>(s_sseDecodeShuffleVec); + Vector128 zero = Vector128.Zero; byte* src = srcBytes; byte* dest = destBytes; @@ -421,7 +422,6 @@ private static unsafe void Ssse3Decode(ref byte* srcBytes, ref byte* destBytes, Vector128 loNibbles = Sse2.And(str, mask2F); Vector128 hi = Ssse3.Shuffle(lutHi, hiNibbles); Vector128 lo = Ssse3.Shuffle(lutLo, loNibbles); - Vector128 zero = Vector128.Zero; if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.And(lo, hi), zero)) != 0) break; @@ -526,7 +526,12 @@ private static unsafe void WriteThreeLowOrderBytes(byte* destination, int value) 0, 0, 0, 0 }; - private static readonly Vector128 s_sseDecodeMask2F = Sse3.IsSupported ? Vector128.Create((sbyte)0x2F) : default; // ASCII: / + private static ReadOnlySpan s_sseDecodeMask2F => new sbyte[] { // ASCII: / + 0x2F, 0x2F, 0x2F, 0x2F, + 0x2F, 0x2F, 0x2F, 0x2F, + 0x2F, 0x2F, 0x2F, 0x2F, + 0x2F, 0x2F, 0x2F, 0x2F + }; private static ReadOnlySpan s_avxDecodeShuffleVec => new sbyte[] { 2, 1, 0, 6, @@ -550,7 +555,16 @@ private static unsafe void WriteThreeLowOrderBytes(byte* destination, int value) -1, -1, -1, -1 }; - private static readonly Vector256 s_avxDecodeMask2F = Avx2.IsSupported ? Vector256.Create((sbyte)0x2F) : default; // ASCII: / + private static ReadOnlySpan s_avxDecodeMask2F => new sbyte[] { // ASCII: / + 0x2F, 0x2F, 0x2F, 0x2F, + 0x2F, 0x2F, 0x2F, 0x2F, + 0x2F, 0x2F, 0x2F, 0x2F, + 0x2F, 0x2F, 0x2F, 0x2F, + 0x2F, 0x2F, 0x2F, 0x2F, + 0x2F, 0x2F, 0x2F, 0x2F, + 0x2F, 0x2F, 0x2F, 0x2F, + 0x2F, 0x2F, 0x2F, 0x2F + }; private static ReadOnlySpan s_avxDecodeLutLo => new sbyte[] { 0x15, 0x11, 0x11, 0x11, From d89692f584f2a3c22371a607e9118ef35b65e2fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Foidl?= Date: Sun, 26 May 2019 23:08:38 +0200 Subject: [PATCH 20/23] Assert assumption about destLength Cf. https://github.com/dotnet/corefx/pull/34529#discussion_r287605561 --- src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs index 37ef748ad174..ec458de57bd9 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using System.Diagnostics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; @@ -94,6 +95,7 @@ public static unsafe OperationStatus DecodeFromUtf8(ReadOnlySpan utf8, Spa { // This should never overflow since destLength here is less than int.MaxValue / 4 * 3 (i.e. 1610612733) // Therefore, (destLength / 3) * 4 will always be less than 2147483641 + Debug.Assert(destLength < (int.MaxValue / 4 * 3)); maxSrcLength = (destLength / 3) * 4; } From c8ee0a92d5cf545b55f93eeec31cf4e165067e73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Foidl?= Date: Sun, 26 May 2019 23:38:49 +0200 Subject: [PATCH 21/23] Added comments from original source and some changes to variable names Cf. https://github.com/dotnet/corefx/pull/34529#discussion_r287606634 and https://github.com/dotnet/corefx/pull/34529#discussion_r287606714 --- .../src/System/Buffers/Text/Base64Decoder.cs | 163 +++++++++++++-- .../src/System/Buffers/Text/Base64Encoder.cs | 187 +++++++++++++++--- 2 files changed, 309 insertions(+), 41 deletions(-) diff --git a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs index ec458de57bd9..b7ca32e36507 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs @@ -348,15 +348,23 @@ public static unsafe OperationStatus DecodeFromUtf8InPlace(Span buffer, ou [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe void Avx2Decode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart) { + // If we have AVX2 support, pick off 32 bytes at a time for as long as we can, + // but make sure that we quit before seeing any == markers at the end of the + // string. Also, because we write 8 zeroes at the end of the output, ensure + // that there are at least 11 valid bytes of input data remaining to close the + // gap. 32 + 2 + 11 = 45 bytes. + + // See SSSE3-version below for an explanation of how the code works. + // The JIT won't hoist these "constants", so help it Vector256 lutHi = ReadVector>(s_avxDecodeLutHi); Vector256 lutLo = ReadVector>(s_avxDecodeLutLo); Vector256 lutShift = ReadVector>(s_avxDecodeLutShift); Vector256 mask2F = ReadVector>(s_avxDecodeMask2F); - Vector256 shuffleConstant0 = Vector256.Create(0x01400140).AsSByte(); - Vector256 shuffleConstant1 = Vector256.Create(0x00011000).AsInt16(); - Vector256 shuffleVec = ReadVector>(s_avxDecodeShuffleVec); - Vector256 permuteVec = ReadVector>(s_avxDecodePermuteVec).AsInt32(); + Vector256 mergeConstant0 = Vector256.Create(0x01400140).AsSByte(); + Vector256 mergeConstant1 = Vector256.Create(0x00011000).AsInt16(); + Vector256 packBytesInLaneMask = ReadVector>(s_avxDecodePackBytesInLaneMask); + Vector256 packLanesControl = ReadVector>(s_avxDecodePackLanesControl).AsInt32(); Vector256 zero = Vector256.Zero; byte* src = srcBytes; @@ -381,10 +389,33 @@ private static unsafe void Avx2Decode(ref byte* srcBytes, ref byte* destBytes, b Vector256 shift = Avx2.Shuffle(lutShift, Avx2.Add(eq2F, hiNibbles)); str = Avx2.Add(str, shift); - Vector256 merge_ab_and_bc = Avx2.MultiplyAddAdjacent(str.AsByte(), shuffleConstant0); - Vector256 output = Avx2.MultiplyAddAdjacent(merge_ab_and_bc, shuffleConstant1); - output = Avx2.Shuffle(output.AsSByte(), shuffleVec).AsInt32(); - str = Avx2.PermuteVar8x32(output, permuteVec).AsSByte(); + // in, lower lane, bits, upper case are most significant bits, lower case are least significant bits: + // 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ + // 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG + // 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD + // 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA + + Vector256 merge_ab_and_bc = Avx2.MultiplyAddAdjacent(str.AsByte(), mergeConstant0); + // 0000kkkk LLllllll 0000JJJJ JJjjKKKK + // 0000hhhh IIiiiiii 0000GGGG GGggHHHH + // 0000eeee FFffffff 0000DDDD DDddEEEE + // 0000bbbb CCcccccc 0000AAAA AAaaBBBB + + Vector256 output = Avx2.MultiplyAddAdjacent(merge_ab_and_bc, mergeConstant1); + // 00000000 JJJJJJjj KKKKkkkk LLllllll + // 00000000 GGGGGGgg HHHHhhhh IIiiiiii + // 00000000 DDDDDDdd EEEEeeee FFffffff + // 00000000 AAAAAAaa BBBBbbbb CCcccccc + + // Pack bytes together in each lane: + output = Avx2.Shuffle(output.AsSByte(), packBytesInLaneMask).AsInt32(); + // 00000000 00000000 00000000 00000000 + // LLllllll KKKKkkkk JJJJJJjj IIiiiiii + // HHHHhhhh GGGGGGgg FFffffff EEEEeeee + // DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa + + // Pack lanes + str = Avx2.PermuteVar8x32(output, packLanesControl).AsSByte(); AssertWrite>(dest, destStart, destLength); Avx.Store(dest, str.AsByte()); @@ -401,14 +432,86 @@ private static unsafe void Avx2Decode(ref byte* srcBytes, ref byte* destBytes, b [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe void Ssse3Decode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart) { + // If we have SSSE3 support, pick off 16 bytes at a time for as long as we can, + // but make sure that we quit before seeing any == markers at the end of the + // string. Also, because we write four zeroes at the end of the output, ensure + // that there are at least 6 valid bytes of input data remaining to close the + // gap. 16 + 2 + 6 = 24 bytes. + + // The input consists of six character sets in the Base64 alphabet, + // which we need to map back to the 6-bit values they represent. + // There are three ranges, two singles, and then there's the rest. + // + // # From To Add Characters + // 1 [43] [62] +19 + + // 2 [47] [63] +16 / + // 3 [48..57] [52..61] +4 0..9 + // 4 [65..90] [0..25] -65 A..Z + // 5 [97..122] [26..51] -71 a..z + // (6) Everything else => invalid input + + // We will use LUTS for character validation & offset computation + // Remember that 0x2X and 0x0X are the same index for _mm_shuffle_epi8, + // this allows to mask with 0x2F instead of 0x0F and thus save one constant declaration (register and/or memory access) + + // For offsets: + // Perfect hash for lut = ((src>>4)&0x2F)+((src==0x2F)?0xFF:0x00) + // 0000 = garbage + // 0001 = / + // 0010 = + + // 0011 = 0-9 + // 0100 = A-Z + // 0101 = A-Z + // 0110 = a-z + // 0111 = a-z + // 1000 >= garbage + + // For validation, here's the table. + // A character is valid if and only if the AND of the 2 lookups equals 0: + + // hi \ lo 0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111 + // LUT 0x15 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x13 0x1A 0x1B 0x1B 0x1B 0x1A + + // 0000 0X10 char NUL SOH STX ETX EOT ENQ ACK BEL BS HT LF VT FF CR SO SI + // andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 + + // 0001 0x10 char DLE DC1 DC2 DC3 DC4 NAK SYN ETB CAN EM SUB ESC FS GS RS US + // andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 + + // 0010 0x01 char ! " # $ % & ' ( ) * + , - . / + // andlut 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x00 0x01 0x01 0x01 0x00 + + // 0011 0x02 char 0 1 2 3 4 5 6 7 8 9 : ; < = > ? + // andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x02 0x02 0x02 0x02 0x02 0x02 + + // 0100 0x04 char @ A B C D E F G H I J K L M N 0 + // andlut 0x04 0x00 0x00 0x00 0X00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 + + // 0101 0x08 char P Q R S T U V W X Y Z [ \ ] ^ _ + // andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08 + + // 0110 0x04 char ` a b c d e f g h i j k l m n o + // andlut 0x04 0x00 0x00 0x00 0X00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 + // 0111 0X08 char p q r s t u v w x y z { | } ~ + // andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08 + + // 1000 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 + // 1001 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 + // 1010 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 + // 1011 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 + // 1100 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 + // 1101 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 + // 1110 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 + // 1111 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 + // The JIT won't hoist these "constants", so help it Vector128 lutHi = ReadVector>(s_sseDecodeLutHi); Vector128 lutLo = ReadVector>(s_sseDecodeLutLo); Vector128 lutShift = ReadVector>(s_sseDecodeLutShift); Vector128 mask2F = ReadVector>(s_sseDecodeMask2F); - Vector128 shuffleConstant0 = Vector128.Create(0x01400140).AsSByte(); - Vector128 shuffleConstant1 = Vector128.Create(0x00011000).AsInt16(); - Vector128 shuffleVec = ReadVector>(s_sseDecodeShuffleVec); + Vector128 mergeConstant0 = Vector128.Create(0x01400140).AsSByte(); + Vector128 mergeConstant1 = Vector128.Create(0x00011000).AsInt16(); + Vector128 packBytesMask = ReadVector>(s_sseDecodePackBytesMask); Vector128 zero = Vector128.Zero; byte* src = srcBytes; @@ -420,21 +523,47 @@ private static unsafe void Ssse3Decode(ref byte* srcBytes, ref byte* destBytes, AssertRead>(src, srcStart, sourceLength); Vector128 str = Sse2.LoadVector128(src).AsSByte(); + // lookup Vector128 hiNibbles = Sse2.And(Sse2.ShiftRightLogical(str.AsInt32(), 4).AsSByte(), mask2F); Vector128 loNibbles = Sse2.And(str, mask2F); Vector128 hi = Ssse3.Shuffle(lutHi, hiNibbles); Vector128 lo = Ssse3.Shuffle(lutLo, loNibbles); + // Check for invalid input: if any "and" values from lo and hi are not zero, + // fall back on bytewise code to do error checking and reporting: if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.And(lo, hi), zero)) != 0) break; Vector128 eq2F = Sse2.CompareEqual(str, mask2F); Vector128 shift = Ssse3.Shuffle(lutShift, Sse2.Add(eq2F, hiNibbles)); + + // Now simply add the delta values to the input: str = Sse2.Add(str, shift); - Vector128 merge_ab_and_bc = Ssse3.MultiplyAddAdjacent(str.AsByte(), shuffleConstant0); - Vector128 output = Sse2.MultiplyAddAdjacent(merge_ab_and_bc, shuffleConstant1); - str = Ssse3.Shuffle(output.AsSByte(), shuffleVec); + // in, bits, upper case are most significant bits, lower case are least significant bits + // 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ + // 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG + // 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD + // 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA + + Vector128 merge_ab_and_bc = Ssse3.MultiplyAddAdjacent(str.AsByte(), mergeConstant0); + // 0000kkkk LLllllll 0000JJJJ JJjjKKKK + // 0000hhhh IIiiiiii 0000GGGG GGggHHHH + // 0000eeee FFffffff 0000DDDD DDddEEEE + // 0000bbbb CCcccccc 0000AAAA AAaaBBBB + + Vector128 output = Sse2.MultiplyAddAdjacent(merge_ab_and_bc, mergeConstant1); + // 00000000 JJJJJJjj KKKKkkkk LLllllll + // 00000000 GGGGGGgg HHHHhhhh IIiiiiii + // 00000000 DDDDDDdd EEEEeeee FFffffff + // 00000000 AAAAAAaa BBBBbbbb CCcccccc + + // Pack bytes together: + str = Ssse3.Shuffle(output.AsSByte(), packBytesMask); + // 00000000 00000000 00000000 00000000 + // LLllllll KKKKkkkk JJJJJJjj IIiiiiii + // HHHHhhhh GGGGGGgg FFffffff EEEEeeee + // DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa AssertWrite>(dest, destStart, destLength); Sse2.Store(dest, str.AsByte()); @@ -500,7 +629,7 @@ private static unsafe void WriteThreeLowOrderBytes(byte* destination, int value) -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, }; - private static ReadOnlySpan s_sseDecodeShuffleVec => new sbyte[] { + private static ReadOnlySpan s_sseDecodePackBytesMask => new sbyte[] { 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, @@ -535,7 +664,7 @@ private static unsafe void WriteThreeLowOrderBytes(byte* destination, int value) 0x2F, 0x2F, 0x2F, 0x2F }; - private static ReadOnlySpan s_avxDecodeShuffleVec => new sbyte[] { + private static ReadOnlySpan s_avxDecodePackBytesInLaneMask => new sbyte[] { 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, @@ -546,7 +675,7 @@ private static unsafe void WriteThreeLowOrderBytes(byte* destination, int value) -1, -1, -1, -1 }; - private static ReadOnlySpan s_avxDecodePermuteVec => new sbyte[] { + private static ReadOnlySpan s_avxDecodePackLanesControl => new sbyte[] { 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, diff --git a/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs b/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs index 3765ed9ee63b..033978c003fa 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs @@ -222,14 +222,26 @@ public static unsafe OperationStatus EncodeToUtf8InPlace(Span buffer, int [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe void Avx2Encode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart) { + // If we have AVX2 support, pick off 24 bytes at a time for as long as we can. + // But because we read 32 bytes at a time, ensure we have enough room to do a + // full 32-byte read without segfaulting. + + // translation from SSSE3 into AVX2 of procedure + // This one works with shifted (4 bytes) input in order to + // be able to work efficiently in the 2 128-bit lanes + + // srcBytes, bytes MSB to LSB: + // 0 0 0 0 x w v u t s r q p o n m + // l k j i h g f e d c b a 0 0 0 0 + // The JIT won't hoist these "constants", so help it Vector256 shuffleVec = ReadVector>(s_avxEncodeShuffleVec); - Vector256 shuffleConstant0 = Vector256.Create(0x0fc0fc00).AsSByte(); - Vector256 shuffleConstant2 = Vector256.Create(0x003f03f0).AsSByte(); - Vector256 shuffleConstant1 = Vector256.Create(0x04000040).AsUInt16(); - Vector256 shuffleConstant3 = Vector256.Create(0x01000010).AsInt16(); - Vector256 translationContant0 = Vector256.Create((byte)51); - Vector256 translationContant1 = Vector256.Create((sbyte)25); + Vector256 maskAC = Vector256.Create(0x0fc0fc00).AsSByte(); + Vector256 maskBB = Vector256.Create(0x003f03f0).AsSByte(); + Vector256 shiftAC = Vector256.Create(0x04000040).AsUInt16(); + Vector256 shiftBB = Vector256.Create(0x01000010).AsInt16(); + Vector256 const51 = Vector256.Create((byte)51); + Vector256 const25 = Vector256.Create((sbyte)25); Vector256 lut = ReadVector>(s_avxEncodeLut); byte* src = srcBytes; @@ -249,16 +261,88 @@ private static unsafe void Avx2Encode(ref byte* srcBytes, ref byte* destBytes, b { // Reshuffle str = Avx2.Shuffle(str, shuffleVec); - Vector256 t0 = Avx2.And(str, shuffleConstant0); - Vector256 t2 = Avx2.And(str, shuffleConstant2); - Vector256 t1 = Avx2.MultiplyHigh(t0.AsUInt16(), shuffleConstant1); - Vector256 t3 = Avx2.MultiplyLow(t2.AsInt16(), shuffleConstant3); + // str, bytes MSB to LSB: + // w x v w + // t u s t + // q r p q + // n o m n + // k l j k + // h i g h + // e f d e + // b c a b + + Vector256 t0 = Avx2.And(str, maskAC); + // bits, upper case are most significant bits, lower case are least significant bits. + // 0000wwww XX000000 VVVVVV00 00000000 + // 0000tttt UU000000 SSSSSS00 00000000 + // 0000qqqq RR000000 PPPPPP00 00000000 + // 0000nnnn OO000000 MMMMMM00 00000000 + // 0000kkkk LL000000 JJJJJJ00 00000000 + // 0000hhhh II000000 GGGGGG00 00000000 + // 0000eeee FF000000 DDDDDD00 00000000 + // 0000bbbb CC000000 AAAAAA00 00000000 + + Vector256 t2 = Avx2.And(str, maskBB); + // 00000000 00xxxxxx 000000vv WWWW0000 + // 00000000 00uuuuuu 000000ss TTTT0000 + // 00000000 00rrrrrr 000000pp QQQQ0000 + // 00000000 00oooooo 000000mm NNNN0000 + // 00000000 00llllll 000000jj KKKK0000 + // 00000000 00iiiiii 000000gg HHHH0000 + // 00000000 00ffffff 000000dd EEEE0000 + // 00000000 00cccccc 000000aa BBBB0000 + + Vector256 t1 = Avx2.MultiplyHigh(t0.AsUInt16(), shiftAC); + // 00000000 00wwwwXX 00000000 00VVVVVV + // 00000000 00ttttUU 00000000 00SSSSSS + // 00000000 00qqqqRR 00000000 00PPPPPP + // 00000000 00nnnnOO 00000000 00MMMMMM + // 00000000 00kkkkLL 00000000 00JJJJJJ + // 00000000 00hhhhII 00000000 00GGGGGG + // 00000000 00eeeeFF 00000000 00DDDDDD + // 00000000 00bbbbCC 00000000 00AAAAAA + + Vector256 t3 = Avx2.MultiplyLow(t2.AsInt16(), shiftBB); + // 00xxxxxx 00000000 00vvWWWW 00000000 + // 00uuuuuu 00000000 00ssTTTT 00000000 + // 00rrrrrr 00000000 00ppQQQQ 00000000 + // 00oooooo 00000000 00mmNNNN 00000000 + // 00llllll 00000000 00jjKKKK 00000000 + // 00iiiiii 00000000 00ggHHHH 00000000 + // 00ffffff 00000000 00ddEEEE 00000000 + // 00cccccc 00000000 00aaBBBB 00000000 + str = Avx2.Or(t1.AsSByte(), t3.AsSByte()); + // 00xxxxxx 00wwwwXX 00vvWWWW 00VVVVVV + // 00uuuuuu 00ttttUU 00ssTTTT 00SSSSSS + // 00rrrrrr 00qqqqRR 00ppQQQQ 00PPPPPP + // 00oooooo 00nnnnOO 00mmNNNN 00MMMMMM + // 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ + // 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG + // 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD + // 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA // Translation - Vector256 indices = Avx2.SubtractSaturate(str.AsByte(), translationContant0); - Vector256 mask = Avx2.CompareGreaterThan(str, translationContant1); + // LUT contains Absolute offset for all ranges: + // Translate values 0..63 to the Base64 alphabet. There are five sets: + // # From To Abs Index Characters + // 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ + // 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz + // 2 [52..61] [48..57] -4 [2..11] 0123456789 + // 3 [62] [43] -19 12 + + // 4 [63] [47] -16 13 / + + // Create LUT indices from input: + // the index for range #0 is right, others are 1 less than expected: + Vector256 indices = Avx2.SubtractSaturate(str.AsByte(), const51); + + // mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0: + Vector256 mask = Avx2.CompareGreaterThan(str, const25); + + // substract -1, so add 1 to indices for range #[1..4], All indices are now correct: Vector256 tmp = Avx2.Subtract(indices.AsSByte(), mask); + + // Add offsets to input values: str = Avx2.Add(str, Avx2.Shuffle(lut, tmp)); AssertWrite>(dest, destStart, destLength); @@ -282,14 +366,21 @@ private static unsafe void Avx2Encode(ref byte* srcBytes, ref byte* destBytes, b [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe void Ssse3Encode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart) { + // If we have SSSE3 support, pick off 12 bytes at a time for as long as we can. + // But because we read 16 bytes at a time, ensure we have enough room to do a + // full 16-byte read without segfaulting. + + // srcBytes, bytes MSB to LSB: + // 0 0 0 0 l k j i h g f e d c b a + // The JIT won't hoist these "constants", so help it Vector128 shuffleVec = ReadVector>(s_sseEncodeShuffleVec); - Vector128 shuffleConstant0 = Vector128.Create(0x0fc0fc00).AsSByte(); - Vector128 shuffleConstant2 = Vector128.Create(0x003f03f0).AsSByte(); - Vector128 shuffleConstant1 = Vector128.Create(0x04000040).AsUInt16(); - Vector128 shuffleConstant3 = Vector128.Create(0x01000010).AsInt16(); - Vector128 translationContant0 = Vector128.Create((byte)51); - Vector128 translationContant1 = Vector128.Create((sbyte)25); + Vector128 maskAC = Vector128.Create(0x0fc0fc00).AsSByte(); + Vector128 maskBB = Vector128.Create(0x003f03f0).AsSByte(); + Vector128 shiftAC = Vector128.Create(0x04000040).AsUInt16(); + Vector128 shiftBB = Vector128.Create(0x01000010).AsInt16(); + Vector128 const51 = Vector128.Create((byte)51); + Vector128 const25 = Vector128.Create((sbyte)25); Vector128 lut = ReadVector>(s_sseEncodeLut); byte* src = srcBytes; @@ -303,16 +394,64 @@ private static unsafe void Ssse3Encode(ref byte* srcBytes, ref byte* destBytes, // Reshuffle str = Ssse3.Shuffle(str, shuffleVec); - Vector128 t0 = Sse2.And(str, shuffleConstant0); - Vector128 t2 = Sse2.And(str, shuffleConstant2); - Vector128 t1 = Sse2.MultiplyHigh(t0.AsUInt16(), shuffleConstant1); - Vector128 t3 = Sse2.MultiplyLow(t2.AsInt16(), shuffleConstant3); + // str, bytes MSB to LSB: + // k l j k + // h i g h + // e f d e + // b c a b + + Vector128 t0 = Sse2.And(str, maskAC); + // bits, upper case are most significant bits, lower case are least significant bits + // 0000kkkk LL000000 JJJJJJ00 00000000 + // 0000hhhh II000000 GGGGGG00 00000000 + // 0000eeee FF000000 DDDDDD00 00000000 + // 0000bbbb CC000000 AAAAAA00 00000000 + + Vector128 t2 = Sse2.And(str, maskBB); + // 00000000 00llllll 000000jj KKKK0000 + // 00000000 00iiiiii 000000gg HHHH0000 + // 00000000 00ffffff 000000dd EEEE0000 + // 00000000 00cccccc 000000aa BBBB0000 + + Vector128 t1 = Sse2.MultiplyHigh(t0.AsUInt16(), shiftAC); + // 00000000 00kkkkLL 00000000 00JJJJJJ + // 00000000 00hhhhII 00000000 00GGGGGG + // 00000000 00eeeeFF 00000000 00DDDDDD + // 00000000 00bbbbCC 00000000 00AAAAAA + + Vector128 t3 = Sse2.MultiplyLow(t2.AsInt16(), shiftBB); + // 00llllll 00000000 00jjKKKK 00000000 + // 00iiiiii 00000000 00ggHHHH 00000000 + // 00ffffff 00000000 00ddEEEE 00000000 + // 00cccccc 00000000 00aaBBBB 00000000 + str = Sse2.Or(t1.AsSByte(), t3.AsSByte()); + // 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ + // 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG + // 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD + // 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA // Translation - Vector128 indices = Sse2.SubtractSaturate(str.AsByte(), translationContant0); - Vector128 mask = Sse2.CompareGreaterThan(str, translationContant1); + // LUT contains Absolute offset for all ranges: + // Translate values 0..63 to the Base64 alphabet. There are five sets: + // # From To Abs Index Characters + // 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ + // 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz + // 2 [52..61] [48..57] -4 [2..11] 0123456789 + // 3 [62] [43] -19 12 + + // 4 [63] [47] -16 13 / + + // Create LUT indices from input: + // the index for range #0 is right, others are 1 less than expected: + Vector128 indices = Sse2.SubtractSaturate(str.AsByte(), const51); + + // mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0: + Vector128 mask = Sse2.CompareGreaterThan(str, const25); + + // substract -1, so add 1 to indices for range #[1..4], All indices are now correct: Vector128 tmp = Sse2.Subtract(indices.AsSByte(), mask); + + // Add offsets to input values: str = Sse2.Add(str, Ssse3.Shuffle(lut, tmp)); AssertWrite>(dest, destStart, destLength); From 8c53689df7de952018e3926978c1c07f5967cdc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Foidl?= Date: Mon, 27 May 2019 17:14:48 +0200 Subject: [PATCH 22/23] Use TestZ instead of MoveMask in AVX2-path Cf. https://github.com/dotnet/corefx/pull/34529#discussion_r287825385 --- src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs index b7ca32e36507..f96081230910 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs @@ -365,7 +365,6 @@ private static unsafe void Avx2Decode(ref byte* srcBytes, ref byte* destBytes, b Vector256 mergeConstant1 = Vector256.Create(0x00011000).AsInt16(); Vector256 packBytesInLaneMask = ReadVector>(s_avxDecodePackBytesInLaneMask); Vector256 packLanesControl = ReadVector>(s_avxDecodePackLanesControl).AsInt32(); - Vector256 zero = Vector256.Zero; byte* src = srcBytes; byte* dest = destBytes; @@ -381,8 +380,7 @@ private static unsafe void Avx2Decode(ref byte* srcBytes, ref byte* destBytes, b Vector256 hi = Avx2.Shuffle(lutHi, hiNibbles); Vector256 lo = Avx2.Shuffle(lutLo, loNibbles); - // https://github.com/dotnet/coreclr/issues/21247 - if (Avx2.MoveMask(Avx2.CompareGreaterThan(Avx2.And(lo, hi), zero)) != 0) + if (!Avx.TestZ(lo, hi)) break; Vector256 eq2F = Avx2.CompareEqual(str, mask2F); From cf4f5ce7ba3792f63967d5fe17f28ada84065129 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Foidl?= Date: Mon, 27 May 2019 18:16:11 +0200 Subject: [PATCH 23/23] Fixed too complicated mask2F creation Improved the version done in c8b6cb3387ca856f52d246ad260172c8fe1d9dcd, so the static data isn't needed and code is more compact and readable. --- .../src/System/Buffers/Text/Base64Decoder.cs | 22 ++----------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs index f96081230910..ffb660aae7e8 100644 --- a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs +++ b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs @@ -360,7 +360,7 @@ private static unsafe void Avx2Decode(ref byte* srcBytes, ref byte* destBytes, b Vector256 lutHi = ReadVector>(s_avxDecodeLutHi); Vector256 lutLo = ReadVector>(s_avxDecodeLutLo); Vector256 lutShift = ReadVector>(s_avxDecodeLutShift); - Vector256 mask2F = ReadVector>(s_avxDecodeMask2F); + Vector256 mask2F = Vector256.Create((sbyte)'/'); Vector256 mergeConstant0 = Vector256.Create(0x01400140).AsSByte(); Vector256 mergeConstant1 = Vector256.Create(0x00011000).AsInt16(); Vector256 packBytesInLaneMask = ReadVector>(s_avxDecodePackBytesInLaneMask); @@ -506,7 +506,7 @@ private static unsafe void Ssse3Decode(ref byte* srcBytes, ref byte* destBytes, Vector128 lutHi = ReadVector>(s_sseDecodeLutHi); Vector128 lutLo = ReadVector>(s_sseDecodeLutLo); Vector128 lutShift = ReadVector>(s_sseDecodeLutShift); - Vector128 mask2F = ReadVector>(s_sseDecodeMask2F); + Vector128 mask2F = Vector128.Create((sbyte)'/'); Vector128 mergeConstant0 = Vector128.Create(0x01400140).AsSByte(); Vector128 mergeConstant1 = Vector128.Create(0x00011000).AsInt16(); Vector128 packBytesMask = ReadVector>(s_sseDecodePackBytesMask); @@ -655,13 +655,6 @@ private static unsafe void WriteThreeLowOrderBytes(byte* destination, int value) 0, 0, 0, 0 }; - private static ReadOnlySpan s_sseDecodeMask2F => new sbyte[] { // ASCII: / - 0x2F, 0x2F, 0x2F, 0x2F, - 0x2F, 0x2F, 0x2F, 0x2F, - 0x2F, 0x2F, 0x2F, 0x2F, - 0x2F, 0x2F, 0x2F, 0x2F - }; - private static ReadOnlySpan s_avxDecodePackBytesInLaneMask => new sbyte[] { 2, 1, 0, 6, 5, 4, 10, 9, @@ -684,17 +677,6 @@ private static unsafe void WriteThreeLowOrderBytes(byte* destination, int value) -1, -1, -1, -1 }; - private static ReadOnlySpan s_avxDecodeMask2F => new sbyte[] { // ASCII: / - 0x2F, 0x2F, 0x2F, 0x2F, - 0x2F, 0x2F, 0x2F, 0x2F, - 0x2F, 0x2F, 0x2F, 0x2F, - 0x2F, 0x2F, 0x2F, 0x2F, - 0x2F, 0x2F, 0x2F, 0x2F, - 0x2F, 0x2F, 0x2F, 0x2F, - 0x2F, 0x2F, 0x2F, 0x2F, - 0x2F, 0x2F, 0x2F, 0x2F - }; - private static ReadOnlySpan s_avxDecodeLutLo => new sbyte[] { 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,