diff --git a/THIRD-PARTY-NOTICES.TXT b/THIRD-PARTY-NOTICES.TXT index 01511175f6f9..59d1274050f6 100644 --- a/THIRD-PARTY-NOTICES.TXT +++ b/THIRD-PARTY-NOTICES.TXT @@ -358,3 +358,35 @@ License for fastmod (https://github.com/lemire/fastmod) WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. + +License notice for vectorized base64 encoding +-------------------------------------------------------- + +Copyright (c) 2005-2007, Nick Galbreath +Copyright (c) 2013-2017, Alfred Klomp +Copyright (c) 2015-2017, Wojciech Mula +Copyright (c) 2016-2017, Matthieu Darbois +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/src/System.Private.CoreLib/shared/System/Convert.cs b/src/System.Private.CoreLib/shared/System/Convert.cs index 074a39492cb9..7dd7e27fd6fe 100644 --- a/src/System.Private.CoreLib/shared/System/Convert.cs +++ b/src/System.Private.CoreLib/shared/System/Convert.cs @@ -7,6 +7,8 @@ using System.Runtime.InteropServices; using System.Diagnostics; using System.Diagnostics.CodeAnalysis; +using System.Runtime.Intrinsics.X86; +using System.Runtime.Intrinsics; namespace System { @@ -2492,19 +2494,146 @@ public static unsafe bool TryToBase64Chars(ReadOnlySpan bytes, Span } } + internal static readonly Vector128 s_base64ShuffleMask = Vector128.Create((byte) + 1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10); + + internal static readonly Vector128 s_base64ShiftLut = Vector128.Create( + (sbyte)'a' - 26, (sbyte)'0' - 52, + (sbyte)'0' - 52, (sbyte)'0' - 52, + (sbyte)'0' - 52, (sbyte)'0' - 52, + (sbyte)'0' - 52, (sbyte)'0' - 52, + (sbyte)'0' - 52, (sbyte)'0' - 52, + (sbyte)'0' - 52, (sbyte)'+' - 62, + (sbyte)'/' - 63, (sbyte)'A', 0, 0).AsByte(); + + internal static readonly Vector128 s_base64TwoBytesStringMaskLo = Vector128.Create( + 0, 0x80, 1, 0x80, + 2, 0x80, 3, 0x80, + 4, 0x80, 5, 0x80, + 6, 0x80, 7, 0x80); + + // Based on "Base64 encoding with SIMD instructions" article by Wojciech Mula http://0x80.pl/notesen/2016-01-12-sse-base64-encoding.html (see THIRD-PARTY-NOTICES.txt) + // The original code can be found here: https://github.com/WojciechMula/base64simd/blob/master/encode/encode.sse.cpp (and lookup_pshufb_improved as a lookup function) + private static unsafe (int i, int j, int charcount) ConvertToBase64ArraySsse3(char* outChars, byte* inData, int length, int offset, bool insertLineBreaks) + { + int i = offset, j = 0, charcount = 0; + const int stride = 4 * 3; + + byte* outputBytes = (byte*)outChars; + + Vector128 tt0 = Vector128.Create(0x0fc0fc00).AsByte(); + Vector128 tt1 = Vector128.Create(0x04000040).AsUInt16(); + Vector128 tt2 = Vector128.Create(0x003f03f0).AsByte(); + Vector128 tt3 = Vector128.Create(0x01000010).AsUInt16(); + Vector128 tt5 = Vector128.Create((byte)51); + Vector128 tt7 = Vector128.Create((sbyte)26); + Vector128 tt8 = Vector128.Create((byte)13); + + // static readonly Vector128 field + assigning its value to a local variable is a C# pattern for `const __mX` + Vector128 localShiftLut = s_base64ShiftLut; + Vector128 localShuffleMask = s_base64ShuffleMask; + Vector128 localTwoBytesStringMaskLo = s_base64TwoBytesStringMaskLo; + + for (; i <= length - stride; i += stride) + { + // input = [xxxx|DDDC|CCBB|BAAA] + Vector128 inputVector = Sse2.LoadVector128(inData + i); + + // bytes from groups A, B and C are needed in separate 32-bit lanes + // in = [DDDD|CCCC|BBBB|AAAA] + // + // an input triplet has layout + // [????????|ccdddddd|bbbbcccc|aaaaaabb] + // byte 3 byte 2 byte 1 byte 0 -- byte 3 comes from the next triplet + // + // shuffling changes the order of bytes: 1, 0, 2, 1 + // [bbbbcccc|ccdddddd|aaaaaabb|bbbbcccc] + // ^^^^ ^^^^^^^^ ^^^^^^^^ ^^^^ + // processed bits + inputVector = Ssse3.Shuffle(inputVector, localShuffleMask); + + // unpacking + + // t0 = [0000cccc|cc000000|aaaaaa00|00000000] + Vector128 t0 = Sse2.And(inputVector, tt0); + // t1 = [00000000|00cccccc|00000000|00aaaaaa] + Vector128 t1 = Sse2.MultiplyHigh(t0.AsUInt16(), tt1).AsByte(); + // t2 = [00000000|00dddddd|000000bb|bbbb0000] + Vector128 t2 = Sse2.And(inputVector, tt2); + // t3 = [00dddddd|00000000|00bbbbbb|00000000] + Vector128 t3 = Sse2.MultiplyLow(t2.AsUInt16(), tt3).AsByte(); + // indices = [00dddddd|00cccccc|00bbbbbb|00aaaaaa] = t1 | t3 + Vector128 indices = Sse2.Or(t1, t3); + + // lookup function "Single pshufb method" (lookup_pshufb_improved) + Vector128 result = Sse2.SubtractSaturate(indices, tt5); + Vector128 compareResult = Sse2.CompareGreaterThan(tt7, indices.AsSByte()); + result = Sse2.Or(result, Sse2.And(compareResult.AsByte(), tt8)); + result = Ssse3.Shuffle(localShiftLut, result); + result = Sse2.Add(result, indices); + // end of lookup function + + // save as two-bytes string, e.g.: + // 1,2,3,4,5..16 => 1,0,2,0,3,0..16,0 + Sse2.Store(outputBytes + j, Ssse3.Shuffle(result, localTwoBytesStringMaskLo)); + j += Vector128.Count; + + // Do it for the second part of the vector (rotate it first in order to re-use asciiToStringMaskLo) + result = Sse2.Shuffle(result.AsUInt32(), 0x4E /*_MM_SHUFFLE(1,0,3,2)*/).AsByte(); + result = Ssse3.Shuffle(result, localTwoBytesStringMaskLo); + + if (insertLineBreaks && (charcount += 16) >= base64LineBreakPosition) + { + // Normally we save 32 bytes per iteration + // but `insertLineBreaks` needs `\r\n` (4 bytes) between each 76*2=152 bytes. 152/32 = 4.75 (means not a multiply of 32) + // we need to insert `\r\n` in the middle of Vector128 somehow + // but the following code just saves a half of the vector, then appends `\r\n` manually + // and the second part of the vector is ignored (this is why 'i' is decremented) + charcount = 0; + Vector128 shuffleResult = result.AsUInt64(); + Sse2.StoreScalar((ulong*)(outputBytes + j), shuffleResult); + j += Vector128.Count / 2; + outputBytes[j++] = (byte)'\r'; + outputBytes[j++] = 0; + outputBytes[j++] = (byte)'\n'; + outputBytes[j++] = 0; + i -= stride / 4; + } + else + { + Sse2.Store(outputBytes + j, result); + j += Vector128.Count; + } + } + // SIMD-based algorithm used `j` to count bytes, the software fallback uses it count chars + j /= 2; + + return (i, j, charcount); + } + private static unsafe int ConvertToBase64Array(char* outChars, byte* inData, int offset, int length, bool insertLineBreaks) { + int charcount = 0; + int i = offset; + int j = 0; + + if (Ssse3.IsSupported && length - offset >= 36) + { + // Tuple is faster then passing i,j,charcount by ref. + // SSSE impl is moved to a separate method in order to avoid regression for smaller inputs + (i, j, charcount) = ConvertToBase64ArraySsse3(outChars, inData, length, offset, insertLineBreaks); + if (i == length) + return j; + } + int lengthmod3 = length % 3; int calcLength = offset + (length - lengthmod3); - int j = 0; - int charcount = 0; // Convert three bytes at a time to base64 notation. This will consume 4 chars. - int i; // get a pointer to the base64Table to avoid unnecessary range checking fixed (char* base64 = &base64Table[0]) { - for (i = offset; i < calcLength; i += 3) + for (; i < calcLength; i += 3) { if (insertLineBreaks) {