From 849f9951cf32323f24e8c31a36867eaf9884cf11 Mon Sep 17 00:00:00 2001 From: Ahson Khan Date: Tue, 22 Oct 2019 00:59:19 -0700 Subject: [PATCH 1/9] Use Sse2 instrinsics to make NeedsEscaping check faster for large JSON strings (#41845) * Use Sse2 instrinsics to make NeedsEscaping check faster for large strings. * Update the utf-8 bytes needsescaping and add tests. * Remove unnecessary bitwise OR and add more tests * Add more tests around surrogates, invalid strings, and characters > short.MaxValue. --- .../src/System.Text.Json.csproj | 1 + .../Json/Writer/JsonWriterHelper.Escaping.cs | 209 ++++++++-- .../tests/Utf8JsonWriterTests.cs | 390 ++++++++++++++++++ 3 files changed, 571 insertions(+), 29 deletions(-) diff --git a/src/System.Text.Json/src/System.Text.Json.csproj b/src/System.Text.Json/src/System.Text.Json.csproj index 23b7672de0bc..83f021b1c995 100644 --- a/src/System.Text.Json/src/System.Text.Json.csproj +++ b/src/System.Text.Json/src/System.Text.Json.csproj @@ -195,6 +195,7 @@ + diff --git a/src/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.Escaping.cs b/src/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.Escaping.cs index a41035bb51c6..5c6c27fb6346 100644 --- a/src/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.Escaping.cs +++ b/src/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.Escaping.cs @@ -5,10 +5,16 @@ using System.Buffers; using System.Buffers.Text; using System.Diagnostics; +using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Text.Encodings.Web; +#if BUILDING_INBOX_LIBRARY +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif + namespace System.Text.Json { // TODO: Replace the escaping logic with publicly shipping APIs from https://github.com/dotnet/corefx/issues/33509 @@ -55,57 +61,202 @@ internal static partial class JsonWriterHelper [MethodImpl(MethodImplOptions.AggressiveInlining)] private static bool NeedsEscaping(char value) => value > LastAsciiCharacter || AllowList[value] == 0; - public static int NeedsEscaping(ReadOnlySpan value, JavaScriptEncoder encoder) +#if BUILDING_INBOX_LIBRARY + private static readonly Vector128 s_mask_UInt16_0x20 = Vector128.Create((short)0x20); // Space ' ' + + private static readonly Vector128 s_mask_UInt16_0x22 = Vector128.Create((short)0x22); // Quotation Mark '"' + private static readonly Vector128 s_mask_UInt16_0x26 = Vector128.Create((short)0x26); // Ampersand '&' + private static readonly Vector128 s_mask_UInt16_0x27 = Vector128.Create((short)0x27); // Apostrophe ''' + private static readonly Vector128 s_mask_UInt16_0x2B = Vector128.Create((short)0x2B); // Plus sign '+' + private static readonly Vector128 s_mask_UInt16_0x3C = Vector128.Create((short)0x3C); // Less Than Sign '<' + private static readonly Vector128 s_mask_UInt16_0x3E = Vector128.Create((short)0x3E); // Greater Than Sign '>' + private static readonly Vector128 s_mask_UInt16_0x5C = Vector128.Create((short)0x5C); // Reverse Solidus '\' + private static readonly Vector128 s_mask_UInt16_0x60 = Vector128.Create((short)0x60); // Grave Access '`' + + private static readonly Vector128 s_mask_UInt16_0x7E = Vector128.Create((short)0x7E); // Tilde '~' + + private static readonly Vector128 s_mask_SByte_0x20 = Vector128.Create((sbyte)0x20); // Space ' ' + + private static readonly Vector128 s_mask_SByte_0x22 = Vector128.Create((sbyte)0x22); // Quotation Mark '"' + private static readonly Vector128 s_mask_SByte_0x26 = Vector128.Create((sbyte)0x26); // Ampersand '&' + private static readonly Vector128 s_mask_SByte_0x27 = Vector128.Create((sbyte)0x27); // Apostrophe ''' + private static readonly Vector128 s_mask_SByte_0x2B = Vector128.Create((sbyte)0x2B); // Plus sign '+' + private static readonly Vector128 s_mask_SByte_0x3C = Vector128.Create((sbyte)0x3C); // Less Than Sign '<' + private static readonly Vector128 s_mask_SByte_0x3E = Vector128.Create((sbyte)0x3E); // Greater Than Sign '>' + private static readonly Vector128 s_mask_SByte_0x5C = Vector128.Create((sbyte)0x5C); // Reverse Solidus '\' + private static readonly Vector128 s_mask_SByte_0x60 = Vector128.Create((sbyte)0x60); // Grave Access '`' + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector128 CreateEscapingMask(Vector128 sourceValue) { - int idx; + Debug.Assert(Sse2.IsSupported); - if (encoder != null) - { - idx = encoder.FindFirstCharacterToEncodeUtf8(value); - goto Return; - } + Vector128 mask = Sse2.CompareLessThan(sourceValue, s_mask_UInt16_0x20); // Space ' ', anything in the control characters range + + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x22)); // Quotation Mark '"' + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x26)); // Ampersand '&' + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x27)); // Apostrophe ''' + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x2B)); // Plus sign '+' + + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x3C)); // Less Than Sign '<' + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x3E)); // Greater Than Sign '>' + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x5C)); // Reverse Solidus '\' + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x60)); // Grave Access '`' + + mask = Sse2.Or(mask, Sse2.CompareGreaterThan(sourceValue, s_mask_UInt16_0x7E)); // Tilde '~', anything above the ASCII range + + return mask; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector128 CreateEscapingMask(Vector128 sourceValue) + { + Debug.Assert(Sse2.IsSupported); - for (idx = 0; idx < value.Length; idx++) + Vector128 mask = Sse2.CompareLessThan(sourceValue, s_mask_SByte_0x20); // Control characters, and anything above 0x7E since sbyte.MaxValue is 0x7E + + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x22)); // Quotation Mark " + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x26)); // Ampersand & + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x27)); // Apostrophe ' + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x2B)); // Plus sign + + + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x3C)); // Less Than Sign < + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x3E)); // Greater Than Sign > + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x5C)); // Reverse Solidus \ + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x60)); // Grave Access ` + + return mask; + } +#endif + + public static unsafe int NeedsEscaping(ReadOnlySpan value, JavaScriptEncoder encoder) + { + fixed (byte* ptr = value) { - if (NeedsEscaping(value[idx])) + int idx = 0; + + if (encoder != null) { + idx = encoder.FindFirstCharacterToEncodeUtf8(value); goto Return; } - } - idx = -1; // all characters allowed +#if BUILDING_INBOX_LIBRARY + if (Sse2.IsSupported) + { + sbyte* startingAddress = (sbyte*)ptr; + while (value.Length - 16 >= idx) + { + Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + value.Length - 16)); + + // Load the next 16 bytes. + Vector128 sourceValue = Sse2.LoadVector128(startingAddress); + + // Check if any of the 16 bytes need to be escaped. + Vector128 mask = CreateEscapingMask(sourceValue); + + int index = Sse2.MoveMask(mask.AsByte()); + // If index == 0, that means none of the 16 bytes needed to be escaped. + // TrailingZeroCount is relatively expensive, avoid it if possible. + if (index != 0) + { + // Found at least one byte that needs to be escaped, figure out the index of + // the first one found that needed to be escaped within the 16 bytes. + Debug.Assert(index > 0 && index <= 65_535); + int tzc = BitOperations.TrailingZeroCount(index); + Debug.Assert(tzc >= 0 && tzc <= 16); + idx += tzc; + goto Return; + } + idx += 16; + startingAddress += 16; + } + + // Process the remaining characters. + Debug.Assert(value.Length - idx < 16); + } +#endif + + for (; idx < value.Length; idx++) + { + Debug.Assert((ptr + idx) <= (ptr + value.Length)); + if (NeedsEscaping(*(ptr + idx))) + { + goto Return; + } + } - Return: - return idx; + idx = -1; // all characters allowed + + Return: + return idx; + } } public static unsafe int NeedsEscaping(ReadOnlySpan value, JavaScriptEncoder encoder) { - int idx; - - // Some implementations of JavascriptEncoder.FindFirstCharacterToEncode may not accept - // null pointers and gaurd against that. Hence, check up-front and fall down to return -1. - if (encoder != null && !value.IsEmpty) + fixed (char* ptr = value) { - fixed (char* ptr = value) + int idx = 0; + + // Some implementations of JavascriptEncoder.FindFirstCharacterToEncode may not accept + // null pointers and gaurd against that. Hence, check up-front and fall down to return -1. + if (encoder != null && !value.IsEmpty) { idx = encoder.FindFirstCharacterToEncode(ptr, value.Length); + goto Return; } - goto Return; - } - for (idx = 0; idx < value.Length; idx++) - { - if (NeedsEscaping(value[idx])) +#if BUILDING_INBOX_LIBRARY + if (Sse2.IsSupported) { - goto Return; + short* startingAddress = (short*)ptr; + while (value.Length - 8 >= idx) + { + Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + value.Length - 8)); + + // Load the next 8 characters. + Vector128 sourceValue = Sse2.LoadVector128(startingAddress); + + // Check if any of the 8 characters need to be escaped. + Vector128 mask = CreateEscapingMask(sourceValue); + + int index = Sse2.MoveMask(mask.AsByte()); + // If index == 0, that means none of the 8 characters needed to be escaped. + // TrailingZeroCount is relatively expensive, avoid it if possible. + if (index != 0) + { + // Found at least one character that needs to be escaped, figure out the index of + // the first one found that needed to be escaped within the 8 characters. + Debug.Assert(index > 0 && index <= 65_535); + int tzc = BitOperations.TrailingZeroCount(index); + Debug.Assert(tzc % 2 == 0 && tzc >= 0 && tzc <= 16); + idx += tzc >> 1; + goto Return; + } + idx += 8; + startingAddress += 8; + } + + // Process the remaining characters. + Debug.Assert(value.Length - idx < 8); + } +#endif + + for (; idx < value.Length; idx++) + { + Debug.Assert((ptr + idx) <= (ptr + value.Length)); + if (NeedsEscaping(*(ptr + idx))) + { + goto Return; + } } - } - idx = -1; // all characters allowed + idx = -1; // All characters are allowed. - Return: - return idx; + Return: + return idx; + } } public static int GetMaxEscapedLength(int textLength, int firstIndexToEscape) diff --git a/src/System.Text.Json/tests/Utf8JsonWriterTests.cs b/src/System.Text.Json/tests/Utf8JsonWriterTests.cs index 1c56b3fe5b0d..2faa7690375d 100644 --- a/src/System.Text.Json/tests/Utf8JsonWriterTests.cs +++ b/src/System.Text.Json/tests/Utf8JsonWriterTests.cs @@ -178,6 +178,396 @@ public static void WritingStringsWithCustomEscaping() JsonTestHelper.AssertContents("\"\u2020\\\"\"", output); } + [Theory] + [MemberData(nameof(EscapingTestData))] + public void EscapingTestWhileWriting(char replacementChar, JavaScriptEncoder encoder, bool requiresEscaping) + { + var writerOptions = new JsonWriterOptions { Encoder = encoder }; + + { + ReadOnlyMemory written = WriteStringHelper(writerOptions, null); + Assert.Equal(-1, written.Span.IndexOf((byte)'\\')); + + written = WriteUtf8StringHelper(writerOptions, null); + Assert.Equal(-1, written.Span.IndexOf((byte)'\\')); + + written = WriteStringHelper(writerOptions, string.Empty); + Assert.Equal(-1, written.Span.IndexOf((byte)'\\')); + + written = WriteUtf8StringHelper(writerOptions, Array.Empty()); + Assert.Equal(-1, written.Span.IndexOf((byte)'\\')); + } + + var random = new Random(42); + for (int dataLength = 0; dataLength < 50; dataLength++) + { + char[] str = new char[dataLength]; + for (int i = 0; i < dataLength; i++) + { + str[i] = (char)random.Next(97, 123); + } + string baseStr = new string(str); + byte[] sourceUtf8 = Encoding.UTF8.GetBytes(baseStr); + + ReadOnlyMemory written = WriteStringHelper(writerOptions, baseStr); + Assert.Equal(-1, written.Span.IndexOf((byte)'\\')); + + written = WriteUtf8StringHelper(writerOptions, sourceUtf8); + Assert.Equal(-1, written.Span.IndexOf((byte)'\\')); + + for (int i = 0; i < dataLength; i++) + { + char[] changed = baseStr.ToCharArray(); + changed[i] = replacementChar; + string newStr = new string(changed); + sourceUtf8 = Encoding.UTF8.GetBytes(newStr); + + written = WriteStringHelper(writerOptions, newStr); + int escapedIndex = written.Span.IndexOf((byte)'\\'); + Assert.Equal(requiresEscaping ? (i + 1) : -1, escapedIndex); // Account for the start quote + + written = WriteUtf8StringHelper(writerOptions, sourceUtf8); + escapedIndex = written.Span.IndexOf((byte)'\\'); + Assert.Equal(requiresEscaping ? (i + 1) : -1, escapedIndex); // Account for the start quote + } + + if (dataLength != 0) + { + char[] changed = baseStr.ToCharArray(); + changed.AsSpan().Fill(replacementChar); + string newStr = new string(changed); + sourceUtf8 = Encoding.UTF8.GetBytes(newStr); + + written = WriteStringHelper(writerOptions, newStr); + int escapedIndex = written.Span.IndexOf((byte)'\\'); + Assert.Equal(requiresEscaping ? 1 : -1, escapedIndex); // Account for the start quote + + written = WriteUtf8StringHelper(writerOptions, sourceUtf8); + escapedIndex = written.Span.IndexOf((byte)'\\'); + Assert.Equal(requiresEscaping ? 1 : -1, escapedIndex); // Account for the start quote + } + } + } + + public static IEnumerable EscapingTestData + { + get + { + return new List + { + new object[] { 'a', null, false }, // ASCII not escaped + new object[] { '\u001F', null, true }, // control character within single byte range + new object[] { '\u2000', null, true }, // space character outside single byte range + new object[] { '\u00A2', null, true }, // non-ASCII but < 255 + new object[] { '\uA686', null, true }, // non-ASCII above short.MaxValue + new object[] { '\u6C49', null, true }, // non-ASCII from chinese alphabet - multibyte + new object[] { '"', null, true }, // ASCII but must always be escaped in JSON + new object[] { '\\', null, true }, // ASCII but must always be escaped in JSON + new object[] { '<', null, true }, // ASCII but escaped by default + new object[] { '>', null, true }, // ASCII but escaped by default + new object[] { '&', null, true }, // ASCII but escaped by default + new object[] { '`', null, true }, // ASCII but escaped by default + new object[] { '\'', null, true }, // ASCII but escaped by default + new object[] { '+', null, true }, // ASCII but escaped by default + + new object[] { 'a', JavaScriptEncoder.Default, false }, + new object[] { '\u001F', JavaScriptEncoder.Default, true }, + new object[] { '\u2000', JavaScriptEncoder.Default, true }, + new object[] { '\u00A2', JavaScriptEncoder.Default, true }, + new object[] { '\uA686', JavaScriptEncoder.Default, true }, + new object[] { '\u6C49', JavaScriptEncoder.Default, true }, + new object[] { '"', JavaScriptEncoder.Default, true }, + new object[] { '\\', JavaScriptEncoder.Default, true }, + new object[] { '<', JavaScriptEncoder.Default, true }, + new object[] { '>', JavaScriptEncoder.Default, true }, + new object[] { '&', JavaScriptEncoder.Default, true }, + new object[] { '`', JavaScriptEncoder.Default, true }, + new object[] { '\'', JavaScriptEncoder.Default, true }, + new object[] { '+', JavaScriptEncoder.Default, true }, + + new object[] { 'a', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), false }, + new object[] { '\u001F', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '\u2000', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '\u00A2', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '\uA686', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '\u6C49', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '"', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '\\', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '<', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '>', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '&', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '`', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '\'', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '+', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + + new object[] { 'a', JavaScriptEncoder.Create(UnicodeRanges.All), false }, + new object[] { '\u001F', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '\u2000', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '\u00A2', JavaScriptEncoder.Create(UnicodeRanges.All), false }, + new object[] { '\uA686', JavaScriptEncoder.Create(UnicodeRanges.All), false }, + new object[] { '\u6C49', JavaScriptEncoder.Create(UnicodeRanges.All), false }, + new object[] { '"', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '\\', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '<', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '>', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '&', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '`', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '\'', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '+', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + + new object[] { 'a', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '\u001F', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, + new object[] { '\u2000', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, + new object[] { '\u00A2', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '\uA686', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '\u6C49', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '"', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, + new object[] { '\\', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, + new object[] { '<', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '>', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '&', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '`', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '\'', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '+', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + }; + } + } + + [Theory] + [MemberData(nameof(EscapingTestData_NonAscii))] + public unsafe void WriteString_NonAscii(char replacementChar, JavaScriptEncoder encoder, bool requiresEscaping) + { + var writerOptions = new JsonWriterOptions { Encoder = encoder }; + var random = new Random(42); + for (int dataLength = 1; dataLength < 50; dataLength++) + { + char[] str = new char[dataLength]; + for (int i = 0; i < dataLength; i++) + { + str[i] = (char)random.Next(0x2E9B, 0x2EF4); // CJK Radicals Supplement characters + } + string baseStr = new string(str); + byte[] sourceUtf8 = Encoding.UTF8.GetBytes(baseStr); + + ReadOnlyMemory written = WriteStringHelper(writerOptions, baseStr); + Assert.Equal(-1, written.Span.IndexOf((byte)'\\')); + + written = WriteUtf8StringHelper(writerOptions, sourceUtf8); + Assert.Equal(-1, written.Span.IndexOf((byte)'\\')); + + for (int i = 0; i < dataLength; i++) + { + string source = baseStr.Insert(i, new string(replacementChar, 1)); + sourceUtf8 = Encoding.UTF8.GetBytes(source); + + written = WriteStringHelper(writerOptions, source); + int escapedIndex = written.Span.IndexOf((byte)'\\'); + // Each CJK character expands to 3 utf-8 bytes. + Assert.Equal(requiresEscaping ? ((i * 3) + 1) : -1, escapedIndex); // Account for the start quote + + written = WriteUtf8StringHelper(writerOptions, sourceUtf8); + escapedIndex = written.Span.IndexOf((byte)'\\'); + // Each CJK character expands to 3 utf-8 bytes. + Assert.Equal(requiresEscaping ? ((i * 3) + 1) : -1, escapedIndex); // Account for the start quote + } + } + } + + public static IEnumerable EscapingTestData_NonAscii + { + get + { + return new List + { + new object[] { 'a', JavaScriptEncoder.Create(UnicodeRanges.All), false }, + new object[] { '\u001F', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '\u2000', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '\u00A2', JavaScriptEncoder.Create(UnicodeRanges.All), false }, + new object[] { '\uA686', JavaScriptEncoder.Create(UnicodeRanges.All), false }, + new object[] { '\u6C49', JavaScriptEncoder.Create(UnicodeRanges.All), false }, + new object[] { '"', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '\\', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '<', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '>', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '&', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '`', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '\'', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '+', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + + new object[] { 'a', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '\u001F', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, + new object[] { '\u2000', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, + new object[] { '\u00A2', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '\uA686', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '\u6C49', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '"', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, + new object[] { '\\', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, + new object[] { '<', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '>', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '&', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '`', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '\'', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '+', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + }; + } + } + + [Theory] + [MemberData(nameof(JavaScriptEncoders))] + public void EscapingTestWhileWritingSurrogate(JavaScriptEncoder encoder) + { + char highSurrogate = '\uD801'; + char lowSurrogate = '\uDC37'; + var writerOptions = new JsonWriterOptions { Encoder = encoder }; + var random = new Random(42); + for (int dataLength = 2; dataLength < 50; dataLength++) + { + char[] str = new char[dataLength]; + for (int i = 0; i < dataLength; i++) + { + str[i] = (char)random.Next(97, 123); + } + string baseStr = new string(str); + byte[] sourceUtf8 = Encoding.UTF8.GetBytes(baseStr); + + ReadOnlyMemory written = WriteStringHelper(writerOptions, baseStr); + Assert.Equal(-1, written.Span.IndexOf((byte)'\\')); + + written = WriteUtf8StringHelper(writerOptions, sourceUtf8); + Assert.Equal(-1, written.Span.IndexOf((byte)'\\')); + + for (int i = 0; i < dataLength - 1; i++) + { + char[] changed = baseStr.ToCharArray(); + changed[i] = highSurrogate; + changed[i + 1] = lowSurrogate; + string newStr = new string(changed); + sourceUtf8 = Encoding.UTF8.GetBytes(newStr); + + written = WriteStringHelper(writerOptions, newStr); + int escapedIndex = written.Span.IndexOf((byte)'\\'); + Assert.Equal(i + 1, escapedIndex); // Account for the start quote + + written = WriteUtf8StringHelper(writerOptions, sourceUtf8); + escapedIndex = written.Span.IndexOf((byte)'\\'); + Assert.Equal(i + 1, escapedIndex); // Account for the start quote + } + + { + char[] changed = baseStr.ToCharArray(); + + for (int i = 0; i < changed.Length - 1; i += 2) + { + changed[i] = highSurrogate; + changed[i + 1] = lowSurrogate; + } + + string newStr = new string(changed); + sourceUtf8 = Encoding.UTF8.GetBytes(newStr); + + written = WriteStringHelper(writerOptions, newStr); + int escapedIndex = written.Span.IndexOf((byte)'\\'); + Assert.Equal(1, escapedIndex); // Account for the start quote + + written = WriteUtf8StringHelper(writerOptions, sourceUtf8); + escapedIndex = written.Span.IndexOf((byte)'\\'); + Assert.Equal(1, escapedIndex); // Account for the start quote + } + } + } + + public static IEnumerable JavaScriptEncoders + { + get + { + return new List + { + new object[] { null }, + new object[] { JavaScriptEncoder.Default }, + new object[] { JavaScriptEncoder.Create(UnicodeRanges.BasicLatin) }, + new object[] { JavaScriptEncoder.Create(UnicodeRanges.All) }, + new object[] { JavaScriptEncoder.UnsafeRelaxedJsonEscaping }, + }; + } + } + + [Theory] + [MemberData(nameof(InvalidEscapingTestData))] + public unsafe void WriteStringInvalidCharacter(char replacementChar, JavaScriptEncoder encoder, bool requiresEscaping) + { + var writerOptions = new JsonWriterOptions { Encoder = encoder }; + var random = new Random(42); + for (int dataLength = 0; dataLength < 47; dataLength++) + { + char[] str = new char[dataLength]; + for (int i = 0; i < dataLength; i++) + { + str[i] = (char)random.Next(97, 123); + } + string baseStr = new string(str); + byte[] baseStrUtf8 = Encoding.UTF8.GetBytes(baseStr); + + for (int i = 0; i < dataLength; i++) + { + char[] changed = baseStr.ToCharArray(); + changed[i] = replacementChar; + string source = new string(changed); + byte[] sourceUtf8 = new byte[baseStrUtf8.Length]; + baseStrUtf8.AsSpan().CopyTo(sourceUtf8); + sourceUtf8[i] = 0xC3; // Invalid, first byte of a 2-byte utf-8 character + + ReadOnlyMemory written = WriteStringHelper(writerOptions, source); + // Some encoders don't escape replacement character + Assert.Equal(requiresEscaping ? i + 1 : -1, written.Span.IndexOf((byte)'\\')); // Account for the start quote + + written = WriteUtf8StringHelper(writerOptions, sourceUtf8); + // Some encoders don't escape replacement character + Assert.Equal(requiresEscaping ? i + 1 : -1, written.Span.IndexOf((byte)'\\')); // Account for the start quote + } + } + } + + public static IEnumerable InvalidEscapingTestData + { + get + { + return new List + { + new object[] { '\uD801', JavaScriptEncoder.Default, true }, // Invalid, high surrogate alone + new object[] { '\uDC01', JavaScriptEncoder.Default, true }, // Invalid, low surrogate alone + + new object[] { '\uD801', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '\uDC01', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + + new object[] { '\uD801', JavaScriptEncoder.Create(UnicodeRanges.All), false }, + new object[] { '\uDC01', JavaScriptEncoder.Create(UnicodeRanges.All), false }, + + new object[] { '\uD801', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '\uDC01', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + }; + } + } + + private static ReadOnlyMemory WriteStringHelper(JsonWriterOptions writerOptions, string str) + { + var output = new ArrayBufferWriter(); + using (var writer = new Utf8JsonWriter(output, writerOptions)) + { + writer.WriteStringValue(str); + } + return output.WrittenMemory; + } + + private static ReadOnlyMemory WriteUtf8StringHelper(JsonWriterOptions writerOptions, byte[] utf8str) + { + var output = new ArrayBufferWriter(); + using (var writer = new Utf8JsonWriter(output, writerOptions)) + { + writer.WriteStringValue(utf8str); + } + return output.WrittenMemory; + } + [Fact] public void WriteJsonWritesToIBWOnDemand_Dispose() { From 58ec075112d5175e723be610a8e4b25449970528 Mon Sep 17 00:00:00 2001 From: Ahson Khan Date: Tue, 22 Oct 2019 17:34:24 -0700 Subject: [PATCH 2/9] Optimize FindFirstCharToEncode for JavaScriptEncoder.Default and Relaxed using Sse2 intrinsics. (#41933) * Optimize FindFirstCharToEncode for JavaScriptEncoder.Default and Relaxed using Sse2 intrinsics. * Create an Sse2Helper and improve perf of TextEncoder and AllowedCharactersBitmap * Loop unroll FindFirstCharacterToEncode * Improve code coverage. * Add more tests for surrogate pairs and fix call to WillEncode. * Address PR feedback - remove some code duplication. * Move DefaultJavaScriptEncoder to separate file and override EncodeUtf8 with better caching. * Add default replacement character as a test. * Address nits. --- .../src/Configurations.props | 1 + .../src/System.Text.Encodings.Web.csproj | 11 +- .../Encodings/Web/DefaultJavaScriptEncoder.cs | 271 ++++++++++++++ .../Web/DefaultJavaScriptEncoderBasicLatin.cs | 289 ++++++++++++++ .../Text/Encodings/Web/JavaScriptEncoder.cs | 178 +-------- .../Encodings/Web/JavaScriptEncoderHelper.cs | 69 ++++ .../System/Text/Encodings/Web/Sse2Helper.cs | 127 +++++++ .../System/Text/Encodings/Web/TextEncoder.cs | 153 ++++++-- .../Web/UnsafeRelaxedJavaScriptEncoder.cs | 272 ++++++++++---- .../Text/Internal/AllowedCharactersBitmap.cs | 52 ++- .../tests/JavaScriptStringEncoderTests.cs | 352 +++++++++++++++++- 11 files changed, 1500 insertions(+), 275 deletions(-) create mode 100644 src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/DefaultJavaScriptEncoder.cs create mode 100644 src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/DefaultJavaScriptEncoderBasicLatin.cs create mode 100644 src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/JavaScriptEncoderHelper.cs create mode 100644 src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/Sse2Helper.cs diff --git a/src/System.Text.Encodings.Web/src/Configurations.props b/src/System.Text.Encodings.Web/src/Configurations.props index 6360871dacff..4f327a843e1b 100644 --- a/src/System.Text.Encodings.Web/src/Configurations.props +++ b/src/System.Text.Encodings.Web/src/Configurations.props @@ -1,6 +1,7 @@  + netcoreapp; netstandard2.1; netstandard; uap-Windows_NT; diff --git a/src/System.Text.Encodings.Web/src/System.Text.Encodings.Web.csproj b/src/System.Text.Encodings.Web/src/System.Text.Encodings.Web.csproj index 98050657653d..af978c992baf 100644 --- a/src/System.Text.Encodings.Web/src/System.Text.Encodings.Web.csproj +++ b/src/System.Text.Encodings.Web/src/System.Text.Encodings.Web.csproj @@ -3,12 +3,15 @@ {B7EDBF00-765A-48E8-B593-CD668288E274} System.Text.Encodings.Web true - netstandard-Debug;netstandard-Release;netstandard2.1-Debug;netstandard2.1-Release;uap-Windows_NT-Debug;uap-Windows_NT-Release + netcoreapp-Debug;netcoreapp-Release;netstandard-Debug;netstandard-Release;netstandard2.1-Debug;netstandard2.1-Release;uap-Windows_NT-Debug;uap-Windows_NT-Release + + + @@ -20,6 +23,9 @@ + + + System\Text\UnicodeDebug.cs @@ -37,4 +43,7 @@ + + + \ No newline at end of file diff --git a/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/DefaultJavaScriptEncoder.cs b/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/DefaultJavaScriptEncoder.cs new file mode 100644 index 000000000000..ad6ffb9bc829 --- /dev/null +++ b/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/DefaultJavaScriptEncoder.cs @@ -0,0 +1,271 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Buffers; +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Text.Internal; +using System.Text.Unicode; + +#if NETCOREAPP +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif + +namespace System.Text.Encodings.Web +{ + internal sealed class DefaultJavaScriptEncoder : JavaScriptEncoder + { + private readonly AllowedCharactersBitmap _allowedCharacters; + + private readonly int[] _asciiNeedsEscaping = new int[0x80]; + + public DefaultJavaScriptEncoder(TextEncoderSettings filter) + { + if (filter == null) + { + throw new ArgumentNullException(nameof(filter)); + } + + _allowedCharacters = filter.GetAllowedCharacters(); + + // Forbid codepoints which aren't mapped to characters or which are otherwise always disallowed + // (includes categories Cc, Cs, Co, Cn, Zs [except U+0020 SPACE], Zl, Zp) + _allowedCharacters.ForbidUndefinedCharacters(); + + // Forbid characters that are special in HTML. + // Even though this is a not HTML encoder, + // it's unfortunately common for developers to + // forget to HTML-encode a string once it has been JS-encoded, + // so this offers extra protection. + DefaultHtmlEncoder.ForbidHtmlCharacters(_allowedCharacters); + + // '\' (U+005C REVERSE SOLIDUS) must always be escaped in Javascript / ECMAScript / JSON. + // '/' (U+002F SOLIDUS) is not Javascript / ECMAScript / JSON-sensitive so doesn't need to be escaped. + _allowedCharacters.ForbidCharacter('\\'); + + // '`' (U+0060 GRAVE ACCENT) is ECMAScript-sensitive (see ECMA-262). + _allowedCharacters.ForbidCharacter('`'); + + for (int i = 0; i < _asciiNeedsEscaping.Length; i++) + { + _asciiNeedsEscaping[i] = WillEncode(i) ? 1 : -1; + } + } + + public DefaultJavaScriptEncoder(params UnicodeRange[] allowedRanges) : this(new TextEncoderSettings(allowedRanges)) + { } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public override bool WillEncode(int unicodeScalar) + { + if (UnicodeHelpers.IsSupplementaryCodePoint(unicodeScalar)) + { + return true; + } + + Debug.Assert(unicodeScalar >= char.MinValue && unicodeScalar <= char.MaxValue); + + return !_allowedCharacters.IsUnicodeScalarAllowed(unicodeScalar); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public override unsafe int FindFirstCharacterToEncode(char* text, int textLength) + { + if (text == null) + { + throw new ArgumentNullException(nameof(text)); + } + + return _allowedCharacters.FindFirstCharacterToEncode(text, textLength); + } + + public override unsafe int FindFirstCharacterToEncodeUtf8(ReadOnlySpan utf8Text) + { + fixed (byte* ptr = utf8Text) + { + int idx = 0; + +#if NETCOREAPP + if (Sse2.IsSupported) + { + sbyte* startingAddress = (sbyte*)ptr; + while (utf8Text.Length - 16 >= idx) + { + Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + utf8Text.Length - 16)); + + // Load the next 16 bytes. + Vector128 sourceValue = Sse2.LoadVector128(startingAddress); + + Vector128 mask = Sse2Helper.CreateAsciiMask(sourceValue); + int index = Sse2.MoveMask(mask); + + if (index != 0) + { + // At least one of the following 16 bytes is non-ASCII. + + int processNextSixteen = idx + 16; + Debug.Assert(processNextSixteen <= utf8Text.Length); + + while (idx < processNextSixteen) + { + Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length)); + + if (UnicodeUtility.IsAsciiCodePoint(ptr[idx])) + { + if (DoesAsciiNeedEncoding(ptr[idx]) == 1) + { + goto Return; + } + idx++; + } + else + { + OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar); + + Debug.Assert(nextScalarValue <= int.MaxValue); + if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue)) + { + goto Return; + } + + Debug.Assert(opStatus == OperationStatus.Done); + idx += utf8BytesConsumedForScalar; + } + } + } + else + { + if (DoesAsciiNeedEncoding(ptr[idx]) == 1 + + || DoesAsciiNeedEncoding(ptr[++idx]) == 1 + || DoesAsciiNeedEncoding(ptr[++idx]) == 1 + || DoesAsciiNeedEncoding(ptr[++idx]) == 1 + || DoesAsciiNeedEncoding(ptr[++idx]) == 1 + + || DoesAsciiNeedEncoding(ptr[++idx]) == 1 + || DoesAsciiNeedEncoding(ptr[++idx]) == 1 + || DoesAsciiNeedEncoding(ptr[++idx]) == 1 + || DoesAsciiNeedEncoding(ptr[++idx]) == 1 + + || DoesAsciiNeedEncoding(ptr[++idx]) == 1 + || DoesAsciiNeedEncoding(ptr[++idx]) == 1 + || DoesAsciiNeedEncoding(ptr[++idx]) == 1 + || DoesAsciiNeedEncoding(ptr[++idx]) == 1 + + || DoesAsciiNeedEncoding(ptr[++idx]) == 1 + || DoesAsciiNeedEncoding(ptr[++idx]) == 1 + || DoesAsciiNeedEncoding(ptr[++idx]) == 1) + { + goto Return; + } + idx++; + } + startingAddress = (sbyte*)ptr + idx; + } + + // Process the remaining bytes. + Debug.Assert(utf8Text.Length - idx < 16); + } +#endif + + while (idx < utf8Text.Length) + { + Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length)); + + if (UnicodeUtility.IsAsciiCodePoint(ptr[idx])) + { + if (DoesAsciiNeedEncoding(ptr[idx]) == 1) + { + goto Return; + } + idx++; + } + else + { + OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar); + + Debug.Assert(nextScalarValue <= int.MaxValue); + if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue)) + { + goto Return; + } + + Debug.Assert(opStatus == OperationStatus.Done); + idx += utf8BytesConsumedForScalar; + } + } + + idx = -1; // All bytes are allowed. + + Return: + return idx; + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private int DoesAsciiNeedEncoding(byte value) + { + Debug.Assert(value <= 0x7F); + + int needsEscaping = _asciiNeedsEscaping[value]; + + Debug.Assert(needsEscaping == 1 || needsEscaping == -1); + + return needsEscaping; + } + + // The worst case encoding is 6 output chars per input char: [input] U+FFFF -> [output] "\uFFFF" + // We don't need to worry about astral code points since they're represented as encoded + // surrogate pairs in the output. + public override int MaxOutputCharactersPerInputCharacter => 12; // "\uFFFF\uFFFF" is the longest encoded form + + private static readonly char[] s_b = new char[] { '\\', 'b' }; + private static readonly char[] s_t = new char[] { '\\', 't' }; + private static readonly char[] s_n = new char[] { '\\', 'n' }; + private static readonly char[] s_f = new char[] { '\\', 'f' }; + private static readonly char[] s_r = new char[] { '\\', 'r' }; + private static readonly char[] s_back = new char[] { '\\', '\\' }; + + // Writes a scalar value as a JavaScript-escaped character (or sequence of characters). + // See ECMA-262, Sec. 7.8.4, and ECMA-404, Sec. 9 + // http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.4 + // http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-404.pdf + public override unsafe bool TryEncodeUnicodeScalar(int unicodeScalar, char* buffer, int bufferLength, out int numberOfCharactersWritten) + { + if (buffer == null) + { + throw new ArgumentNullException(nameof(buffer)); + } + // ECMA-262 allows encoding U+000B as "\v", but ECMA-404 does not. + // Both ECMA-262 and ECMA-404 allow encoding U+002F SOLIDUS as "\/" + // (in ECMA-262 this character is a NonEscape character); however, we + // don't encode SOLIDUS by default unless the caller has provided an + // explicit bitmap which does not contain it. In this case we'll assume + // that the caller didn't want a SOLIDUS written to the output at all, + // so it should be written using "\u002F" encoding. + // HTML-specific characters (including apostrophe and quotes) will + // be written out as numeric entities for defense-in-depth. + // See UnicodeEncoderBase ctor comments for more info. + + if (!WillEncode(unicodeScalar)) + { + return TryWriteScalarAsChar(unicodeScalar, buffer, bufferLength, out numberOfCharactersWritten); + } + + char[] toCopy; + switch (unicodeScalar) + { + case '\b': toCopy = s_b; break; + case '\t': toCopy = s_t; break; + case '\n': toCopy = s_n; break; + case '\f': toCopy = s_f; break; + case '\r': toCopy = s_r; break; + case '\\': toCopy = s_back; break; + default: return JavaScriptEncoderHelper.TryWriteEncodedScalarAsNumericEntity(unicodeScalar, buffer, bufferLength, out numberOfCharactersWritten); + } + return TryCopyCharacters(toCopy, buffer, bufferLength, out numberOfCharactersWritten); + } + } +} diff --git a/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/DefaultJavaScriptEncoderBasicLatin.cs b/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/DefaultJavaScriptEncoderBasicLatin.cs new file mode 100644 index 000000000000..db25c3d6d63f --- /dev/null +++ b/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/DefaultJavaScriptEncoderBasicLatin.cs @@ -0,0 +1,289 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Text.Internal; +using System.Text.Unicode; + +#if NETCOREAPP +using System.Numerics; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif + +namespace System.Text.Encodings.Web +{ + internal sealed class DefaultJavaScriptEncoderBasicLatin : JavaScriptEncoder + { + internal static readonly DefaultJavaScriptEncoderBasicLatin s_singleton = new DefaultJavaScriptEncoderBasicLatin(); + + private DefaultJavaScriptEncoderBasicLatin() + { + var filter = new TextEncoderSettings(UnicodeRanges.BasicLatin); + + AllowedCharactersBitmap allowedCharacters = filter.GetAllowedCharacters(); + + // Forbid codepoints which aren't mapped to characters or which are otherwise always disallowed + // (includes categories Cc, Cs, Co, Cn, Zs [except U+0020 SPACE], Zl, Zp) + allowedCharacters.ForbidUndefinedCharacters(); + + // Forbid characters that are special in HTML. + // Even though this is a not HTML encoder, + // it's unfortunately common for developers to + // forget to HTML-encode a string once it has been JS-encoded, + // so this offers extra protection. + DefaultHtmlEncoder.ForbidHtmlCharacters(allowedCharacters); + + // '\' (U+005C REVERSE SOLIDUS) must always be escaped in Javascript / ECMAScript / JSON. + // '/' (U+002F SOLIDUS) is not Javascript / ECMAScript / JSON-sensitive so doesn't need to be escaped. + allowedCharacters.ForbidCharacter('\\'); + + // '`' (U+0060 GRAVE ACCENT) is ECMAScript-sensitive (see ECMA-262). + allowedCharacters.ForbidCharacter('`'); + +#if DEBUG + // Verify and ensure that the AllowList bit map matches the set of allowed characters using AllowedCharactersBitmap + for (int i = 0; i < AllowList.Length; i++) + { + char ch = (char)i; + Debug.Assert((allowedCharacters.IsCharacterAllowed(ch) ? 1 : 0) == AllowList[ch]); + Debug.Assert(allowedCharacters.IsCharacterAllowed(ch) == !NeedsEscaping(ch)); + } + for (int i = AllowList.Length; i <= char.MaxValue; i++) + { + char ch = (char)i; + Debug.Assert(!allowedCharacters.IsCharacterAllowed(ch)); + Debug.Assert(NeedsEscaping(ch)); + } +#endif + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public override bool WillEncode(int unicodeScalar) + { + if (UnicodeHelpers.IsSupplementaryCodePoint(unicodeScalar)) + { + return true; + } + + Debug.Assert(unicodeScalar >= char.MinValue && unicodeScalar <= char.MaxValue); + + return NeedsEscaping((char)unicodeScalar); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public override unsafe int FindFirstCharacterToEncode(char* text, int textLength) + { + if (text == null) + { + throw new ArgumentNullException(nameof(text)); + } + + int idx = 0; + +#if NETCOREAPP + if (Sse2.IsSupported) + { + short* startingAddress = (short*)text; + while (textLength - 8 >= idx) + { + Debug.Assert(startingAddress >= text && startingAddress <= (text + textLength - 8)); + + // Load the next 8 characters. + Vector128 sourceValue = Sse2.LoadVector128(startingAddress); + + // Check if any of the 8 characters need to be escaped. + Vector128 mask = Sse2Helper.CreateEscapingMask_DefaultJavaScriptEncoderBasicLatin(sourceValue); + + int index = Sse2.MoveMask(mask.AsByte()); + // If index == 0, that means none of the 8 characters needed to be escaped. + // TrailingZeroCount is relatively expensive, avoid it if possible. + if (index != 0) + { + // Found at least one character that needs to be escaped, figure out the index of + // the first one found that needed to be escaped within the 8 characters. + Debug.Assert(index > 0 && index <= 65_535); + int tzc = BitOperations.TrailingZeroCount(index); + Debug.Assert(tzc % 2 == 0 && tzc >= 0 && tzc <= 16); + idx += tzc >> 1; + goto Return; + } + idx += 8; + startingAddress += 8; + } + + // Process the remaining characters. + Debug.Assert(textLength - idx < 8); + } +#endif + + for (; idx < textLength; idx++) + { + Debug.Assert((text + idx) <= (text + textLength)); + if (NeedsEscaping(*(text + idx))) + { + goto Return; + } + } + + idx = -1; // All characters are allowed. + + Return: + return idx; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public override unsafe int FindFirstCharacterToEncodeUtf8(ReadOnlySpan utf8Text) + { + fixed (byte* ptr = utf8Text) + { + int idx = 0; + +#if NETCOREAPP + if (Sse2.IsSupported) + { + sbyte* startingAddress = (sbyte*)ptr; + while (utf8Text.Length - 16 >= idx) + { + Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + utf8Text.Length - 16)); + + // Load the next 16 bytes. + Vector128 sourceValue = Sse2.LoadVector128(startingAddress); + + // Check if any of the 16 bytes need to be escaped. + Vector128 mask = Sse2Helper.CreateEscapingMask_DefaultJavaScriptEncoderBasicLatin(sourceValue); + + int index = Sse2.MoveMask(mask); + // If index == 0, that means none of the 16 bytes needed to be escaped. + // TrailingZeroCount is relatively expensive, avoid it if possible. + if (index != 0) + { + // Found at least one byte that needs to be escaped, figure out the index of + // the first one found that needed to be escaped within the 16 bytes. + int tzc = BitOperations.TrailingZeroCount(index); + Debug.Assert(tzc >= 0 && tzc <= 16); + idx += tzc; + goto Return; + } + idx += 16; + startingAddress += 16; + } + + // Process the remaining bytes. + Debug.Assert(utf8Text.Length - idx < 16); + } +#endif + + for (; idx < utf8Text.Length; idx++) + { + Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length)); + if (NeedsEscaping(*(ptr + idx))) + { + goto Return; + } + } + + idx = -1; // All bytes are allowed. + + Return: + return idx; + } + } + + // The worst case encoding is 6 output chars per input char: [input] U+FFFF -> [output] "\uFFFF" + // We don't need to worry about astral code points since they're represented as encoded + // surrogate pairs in the output. + public override int MaxOutputCharactersPerInputCharacter => 12; // "\uFFFF\uFFFF" is the longest encoded form + + private static readonly char[] s_b = new char[] { '\\', 'b' }; + private static readonly char[] s_t = new char[] { '\\', 't' }; + private static readonly char[] s_n = new char[] { '\\', 'n' }; + private static readonly char[] s_f = new char[] { '\\', 'f' }; + private static readonly char[] s_r = new char[] { '\\', 'r' }; + private static readonly char[] s_back = new char[] { '\\', '\\' }; + + // Writes a scalar value as a JavaScript-escaped character (or sequence of characters). + // See ECMA-262, Sec. 7.8.4, and ECMA-404, Sec. 9 + // http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.4 + // http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-404.pdf + public override unsafe bool TryEncodeUnicodeScalar(int unicodeScalar, char* buffer, int bufferLength, out int numberOfCharactersWritten) + { + if (buffer == null) + { + throw new ArgumentNullException(nameof(buffer)); + } + // ECMA-262 allows encoding U+000B as "\v", but ECMA-404 does not. + // Both ECMA-262 and ECMA-404 allow encoding U+002F SOLIDUS as "\/" + // (in ECMA-262 this character is a NonEscape character); however, we + // don't encode SOLIDUS by default unless the caller has provided an + // explicit bitmap which does not contain it. In this case we'll assume + // that the caller didn't want a SOLIDUS written to the output at all, + // so it should be written using "\u002F" encoding. + // HTML-specific characters (including apostrophe and quotes) will + // be written out as numeric entities for defense-in-depth. + // See UnicodeEncoderBase ctor comments for more info. + + if (!WillEncode(unicodeScalar)) + { + return TryWriteScalarAsChar(unicodeScalar, buffer, bufferLength, out numberOfCharactersWritten); + } + + char[] toCopy; + switch (unicodeScalar) + { + case '\b': + toCopy = s_b; + break; + case '\t': + toCopy = s_t; + break; + case '\n': + toCopy = s_n; + break; + case '\f': + toCopy = s_f; + break; + case '\r': + toCopy = s_r; + break; + case '\\': + toCopy = s_back; + break; + default: + return JavaScriptEncoderHelper.TryWriteEncodedScalarAsNumericEntity(unicodeScalar, buffer, bufferLength, out numberOfCharactersWritten); + } + return TryCopyCharacters(toCopy, buffer, bufferLength, out numberOfCharactersWritten); + } + + private static ReadOnlySpan AllowList => new byte[byte.MaxValue + 1] + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // U+0000..U+000F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // U+0010..U+001F + 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, // U+0020..U+002F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, // U+0030..U+003F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // U+0040..U+004F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, // U+0050..U+005F + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // U+0060..U+006F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, // U+0070..U+007F + + // Also include the ranges from U+0080 to U+00FF for performance to avoid UTF8 code from checking boundary. + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // U+00F0..U+00FF + }; + + public const int LastAsciiCharacter = 0x7F; + + private static bool NeedsEscaping(byte value) => AllowList[value] == 0; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool NeedsEscaping(char value) => value > LastAsciiCharacter || AllowList[value] == 0; + } +} diff --git a/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/JavaScriptEncoder.cs b/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/JavaScriptEncoder.cs index 4c1ecc1cad21..8f20dc04e9db 100644 --- a/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/JavaScriptEncoder.cs +++ b/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/JavaScriptEncoder.cs @@ -2,10 +2,6 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. -using System.ComponentModel; -using System.Diagnostics; -using System.Runtime.CompilerServices; -using System.Text.Internal; using System.Text.Unicode; namespace System.Text.Encodings.Web @@ -18,10 +14,7 @@ public abstract class JavaScriptEncoder : TextEncoder /// /// Returns a default built-in instance of . /// - public static JavaScriptEncoder Default - { - get { return DefaultJavaScriptEncoder.Singleton; } - } + public static JavaScriptEncoder Default => DefaultJavaScriptEncoderBasicLatin.s_singleton; /// /// Returns a built-in instance of that is less strict about what gets encoded. @@ -40,11 +33,8 @@ public static JavaScriptEncoder Default /// /// Unlike the , this encoder instance allows some other characters to go through unescaped (for example, '+'), and hence must be used cautiously. /// - /// - public static JavaScriptEncoder UnsafeRelaxedJsonEscaping - { - get { return UnsafeRelaxedJavaScriptEncoder.s_singleton; } - } + /// + public static JavaScriptEncoder UnsafeRelaxedJsonEscaping => UnsafeRelaxedJavaScriptEncoder.s_singleton; /// /// Creates a new instance of JavaScriptEncoder with provided settings. @@ -67,166 +57,4 @@ public static JavaScriptEncoder Create(params UnicodeRange[] allowedRanges) return new DefaultJavaScriptEncoder(allowedRanges); } } - - internal sealed class DefaultJavaScriptEncoder : JavaScriptEncoder - { - private AllowedCharactersBitmap _allowedCharacters; - - internal static readonly DefaultJavaScriptEncoder Singleton = new DefaultJavaScriptEncoder(new TextEncoderSettings(UnicodeRanges.BasicLatin)); - - public DefaultJavaScriptEncoder(TextEncoderSettings filter) - { - if (filter == null) - { - throw new ArgumentNullException(nameof(filter)); - } - - _allowedCharacters = filter.GetAllowedCharacters(); - - // Forbid codepoints which aren't mapped to characters or which are otherwise always disallowed - // (includes categories Cc, Cs, Co, Cn, Zs [except U+0020 SPACE], Zl, Zp) - _allowedCharacters.ForbidUndefinedCharacters(); - - // Forbid characters that are special in HTML. - // Even though this is a not HTML encoder, - // it's unfortunately common for developers to - // forget to HTML-encode a string once it has been JS-encoded, - // so this offers extra protection. - DefaultHtmlEncoder.ForbidHtmlCharacters(_allowedCharacters); - - // '\' (U+005C REVERSE SOLIDUS) must always be escaped in Javascript / ECMAScript / JSON. - // '/' (U+002F SOLIDUS) is not Javascript / ECMAScript / JSON-sensitive so doesn't need to be escaped. - _allowedCharacters.ForbidCharacter('\\'); - - // '`' (U+0060 GRAVE ACCENT) is ECMAScript-sensitive (see ECMA-262). - _allowedCharacters.ForbidCharacter('`'); - } - - public DefaultJavaScriptEncoder(params UnicodeRange[] allowedRanges) : this(new TextEncoderSettings(allowedRanges)) - { } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public override bool WillEncode(int unicodeScalar) - { - if (UnicodeHelpers.IsSupplementaryCodePoint(unicodeScalar)) return true; - return !_allowedCharacters.IsUnicodeScalarAllowed(unicodeScalar); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public unsafe override int FindFirstCharacterToEncode(char* text, int textLength) - { - if (text == null) - { - throw new ArgumentNullException(nameof(text)); - } - - return _allowedCharacters.FindFirstCharacterToEncode(text, textLength); - } - - // The worst case encoding is 6 output chars per input char: [input] U+FFFF -> [output] "\uFFFF" - // We don't need to worry about astral code points since they're represented as encoded - // surrogate pairs in the output. - public override int MaxOutputCharactersPerInputCharacter - { - get { return 12; } // "\uFFFF\uFFFF" is the longest encoded form - } - - static readonly char[] s_b = new char[] { '\\', 'b' }; - static readonly char[] s_t = new char[] { '\\', 't' }; - static readonly char[] s_n = new char[] { '\\', 'n' }; - static readonly char[] s_f = new char[] { '\\', 'f' }; - static readonly char[] s_r = new char[] { '\\', 'r' }; - static readonly char[] s_back = new char[] { '\\', '\\' }; - - // Writes a scalar value as a JavaScript-escaped character (or sequence of characters). - // See ECMA-262, Sec. 7.8.4, and ECMA-404, Sec. 9 - // http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.4 - // http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-404.pdf - public unsafe override bool TryEncodeUnicodeScalar(int unicodeScalar, char* buffer, int bufferLength, out int numberOfCharactersWritten) - { - if (buffer == null) - { - throw new ArgumentNullException(nameof(buffer)); - } - // ECMA-262 allows encoding U+000B as "\v", but ECMA-404 does not. - // Both ECMA-262 and ECMA-404 allow encoding U+002F SOLIDUS as "\/" - // (in ECMA-262 this character is a NonEscape character); however, we - // don't encode SOLIDUS by default unless the caller has provided an - // explicit bitmap which does not contain it. In this case we'll assume - // that the caller didn't want a SOLIDUS written to the output at all, - // so it should be written using "\u002F" encoding. - // HTML-specific characters (including apostrophe and quotes) will - // be written out as numeric entities for defense-in-depth. - // See UnicodeEncoderBase ctor comments for more info. - - if (!WillEncode(unicodeScalar)) { return TryWriteScalarAsChar(unicodeScalar, buffer, bufferLength, out numberOfCharactersWritten); } - - char[] toCopy; - switch (unicodeScalar) - { - case '\b': toCopy = s_b; break; - case '\t': toCopy = s_t; break; - case '\n': toCopy = s_n; break; - case '\f': toCopy = s_f; break; - case '\r': toCopy = s_r; break; - case '\\': toCopy = s_back; break; - default: return TryWriteEncodedScalarAsNumericEntity(unicodeScalar, buffer, bufferLength, out numberOfCharactersWritten); - } - return TryCopyCharacters(toCopy, buffer, bufferLength, out numberOfCharactersWritten); - } - - private static unsafe bool TryWriteEncodedScalarAsNumericEntity(int unicodeScalar, char* buffer, int length, out int numberOfCharactersWritten) - { - Debug.Assert(buffer != null && length >= 0); - - if (UnicodeHelpers.IsSupplementaryCodePoint(unicodeScalar)) - { - // Convert this back to UTF-16 and write out both characters. - char leadingSurrogate, trailingSurrogate; - UnicodeHelpers.GetUtf16SurrogatePairFromAstralScalarValue(unicodeScalar, out leadingSurrogate, out trailingSurrogate); - int leadingSurrogateCharactersWritten; - if (TryWriteEncodedSingleCharacter(leadingSurrogate, buffer, length, out leadingSurrogateCharactersWritten) && - TryWriteEncodedSingleCharacter(trailingSurrogate, buffer + leadingSurrogateCharactersWritten, length - leadingSurrogateCharactersWritten, out numberOfCharactersWritten) - ) - { - numberOfCharactersWritten += leadingSurrogateCharactersWritten; - return true; - } - else - { - numberOfCharactersWritten = 0; - return false; - } - } - else - { - // This is only a single character. - return TryWriteEncodedSingleCharacter(unicodeScalar, buffer, length, out numberOfCharactersWritten); - } - } - - // Writes an encoded scalar value (in the BMP) as a JavaScript-escaped character. - private static unsafe bool TryWriteEncodedSingleCharacter(int unicodeScalar, char* buffer, int length, out int numberOfCharactersWritten) - { - Debug.Assert(buffer != null && length >= 0); - Debug.Assert(!UnicodeHelpers.IsSupplementaryCodePoint(unicodeScalar), "The incoming value should've been in the BMP."); - - if (length < 6) - { - numberOfCharactersWritten = 0; - return false; - } - - // Encode this as 6 chars "\uFFFF". - *buffer = '\\'; buffer++; - *buffer = 'u'; buffer++; - *buffer = HexUtil.Int32LsbToHexDigit(unicodeScalar >> 12); buffer++; - *buffer = HexUtil.Int32LsbToHexDigit((int)((unicodeScalar >> 8) & 0xFU)); buffer++; - *buffer = HexUtil.Int32LsbToHexDigit((int)((unicodeScalar >> 4) & 0xFU)); buffer++; - *buffer = HexUtil.Int32LsbToHexDigit((int)(unicodeScalar & 0xFU)); buffer++; - - numberOfCharactersWritten = 6; - return true; - } - } } diff --git a/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/JavaScriptEncoderHelper.cs b/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/JavaScriptEncoderHelper.cs new file mode 100644 index 000000000000..7c8bffa4f54a --- /dev/null +++ b/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/JavaScriptEncoderHelper.cs @@ -0,0 +1,69 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Diagnostics; +using System.Text.Unicode; + +namespace System.Text.Encodings.Web +{ + internal static class JavaScriptEncoderHelper + { + public static unsafe bool TryWriteEncodedScalarAsNumericEntity(int unicodeScalar, char* buffer, int length, out int numberOfCharactersWritten) + { + Debug.Assert(buffer != null && length >= 0); + + if (UnicodeHelpers.IsSupplementaryCodePoint(unicodeScalar)) + { + // Convert this back to UTF-16 and write out both characters. + UnicodeHelpers.GetUtf16SurrogatePairFromAstralScalarValue(unicodeScalar, out char leadingSurrogate, out char trailingSurrogate); + if (TryWriteEncodedSingleCharacter(leadingSurrogate, buffer, length, out int leadingSurrogateCharactersWritten) && + TryWriteEncodedSingleCharacter(trailingSurrogate, buffer + leadingSurrogateCharactersWritten, length - leadingSurrogateCharactersWritten, out numberOfCharactersWritten) + ) + { + numberOfCharactersWritten += leadingSurrogateCharactersWritten; + return true; + } + else + { + numberOfCharactersWritten = 0; + return false; + } + } + else + { + // This is only a single character. + return TryWriteEncodedSingleCharacter(unicodeScalar, buffer, length, out numberOfCharactersWritten); + } + } + + // Writes an encoded scalar value (in the BMP) as a JavaScript-escaped character. + private static unsafe bool TryWriteEncodedSingleCharacter(int unicodeScalar, char* buffer, int length, out int numberOfCharactersWritten) + { + Debug.Assert(buffer != null && length >= 0); + Debug.Assert(!UnicodeHelpers.IsSupplementaryCodePoint(unicodeScalar), "The incoming value should've been in the BMP."); + + if (length < 6) + { + numberOfCharactersWritten = 0; + return false; + } + + // Encode this as 6 chars "\uFFFF". + *buffer = '\\'; + buffer++; + *buffer = 'u'; + buffer++; + *buffer = HexUtil.Int32LsbToHexDigit(unicodeScalar >> 12); + buffer++; + *buffer = HexUtil.Int32LsbToHexDigit((int)((unicodeScalar >> 8) & 0xFU)); + buffer++; + *buffer = HexUtil.Int32LsbToHexDigit((int)((unicodeScalar >> 4) & 0xFU)); + buffer++; + *buffer = HexUtil.Int32LsbToHexDigit((int)(unicodeScalar & 0xFU)); + + numberOfCharactersWritten = 6; + return true; + } + } +} diff --git a/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/Sse2Helper.cs b/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/Sse2Helper.cs new file mode 100644 index 000000000000..6caebd3e10a6 --- /dev/null +++ b/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/Sse2Helper.cs @@ -0,0 +1,127 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Numerics; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; + +namespace System.Text.Encodings.Web +{ + internal static class Sse2Helper + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 CreateEscapingMask_UnsafeRelaxedJavaScriptEncoder(Vector128 sourceValue) + { + Debug.Assert(Sse2.IsSupported); + + // Space ' ', anything in the control characters range, and anything above short.MaxValue but less than or equal char.MaxValue + Vector128 mask = Sse2.CompareLessThan(sourceValue, s_mask_UInt16_0x20); + + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x22)); // Quotation Mark '"' + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x5C)); // Reverse Solidus '\' + + return mask; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 CreateEscapingMask_UnsafeRelaxedJavaScriptEncoder(Vector128 sourceValue) + { + Debug.Assert(Sse2.IsSupported); + + Vector128 mask = Sse2.CompareLessThan(sourceValue, s_mask_SByte_0x20); // Control characters, and anything above 0x7E since sbyte.MaxValue is 0x7E + + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x22)); // Quotation Mark " + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x5C)); // Reverse Solidus \ + + return mask; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 CreateEscapingMask_DefaultJavaScriptEncoderBasicLatin(Vector128 sourceValue) + { + Debug.Assert(Sse2.IsSupported); + + Vector128 mask = CreateEscapingMask_UnsafeRelaxedJavaScriptEncoder(sourceValue); + + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x26)); // Ampersand '&' + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x27)); // Apostrophe ''' + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x2B)); // Plus sign '+' + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x3C)); // Less Than Sign '<' + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x3E)); // Greater Than Sign '>' + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x60)); // Grave Access '`' + + mask = Sse2.Or(mask, Sse2.CompareGreaterThan(sourceValue, s_mask_UInt16_0x7E)); // Tilde '~', anything above the ASCII range + + return mask; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 CreateEscapingMask_DefaultJavaScriptEncoderBasicLatin(Vector128 sourceValue) + { + Debug.Assert(Sse2.IsSupported); + + Vector128 mask = CreateEscapingMask_UnsafeRelaxedJavaScriptEncoder(sourceValue); + + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x26)); // Ampersand & + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x27)); // Apostrophe ' + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x2B)); // Plus sign + + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x3C)); // Less Than Sign < + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x3E)); // Greater Than Sign > + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x60)); // Grave Access ` + + return mask; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 CreateAsciiMask(Vector128 sourceValue) + { + Debug.Assert(Sse2.IsSupported); + + Vector128 mask = Sse2.CompareLessThan(sourceValue, s_mask_UInt16_0x00); // Null, anything above short.MaxValue but less than or equal char.MaxValue + mask = Sse2.Or(mask, Sse2.CompareGreaterThan(sourceValue, s_mask_UInt16_0x7E)); // Tilde '~', anything above the ASCII range + + return mask; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 CreateAsciiMask(Vector128 sourceValue) + { + Debug.Assert(Sse2.IsSupported); + + // Null, anything above sbyte.MaxValue but less than or equal byte.MaxValue (i.e. anything above the ASCII range) + Vector128 mask = Sse2.CompareLessThan(sourceValue, s_mask_SByte_0x00); + return mask; + } + + private static readonly Vector128 s_mask_UInt16_0x00 = Vector128.Zero; // Null + + private static readonly Vector128 s_mask_UInt16_0x20 = Vector128.Create((short)0x20); // Space ' ' + + private static readonly Vector128 s_mask_UInt16_0x22 = Vector128.Create((short)0x22); // Quotation Mark '"' + private static readonly Vector128 s_mask_UInt16_0x26 = Vector128.Create((short)0x26); // Ampersand '&' + private static readonly Vector128 s_mask_UInt16_0x27 = Vector128.Create((short)0x27); // Apostrophe ''' + private static readonly Vector128 s_mask_UInt16_0x2B = Vector128.Create((short)0x2B); // Plus sign '+' + private static readonly Vector128 s_mask_UInt16_0x3C = Vector128.Create((short)0x3C); // Less Than Sign '<' + private static readonly Vector128 s_mask_UInt16_0x3E = Vector128.Create((short)0x3E); // Greater Than Sign '>' + private static readonly Vector128 s_mask_UInt16_0x5C = Vector128.Create((short)0x5C); // Reverse Solidus '\' + private static readonly Vector128 s_mask_UInt16_0x60 = Vector128.Create((short)0x60); // Grave Access '`' + + private static readonly Vector128 s_mask_UInt16_0x7E = Vector128.Create((short)0x7E); // Tilde '~' + + private static readonly Vector128 s_mask_SByte_0x00 = Vector128.Zero; // Null + + private static readonly Vector128 s_mask_SByte_0x20 = Vector128.Create((sbyte)0x20); // Space ' ' + + private static readonly Vector128 s_mask_SByte_0x22 = Vector128.Create((sbyte)0x22); // Quotation Mark '"' + private static readonly Vector128 s_mask_SByte_0x26 = Vector128.Create((sbyte)0x26); // Ampersand '&' + private static readonly Vector128 s_mask_SByte_0x27 = Vector128.Create((sbyte)0x27); // Apostrophe ''' + private static readonly Vector128 s_mask_SByte_0x2B = Vector128.Create((sbyte)0x2B); // Plus sign '+' + private static readonly Vector128 s_mask_SByte_0x3C = Vector128.Create((sbyte)0x3C); // Less Than Sign '<' + private static readonly Vector128 s_mask_SByte_0x3E = Vector128.Create((sbyte)0x3E); // Greater Than Sign '>' + private static readonly Vector128 s_mask_SByte_0x5C = Vector128.Create((sbyte)0x5C); // Reverse Solidus '\' + private static readonly Vector128 s_mask_SByte_0x60 = Vector128.Create((sbyte)0x60); // Grave Access '`' + } +} diff --git a/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/TextEncoder.cs b/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/TextEncoder.cs index bdd09f24593b..7f3b47fcec1d 100644 --- a/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/TextEncoder.cs +++ b/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/TextEncoder.cs @@ -10,6 +10,11 @@ using System.Runtime.InteropServices; using System.Text.Unicode; +#if NETCOREAPP +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif + namespace System.Text.Encodings.Web { /// @@ -24,6 +29,8 @@ public abstract class TextEncoder // Fast cache for Ascii private byte[][] _asciiEscape = new byte[0x80][]; + private readonly int[] _asciiNeedsEscaping = new int[0x80]; + // Keep a reference to Array.Empty as this is used as a singleton for comparisons // and there is no guarantee that Array.Empty() will always be the same instance. private static readonly byte[] s_noEscape = Array.Empty(); @@ -693,46 +700,132 @@ private unsafe int FindFirstCharacterToEncode(ReadOnlySpan text) /// current encoder instance, or -1 if no data in requires escaping. /// [EditorBrowsable(EditorBrowsableState.Never)] - public virtual int FindFirstCharacterToEncodeUtf8(ReadOnlySpan utf8Text) + public virtual unsafe int FindFirstCharacterToEncodeUtf8(ReadOnlySpan utf8Text) { - int originalUtf8TextLength = utf8Text.Length; - // Loop through the input text, terminating when we see ill-formed UTF-8 or when we decode a scalar value // that must be encoded. If we see either of these things then we'll return its index in the original // input sequence. If we consume the entire text without seeing either of these, return -1 to indicate // that the text can be copied as-is without escaping. - int i = 0; - while (i < utf8Text.Length) + fixed (byte* ptr = utf8Text) { - byte value = utf8Text[i]; - if (UnicodeUtility.IsAsciiCodePoint(value)) + int idx = 0; + +#if NETCOREAPP + if (Sse2.IsSupported) { - if (!ReferenceEquals(GetAsciiEncoding(value), s_noEscape)) + sbyte* startingAddress = (sbyte*)ptr; + while (utf8Text.Length - 16 >= idx) { - return originalUtf8TextLength - utf8Text.Length + i; + Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + utf8Text.Length - 16)); + + // Load the next 16 bytes. + Vector128 sourceValue = Sse2.LoadVector128(startingAddress); + + Vector128 mask = Sse2Helper.CreateAsciiMask(sourceValue); + int index = Sse2.MoveMask(mask); + + if (index != 0) + { + // At least one of the following 16 bytes is non-ASCII. + + int processNextSixteen = idx + 16; + Debug.Assert(processNextSixteen <= utf8Text.Length); + + while (idx < processNextSixteen) + { + Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length)); + + if (UnicodeUtility.IsAsciiCodePoint(ptr[idx])) + { + if (DoesAsciiNeedEncoding(ptr[idx]) == 1) + { + goto Return; + } + idx++; + } + else + { + OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar); + + Debug.Assert(nextScalarValue <= int.MaxValue); + if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue)) + { + goto Return; + } + + Debug.Assert(opStatus == OperationStatus.Done); + idx += utf8BytesConsumedForScalar; + } + } + } + else + { + if (DoesAsciiNeedEncoding(ptr[idx]) == 1 + + || DoesAsciiNeedEncoding(ptr[++idx]) == 1 + || DoesAsciiNeedEncoding(ptr[++idx]) == 1 + || DoesAsciiNeedEncoding(ptr[++idx]) == 1 + || DoesAsciiNeedEncoding(ptr[++idx]) == 1 + + || DoesAsciiNeedEncoding(ptr[++idx]) == 1 + || DoesAsciiNeedEncoding(ptr[++idx]) == 1 + || DoesAsciiNeedEncoding(ptr[++idx]) == 1 + || DoesAsciiNeedEncoding(ptr[++idx]) == 1 + + || DoesAsciiNeedEncoding(ptr[++idx]) == 1 + || DoesAsciiNeedEncoding(ptr[++idx]) == 1 + || DoesAsciiNeedEncoding(ptr[++idx]) == 1 + || DoesAsciiNeedEncoding(ptr[++idx]) == 1 + + || DoesAsciiNeedEncoding(ptr[++idx]) == 1 + || DoesAsciiNeedEncoding(ptr[++idx]) == 1 + || DoesAsciiNeedEncoding(ptr[++idx]) == 1) + { + goto Return; + } + idx++; + } + startingAddress = (sbyte*)ptr + idx; } - i++; + // Process the remaining bytes. + Debug.Assert(utf8Text.Length - idx < 16); } - else +#endif + + while (idx < utf8Text.Length) { - if (i > 0) - { - utf8Text = utf8Text.Slice(i); - } + Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length)); - if (UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text, out uint nextScalarValue, out int bytesConsumedThisIteration) != OperationStatus.Done - || WillEncode((int)nextScalarValue)) + if (UnicodeUtility.IsAsciiCodePoint(ptr[idx])) { - return originalUtf8TextLength - utf8Text.Length; + if (DoesAsciiNeedEncoding(ptr[idx]) == 1) + { + goto Return; + } + idx++; } + else + { + OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar); - i = bytesConsumedThisIteration; + Debug.Assert(nextScalarValue <= int.MaxValue); + if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue)) + { + goto Return; + } + + Debug.Assert(opStatus == OperationStatus.Done); + idx += utf8BytesConsumedForScalar; + } } - } - return -1; // no input data needs to be escaped + idx = -1; // All bytes are allowed. + + Return: + return idx; + } } /// @@ -813,10 +906,26 @@ private byte[] GetAsciiEncoding(byte value) _asciiEscape[value] = encoding; } } - return encoding; } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private int DoesAsciiNeedEncoding(byte value) + { + Debug.Assert(value <= 0x7F); + + int needsEscaping = _asciiNeedsEscaping[value]; + + Debug.Assert(needsEscaping == 0 || needsEscaping == 1 || needsEscaping == -1); + + if (needsEscaping == 0) + { + needsEscaping = WillEncode(value) ? 1 : -1; + _asciiNeedsEscaping[value] = needsEscaping; + } + return needsEscaping; + } + private static void ThrowArgumentException_MaxOutputCharsPerInputChar() { throw new ArgumentException("Argument encoder does not implement MaxOutputCharsPerInputChar correctly."); diff --git a/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/UnsafeRelaxedJavaScriptEncoder.cs b/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/UnsafeRelaxedJavaScriptEncoder.cs index 7226d1760d07..bab848b5ad90 100644 --- a/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/UnsafeRelaxedJavaScriptEncoder.cs +++ b/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/UnsafeRelaxedJavaScriptEncoder.cs @@ -2,25 +2,29 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using System.Buffers; using System.Diagnostics; using System.Runtime.CompilerServices; using System.Text.Internal; using System.Text.Unicode; +#if NETCOREAPP +using System.Numerics; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif + namespace System.Text.Encodings.Web { internal sealed class UnsafeRelaxedJavaScriptEncoder : JavaScriptEncoder { private readonly AllowedCharactersBitmap _allowedCharacters; - internal static readonly UnsafeRelaxedJavaScriptEncoder s_singleton = new UnsafeRelaxedJavaScriptEncoder(new TextEncoderSettings(UnicodeRanges.All)); + internal static readonly UnsafeRelaxedJavaScriptEncoder s_singleton = new UnsafeRelaxedJavaScriptEncoder(); - private UnsafeRelaxedJavaScriptEncoder(TextEncoderSettings filter) + private UnsafeRelaxedJavaScriptEncoder() { - if (filter == null) - { - throw new ArgumentNullException(nameof(filter)); - } + var filter = new TextEncoderSettings(UnicodeRanges.All); _allowedCharacters = filter.GetAllowedCharacters(); @@ -44,18 +48,209 @@ public override bool WillEncode(int unicodeScalar) return true; } + Debug.Assert(unicodeScalar >= char.MinValue && unicodeScalar <= char.MaxValue); + return !_allowedCharacters.IsUnicodeScalarAllowed(unicodeScalar); } [MethodImpl(MethodImplOptions.AggressiveInlining)] - public unsafe override int FindFirstCharacterToEncode(char* text, int textLength) + public override unsafe int FindFirstCharacterToEncode(char* text, int textLength) { if (text == null) { throw new ArgumentNullException(nameof(text)); } - return _allowedCharacters.FindFirstCharacterToEncode(text, textLength); + int idx = 0; + +#if NETCOREAPP + if (Sse2.IsSupported) + { + short* startingAddress = (short*)text; + while (textLength - 8 >= idx) + { + Debug.Assert(startingAddress >= text && startingAddress <= (text + textLength - 8)); + + // Load the next 8 characters. + Vector128 sourceValue = Sse2.LoadVector128(startingAddress); + + Vector128 mask = Sse2Helper.CreateAsciiMask(sourceValue); + int index = Sse2.MoveMask(mask.AsByte()); + + if (index != 0) + { + // At least one of the following 8 characters is non-ASCII. + int processNextEight = idx + 8; + Debug.Assert(processNextEight <= textLength); + for (; idx < processNextEight; idx++) + { + Debug.Assert((text + idx) <= (text + textLength)); + if (!_allowedCharacters.IsCharacterAllowed(*(text + idx))) + { + goto Return; + } + } + startingAddress += 8; + } + else + { + // Check if any of the 8 characters need to be escaped. + mask = Sse2Helper.CreateEscapingMask_UnsafeRelaxedJavaScriptEncoder(sourceValue); + + index = Sse2.MoveMask(mask.AsByte()); + // If index == 0, that means none of the 8 characters needed to be escaped. + // TrailingZeroCount is relatively expensive, avoid it if possible. + if (index != 0) + { + // Found at least one character that needs to be escaped, figure out the index of + // the first one found that needed to be escaped within the 8 characters. + Debug.Assert(index > 0 && index <= 65_535); + int tzc = BitOperations.TrailingZeroCount(index); + Debug.Assert(tzc % 2 == 0 && tzc >= 0 && tzc <= 16); + idx += tzc >> 1; + goto Return; + } + idx += 8; + startingAddress += 8; + } + } + + // Process the remaining characters. + Debug.Assert(textLength - idx < 8); + } +#endif + + for (; idx < textLength; idx++) + { + Debug.Assert((text + idx) <= (text + textLength)); + if (!_allowedCharacters.IsCharacterAllowed(*(text + idx))) + { + goto Return; + } + } + + idx = -1; // All characters are allowed. + + Return: + return idx; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public override unsafe int FindFirstCharacterToEncodeUtf8(ReadOnlySpan utf8Text) + { + fixed (byte* ptr = utf8Text) + { + int idx = 0; + +#if NETCOREAPP + if (Sse2.IsSupported) + { + sbyte* startingAddress = (sbyte*)ptr; + while (utf8Text.Length - 16 >= idx) + { + Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + utf8Text.Length - 16)); + + // Load the next 16 bytes. + Vector128 sourceValue = Sse2.LoadVector128(startingAddress); + + Vector128 mask = Sse2Helper.CreateAsciiMask(sourceValue); + int index = Sse2.MoveMask(mask); + + if (index != 0) + { + // At least one of the following 16 bytes is non-ASCII. + + int processNextSixteen = idx + 16; + Debug.Assert(processNextSixteen <= utf8Text.Length); + + while (idx < processNextSixteen) + { + Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length)); + + if (UnicodeUtility.IsAsciiCodePoint(ptr[idx])) + { + if (!_allowedCharacters.IsUnicodeScalarAllowed(ptr[idx])) + { + goto Return; + } + idx++; + } + else + { + OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar); + + Debug.Assert(nextScalarValue <= int.MaxValue); + if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue)) + { + goto Return; + } + + Debug.Assert(opStatus == OperationStatus.Done); + idx += utf8BytesConsumedForScalar; + } + } + startingAddress = (sbyte*)ptr + idx; + } + else + { + // Check if any of the 16 bytes need to be escaped. + mask = Sse2Helper.CreateEscapingMask_UnsafeRelaxedJavaScriptEncoder(sourceValue); + + index = Sse2.MoveMask(mask); + // If index == 0, that means none of the 16 bytes needed to be escaped. + // TrailingZeroCount is relatively expensive, avoid it if possible. + if (index != 0) + { + // Found at least one byte that needs to be escaped, figure out the index of + // the first one found that needed to be escaped within the 16 bytes. + Debug.Assert(index > 0 && index <= 65_535); + int tzc = BitOperations.TrailingZeroCount(index); + Debug.Assert(tzc >= 0 && tzc <= 16); + idx += tzc; + goto Return; + } + idx += 16; + startingAddress += 16; + } + } + + // Process the remaining bytes. + Debug.Assert(utf8Text.Length - idx < 16); + } +#endif + + while (idx < utf8Text.Length) + { + Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length)); + + if (UnicodeUtility.IsAsciiCodePoint(ptr[idx])) + { + if (!_allowedCharacters.IsUnicodeScalarAllowed(ptr[idx])) + { + goto Return; + } + idx++; + } + else + { + OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar); + + Debug.Assert(nextScalarValue <= int.MaxValue); + if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue)) + { + goto Return; + } + + Debug.Assert(opStatus == OperationStatus.Done); + idx += utf8BytesConsumedForScalar; + } + } + + idx = -1; // All bytes are allowed. + + Return: + return idx; + } } // The worst case encoding is 6 output chars per input char: [input] U+FFFF -> [output] "\uFFFF" @@ -75,7 +270,7 @@ public unsafe override int FindFirstCharacterToEncode(char* text, int textLength // See ECMA-262, Sec. 7.8.4, and ECMA-404, Sec. 9 // http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.4 // http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-404.pdf - public unsafe override bool TryEncodeUnicodeScalar(int unicodeScalar, char* buffer, int bufferLength, out int numberOfCharactersWritten) + public override unsafe bool TryEncodeUnicodeScalar(int unicodeScalar, char* buffer, int bufferLength, out int numberOfCharactersWritten) { if (buffer == null) { @@ -122,66 +317,9 @@ public unsafe override bool TryEncodeUnicodeScalar(int unicodeScalar, char* buff toCopy = s_back; break; default: - return TryWriteEncodedScalarAsNumericEntity(unicodeScalar, buffer, bufferLength, out numberOfCharactersWritten); + return JavaScriptEncoderHelper.TryWriteEncodedScalarAsNumericEntity(unicodeScalar, buffer, bufferLength, out numberOfCharactersWritten); } return TryCopyCharacters(toCopy, buffer, bufferLength, out numberOfCharactersWritten); } - - private static unsafe bool TryWriteEncodedScalarAsNumericEntity(int unicodeScalar, char* buffer, int length, out int numberOfCharactersWritten) - { - Debug.Assert(buffer != null && length >= 0); - - if (UnicodeHelpers.IsSupplementaryCodePoint(unicodeScalar)) - { - // Convert this back to UTF-16 and write out both characters. - UnicodeHelpers.GetUtf16SurrogatePairFromAstralScalarValue(unicodeScalar, out char leadingSurrogate, out char trailingSurrogate); - if (TryWriteEncodedSingleCharacter(leadingSurrogate, buffer, length, out int leadingSurrogateCharactersWritten) && - TryWriteEncodedSingleCharacter(trailingSurrogate, buffer + leadingSurrogateCharactersWritten, length - leadingSurrogateCharactersWritten, out numberOfCharactersWritten) - ) - { - numberOfCharactersWritten += leadingSurrogateCharactersWritten; - return true; - } - else - { - numberOfCharactersWritten = 0; - return false; - } - } - else - { - // This is only a single character. - return TryWriteEncodedSingleCharacter(unicodeScalar, buffer, length, out numberOfCharactersWritten); - } - } - - // Writes an encoded scalar value (in the BMP) as a JavaScript-escaped character. - private static unsafe bool TryWriteEncodedSingleCharacter(int unicodeScalar, char* buffer, int length, out int numberOfCharactersWritten) - { - Debug.Assert(buffer != null && length >= 0); - Debug.Assert(!UnicodeHelpers.IsSupplementaryCodePoint(unicodeScalar), "The incoming value should've been in the BMP."); - - if (length < 6) - { - numberOfCharactersWritten = 0; - return false; - } - - // Encode this as 6 chars "\uFFFF". - *buffer = '\\'; - buffer++; - *buffer = 'u'; - buffer++; - *buffer = HexUtil.Int32LsbToHexDigit(unicodeScalar >> 12); - buffer++; - *buffer = HexUtil.Int32LsbToHexDigit((int)((unicodeScalar >> 8) & 0xFU)); - buffer++; - *buffer = HexUtil.Int32LsbToHexDigit((int)((unicodeScalar >> 4) & 0xFU)); - buffer++; - *buffer = HexUtil.Int32LsbToHexDigit((int)(unicodeScalar & 0xFU)); - - numberOfCharactersWritten = 6; - return true; - } } } diff --git a/src/System.Text.Encodings.Web/src/System/Text/Internal/AllowedCharactersBitmap.cs b/src/System.Text.Encodings.Web/src/System/Text/Internal/AllowedCharactersBitmap.cs index cb752838a190..75c6d8ffc6f1 100644 --- a/src/System.Text.Encodings.Web/src/System/Text/Internal/AllowedCharactersBitmap.cs +++ b/src/System.Text.Encodings.Web/src/System/Text/Internal/AllowedCharactersBitmap.cs @@ -73,30 +73,64 @@ public AllowedCharactersBitmap Clone() // Determines whether the given character can be returned unencoded. public bool IsCharacterAllowed(char character) { - int codePoint = character; - int index = codePoint >> 5; - int offset = codePoint & 0x1F; - return ((_allowedCharacters[index] >> offset) & 0x1U) != 0; + return IsUnicodeScalarAllowed(character); } // Determines whether the given character can be returned unencoded. [MethodImpl(MethodImplOptions.AggressiveInlining)] public bool IsUnicodeScalarAllowed(int unicodeScalar) { + Debug.Assert(unicodeScalar < 0x10000); int index = unicodeScalar >> 5; int offset = unicodeScalar & 0x1F; - return ((_allowedCharacters[index] >> offset) & 0x1U) != 0; + return (_allowedCharacters[index] & (0x1U << offset)) != 0; } - [MethodImpl(MethodImplOptions.AggressiveInlining)] public unsafe int FindFirstCharacterToEncode(char* text, int textLength) { - for (int i = 0; i < textLength; i++) + int i = 0; + + while (i <= textLength - 8) + { + if (!IsCharacterAllowed(text[i]) + || !IsCharacterAllowed(text[++i]) + || !IsCharacterAllowed(text[++i]) + || !IsCharacterAllowed(text[++i]) + || !IsCharacterAllowed(text[++i]) + || !IsCharacterAllowed(text[++i]) + || !IsCharacterAllowed(text[++i]) + || !IsCharacterAllowed(text[++i])) + { + goto Return; + } + i++; + } + + while (i <= textLength - 4) + { + if (!IsCharacterAllowed(text[i]) + || !IsCharacterAllowed(text[++i]) + || !IsCharacterAllowed(text[++i]) + || !IsCharacterAllowed(text[++i])) + { + goto Return; + } + i++; + } + + while (i < textLength) { if (!IsCharacterAllowed(text[i])) - { return i; } + { + goto Return; + } + i++; } - return -1; + + i = -1; + + Return: + return i; } } } diff --git a/src/System.Text.Encodings.Web/tests/JavaScriptStringEncoderTests.cs b/src/System.Text.Encodings.Web/tests/JavaScriptStringEncoderTests.cs index c2b00605608b..062cb942b613 100644 --- a/src/System.Text.Encodings.Web/tests/JavaScriptStringEncoderTests.cs +++ b/src/System.Text.Encodings.Web/tests/JavaScriptStringEncoderTests.cs @@ -4,6 +4,7 @@ using System; using System.Buffers; +using System.Collections.Generic; using System.Globalization; using System.IO; using System.Linq; @@ -15,6 +16,354 @@ namespace Microsoft.Framework.WebEncoders { public partial class JavaScriptStringEncoderTests { + [Fact] + public unsafe void NullPtrThrows() + { + Assert.Throws(() => JavaScriptEncoder.Default.FindFirstCharacterToEncode(null, 0)); + Assert.Throws(() => JavaScriptEncoder.UnsafeRelaxedJsonEscaping.FindFirstCharacterToEncode(null, 0)); + Assert.Throws(() => JavaScriptEncoder.Create(UnicodeRanges.All).FindFirstCharacterToEncode(null, 0)); + + Assert.Throws(() => JavaScriptEncoder.Default.TryEncodeUnicodeScalar('a', null, 0, out _)); + Assert.Throws(() => JavaScriptEncoder.UnsafeRelaxedJsonEscaping.TryEncodeUnicodeScalar('a', null, 0, out _)); + Assert.Throws(() => JavaScriptEncoder.Create(UnicodeRanges.All).TryEncodeUnicodeScalar('a', null, 0, out _)); + + Assert.Throws(() => JavaScriptEncoder.Create((TextEncoderSettings)null)); + Assert.Throws(() => JavaScriptEncoder.Create((UnicodeRange)null)); + } + + [Theory] + [MemberData(nameof(EscapingTestData))] + public unsafe void FindFirstCharacterToEncode(char replacementChar, JavaScriptEncoder encoder, bool requiresEscaping) + { + Assert.Equal(-1, encoder.FindFirstCharacterToEncodeUtf8(default)); + fixed (char* ptr = string.Empty) + { + Assert.Equal(-1, encoder.FindFirstCharacterToEncode(ptr, 0)); + } + + var random = new Random(42); + for (int dataLength = 0; dataLength < 50; dataLength++) + { + char[] str = new char[dataLength]; + for (int i = 0; i < dataLength; i++) + { + str[i] = (char)random.Next(97, 123); + } + string baseStr = new string(str); + byte[] sourceUtf8 = Encoding.UTF8.GetBytes(baseStr); + + Assert.Equal(-1, encoder.FindFirstCharacterToEncodeUtf8(sourceUtf8)); + fixed (char* ptr = baseStr) + { + Assert.Equal(-1, encoder.FindFirstCharacterToEncode(ptr, baseStr.Length)); + } + + for (int i = 0; i < dataLength; i++) + { + char[] changed = baseStr.ToCharArray(); + changed[i] = replacementChar; + string source = new string(changed); + sourceUtf8 = Encoding.UTF8.GetBytes(source); + + Assert.Equal(requiresEscaping ? i : -1, encoder.FindFirstCharacterToEncodeUtf8(sourceUtf8)); + fixed (char* ptr = source) + { + Assert.Equal(requiresEscaping ? i : -1, encoder.FindFirstCharacterToEncode(ptr, source.Length)); + } + } + + if (dataLength != 0) + { + char[] changed = baseStr.ToCharArray(); + changed.AsSpan().Fill(replacementChar); + string source = new string(changed); + sourceUtf8 = Encoding.UTF8.GetBytes(source); + + Assert.Equal(requiresEscaping ? 0 : -1, encoder.FindFirstCharacterToEncodeUtf8(sourceUtf8)); + fixed (char* ptr = source) + { + Assert.Equal(requiresEscaping ? 0 : -1, encoder.FindFirstCharacterToEncode(ptr, source.Length)); + } + } + } + } + + public static IEnumerable EscapingTestData + { + get + { + return new List + { + new object[] { 'a', JavaScriptEncoder.Default, false }, // ASCII not escaped + new object[] { '\u001F', JavaScriptEncoder.Default, true }, // control character within single byte range + new object[] { '\u2000', JavaScriptEncoder.Default, true }, // space character outside single byte range + new object[] { '\u00A2', JavaScriptEncoder.Default, true }, // non-ASCII but < 255 + new object[] { '\uA686', JavaScriptEncoder.Default, true }, // non-ASCII above short.MaxValue + new object[] { '\u6C49', JavaScriptEncoder.Default, true }, // non-ASCII from chinese alphabet - multibyte + new object[] { '"', JavaScriptEncoder.Default, true }, // ASCII but must always be escaped in JSON + new object[] { '\\', JavaScriptEncoder.Default, true }, // ASCII but must always be escaped in JSON + new object[] { '<', JavaScriptEncoder.Default, true }, // ASCII but escaped by default + new object[] { '>', JavaScriptEncoder.Default, true }, // ASCII but escaped by default + new object[] { '&', JavaScriptEncoder.Default, true }, // ASCII but escaped by default + new object[] { '`', JavaScriptEncoder.Default, true }, // ASCII but escaped by default + new object[] { '\'', JavaScriptEncoder.Default, true }, // ASCII but escaped by default + new object[] { '+', JavaScriptEncoder.Default, true }, // ASCII but escaped by default + new object[] { '\uFFFD', JavaScriptEncoder.Default, true }, // Default replacement character + + new object[] { 'a', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), false }, + new object[] { '\u001F', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '\u2000', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '\u00A2', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '\uA686', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '\u6C49', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '"', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '\\', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '<', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '>', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '&', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '`', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '\'', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '+', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '\uFFFD', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + + new object[] { 'a', JavaScriptEncoder.Create(UnicodeRanges.All), false }, + new object[] { '\u001F', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '\u2000', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '\u00A2', JavaScriptEncoder.Create(UnicodeRanges.All), false }, + new object[] { '\uA686', JavaScriptEncoder.Create(UnicodeRanges.All), false }, + new object[] { '\u6C49', JavaScriptEncoder.Create(UnicodeRanges.All), false }, + new object[] { '"', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '\\', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '<', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '>', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '&', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '`', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '\'', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '+', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '\uFFFD', JavaScriptEncoder.Create(UnicodeRanges.All), false }, + + new object[] { 'a', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '\u001F', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, + new object[] { '\u2000', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, + new object[] { '\u00A2', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '\uA686', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '\u6C49', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '"', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, + new object[] { '\\', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, + new object[] { '<', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '>', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '&', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '`', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '\'', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '+', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '\uFFFD', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + }; + } + } + + [Theory] + [MemberData(nameof(EscapingTestData_NonAscii))] + public unsafe void FindFirstCharacterToEncode_NonAscii(char replacementChar, JavaScriptEncoder encoder, bool requiresEscaping) + { + var random = new Random(42); + for (int dataLength = 1; dataLength < 50; dataLength++) + { + char[] str = new char[dataLength]; + for (int i = 0; i < dataLength; i++) + { + str[i] = (char)random.Next(0x2E9B, 0x2EF4); // CJK Radicals Supplement characters + } + string baseStr = new string(str); + byte[] sourceUtf8 = Encoding.UTF8.GetBytes(baseStr); + + Assert.Equal(-1, encoder.FindFirstCharacterToEncodeUtf8(sourceUtf8)); + fixed (char* ptr = baseStr) + { + Assert.Equal(-1, encoder.FindFirstCharacterToEncode(ptr, baseStr.Length)); + } + + for (int i = 0; i < dataLength; i++) + { + string source = baseStr.Insert(i, new string(replacementChar, 1)); + sourceUtf8 = Encoding.UTF8.GetBytes(source); + + Assert.Equal(requiresEscaping ? i * 3 : -1, encoder.FindFirstCharacterToEncodeUtf8(sourceUtf8)); // Each CJK character expands to 3 utf-8 bytes. + fixed (char* ptr = source) + { + Assert.Equal(requiresEscaping ? i : -1, encoder.FindFirstCharacterToEncode(ptr, source.Length)); + } + } + } + } + + public static IEnumerable EscapingTestData_NonAscii + { + get + { + return new List + { + new object[] { 'a', JavaScriptEncoder.Create(UnicodeRanges.All), false }, + new object[] { '\u001F', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '\u2000', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '\u00A2', JavaScriptEncoder.Create(UnicodeRanges.All), false }, + new object[] { '\uA686', JavaScriptEncoder.Create(UnicodeRanges.All), false }, + new object[] { '\u6C49', JavaScriptEncoder.Create(UnicodeRanges.All), false }, + new object[] { '"', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '\\', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '<', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '>', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '&', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '`', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '\'', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '+', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '\uFFFD', JavaScriptEncoder.Create(UnicodeRanges.All), false }, + + new object[] { 'a', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '\u001F', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, + new object[] { '\u2000', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, + new object[] { '\u00A2', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '\uA686', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '\u6C49', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '"', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, + new object[] { '\\', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, + new object[] { '<', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '>', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '&', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '`', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '\'', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '+', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '\uFFFD', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + }; + } + } + + [Theory] + [MemberData(nameof(JavaScriptEncoders))] + public unsafe void EscapingTestWhileWritingSurrogate(JavaScriptEncoder encoder) + { + char highSurrogate = '\uD801'; + char lowSurrogate = '\uDC37'; + var random = new Random(42); + for (int dataLength = 2; dataLength < 50; dataLength++) + { + char[] str = new char[dataLength]; + for (int i = 0; i < dataLength; i++) + { + str[i] = (char)random.Next(97, 123); + } + string baseStr = new string(str); + byte[] sourceUtf8 = Encoding.UTF8.GetBytes(baseStr); + + Assert.Equal(-1, encoder.FindFirstCharacterToEncodeUtf8(sourceUtf8)); + fixed (char* ptr = baseStr) + { + Assert.Equal(-1, encoder.FindFirstCharacterToEncode(ptr, baseStr.Length)); + } + + for (int i = 0; i < dataLength - 1; i++) + { + char[] changed = baseStr.ToCharArray(); + changed[i] = highSurrogate; + changed[i + 1] = lowSurrogate; + string newStr = new string(changed); + sourceUtf8 = Encoding.UTF8.GetBytes(newStr); + + Assert.Equal(i, encoder.FindFirstCharacterToEncodeUtf8(sourceUtf8)); + fixed (char* ptr = newStr) + { + Assert.Equal(i, encoder.FindFirstCharacterToEncode(ptr, newStr.Length)); + } + } + + { + char[] changed = baseStr.ToCharArray(); + + for (int i = 0; i < changed.Length - 1; i += 2) + { + changed[i] = highSurrogate; + changed[i + 1] = lowSurrogate; + } + + string newStr = new string(changed); + sourceUtf8 = Encoding.UTF8.GetBytes(newStr); + + Assert.Equal(0, encoder.FindFirstCharacterToEncodeUtf8(sourceUtf8)); + fixed (char* ptr = newStr) + { + Assert.Equal(0, encoder.FindFirstCharacterToEncode(ptr, newStr.Length)); + } + } + } + } + + public static IEnumerable JavaScriptEncoders + { + get + { + return new List + { + new object[] { JavaScriptEncoder.Default }, + new object[] { JavaScriptEncoder.Create(UnicodeRanges.BasicLatin) }, + new object[] { JavaScriptEncoder.Create(UnicodeRanges.All) }, + new object[] { JavaScriptEncoder.UnsafeRelaxedJsonEscaping }, + }; + } + } + + [Theory] + [MemberData(nameof(InvalidEscapingTestData))] + public unsafe void InvalidFindFirstCharacterToEncode(char replacementChar, JavaScriptEncoder encoder) + { + var random = new Random(42); + for (int dataLength = 0; dataLength < 47; dataLength++) + { + char[] str = new char[dataLength]; + for (int i = 0; i < dataLength; i++) + { + str[i] = (char)random.Next(97, 123); + } + string baseStr = new string(str); + byte[] baseStrUtf8 = Encoding.UTF8.GetBytes(baseStr); + + for (int i = 0; i < dataLength; i++) + { + char[] changed = baseStr.ToCharArray(); + changed[i] = replacementChar; + string source = new string(changed); + byte[] sourceUtf8 = new byte[baseStrUtf8.Length]; + baseStrUtf8.AsSpan().CopyTo(sourceUtf8); + sourceUtf8[i] = 0xC3; // Invalid, first byte of a 2-byte utf-8 character + + Assert.Equal(i, encoder.FindFirstCharacterToEncodeUtf8(sourceUtf8)); + fixed (char* ptr = source) + { + Assert.Equal(i, encoder.FindFirstCharacterToEncode(ptr, source.Length)); + } + } + } + } + + public static IEnumerable InvalidEscapingTestData + { + get + { + return new List + { + new object[] { '\uD801', JavaScriptEncoder.Default }, // Invalid, high surrogate alone + new object[] { '\uDC01', JavaScriptEncoder.Default }, // Invalid, low surrogate alone + + new object[] { '\uD801', JavaScriptEncoder.UnsafeRelaxedJsonEscaping }, + new object[] { '\uDC01', JavaScriptEncoder.UnsafeRelaxedJsonEscaping }, + + new object[] { '\uD801', JavaScriptEncoder.Create(UnicodeRanges.All) }, + new object[] { '\uDC01', JavaScriptEncoder.Create(UnicodeRanges.All) }, + + new object[] { '\uD801', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin) }, + new object[] { '\uDC01', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin) }, + }; + } + } + [Fact] public void TestSurrogate() { @@ -198,7 +547,8 @@ public void Default_EquivalentToBasicLatin() } [Fact] - public void JavaScriptStringEncode_AllRangesAllowed_StillEncodesForbiddenChars_Simple_Escaping() { + public void JavaScriptStringEncode_AllRangesAllowed_StillEncodesForbiddenChars_Simple_Escaping() + { // The following two calls could be simply InlineData to the Theory below // Unfortunately, the xUnit logger fails to escape the inputs when logging the test results, // and so the suite fails despite all tests passing. From 8eec62257c28d5d6aa138318b4cb9d66c2adfb05 Mon Sep 17 00:00:00 2001 From: Ahson Khan Date: Wed, 23 Oct 2019 01:09:34 -0700 Subject: [PATCH 3/9] When encoder is null, use JavaScriptEncoder.Default to check for NeedsEscaping (#42023) * When encoder is null, use JavaScriptEncoder.Default to check for NeedsEscaping. * Remove unnecessary unsafe keyword and add comment to using directive. * Address feedback. * Remove gotos and move the IsEmpty check outside the fixed block. --- .../src/System.Text.Json.csproj | 1 - .../Json/Writer/JsonWriterHelper.Escaping.cs | 206 +----------------- 2 files changed, 9 insertions(+), 198 deletions(-) diff --git a/src/System.Text.Json/src/System.Text.Json.csproj b/src/System.Text.Json/src/System.Text.Json.csproj index 83f021b1c995..23b7672de0bc 100644 --- a/src/System.Text.Json/src/System.Text.Json.csproj +++ b/src/System.Text.Json/src/System.Text.Json.csproj @@ -195,7 +195,6 @@ - diff --git a/src/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.Escaping.cs b/src/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.Escaping.cs index 5c6c27fb6346..2a5101310d65 100644 --- a/src/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.Escaping.cs +++ b/src/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.Escaping.cs @@ -5,16 +5,9 @@ using System.Buffers; using System.Buffers.Text; using System.Diagnostics; -using System.Numerics; -using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; +using System.Runtime.CompilerServices; // Do not remove. Needed for Int32LsbToHexDigit when !BUILDING_INBOX_LIBRARY using System.Text.Encodings.Web; -#if BUILDING_INBOX_LIBRARY -using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.X86; -#endif - namespace System.Text.Json { // TODO: Replace the escaping logic with publicly shipping APIs from https://github.com/dotnet/corefx/issues/33509 @@ -58,204 +51,23 @@ internal static partial class JsonWriterHelper private static bool NeedsEscapingNoBoundsCheck(char value) => AllowList[value] == 0; - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool NeedsEscaping(char value) => value > LastAsciiCharacter || AllowList[value] == 0; - -#if BUILDING_INBOX_LIBRARY - private static readonly Vector128 s_mask_UInt16_0x20 = Vector128.Create((short)0x20); // Space ' ' - - private static readonly Vector128 s_mask_UInt16_0x22 = Vector128.Create((short)0x22); // Quotation Mark '"' - private static readonly Vector128 s_mask_UInt16_0x26 = Vector128.Create((short)0x26); // Ampersand '&' - private static readonly Vector128 s_mask_UInt16_0x27 = Vector128.Create((short)0x27); // Apostrophe ''' - private static readonly Vector128 s_mask_UInt16_0x2B = Vector128.Create((short)0x2B); // Plus sign '+' - private static readonly Vector128 s_mask_UInt16_0x3C = Vector128.Create((short)0x3C); // Less Than Sign '<' - private static readonly Vector128 s_mask_UInt16_0x3E = Vector128.Create((short)0x3E); // Greater Than Sign '>' - private static readonly Vector128 s_mask_UInt16_0x5C = Vector128.Create((short)0x5C); // Reverse Solidus '\' - private static readonly Vector128 s_mask_UInt16_0x60 = Vector128.Create((short)0x60); // Grave Access '`' - - private static readonly Vector128 s_mask_UInt16_0x7E = Vector128.Create((short)0x7E); // Tilde '~' - - private static readonly Vector128 s_mask_SByte_0x20 = Vector128.Create((sbyte)0x20); // Space ' ' - - private static readonly Vector128 s_mask_SByte_0x22 = Vector128.Create((sbyte)0x22); // Quotation Mark '"' - private static readonly Vector128 s_mask_SByte_0x26 = Vector128.Create((sbyte)0x26); // Ampersand '&' - private static readonly Vector128 s_mask_SByte_0x27 = Vector128.Create((sbyte)0x27); // Apostrophe ''' - private static readonly Vector128 s_mask_SByte_0x2B = Vector128.Create((sbyte)0x2B); // Plus sign '+' - private static readonly Vector128 s_mask_SByte_0x3C = Vector128.Create((sbyte)0x3C); // Less Than Sign '<' - private static readonly Vector128 s_mask_SByte_0x3E = Vector128.Create((sbyte)0x3E); // Greater Than Sign '>' - private static readonly Vector128 s_mask_SByte_0x5C = Vector128.Create((sbyte)0x5C); // Reverse Solidus '\' - private static readonly Vector128 s_mask_SByte_0x60 = Vector128.Create((sbyte)0x60); // Grave Access '`' - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static Vector128 CreateEscapingMask(Vector128 sourceValue) - { - Debug.Assert(Sse2.IsSupported); - - Vector128 mask = Sse2.CompareLessThan(sourceValue, s_mask_UInt16_0x20); // Space ' ', anything in the control characters range - - mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x22)); // Quotation Mark '"' - mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x26)); // Ampersand '&' - mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x27)); // Apostrophe ''' - mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x2B)); // Plus sign '+' - - mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x3C)); // Less Than Sign '<' - mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x3E)); // Greater Than Sign '>' - mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x5C)); // Reverse Solidus '\' - mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x60)); // Grave Access '`' - - mask = Sse2.Or(mask, Sse2.CompareGreaterThan(sourceValue, s_mask_UInt16_0x7E)); // Tilde '~', anything above the ASCII range - - return mask; - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static Vector128 CreateEscapingMask(Vector128 sourceValue) + public static int NeedsEscaping(ReadOnlySpan value, JavaScriptEncoder encoder) { - Debug.Assert(Sse2.IsSupported); - - Vector128 mask = Sse2.CompareLessThan(sourceValue, s_mask_SByte_0x20); // Control characters, and anything above 0x7E since sbyte.MaxValue is 0x7E - - mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x22)); // Quotation Mark " - mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x26)); // Ampersand & - mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x27)); // Apostrophe ' - mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x2B)); // Plus sign + - - mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x3C)); // Less Than Sign < - mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x3E)); // Greater Than Sign > - mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x5C)); // Reverse Solidus \ - mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x60)); // Grave Access ` - - return mask; + return (encoder ?? JavaScriptEncoder.Default).FindFirstCharacterToEncodeUtf8(value); } -#endif - public static unsafe int NeedsEscaping(ReadOnlySpan value, JavaScriptEncoder encoder) + public static unsafe int NeedsEscaping(ReadOnlySpan value, JavaScriptEncoder encoder) { - fixed (byte* ptr = value) + // Some implementations of JavaScriptEncoder.FindFirstCharacterToEncode may not accept + // null pointers and gaurd against that. Hence, check up-front to return -1. + if (value.IsEmpty) { - int idx = 0; - - if (encoder != null) - { - idx = encoder.FindFirstCharacterToEncodeUtf8(value); - goto Return; - } - -#if BUILDING_INBOX_LIBRARY - if (Sse2.IsSupported) - { - sbyte* startingAddress = (sbyte*)ptr; - while (value.Length - 16 >= idx) - { - Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + value.Length - 16)); - - // Load the next 16 bytes. - Vector128 sourceValue = Sse2.LoadVector128(startingAddress); - - // Check if any of the 16 bytes need to be escaped. - Vector128 mask = CreateEscapingMask(sourceValue); - - int index = Sse2.MoveMask(mask.AsByte()); - // If index == 0, that means none of the 16 bytes needed to be escaped. - // TrailingZeroCount is relatively expensive, avoid it if possible. - if (index != 0) - { - // Found at least one byte that needs to be escaped, figure out the index of - // the first one found that needed to be escaped within the 16 bytes. - Debug.Assert(index > 0 && index <= 65_535); - int tzc = BitOperations.TrailingZeroCount(index); - Debug.Assert(tzc >= 0 && tzc <= 16); - idx += tzc; - goto Return; - } - idx += 16; - startingAddress += 16; - } - - // Process the remaining characters. - Debug.Assert(value.Length - idx < 16); - } -#endif - - for (; idx < value.Length; idx++) - { - Debug.Assert((ptr + idx) <= (ptr + value.Length)); - if (NeedsEscaping(*(ptr + idx))) - { - goto Return; - } - } - - idx = -1; // all characters allowed - - Return: - return idx; + return -1; } - } - public static unsafe int NeedsEscaping(ReadOnlySpan value, JavaScriptEncoder encoder) - { fixed (char* ptr = value) { - int idx = 0; - - // Some implementations of JavascriptEncoder.FindFirstCharacterToEncode may not accept - // null pointers and gaurd against that. Hence, check up-front and fall down to return -1. - if (encoder != null && !value.IsEmpty) - { - idx = encoder.FindFirstCharacterToEncode(ptr, value.Length); - goto Return; - } - -#if BUILDING_INBOX_LIBRARY - if (Sse2.IsSupported) - { - short* startingAddress = (short*)ptr; - while (value.Length - 8 >= idx) - { - Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + value.Length - 8)); - - // Load the next 8 characters. - Vector128 sourceValue = Sse2.LoadVector128(startingAddress); - - // Check if any of the 8 characters need to be escaped. - Vector128 mask = CreateEscapingMask(sourceValue); - - int index = Sse2.MoveMask(mask.AsByte()); - // If index == 0, that means none of the 8 characters needed to be escaped. - // TrailingZeroCount is relatively expensive, avoid it if possible. - if (index != 0) - { - // Found at least one character that needs to be escaped, figure out the index of - // the first one found that needed to be escaped within the 8 characters. - Debug.Assert(index > 0 && index <= 65_535); - int tzc = BitOperations.TrailingZeroCount(index); - Debug.Assert(tzc % 2 == 0 && tzc >= 0 && tzc <= 16); - idx += tzc >> 1; - goto Return; - } - idx += 8; - startingAddress += 8; - } - - // Process the remaining characters. - Debug.Assert(value.Length - idx < 8); - } -#endif - - for (; idx < value.Length; idx++) - { - Debug.Assert((ptr + idx) <= (ptr + value.Length)); - if (NeedsEscaping(*(ptr + idx))) - { - goto Return; - } - } - - idx = -1; // All characters are allowed. - - Return: - return idx; + return (encoder ?? JavaScriptEncoder.Default).FindFirstCharacterToEncode(ptr, value.Length); } } From 28a5bea6877c69bf2c3d53c7bb76cd868efcbb09 Mon Sep 17 00:00:00 2001 From: Ahson Khan Date: Wed, 23 Oct 2019 01:55:00 -0700 Subject: [PATCH 4/9] Add necessary using directive in tests. --- .../tests/JavaScriptStringEncoderTests.cs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/System.Text.Encodings.Web/tests/JavaScriptStringEncoderTests.cs b/src/System.Text.Encodings.Web/tests/JavaScriptStringEncoderTests.cs index 062cb942b613..58319834ebed 100644 --- a/src/System.Text.Encodings.Web/tests/JavaScriptStringEncoderTests.cs +++ b/src/System.Text.Encodings.Web/tests/JavaScriptStringEncoderTests.cs @@ -8,6 +8,7 @@ using System.Globalization; using System.IO; using System.Linq; +using System.Text; using System.Text.Encodings.Web; using System.Text.Unicode; using Xunit; From de7b0f31924509fd1a78bcf703c8bfae97caa3e8 Mon Sep 17 00:00:00 2001 From: Ahson Khan Date: Wed, 23 Oct 2019 02:13:32 -0700 Subject: [PATCH 5/9] Move using directive within ifdef to make it clear when its used. --- .../src/System/Text/Json/Writer/JsonWriterHelper.Escaping.cs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.Escaping.cs b/src/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.Escaping.cs index 2a5101310d65..afba857013d1 100644 --- a/src/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.Escaping.cs +++ b/src/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.Escaping.cs @@ -5,9 +5,12 @@ using System.Buffers; using System.Buffers.Text; using System.Diagnostics; -using System.Runtime.CompilerServices; // Do not remove. Needed for Int32LsbToHexDigit when !BUILDING_INBOX_LIBRARY using System.Text.Encodings.Web; +#if !BUILDING_INBOX_LIBRARY +using System.Runtime.CompilerServices; +#endif + namespace System.Text.Json { // TODO: Replace the escaping logic with publicly shipping APIs from https://github.com/dotnet/corefx/issues/33509 From 891e315fc83aa56671e7f560ea6c875fbe517c4f Mon Sep 17 00:00:00 2001 From: Ahson Khan Date: Wed, 23 Oct 2019 04:06:16 -0700 Subject: [PATCH 6/9] Use a custom constant for net core app rather than one used by the SDK. --- .../src/System.Text.Encodings.Web.csproj | 1 + .../System/Text/Encodings/Web/DefaultJavaScriptEncoder.cs | 4 ++-- .../Encodings/Web/DefaultJavaScriptEncoderBasicLatin.cs | 6 +++--- .../src/System/Text/Encodings/Web/TextEncoder.cs | 4 ++-- .../Text/Encodings/Web/UnsafeRelaxedJavaScriptEncoder.cs | 6 +++--- 5 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src/System.Text.Encodings.Web/src/System.Text.Encodings.Web.csproj b/src/System.Text.Encodings.Web/src/System.Text.Encodings.Web.csproj index af978c992baf..ede309349679 100644 --- a/src/System.Text.Encodings.Web/src/System.Text.Encodings.Web.csproj +++ b/src/System.Text.Encodings.Web/src/System.Text.Encodings.Web.csproj @@ -4,6 +4,7 @@ System.Text.Encodings.Web true netcoreapp-Debug;netcoreapp-Release;netstandard-Debug;netstandard-Release;netstandard2.1-Debug;netstandard2.1-Release;uap-Windows_NT-Debug;uap-Windows_NT-Release + $(DefineConstants);BUILDING_INBOX_LIBRARY diff --git a/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/DefaultJavaScriptEncoder.cs b/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/DefaultJavaScriptEncoder.cs index ad6ffb9bc829..e3adc3cc6144 100644 --- a/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/DefaultJavaScriptEncoder.cs +++ b/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/DefaultJavaScriptEncoder.cs @@ -8,7 +8,7 @@ using System.Text.Internal; using System.Text.Unicode; -#if NETCOREAPP +#if BUILDING_INBOX_LIBRARY using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; #endif @@ -87,7 +87,7 @@ public override unsafe int FindFirstCharacterToEncodeUtf8(ReadOnlySpan utf { int idx = 0; -#if NETCOREAPP +#if BUILDING_INBOX_LIBRARY if (Sse2.IsSupported) { sbyte* startingAddress = (sbyte*)ptr; diff --git a/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/DefaultJavaScriptEncoderBasicLatin.cs b/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/DefaultJavaScriptEncoderBasicLatin.cs index db25c3d6d63f..de1c466e308b 100644 --- a/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/DefaultJavaScriptEncoderBasicLatin.cs +++ b/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/DefaultJavaScriptEncoderBasicLatin.cs @@ -7,7 +7,7 @@ using System.Text.Internal; using System.Text.Unicode; -#if NETCOREAPP +#if BUILDING_INBOX_LIBRARY using System.Numerics; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; @@ -83,7 +83,7 @@ public override unsafe int FindFirstCharacterToEncode(char* text, int textLength int idx = 0; -#if NETCOREAPP +#if BUILDING_INBOX_LIBRARY if (Sse2.IsSupported) { short* startingAddress = (short*)text; @@ -141,7 +141,7 @@ public override unsafe int FindFirstCharacterToEncodeUtf8(ReadOnlySpan utf { int idx = 0; -#if NETCOREAPP +#if BUILDING_INBOX_LIBRARY if (Sse2.IsSupported) { sbyte* startingAddress = (sbyte*)ptr; diff --git a/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/TextEncoder.cs b/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/TextEncoder.cs index 7f3b47fcec1d..eb2b50129ad4 100644 --- a/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/TextEncoder.cs +++ b/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/TextEncoder.cs @@ -10,7 +10,7 @@ using System.Runtime.InteropServices; using System.Text.Unicode; -#if NETCOREAPP +#if BUILDING_INBOX_LIBRARY using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; #endif @@ -711,7 +711,7 @@ public virtual unsafe int FindFirstCharacterToEncodeUtf8(ReadOnlySpan utf8 { int idx = 0; -#if NETCOREAPP +#if BUILDING_INBOX_LIBRARY if (Sse2.IsSupported) { sbyte* startingAddress = (sbyte*)ptr; diff --git a/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/UnsafeRelaxedJavaScriptEncoder.cs b/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/UnsafeRelaxedJavaScriptEncoder.cs index bab848b5ad90..a47afe1bf021 100644 --- a/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/UnsafeRelaxedJavaScriptEncoder.cs +++ b/src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/UnsafeRelaxedJavaScriptEncoder.cs @@ -8,7 +8,7 @@ using System.Text.Internal; using System.Text.Unicode; -#if NETCOREAPP +#if BUILDING_INBOX_LIBRARY using System.Numerics; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; @@ -63,7 +63,7 @@ public override unsafe int FindFirstCharacterToEncode(char* text, int textLength int idx = 0; -#if NETCOREAPP +#if BUILDING_INBOX_LIBRARY if (Sse2.IsSupported) { short* startingAddress = (short*)text; @@ -142,7 +142,7 @@ public override unsafe int FindFirstCharacterToEncodeUtf8(ReadOnlySpan utf { int idx = 0; -#if NETCOREAPP +#if BUILDING_INBOX_LIBRARY if (Sse2.IsSupported) { sbyte* startingAddress = (sbyte*)ptr; From 7b71e1a5219965af2ed84789b08713219b1ac5b5 Mon Sep 17 00:00:00 2001 From: Ahson Khan Date: Wed, 23 Oct 2019 04:20:28 -0700 Subject: [PATCH 7/9] Add more tests for custom text encoder case. --- .../tests/JavaScriptStringEncoderTests.cs | 115 ++++++++++++++++++ 1 file changed, 115 insertions(+) diff --git a/src/System.Text.Encodings.Web/tests/JavaScriptStringEncoderTests.cs b/src/System.Text.Encodings.Web/tests/JavaScriptStringEncoderTests.cs index 58319834ebed..c66044f04717 100644 --- a/src/System.Text.Encodings.Web/tests/JavaScriptStringEncoderTests.cs +++ b/src/System.Text.Encodings.Web/tests/JavaScriptStringEncoderTests.cs @@ -5,11 +5,14 @@ using System; using System.Buffers; using System.Collections.Generic; +using System.Diagnostics; using System.Globalization; using System.IO; using System.Linq; +using System.Runtime.CompilerServices; using System.Text; using System.Text.Encodings.Web; +using System.Text.Internal; using System.Text.Unicode; using Xunit; @@ -158,6 +161,22 @@ public static IEnumerable EscapingTestData new object[] { '\'', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, new object[] { '+', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, new object[] { '\uFFFD', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + + new object[] { 'a', new MyCustomEncoder(UnicodeRanges.All), false }, + new object[] { '\u001F', new MyCustomEncoder(UnicodeRanges.All), true }, + new object[] { '\u2000', new MyCustomEncoder(UnicodeRanges.All), true }, + new object[] { '\u00A2', new MyCustomEncoder(UnicodeRanges.All), false }, + new object[] { '\uA686', new MyCustomEncoder(UnicodeRanges.All), false }, + new object[] { '\u6C49', new MyCustomEncoder(UnicodeRanges.All), false }, + new object[] { '"', new MyCustomEncoder(UnicodeRanges.All), true }, + new object[] { '\\', new MyCustomEncoder(UnicodeRanges.All), true }, + new object[] { '<', new MyCustomEncoder(UnicodeRanges.All), true }, + new object[] { '>', new MyCustomEncoder(UnicodeRanges.All), true }, + new object[] { '&', new MyCustomEncoder(UnicodeRanges.All), true }, + new object[] { '`', new MyCustomEncoder(UnicodeRanges.All), true }, + new object[] { '\'', new MyCustomEncoder(UnicodeRanges.All), true }, + new object[] { '+', new MyCustomEncoder(UnicodeRanges.All), true }, + new object[] { '\uFFFD', new MyCustomEncoder(UnicodeRanges.All), false }, }; } } @@ -234,6 +253,22 @@ public static IEnumerable EscapingTestData_NonAscii new object[] { '\'', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, new object[] { '+', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, new object[] { '\uFFFD', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + + new object[] { 'a', new MyCustomEncoder(UnicodeRanges.All), false }, + new object[] { '\u001F', new MyCustomEncoder(UnicodeRanges.All), true }, + new object[] { '\u2000', new MyCustomEncoder(UnicodeRanges.All), true }, + new object[] { '\u00A2', new MyCustomEncoder(UnicodeRanges.All), false }, + new object[] { '\uA686', new MyCustomEncoder(UnicodeRanges.All), false }, + new object[] { '\u6C49', new MyCustomEncoder(UnicodeRanges.All), false }, + new object[] { '"', new MyCustomEncoder(UnicodeRanges.All), true }, + new object[] { '\\', new MyCustomEncoder(UnicodeRanges.All), true }, + new object[] { '<', new MyCustomEncoder(UnicodeRanges.All), true }, + new object[] { '>', new MyCustomEncoder(UnicodeRanges.All), true }, + new object[] { '&', new MyCustomEncoder(UnicodeRanges.All), true }, + new object[] { '`', new MyCustomEncoder(UnicodeRanges.All), true }, + new object[] { '\'', new MyCustomEncoder(UnicodeRanges.All), true }, + new object[] { '+', new MyCustomEncoder(UnicodeRanges.All), true }, + new object[] { '\uFFFD', new MyCustomEncoder(UnicodeRanges.All), false }, }; } } @@ -307,6 +342,7 @@ public static IEnumerable JavaScriptEncoders new object[] { JavaScriptEncoder.Create(UnicodeRanges.BasicLatin) }, new object[] { JavaScriptEncoder.Create(UnicodeRanges.All) }, new object[] { JavaScriptEncoder.UnsafeRelaxedJsonEscaping }, + new object[] { new MyCustomEncoder(UnicodeRanges.BasicLatin) }, }; } } @@ -361,10 +397,89 @@ public static IEnumerable InvalidEscapingTestData new object[] { '\uD801', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin) }, new object[] { '\uDC01', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin) }, + + new object[] { '\uD801', new MyCustomEncoder(UnicodeRanges.BasicLatin) }, + new object[] { '\uDC01', new MyCustomEncoder(UnicodeRanges.BasicLatin) }, }; } } + internal sealed class MyCustomEncoder : JavaScriptEncoder + { + private readonly AllowedCharactersBitmap _allowedCharacters; + + public MyCustomEncoder(TextEncoderSettings filter) + { + if (filter == null) + { + throw new ArgumentNullException(nameof(filter)); + } + + _allowedCharacters = filter.GetAllowedCharacters(); + + // Forbid codepoints which aren't mapped to characters or which are otherwise always disallowed + // (includes categories Cc, Cs, Co, Cn, Zs [except U+0020 SPACE], Zl, Zp) + _allowedCharacters.ForbidUndefinedCharacters(); + + // Forbid characters that are special in HTML. + // Even though this is a not HTML encoder, + // it's unfortunately common for developers to + // forget to HTML-encode a string once it has been JS-encoded, + // so this offers extra protection. + ForbidHtmlCharacters(_allowedCharacters); + + // '\' (U+005C REVERSE SOLIDUS) must always be escaped in Javascript / ECMAScript / JSON. + // '/' (U+002F SOLIDUS) is not Javascript / ECMAScript / JSON-sensitive so doesn't need to be escaped. + _allowedCharacters.ForbidCharacter('\\'); + + // '`' (U+0060 GRAVE ACCENT) is ECMAScript-sensitive (see ECMA-262). + _allowedCharacters.ForbidCharacter('`'); + } + + internal static void ForbidHtmlCharacters(AllowedCharactersBitmap allowedCharacters) + { + allowedCharacters.ForbidCharacter('<'); + allowedCharacters.ForbidCharacter('>'); + allowedCharacters.ForbidCharacter('&'); + allowedCharacters.ForbidCharacter('\''); // can be used to escape attributes + allowedCharacters.ForbidCharacter('\"'); // can be used to escape attributes + allowedCharacters.ForbidCharacter('+'); // technically not HTML-specific, but can be used to perform UTF7-based attacks + } + + public MyCustomEncoder(params UnicodeRange[] allowedRanges) : this(new TextEncoderSettings(allowedRanges)) + { } + + public override int MaxOutputCharactersPerInputCharacter => 12; // "\uFFFF\uFFFF" is the longest encoded form + + public override unsafe bool TryEncodeUnicodeScalar(int unicodeScalar, char* buffer, int bufferLength, out int numberOfCharactersWritten) + { + throw new NotImplementedException(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public override unsafe int FindFirstCharacterToEncode(char* text, int textLength) + { + if (text == null) + { + throw new ArgumentNullException(nameof(text)); + } + + return _allowedCharacters.FindFirstCharacterToEncode(text, textLength); + } + + public override bool WillEncode(int unicodeScalar) + { + if (UnicodeHelpers.IsSupplementaryCodePoint(unicodeScalar)) + { + return true; + } + + Debug.Assert(unicodeScalar >= char.MinValue && unicodeScalar <= char.MaxValue); + + return !_allowedCharacters.IsUnicodeScalarAllowed(unicodeScalar); + } + } + [Fact] public void TestSurrogate() { From 7ae6bf13700b60f7eaa7fbd5286445e4bb54bd56 Mon Sep 17 00:00:00 2001 From: Ahson Khan Date: Wed, 23 Oct 2019 12:56:37 -0700 Subject: [PATCH 8/9] Fix typo in comment gaurd -> guard --- .../src/System/Text/Json/Writer/JsonWriterHelper.Escaping.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.Escaping.cs b/src/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.Escaping.cs index afba857013d1..a6e0c8c9e797 100644 --- a/src/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.Escaping.cs +++ b/src/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.Escaping.cs @@ -62,7 +62,7 @@ public static int NeedsEscaping(ReadOnlySpan value, JavaScriptEncoder enco public static unsafe int NeedsEscaping(ReadOnlySpan value, JavaScriptEncoder encoder) { // Some implementations of JavaScriptEncoder.FindFirstCharacterToEncode may not accept - // null pointers and gaurd against that. Hence, check up-front to return -1. + // null pointers and guard against that. Hence, check up-front to return -1. if (value.IsEmpty) { return -1; From 7ffd8df28cb4efd4b212600c90693d8949f2985d Mon Sep 17 00:00:00 2001 From: Ahson Khan Date: Wed, 23 Oct 2019 15:18:14 -0700 Subject: [PATCH 9/9] Update the S.T.E.W configurations to explicitly target a versioned TFM (nc3.0). --- src/System.Text.Encodings.Web/src/Configurations.props | 2 +- .../src/System.Text.Encodings.Web.csproj | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/System.Text.Encodings.Web/src/Configurations.props b/src/System.Text.Encodings.Web/src/Configurations.props index 4f327a843e1b..36cef612f8ef 100644 --- a/src/System.Text.Encodings.Web/src/Configurations.props +++ b/src/System.Text.Encodings.Web/src/Configurations.props @@ -1,7 +1,7 @@  - netcoreapp; + netcoreapp3.0; netstandard2.1; netstandard; uap-Windows_NT; diff --git a/src/System.Text.Encodings.Web/src/System.Text.Encodings.Web.csproj b/src/System.Text.Encodings.Web/src/System.Text.Encodings.Web.csproj index ede309349679..b4796cc3ab9d 100644 --- a/src/System.Text.Encodings.Web/src/System.Text.Encodings.Web.csproj +++ b/src/System.Text.Encodings.Web/src/System.Text.Encodings.Web.csproj @@ -3,7 +3,7 @@ {B7EDBF00-765A-48E8-B593-CD668288E274} System.Text.Encodings.Web true - netcoreapp-Debug;netcoreapp-Release;netstandard-Debug;netstandard-Release;netstandard2.1-Debug;netstandard2.1-Release;uap-Windows_NT-Debug;uap-Windows_NT-Release + netcoreapp3.0-Debug;netcoreapp3.0-Release;netstandard-Debug;netstandard-Release;netstandard2.1-Debug;netstandard2.1-Release;uap-Windows_NT-Debug;uap-Windows_NT-Release $(DefineConstants);BUILDING_INBOX_LIBRARY