This repository was archived by the owner on Jan 23, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 4.9k
[release/3.1] Use Sse2 instrinsics to optimize NeedsEscaping/FindFirstCharToEncode for all built-in JavaScriptEncoders #42030
Closed
Closed
Changes from all commits
Commits
Show all changes
9 commits
Select commit
Hold shift + click to select a range
849f995
Use Sse2 instrinsics to make NeedsEscaping check faster for large JSO…
ahsonkhan 58ec075
Optimize FindFirstCharToEncode for JavaScriptEncoder.Default and Rela…
ahsonkhan 8eec622
When encoder is null, use JavaScriptEncoder.Default to check for Need…
ahsonkhan 28a5bea
Add necessary using directive in tests.
ahsonkhan de7b0f3
Move using directive within ifdef to make it clear when its used.
ahsonkhan 891e315
Use a custom constant for net core app rather than one used by the SDK.
ahsonkhan 7b71e1a
Add more tests for custom text encoder case.
ahsonkhan 7ae6bf1
Fix typo in comment gaurd -> guard
ahsonkhan 7ffd8df
Update the S.T.E.W configurations to explicitly target a versioned TF…
ahsonkhan File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
271 changes: 271 additions & 0 deletions
271
src/System.Text.Encodings.Web/src/System/Text/Encodings/Web/DefaultJavaScriptEncoder.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,271 @@ | ||
| // Licensed to the .NET Foundation under one or more agreements. | ||
| // The .NET Foundation licenses this file to you under the MIT license. | ||
| // See the LICENSE file in the project root for more information. | ||
|
|
||
| using System.Buffers; | ||
| using System.Diagnostics; | ||
| using System.Runtime.CompilerServices; | ||
| using System.Text.Internal; | ||
| using System.Text.Unicode; | ||
|
|
||
| #if BUILDING_INBOX_LIBRARY | ||
| using System.Runtime.Intrinsics; | ||
| using System.Runtime.Intrinsics.X86; | ||
| #endif | ||
|
|
||
| namespace System.Text.Encodings.Web | ||
| { | ||
| internal sealed class DefaultJavaScriptEncoder : JavaScriptEncoder | ||
| { | ||
| private readonly AllowedCharactersBitmap _allowedCharacters; | ||
|
|
||
| private readonly int[] _asciiNeedsEscaping = new int[0x80]; | ||
|
|
||
| public DefaultJavaScriptEncoder(TextEncoderSettings filter) | ||
| { | ||
| if (filter == null) | ||
| { | ||
| throw new ArgumentNullException(nameof(filter)); | ||
| } | ||
|
|
||
| _allowedCharacters = filter.GetAllowedCharacters(); | ||
|
|
||
| // Forbid codepoints which aren't mapped to characters or which are otherwise always disallowed | ||
| // (includes categories Cc, Cs, Co, Cn, Zs [except U+0020 SPACE], Zl, Zp) | ||
| _allowedCharacters.ForbidUndefinedCharacters(); | ||
|
|
||
| // Forbid characters that are special in HTML. | ||
| // Even though this is a not HTML encoder, | ||
| // it's unfortunately common for developers to | ||
| // forget to HTML-encode a string once it has been JS-encoded, | ||
| // so this offers extra protection. | ||
| DefaultHtmlEncoder.ForbidHtmlCharacters(_allowedCharacters); | ||
|
|
||
| // '\' (U+005C REVERSE SOLIDUS) must always be escaped in Javascript / ECMAScript / JSON. | ||
| // '/' (U+002F SOLIDUS) is not Javascript / ECMAScript / JSON-sensitive so doesn't need to be escaped. | ||
| _allowedCharacters.ForbidCharacter('\\'); | ||
|
|
||
| // '`' (U+0060 GRAVE ACCENT) is ECMAScript-sensitive (see ECMA-262). | ||
| _allowedCharacters.ForbidCharacter('`'); | ||
|
|
||
| for (int i = 0; i < _asciiNeedsEscaping.Length; i++) | ||
| { | ||
| _asciiNeedsEscaping[i] = WillEncode(i) ? 1 : -1; | ||
| } | ||
| } | ||
|
|
||
| public DefaultJavaScriptEncoder(params UnicodeRange[] allowedRanges) : this(new TextEncoderSettings(allowedRanges)) | ||
| { } | ||
|
|
||
| [MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
| public override bool WillEncode(int unicodeScalar) | ||
| { | ||
| if (UnicodeHelpers.IsSupplementaryCodePoint(unicodeScalar)) | ||
| { | ||
| return true; | ||
| } | ||
|
|
||
| Debug.Assert(unicodeScalar >= char.MinValue && unicodeScalar <= char.MaxValue); | ||
|
|
||
| return !_allowedCharacters.IsUnicodeScalarAllowed(unicodeScalar); | ||
| } | ||
|
|
||
| [MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
| public override unsafe int FindFirstCharacterToEncode(char* text, int textLength) | ||
| { | ||
| if (text == null) | ||
| { | ||
| throw new ArgumentNullException(nameof(text)); | ||
| } | ||
|
|
||
| return _allowedCharacters.FindFirstCharacterToEncode(text, textLength); | ||
| } | ||
|
|
||
| public override unsafe int FindFirstCharacterToEncodeUtf8(ReadOnlySpan<byte> utf8Text) | ||
| { | ||
| fixed (byte* ptr = utf8Text) | ||
| { | ||
| int idx = 0; | ||
|
|
||
| #if BUILDING_INBOX_LIBRARY | ||
| if (Sse2.IsSupported) | ||
| { | ||
| sbyte* startingAddress = (sbyte*)ptr; | ||
| while (utf8Text.Length - 16 >= idx) | ||
| { | ||
| Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + utf8Text.Length - 16)); | ||
|
|
||
| // Load the next 16 bytes. | ||
| Vector128<sbyte> sourceValue = Sse2.LoadVector128(startingAddress); | ||
|
|
||
| Vector128<sbyte> mask = Sse2Helper.CreateAsciiMask(sourceValue); | ||
| int index = Sse2.MoveMask(mask); | ||
|
|
||
| if (index != 0) | ||
| { | ||
| // At least one of the following 16 bytes is non-ASCII. | ||
|
|
||
| int processNextSixteen = idx + 16; | ||
| Debug.Assert(processNextSixteen <= utf8Text.Length); | ||
|
|
||
| while (idx < processNextSixteen) | ||
| { | ||
| Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length)); | ||
|
|
||
| if (UnicodeUtility.IsAsciiCodePoint(ptr[idx])) | ||
| { | ||
| if (DoesAsciiNeedEncoding(ptr[idx]) == 1) | ||
| { | ||
| goto Return; | ||
| } | ||
| idx++; | ||
| } | ||
| else | ||
| { | ||
| OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar); | ||
|
|
||
| Debug.Assert(nextScalarValue <= int.MaxValue); | ||
| if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue)) | ||
| { | ||
| goto Return; | ||
| } | ||
|
|
||
| Debug.Assert(opStatus == OperationStatus.Done); | ||
| idx += utf8BytesConsumedForScalar; | ||
| } | ||
| } | ||
| } | ||
| else | ||
| { | ||
| if (DoesAsciiNeedEncoding(ptr[idx]) == 1 | ||
|
|
||
| || DoesAsciiNeedEncoding(ptr[++idx]) == 1 | ||
| || DoesAsciiNeedEncoding(ptr[++idx]) == 1 | ||
| || DoesAsciiNeedEncoding(ptr[++idx]) == 1 | ||
| || DoesAsciiNeedEncoding(ptr[++idx]) == 1 | ||
|
|
||
| || DoesAsciiNeedEncoding(ptr[++idx]) == 1 | ||
| || DoesAsciiNeedEncoding(ptr[++idx]) == 1 | ||
| || DoesAsciiNeedEncoding(ptr[++idx]) == 1 | ||
| || DoesAsciiNeedEncoding(ptr[++idx]) == 1 | ||
|
|
||
| || DoesAsciiNeedEncoding(ptr[++idx]) == 1 | ||
| || DoesAsciiNeedEncoding(ptr[++idx]) == 1 | ||
| || DoesAsciiNeedEncoding(ptr[++idx]) == 1 | ||
| || DoesAsciiNeedEncoding(ptr[++idx]) == 1 | ||
|
|
||
| || DoesAsciiNeedEncoding(ptr[++idx]) == 1 | ||
| || DoesAsciiNeedEncoding(ptr[++idx]) == 1 | ||
| || DoesAsciiNeedEncoding(ptr[++idx]) == 1) | ||
| { | ||
| goto Return; | ||
| } | ||
| idx++; | ||
| } | ||
| startingAddress = (sbyte*)ptr + idx; | ||
| } | ||
|
|
||
| // Process the remaining bytes. | ||
| Debug.Assert(utf8Text.Length - idx < 16); | ||
| } | ||
| #endif | ||
|
|
||
| while (idx < utf8Text.Length) | ||
| { | ||
| Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length)); | ||
|
|
||
| if (UnicodeUtility.IsAsciiCodePoint(ptr[idx])) | ||
| { | ||
| if (DoesAsciiNeedEncoding(ptr[idx]) == 1) | ||
| { | ||
| goto Return; | ||
| } | ||
| idx++; | ||
| } | ||
| else | ||
| { | ||
| OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar); | ||
|
|
||
| Debug.Assert(nextScalarValue <= int.MaxValue); | ||
| if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue)) | ||
| { | ||
| goto Return; | ||
| } | ||
|
|
||
| Debug.Assert(opStatus == OperationStatus.Done); | ||
| idx += utf8BytesConsumedForScalar; | ||
| } | ||
| } | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. does it make sense to add assert after this line idx == utf8Text.Length?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The while loop condition already guards for
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Addressed this nit in #42064 |
||
|
|
||
| idx = -1; // All bytes are allowed. | ||
|
|
||
| Return: | ||
| return idx; | ||
| } | ||
| } | ||
|
|
||
| [MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
| private int DoesAsciiNeedEncoding(byte value) | ||
| { | ||
| Debug.Assert(value <= 0x7F); | ||
|
|
||
| int needsEscaping = _asciiNeedsEscaping[value]; | ||
|
|
||
| Debug.Assert(needsEscaping == 1 || needsEscaping == -1); | ||
|
|
||
| return needsEscaping; | ||
| } | ||
|
|
||
| // The worst case encoding is 6 output chars per input char: [input] U+FFFF -> [output] "\uFFFF" | ||
| // We don't need to worry about astral code points since they're represented as encoded | ||
| // surrogate pairs in the output. | ||
| public override int MaxOutputCharactersPerInputCharacter => 12; // "\uFFFF\uFFFF" is the longest encoded form | ||
|
|
||
| private static readonly char[] s_b = new char[] { '\\', 'b' }; | ||
| private static readonly char[] s_t = new char[] { '\\', 't' }; | ||
| private static readonly char[] s_n = new char[] { '\\', 'n' }; | ||
| private static readonly char[] s_f = new char[] { '\\', 'f' }; | ||
| private static readonly char[] s_r = new char[] { '\\', 'r' }; | ||
| private static readonly char[] s_back = new char[] { '\\', '\\' }; | ||
|
|
||
| // Writes a scalar value as a JavaScript-escaped character (or sequence of characters). | ||
| // See ECMA-262, Sec. 7.8.4, and ECMA-404, Sec. 9 | ||
| // http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.4 | ||
| // http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-404.pdf | ||
| public override unsafe bool TryEncodeUnicodeScalar(int unicodeScalar, char* buffer, int bufferLength, out int numberOfCharactersWritten) | ||
| { | ||
| if (buffer == null) | ||
| { | ||
| throw new ArgumentNullException(nameof(buffer)); | ||
| } | ||
| // ECMA-262 allows encoding U+000B as "\v", but ECMA-404 does not. | ||
| // Both ECMA-262 and ECMA-404 allow encoding U+002F SOLIDUS as "\/" | ||
| // (in ECMA-262 this character is a NonEscape character); however, we | ||
| // don't encode SOLIDUS by default unless the caller has provided an | ||
| // explicit bitmap which does not contain it. In this case we'll assume | ||
| // that the caller didn't want a SOLIDUS written to the output at all, | ||
| // so it should be written using "\u002F" encoding. | ||
| // HTML-specific characters (including apostrophe and quotes) will | ||
| // be written out as numeric entities for defense-in-depth. | ||
| // See UnicodeEncoderBase ctor comments for more info. | ||
|
|
||
| if (!WillEncode(unicodeScalar)) | ||
| { | ||
| return TryWriteScalarAsChar(unicodeScalar, buffer, bufferLength, out numberOfCharactersWritten); | ||
| } | ||
|
|
||
| char[] toCopy; | ||
| switch (unicodeScalar) | ||
| { | ||
| case '\b': toCopy = s_b; break; | ||
| case '\t': toCopy = s_t; break; | ||
| case '\n': toCopy = s_n; break; | ||
| case '\f': toCopy = s_f; break; | ||
| case '\r': toCopy = s_r; break; | ||
| case '\\': toCopy = s_back; break; | ||
| default: return JavaScriptEncoderHelper.TryWriteEncodedScalarAsNumericEntity(unicodeScalar, buffer, bufferLength, out numberOfCharactersWritten); | ||
| } | ||
| return TryCopyCharacters(toCopy, buffer, bufferLength, out numberOfCharactersWritten); | ||
| } | ||
| } | ||
| } | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Most of the changes in this file are copy-paste from
JavaScriptEncoder.csinto a separate file and don't change the existing logic.