Skip to content
This repository was archived by the owner on Jan 23, 2023. It is now read-only.
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/System.Text.Encodings.Web/src/Configurations.props
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
<Project DefaultTargets="Build">
<PropertyGroup>
<BuildConfigurations>
netcoreapp3.0;
netstandard2.1;
netstandard;
uap-Windows_NT;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,16 @@
<ProjectGuid>{B7EDBF00-765A-48E8-B593-CD668288E274}</ProjectGuid>
<RootNamespace>System.Text.Encodings.Web</RootNamespace>
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
<Configurations>netstandard-Debug;netstandard-Release;netstandard2.1-Debug;netstandard2.1-Release;uap-Windows_NT-Debug;uap-Windows_NT-Release</Configurations>
<Configurations>netcoreapp3.0-Debug;netcoreapp3.0-Release;netstandard-Debug;netstandard-Release;netstandard2.1-Debug;netstandard2.1-Release;uap-Windows_NT-Debug;uap-Windows_NT-Release</Configurations>
<DefineConstants Condition="'$(TargetsNetCoreApp)' == 'true'">$(DefineConstants);BUILDING_INBOX_LIBRARY</DefineConstants>
</PropertyGroup>
<ItemGroup>
<Compile Include="System\Text\Encodings\Web\DefaultJavaScriptEncoder.cs" />
<Compile Include="System\Text\Encodings\Web\DefaultJavaScriptEncoderBasicLatin.cs" />
<Compile Include="System\Text\Encodings\Web\HexUtil.cs" />
<Compile Include="System\Text\Encodings\Web\HtmlEncoder.cs" />
<Compile Include="System\Text\Encodings\Web\JavaScriptEncoder.cs" />
<Compile Include="System\Text\Encodings\Web\JavaScriptEncoderHelper.cs" />
<Compile Include="System\Text\Encodings\Web\TextEncoder.cs" />
<Compile Include="System\Text\Encodings\Web\TextEncoderSettings.cs" />
<Compile Include="System\Text\Encodings\Web\UnsafeRelaxedJavaScriptEncoder.cs" />
Expand All @@ -20,6 +24,9 @@
<Compile Include="System\Text\Unicode\UnicodeRanges.cs" />
<Compile Include="System\Text\Unicode\UnicodeRanges.generated.cs" />
</ItemGroup>
<ItemGroup Condition="'$(TargetsNetCoreApp)' == 'true'">
<Compile Include="System\Text\Encodings\Web\Sse2Helper.cs" />
</ItemGroup>
<ItemGroup>
<Compile Include="$(CommonPath)\CoreLib\System\Text\UnicodeDebug.cs">
<Link>System\Text\UnicodeDebug.cs</Link>
Expand All @@ -37,4 +44,7 @@
<Reference Include="System.Runtime.Extensions" />
<Reference Include="System.Threading" />
</ItemGroup>
<ItemGroup Condition="'$(TargetsNetCoreApp)' == 'true'">
<Reference Include="System.Runtime.Intrinsics" />
</ItemGroup>
</Project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,271 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System.Buffers;
using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Text.Internal;
using System.Text.Unicode;

#if BUILDING_INBOX_LIBRARY
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
#endif

namespace System.Text.Encodings.Web
{
internal sealed class DefaultJavaScriptEncoder : JavaScriptEncoder
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Most of the changes in this file are copy-paste from JavaScriptEncoder.cs into a separate file and don't change the existing logic.

{
private readonly AllowedCharactersBitmap _allowedCharacters;

private readonly int[] _asciiNeedsEscaping = new int[0x80];

public DefaultJavaScriptEncoder(TextEncoderSettings filter)
{
if (filter == null)
{
throw new ArgumentNullException(nameof(filter));
}

_allowedCharacters = filter.GetAllowedCharacters();

// Forbid codepoints which aren't mapped to characters or which are otherwise always disallowed
// (includes categories Cc, Cs, Co, Cn, Zs [except U+0020 SPACE], Zl, Zp)
_allowedCharacters.ForbidUndefinedCharacters();

// Forbid characters that are special in HTML.
// Even though this is a not HTML encoder,
// it's unfortunately common for developers to
// forget to HTML-encode a string once it has been JS-encoded,
// so this offers extra protection.
DefaultHtmlEncoder.ForbidHtmlCharacters(_allowedCharacters);

// '\' (U+005C REVERSE SOLIDUS) must always be escaped in Javascript / ECMAScript / JSON.
// '/' (U+002F SOLIDUS) is not Javascript / ECMAScript / JSON-sensitive so doesn't need to be escaped.
_allowedCharacters.ForbidCharacter('\\');

// '`' (U+0060 GRAVE ACCENT) is ECMAScript-sensitive (see ECMA-262).
_allowedCharacters.ForbidCharacter('`');

for (int i = 0; i < _asciiNeedsEscaping.Length; i++)
{
_asciiNeedsEscaping[i] = WillEncode(i) ? 1 : -1;
}
}

public DefaultJavaScriptEncoder(params UnicodeRange[] allowedRanges) : this(new TextEncoderSettings(allowedRanges))
{ }

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public override bool WillEncode(int unicodeScalar)
{
if (UnicodeHelpers.IsSupplementaryCodePoint(unicodeScalar))
{
return true;
}

Debug.Assert(unicodeScalar >= char.MinValue && unicodeScalar <= char.MaxValue);

return !_allowedCharacters.IsUnicodeScalarAllowed(unicodeScalar);
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public override unsafe int FindFirstCharacterToEncode(char* text, int textLength)
{
if (text == null)
{
throw new ArgumentNullException(nameof(text));
}

return _allowedCharacters.FindFirstCharacterToEncode(text, textLength);
}

public override unsafe int FindFirstCharacterToEncodeUtf8(ReadOnlySpan<byte> utf8Text)
{
fixed (byte* ptr = utf8Text)
{
int idx = 0;

#if BUILDING_INBOX_LIBRARY
if (Sse2.IsSupported)
{
sbyte* startingAddress = (sbyte*)ptr;
while (utf8Text.Length - 16 >= idx)
{
Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + utf8Text.Length - 16));

// Load the next 16 bytes.
Vector128<sbyte> sourceValue = Sse2.LoadVector128(startingAddress);

Vector128<sbyte> mask = Sse2Helper.CreateAsciiMask(sourceValue);
int index = Sse2.MoveMask(mask);

if (index != 0)
{
// At least one of the following 16 bytes is non-ASCII.

int processNextSixteen = idx + 16;
Debug.Assert(processNextSixteen <= utf8Text.Length);

while (idx < processNextSixteen)
{
Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length));

if (UnicodeUtility.IsAsciiCodePoint(ptr[idx]))
{
if (DoesAsciiNeedEncoding(ptr[idx]) == 1)
{
goto Return;
}
idx++;
}
else
{
OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar);

Debug.Assert(nextScalarValue <= int.MaxValue);
if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue))
{
goto Return;
}

Debug.Assert(opStatus == OperationStatus.Done);
idx += utf8BytesConsumedForScalar;
}
}
}
else
{
if (DoesAsciiNeedEncoding(ptr[idx]) == 1

|| DoesAsciiNeedEncoding(ptr[++idx]) == 1
|| DoesAsciiNeedEncoding(ptr[++idx]) == 1
|| DoesAsciiNeedEncoding(ptr[++idx]) == 1
|| DoesAsciiNeedEncoding(ptr[++idx]) == 1

|| DoesAsciiNeedEncoding(ptr[++idx]) == 1
|| DoesAsciiNeedEncoding(ptr[++idx]) == 1
|| DoesAsciiNeedEncoding(ptr[++idx]) == 1
|| DoesAsciiNeedEncoding(ptr[++idx]) == 1

|| DoesAsciiNeedEncoding(ptr[++idx]) == 1
|| DoesAsciiNeedEncoding(ptr[++idx]) == 1
|| DoesAsciiNeedEncoding(ptr[++idx]) == 1
|| DoesAsciiNeedEncoding(ptr[++idx]) == 1

|| DoesAsciiNeedEncoding(ptr[++idx]) == 1
|| DoesAsciiNeedEncoding(ptr[++idx]) == 1
|| DoesAsciiNeedEncoding(ptr[++idx]) == 1)
{
goto Return;
}
idx++;
}
startingAddress = (sbyte*)ptr + idx;
}

// Process the remaining bytes.
Debug.Assert(utf8Text.Length - idx < 16);
}
#endif

while (idx < utf8Text.Length)
{
Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length));

if (UnicodeUtility.IsAsciiCodePoint(ptr[idx]))
{
if (DoesAsciiNeedEncoding(ptr[idx]) == 1)
{
goto Return;
}
idx++;
}
else
{
OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar);

Debug.Assert(nextScalarValue <= int.MaxValue);
if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue))
{
goto Return;
}

Debug.Assert(opStatus == OperationStatus.Done);
idx += utf8BytesConsumedForScalar;
}
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does it make sense to add assert after this line idx == utf8Text.Length?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The while loop condition already guards for idx < utf8Text.Length, so adding the assert would be primarily to make sure idx isn't greater than utf8Text.Length. Sure, we can add it.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed this nit in #42064


idx = -1; // All bytes are allowed.

Return:
return idx;
}
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private int DoesAsciiNeedEncoding(byte value)
{
Debug.Assert(value <= 0x7F);

int needsEscaping = _asciiNeedsEscaping[value];

Debug.Assert(needsEscaping == 1 || needsEscaping == -1);

return needsEscaping;
}

// The worst case encoding is 6 output chars per input char: [input] U+FFFF -> [output] "\uFFFF"
// We don't need to worry about astral code points since they're represented as encoded
// surrogate pairs in the output.
public override int MaxOutputCharactersPerInputCharacter => 12; // "\uFFFF\uFFFF" is the longest encoded form

private static readonly char[] s_b = new char[] { '\\', 'b' };
private static readonly char[] s_t = new char[] { '\\', 't' };
private static readonly char[] s_n = new char[] { '\\', 'n' };
private static readonly char[] s_f = new char[] { '\\', 'f' };
private static readonly char[] s_r = new char[] { '\\', 'r' };
private static readonly char[] s_back = new char[] { '\\', '\\' };

// Writes a scalar value as a JavaScript-escaped character (or sequence of characters).
// See ECMA-262, Sec. 7.8.4, and ECMA-404, Sec. 9
// http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.4
// http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-404.pdf
public override unsafe bool TryEncodeUnicodeScalar(int unicodeScalar, char* buffer, int bufferLength, out int numberOfCharactersWritten)
{
if (buffer == null)
{
throw new ArgumentNullException(nameof(buffer));
}
// ECMA-262 allows encoding U+000B as "\v", but ECMA-404 does not.
// Both ECMA-262 and ECMA-404 allow encoding U+002F SOLIDUS as "\/"
// (in ECMA-262 this character is a NonEscape character); however, we
// don't encode SOLIDUS by default unless the caller has provided an
// explicit bitmap which does not contain it. In this case we'll assume
// that the caller didn't want a SOLIDUS written to the output at all,
// so it should be written using "\u002F" encoding.
// HTML-specific characters (including apostrophe and quotes) will
// be written out as numeric entities for defense-in-depth.
// See UnicodeEncoderBase ctor comments for more info.

if (!WillEncode(unicodeScalar))
{
return TryWriteScalarAsChar(unicodeScalar, buffer, bufferLength, out numberOfCharactersWritten);
}

char[] toCopy;
switch (unicodeScalar)
{
case '\b': toCopy = s_b; break;
case '\t': toCopy = s_t; break;
case '\n': toCopy = s_n; break;
case '\f': toCopy = s_f; break;
case '\r': toCopy = s_r; break;
case '\\': toCopy = s_back; break;
default: return JavaScriptEncoderHelper.TryWriteEncodedScalarAsNumericEntity(unicodeScalar, buffer, bufferLength, out numberOfCharactersWritten);
}
return TryCopyCharacters(toCopy, buffer, bufferLength, out numberOfCharactersWritten);
}
}
}
Loading