Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions Microsoft.ML.sln
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,10 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.AutoML.SourceG
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.TorchSharp", "src\Microsoft.ML.TorchSharp\Microsoft.ML.TorchSharp.csproj", "{FF0BD187-4451-4A3B-934B-2AE3454896E2}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.Tokenizers", "src\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj", "{BBC3A950-BD68-45AC-9DBD-A8F4D8847745}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.Tokenizers.Tests", "test\Microsoft.ML.Tokenizers.Tests\Microsoft.ML.Tokenizers.Tests.csproj", "{C3D82402-F207-4F19-8C57-5AF0FBAF9682}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down Expand Up @@ -747,6 +751,22 @@ Global
{FF0BD187-4451-4A3B-934B-2AE3454896E2}.Release|Any CPU.Build.0 = Release|Any CPU
{FF0BD187-4451-4A3B-934B-2AE3454896E2}.Release|x64.ActiveCfg = Release|Any CPU
{FF0BD187-4451-4A3B-934B-2AE3454896E2}.Release|x64.Build.0 = Release|Any CPU
{BBC3A950-BD68-45AC-9DBD-A8F4D8847745}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{BBC3A950-BD68-45AC-9DBD-A8F4D8847745}.Debug|Any CPU.Build.0 = Debug|Any CPU
{BBC3A950-BD68-45AC-9DBD-A8F4D8847745}.Debug|x64.ActiveCfg = Debug|Any CPU
{BBC3A950-BD68-45AC-9DBD-A8F4D8847745}.Debug|x64.Build.0 = Debug|Any CPU
{BBC3A950-BD68-45AC-9DBD-A8F4D8847745}.Release|Any CPU.ActiveCfg = Release|Any CPU
{BBC3A950-BD68-45AC-9DBD-A8F4D8847745}.Release|Any CPU.Build.0 = Release|Any CPU
{BBC3A950-BD68-45AC-9DBD-A8F4D8847745}.Release|x64.ActiveCfg = Release|Any CPU
{BBC3A950-BD68-45AC-9DBD-A8F4D8847745}.Release|x64.Build.0 = Release|Any CPU
{C3D82402-F207-4F19-8C57-5AF0FBAF9682}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{C3D82402-F207-4F19-8C57-5AF0FBAF9682}.Debug|Any CPU.Build.0 = Debug|Any CPU
{C3D82402-F207-4F19-8C57-5AF0FBAF9682}.Debug|x64.ActiveCfg = Debug|Any CPU
{C3D82402-F207-4F19-8C57-5AF0FBAF9682}.Debug|x64.Build.0 = Debug|Any CPU
{C3D82402-F207-4F19-8C57-5AF0FBAF9682}.Release|Any CPU.ActiveCfg = Release|Any CPU
{C3D82402-F207-4F19-8C57-5AF0FBAF9682}.Release|Any CPU.Build.0 = Release|Any CPU
{C3D82402-F207-4F19-8C57-5AF0FBAF9682}.Release|x64.ActiveCfg = Release|Any CPU
{C3D82402-F207-4F19-8C57-5AF0FBAF9682}.Release|x64.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand Down Expand Up @@ -825,6 +845,8 @@ Global
{A3E9F25F-2718-4FF9-A35A-54C232A847AB} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
{C804B990-390E-41D7-8FF1-6774495D70E2} = {7F13E156-3EBA-4021-84A5-CD56BA72F99E}
{FF0BD187-4451-4A3B-934B-2AE3454896E2} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
{BBC3A950-BD68-45AC-9DBD-A8F4D8847745} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
{C3D82402-F207-4F19-8C57-5AF0FBAF9682} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {41165AF1-35BB-4832-A189-73060F82B01D}
Expand Down
21 changes: 20 additions & 1 deletion THIRD-PARTY-NOTICES.TXT
Original file line number Diff line number Diff line change
Expand Up @@ -65,4 +65,23 @@ INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PA
PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
DEALINGS IN THE SOFTWARE.

License notice for HuggingFace Tokenizers
------------------------------------------

https://github.com/huggingface/tokenizers/blob/main/LICENSE

Copyright 2004 HuggingFace Tokenizers

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
6 changes: 6 additions & 0 deletions src/Microsoft.ML.SearchSpace/Parameter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,9 @@ public bool Remove(string key)
return (_value as IDictionary<string, Parameter>).Remove(key);
}

/// <summary>
/// <inheritdoc/>
/// </summary>
public bool Equals(Parameter other)
{
//Check whether the compared object is null.
Expand All @@ -418,6 +421,9 @@ public bool Equals(Parameter other)
return thisJson == otherJson;
}

/// <summary>
/// <inheritdoc/>
/// </summary>
public override int GetHashCode()
{
var thisJson = JsonSerializer.Serialize(this);
Expand Down
91 changes: 91 additions & 0 deletions src/Microsoft.ML.Tokenizers/AddedToken.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System;
using System.Collections.Generic;
using System.Text;

namespace Microsoft.ML.Tokenizers
{
/// <summary>
/// Represent a token added by the user on top of the existing Model vocabulary.
/// AddedToken can be configured to specify the behavior they should have in various situations
/// like:
/// - Whether they should only match single words
/// - Whether to include any WhiteSpace on its left or right
/// </summary>
public struct AddedToken : IEquatable<AddedToken>
{
/// <summary>
/// Gets or sets the content of the added token
/// </summary>
public string Content { get; set; }

/// <summary>
/// Gets or sets whether this token must be a single word or can break words
/// </summary>
internal bool SingleWord { get; set; }

/// <summary>
/// Gets or sets whether this token should strip WhiteSpaces on its left
/// </summary>
internal bool LeftStrip { get; set; }

/// <summary>
/// Gets or sets whether this token should strip WhiteSpaces on its right
/// </summary>
internal bool RightStrip { get; set; }

/// <summary>
/// Gets or sets whether this token should be normalized
/// </summary>
internal bool Normalized { get; set; }

/// <summary>
/// Gets or sets whether this token is special
/// </summary>
internal bool Special { get; set; }

/// <summary>
/// Create a new AddedToken object.
/// </summary>
public AddedToken()
{
Content = "";
SingleWord = LeftStrip = RightStrip = Special = false;
Normalized = true;
}

/// <summary>
/// Create a new AddedToken object from the given content, specifying if it is intended to be a
/// special token. Special tokens are not normalized by default.
/// </summary>
/// <param name="content">The content of the added token.</param>
/// <param name="special">Indicate whether this token is special.</param>
public AddedToken(string content, bool special = false) : this()
{
Content = content ?? "";
Special = special;
Normalized = !special;
}

/// <summary>
/// Determines whether two token instances are equal.
/// </summary>
/// <param name="other">The token to compare with the current token.</param>
public bool Equals(AddedToken other) => Content == other.Content;

// We only want to hash on the content. AddedToken cannot be added multiple times with different options
/// <summary>
/// Returns the hash code for the current token.
/// </summary>
public override int GetHashCode() => Content.GetHashCode();


/// <summary>
/// Defines an implicit conversion of a string object to AddedToken.
/// </summary>
public static implicit operator AddedToken(string token) => new AddedToken(token);
}
}
13 changes: 13 additions & 0 deletions src/Microsoft.ML.Tokenizers/Microsoft.ML.Tokenizers.csproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
<Project Sdk="Microsoft.NET.Sdk">
<Import Project="$(RepoRoot)eng/pkg/Pack.props" />

<PropertyGroup>
<TargetFramework>netstandard2.0</TargetFramework>
<Nullable>enable</Nullable>
<PackageDescription>Microsoft.ML.Tokenizers contains the implmentation of the tokenization used in the NLP transforms.</PackageDescription>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="System.Text.Json" Version="$(SystemTextJsonVersion)" />
</ItemGroup>
</Project>
Loading