dotnet
diff --git a/‎src/Microsoft.ML.Tokenizers/Microsoft.ML.Tokenizers.csproj‎
Lines changed: 9 additions & 1 deletion b/‎src/Microsoft.ML.Tokenizers/Microsoft.ML.Tokenizers.csproj‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎src/Microsoft.ML.Tokenizers/Model/BPE.cs‎
Lines changed: 5 additions & 5 deletions b/‎src/Microsoft.ML.Tokenizers/Model/BPE.cs‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎src/Microsoft.ML.Tokenizers/Model/BpeTrainer.cs‎
Lines changed: 9 additions & 4 deletions b/‎src/Microsoft.ML.Tokenizers/Model/BpeTrainer.cs‎
Lines changed: 9 additions & 4 deletions
diff --git a/‎src/Microsoft.ML.Tokenizers/Model/Cache.cs‎
Lines changed: 4 additions & 4 deletions b/‎src/Microsoft.ML.Tokenizers/Model/Cache.cs‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/Microsoft.ML.Tokenizers/Model/EnglishRoberta.cs‎
Lines changed: 6 additions & 2 deletions b/‎src/Microsoft.ML.Tokenizers/Model/EnglishRoberta.cs‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎src/Microsoft.ML.Tokenizers/Model/Model.cs‎
Lines changed: 1 addition & 1 deletion b/‎src/Microsoft.ML.Tokenizers/Model/Model.cs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/Microsoft.ML.Tokenizers/Model/Tiktoken.cs‎
Lines changed: 12 additions & 13 deletions b/‎src/Microsoft.ML.Tokenizers/Model/Tiktoken.cs‎
Lines changed: 12 additions & 13 deletions
diff --git a/‎src/Microsoft.ML.Tokenizers/PreTokenizer/PreTokenizer.cs‎
Lines changed: 50 additions & 15 deletions b/‎src/Microsoft.ML.Tokenizers/PreTokenizer/PreTokenizer.cs‎
Lines changed: 50 additions & 15 deletions
@@ -2,11 +2,19 @@
   <Import Project="$(RepoRoot)eng/pkg/Pack.props" />
 
   <PropertyGroup>
-    <TargetFramework>netstandard2.0</TargetFramework>
+    <TargetFrameworks>netstandard2.0;net8.0</TargetFrameworks>
     <Nullable>enable</Nullable>
     <PackageDescription>Microsoft.ML.Tokenizers contains the implmentation of the tokenization used in the NLP transforms.</PackageDescription>
   </PropertyGroup>
 
+  <ItemGroup Condition="'$(TargetFramework)' == 'netstandard2.0'">
+    <Compile Remove="Utils/Helpers.netcoreapp.cs" />
+  </ItemGroup>
+
+  <ItemGroup Condition="'$(TargetFramework)' != 'netstandard2.0'">
+    <Compile Remove="Utils/Helpers.netfx.cs" />
+  </ItemGroup>
+
   <ItemGroup>
     <PackageReference Include="System.Text.Json" Version="$(SystemTextJsonVersion)" />
   </ItemGroup>
 
@@ -36,7 +36,7 @@ public string? UnknownToken
 
                 if (value is null)
                 {
-                    if (VocabReverse.TryGetValue(0, out string v))
+                    if (VocabReverse.TryGetValue(0, out string? v))
                     {
                         VocabReverse.Remove(0);
                         if (Vocab.TryGetValue(v, out int id))
@@ -103,7 +103,7 @@ public Bpe(string vocabFile, string? mergesFile, string? unknownToken = null, st
                 VocabReverse.Add(kvp.Value, kvp.Key);
             }
 
-            if (unknownToken is null && VocabReverse.TryGetValue(0, out string unkToken))
+            if (unknownToken is null && VocabReverse.TryGetValue(0, out string? unkToken))
             {
                 unknownToken = unkToken;
             }
@@ -187,7 +187,7 @@ public override IReadOnlyList<Token> Tokenize(string sequence)
         /// <returns>The mapped token of the Id.</returns>
         public override string? IdToToken(int id, bool skipSpecialTokens = false)
         {
-            if (VocabReverse.TryGetValue(id, out string value))
+            if (VocabReverse.TryGetValue(id, out string? value))
             {
                 return value;
             }
@@ -253,7 +253,7 @@ public override string[] Save(string path, string? prefix = null)
         }
 
         /// Read the given files to extract the vocab and merges
-        internal static (Dictionary<string, int>?, Vec<(string, string)>) ReadFile(string? vocab, string? merges)
+        internal static (Dictionary<string, int>?, Vec<(string, string)>) ReadFile(string vocab, string? merges)
         {
             Dictionary<string, int>? dic;
             using (Stream stream = File.OpenRead(vocab))
@@ -320,7 +320,7 @@ internal static (Dictionary<string, int>?, Vec<(string, string)>) ReadFile(strin
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         internal string CharToString(char c)
         {
-            if (_charToString.TryGetValue(c, out string v))
+            if (_charToString.TryGetValue(c, out string? v))
             {
                 return v;
             }
 
@@ -83,7 +83,12 @@ public BpeTrainer(
             MinFrequency = minFrequency;
             VocabSize = vocabSize;
             Progress = progress;
-            SpecialTokens = new List<AddedToken>(specialTokens);
+
+            if (specialTokens is not null)
+            {
+                SpecialTokens = new List<AddedToken>(specialTokens);
+            }
+
             LimitAlphabet = limitAlphabet;
             InitialAlphabet = initialAlphabet;
             ContinuingSubwordPrefix = continuingSubwordPrefix;
@@ -172,7 +177,7 @@ private void ComputeAlphabet(Dictionary<string, int> wc, Dictionary<string, int>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         internal string CharToString(char c)
         {
-            if (_charToString.TryGetValue(c, out string v))
+            if (_charToString.TryGetValue(c, out string? v))
             {
                 return v;
             }
@@ -259,7 +264,7 @@ internal string CharToString(char c)
                     // Then update counts
                     int count = counts[i];
 
-                    if (!whereToUpdate.TryGetValue(curPair, out HashSet<int> h))
+                    if (!whereToUpdate.TryGetValue(curPair, out HashSet<int>? h))
                     {
                         h = new HashSet<int>();
                         whereToUpdate[curPair] = h;
@@ -398,7 +403,7 @@ internal string CharToString(char c)
 
                     if (change > 0)
                     {
-                        if (!whereToUpdate.TryGetValue(p, out HashSet<int> h))
+                        if (!whereToUpdate.TryGetValue(p, out HashSet<int>? h))
                         {
                             h = new();
                             whereToUpdate[p] = h;
 
@@ -9,7 +9,7 @@
 
 namespace Microsoft.ML.Tokenizers
 {
-    internal sealed class Cache<TKey, TValue>
+    internal sealed class Cache<TKey, TValue> where TKey : notnull
     {
         internal Cache() : this(Bpe.DefaultCacheCapacity) { }
 
@@ -39,13 +39,13 @@ internal void Clear()
 
         internal List<TValue> GetValues(IEnumerable<TKey> keys)
         {
-            List<TValue>? values = new();
+            List<TValue> values = new();
             _cacheLock.EnterReadLock();
             try
             {
                 foreach (TKey key in keys)
                 {
-                    if (Map.TryGetValue(key, out TValue value))
+                    if (Map.TryGetValue(key, out TValue? value))
                     {
                         values.Add(value);
                     }
@@ -61,7 +61,7 @@ internal List<TValue> GetValues(IEnumerable<TKey> keys)
             _cacheLock.EnterReadLock();
             try
             {
-                if (Map.TryGetValue(key, out TValue value))
+                if (Map.TryGetValue(key, out TValue? value))
                 {
                     return value;
                 }
 
@@ -429,7 +429,7 @@ private Dictionary<string, int> GetVocabulary(Stream vocabularyStream)
                 using StreamReader reader = new StreamReader(mergeStream);
                 while (reader.Peek() >= 0)
                 {
-                    splitContents.Add(reader.ReadLine());
+                    splitContents.Add(reader.ReadLine()!);
                 }
             }
             catch (Exception e)
@@ -761,7 +761,11 @@ public void AddFromStream(Stream stream)
 
             while (reader.Peek() >= 0)
             {
-                string line = reader.ReadLine();
+                string? line = reader.ReadLine();
+                if (line is null)
+                {
+                    continue;
+                }
 
                 var splitLine = line.Trim().Split(' ');
                 if (splitLine.Length != 2)
 
@@ -35,7 +35,7 @@ public abstract class Model
         /// <param name="isSpecialToken">Indicate if the token is a special token.</param>
         /// <param name="accumulatedIds">The list of accumulated tokenized Ids.</param>
         /// <returns>True if the operation succeeded, false otherwise.</returns>
-        public virtual bool TokenizeToIds(string sequence, bool isSpecialToken, List<int> accumulatedIds)
+        public virtual bool TokenizeToIds(string sequence, bool isSpecialToken, IList<int> accumulatedIds)
         {
             if (accumulatedIds is null)
             {
 
@@ -120,22 +120,21 @@ internal static (Dictionary<byte[], int>, Dictionary<string, int>, IReadOnlyDict
                 {
                     while (!reader.EndOfStream)
                     {
-                        string line = reader.ReadLine();
+                        string? line = reader.ReadLine();
                         if (string.IsNullOrWhiteSpace(line))
                         {
                             continue;
                         }
 
-                        var tokens = line.Split(' ');
-                        if (tokens.Length != 2)
+                        int spaceIndex = line.IndexOf(' ');
+                        if (spaceIndex <= 0 || spaceIndex >= line.Length - 1 || line.IndexOf(' ', spaceIndex + 1) >= 0)
                         {
                             throw new FormatException($"Invalid format in the BPE encoder file stream");
                         }
 
-                        byte[] tokenBytes = Convert.FromBase64String(tokens[0]);
-                        int rank = 0;
+                        byte[] tokenBytes = Helpers.FromBase64String(line, 0, spaceIndex);
 
-                        if (int.TryParse(tokens[1], out rank))
+                        if (Helpers.TryParseInt32(line, spaceIndex + 1, out int rank))
                         {
                             encoder[tokenBytes] = rank;
                             decoder[rank] = tokenBytes;
@@ -146,7 +145,7 @@ internal static (Dictionary<byte[], int>, Dictionary<string, int>, IReadOnlyDict
                         }
                         else
                         {
-                            throw new FormatException($"Can't parse {tokens[1]} to integer");
+                            throw new FormatException($"Can't parse {line.Substring(spaceIndex)} to integer");
                         }
                     }
                 }
@@ -242,7 +241,7 @@ public override IReadOnlyList<Token> Tokenize(string sequence, bool isSpecialTok
         /// <param name="isSpecialToken">Indicate if the token is a special token.</param>
         /// <param name="accumulatedIds">The list of accumulated Ids.</param>
         /// <returns>True if the operation succeeded, false otherwise.</returns>
-        public override bool TokenizeToIds(string sequence, bool isSpecialToken, List<int> accumulatedIds)
+        public override bool TokenizeToIds(string sequence, bool isSpecialToken, IList<int> accumulatedIds)
         {
             if (string.IsNullOrEmpty(sequence))
             {
@@ -320,7 +319,7 @@ public override bool TokenizeToIds(string sequence, bool isSpecialToken, List<in
             }
 
             int[] idsToCache = BytePairEncoder.BytePairEncode(Encoding.UTF8.GetBytes(token), _encoder);
-            _cache.Add(token, idsToCache.ToArray());
+            _cache.Add(token, idsToCache);
 
             if (idsToCache.Length == 1)
             {
@@ -338,12 +337,12 @@ public override bool TokenizeToIds(string sequence, bool isSpecialToken, List<in
         /// <returns>The mapped token of the Id.</returns>
         public override string? IdToToken(int id, bool skipSpecialTokens = false)
         {
-            if (!skipSpecialTokens && _specialTokensDecoder is not null && _specialTokensDecoder.TryGetValue(id, out string token))
+            if (!skipSpecialTokens && _specialTokensDecoder is not null && _specialTokensDecoder.TryGetValue(id, out string? token))
             {
                 return token;
             }
 
-            if (_decoder.TryGetValue(id, out byte[] tokenBytes))
+            if (_decoder.TryGetValue(id, out byte[]? tokenBytes))
             {
                 return Encoding.UTF8.GetString(tokenBytes);
             }
@@ -363,11 +362,11 @@ public override bool TokenizeToIds(string sequence, bool isSpecialToken, List<in
 
             foreach (int id in ids)
             {
-                if (_decoder.TryGetValue(id, out byte[] tokenBytes))
+                if (_decoder.TryGetValue(id, out byte[]? tokenBytes))
                 {
                     utf8Bytes.AddRange(tokenBytes);
                 }
-                else if (useSpecialTokens && _specialTokensDecoder!.TryGetValue(id, out string token))
+                else if (useSpecialTokens && _specialTokensDecoder!.TryGetValue(id, out string? token))
                 {
                     utf8Bytes.AddRange(Encoding.UTF8.GetBytes(token));
                 }
 
@@ -5,6 +5,7 @@
 using System;
 using System.Collections;
 using System.Collections.Generic;
+using System.Diagnostics;
 using System.Text.RegularExpressions;
 
 namespace Microsoft.ML.Tokenizers
@@ -75,53 +76,87 @@ public abstract class PreTokenizer
         public abstract IEnumerable<Split> PreTokenize(string sentence, bool skipSpecialTokens = false);
     }
 
-    internal readonly struct RegexSplitEnumerable : IEnumerable<Split>
+    internal sealed class RegexSplitEnumerable : IEnumerable<Split>
     {
-        private readonly MatchCollection _matches;
+        private readonly static Dictionary<string, Regex> _regexCache = new(StringComparer.Ordinal);
+        private readonly Regex _regex;
+        private readonly string _sentence;
 
         public RegexSplitEnumerable(string sentence, string pattern)
         {
-            _matches = Regex.Matches(sentence, pattern);
+            Debug.Assert(sentence is not null);
+            Debug.Assert(pattern is not null);
+
+            Regex? regex;
+            lock (_regexCache)
+            {
+                if (!_regexCache.TryGetValue(pattern!, out regex))
+                {
+                    regex = new Regex(pattern, RegexOptions.Compiled);
+                    _regexCache[pattern!] = regex;
+                }
+            }
+
+            _regex = regex;
+            _sentence = sentence!;
         }
 
-        public IEnumerator<Split> GetEnumerator() => new RegexSplitEnumerator(_matches);
+        public IEnumerator<Split> GetEnumerator() => new RegexSplitEnumerator(_regex, _sentence);
 
-        IEnumerator IEnumerable.GetEnumerator() => new RegexSplitEnumerator(_matches);
+        IEnumerator IEnumerable.GetEnumerator() => new RegexSplitEnumerator(_regex, _sentence);
 
-        private struct RegexSplitEnumerator : IEnumerator<Split>
+        private sealed class RegexSplitEnumerator : IEnumerator<Split>
         {
             private Split _current = default;
-            private int _matchIndex = 0;
-            private readonly MatchCollection _matches;
+            private readonly Regex _regex;
+            private Match? _tokenMatch;
+            private readonly string _sentence;
 
-            public RegexSplitEnumerator(MatchCollection matches) => _matches = matches;
+            public RegexSplitEnumerator(Regex regex, string sentence)
+            {
+                Debug.Assert(sentence is not null);
+                Debug.Assert(regex is not null);
+
+                _regex = regex!;
+                _sentence = sentence!;
+            }
 
             public Split Current => _current;
 
             object IEnumerator.Current => _current;
 
             public bool MoveNext()
             {
-                if (_matchIndex >= _matches.Count)
+                if (_tokenMatch is null)
+                {
+                    _tokenMatch = _regex.Match(_sentence);
+                }
+                else if (!_tokenMatch.Success)
                 {
                     return false;
                 }
+                else
+                {
+                    _tokenMatch = _tokenMatch.NextMatch();
+                }
 
-                var match = _matches[_matchIndex++];
-                _current = new Split(match.Value, (match.Index, match.Index + match.Length));
+                if (!_tokenMatch.Success)
+                {
+                    return false;
+                }
+
+                _current = new Split(_tokenMatch.Value, (_tokenMatch.Index, _tokenMatch.Index + _tokenMatch.Length));
                 return true;
             }
 
             public void Reset()
             {
-                _matchIndex = 0;
+                _tokenMatch = null;
             }
 
             public void Dispose()
             {
             }
         }
     }
-
-
 }
Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,7 @@ public string? UnknownToken`
`36`	`36`
`37`	`37`	`if (value is null)`
`38`	`38`	`{`
`39`		`- if (VocabReverse.TryGetValue(0, out string v))`
	`39`	`+ if (VocabReverse.TryGetValue(0, out string? v))`
`40`	`40`	`{`
`41`	`41`	`VocabReverse.Remove(0);`
`42`	`42`	`if (Vocab.TryGetValue(v, out int id))`
`@@ -103,7 +103,7 @@ public Bpe(string vocabFile, string? mergesFile, string? unknownToken = null, st`
`103`	`103`	`VocabReverse.Add(kvp.Value, kvp.Key);`
`104`	`104`	`}`
`105`	`105`
`106`		`- if (unknownToken is null && VocabReverse.TryGetValue(0, out string unkToken))`
	`106`	`+ if (unknownToken is null && VocabReverse.TryGetValue(0, out string? unkToken))`
`107`	`107`	`{`
`108`	`108`	`unknownToken = unkToken;`
`109`	`109`	`}`
`@@ -187,7 +187,7 @@ public override IReadOnlyList<Token> Tokenize(string sequence)`
`187`	`187`	`/// <returns>The mapped token of the Id.</returns>`
`188`	`188`	`public override string? IdToToken(int id, bool skipSpecialTokens = false)`
`189`	`189`	`{`
`190`		`- if (VocabReverse.TryGetValue(id, out string value))`
	`190`	`+ if (VocabReverse.TryGetValue(id, out string? value))`
`191`	`191`	`{`
`192`	`192`	`return value;`
`193`	`193`	`}`
`@@ -253,7 +253,7 @@ public override string[] Save(string path, string? prefix = null)`
`253`	`253`	`}`
`254`	`254`
`255`	`255`	`/// Read the given files to extract the vocab and merges`
`256`		`- internal static (Dictionary<string, int>?, Vec<(string, string)>) ReadFile(string? vocab, string? merges)`
	`256`	`+ internal static (Dictionary<string, int>?, Vec<(string, string)>) ReadFile(string vocab, string? merges)`
`257`	`257`	`{`
`258`	`258`	`Dictionary<string, int>? dic;`
`259`	`259`	`using (Stream stream = File.OpenRead(vocab))`
`@@ -320,7 +320,7 @@ internal static (Dictionary<string, int>?, Vec<(string, string)>) ReadFile(strin`
`320`	`320`	`[MethodImpl(MethodImplOptions.AggressiveInlining)]`
`321`	`321`	`internal string CharToString(char c)`
`322`	`322`	`{`
`323`		`- if (_charToString.TryGetValue(c, out string v))`
	`323`	`+ if (_charToString.TryGetValue(c, out string? v))`
`324`	`324`	`{`
`325`	`325`	`return v;`
`326`	`326`	`}`
Original file line number	Diff line number	Diff line change
`@@ -9,7 +9,7 @@`
`9`	`9`
`10`	`10`	`namespace Microsoft.ML.Tokenizers`
`11`	`11`	`{`
`12`		`- internal sealed class Cache<TKey, TValue>`
	`12`	`+ internal sealed class Cache<TKey, TValue> where TKey : notnull`
`13`	`13`	`{`
`14`	`14`	`internal Cache() : this(Bpe.DefaultCacheCapacity) { }`
`15`	`15`
`@@ -39,13 +39,13 @@ internal void Clear()`
`39`	`39`
`40`	`40`	`internal List<TValue> GetValues(IEnumerable<TKey> keys)`
`41`	`41`	`{`
`42`		`- List<TValue>? values = new();`
	`42`	`+ List<TValue> values = new();`
`43`	`43`	`_cacheLock.EnterReadLock();`
`44`	`44`	`try`
`45`	`45`	`{`
`46`	`46`	`foreach (TKey key in keys)`
`47`	`47`	`{`
`48`		`- if (Map.TryGetValue(key, out TValue value))`
	`48`	`+ if (Map.TryGetValue(key, out TValue? value))`
`49`	`49`	`{`
`50`	`50`	`values.Add(value);`
`51`	`51`	`}`
`@@ -61,7 +61,7 @@ internal List<TValue> GetValues(IEnumerable<TKey> keys)`
`61`	`61`	`_cacheLock.EnterReadLock();`
`62`	`62`	`try`
`63`	`63`	`{`
`64`		`- if (Map.TryGetValue(key, out TValue value))`
	`64`	`+ if (Map.TryGetValue(key, out TValue? value))`
`65`	`65`	`{`
`66`	`66`	`return value;`
`67`	`67`	`}`
Original file line number	Diff line number	Diff line change
`@@ -429,7 +429,7 @@ private Dictionary<string, int> GetVocabulary(Stream vocabularyStream)`
`429`	`429`	`using StreamReader reader = new StreamReader(mergeStream);`
`430`	`430`	`while (reader.Peek() >= 0)`
`431`	`431`	`{`
`432`		`- splitContents.Add(reader.ReadLine());`
	`432`	`+ splitContents.Add(reader.ReadLine()!);`
`433`	`433`	`}`
`434`	`434`	`}`
`435`	`435`	`catch (Exception e)`
`@@ -761,7 +761,11 @@ public void AddFromStream(Stream stream)`
`761`	`761`
`762`	`762`	`while (reader.Peek() >= 0)`
`763`	`763`	`{`
`764`		`- string line = reader.ReadLine();`
	`764`	`+ string? line = reader.ReadLine();`
	`765`	`+ if (line is null)`
	`766`	`+ {`
	`767`	`+ continue;`
	`768`	`+ }`
`765`	`769`
`766`	`770`	`var splitLine = line.Trim().Split(' ');`
`767`	`771`	`if (splitLine.Length != 2)`
Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@ public abstract class Model`
`35`	`35`	`/// <param name="isSpecialToken">Indicate if the token is a special token.</param>`
`36`	`36`	`/// <param name="accumulatedIds">The list of accumulated tokenized Ids.</param>`
`37`	`37`	`/// <returns>True if the operation succeeded, false otherwise.</returns>`
`38`		`- public virtual bool TokenizeToIds(string sequence, bool isSpecialToken, List<int> accumulatedIds)`
	`38`	`+ public virtual bool TokenizeToIds(string sequence, bool isSpecialToken, IList<int> accumulatedIds)`
`39`	`39`	`{`
`40`	`40`	`if (accumulatedIds is null)`
`41`	`41`	`{`
Original file line number	Diff line number	Diff line change
`@@ -120,22 +120,21 @@ internal static (Dictionary<byte[], int>, Dictionary<string, int>, IReadOnlyDict`
`120`	`120`	`{`
`121`	`121`	`while (!reader.EndOfStream)`
`122`	`122`	`{`
`123`		`- string line = reader.ReadLine();`
	`123`	`+ string? line = reader.ReadLine();`
`124`	`124`	`if (string.IsNullOrWhiteSpace(line))`
`125`	`125`	`{`
`126`	`126`	`continue;`
`127`	`127`	`}`
`128`	`128`
`129`		`- var tokens = line.Split(' ');`
`130`		`- if (tokens.Length != 2)`
	`129`	`+ int spaceIndex = line.IndexOf(' ');`
	`130`	`+ if (spaceIndex <= 0 \|\| spaceIndex >= line.Length - 1 \|\| line.IndexOf(' ', spaceIndex + 1) >= 0)`
`131`	`131`	`{`
`132`	`132`	`throw new FormatException($"Invalid format in the BPE encoder file stream");`
`133`	`133`	`}`
`134`	`134`
`135`		`- byte[] tokenBytes = Convert.FromBase64String(tokens[0]);`
`136`		`- int rank = 0;`
	`135`	`+ byte[] tokenBytes = Helpers.FromBase64String(line, 0, spaceIndex);`
`137`	`136`
`138`		`- if (int.TryParse(tokens[1], out rank))`
	`137`	`+ if (Helpers.TryParseInt32(line, spaceIndex + 1, out int rank))`
`139`	`138`	`{`
`140`	`139`	`encoder[tokenBytes] = rank;`
`141`	`140`	`decoder[rank] = tokenBytes;`
`@@ -146,7 +145,7 @@ internal static (Dictionary<byte[], int>, Dictionary<string, int>, IReadOnlyDict`
`146`	`145`	`}`
`147`	`146`	`else`
`148`	`147`	`{`
`149`		`- throw new FormatException($"Can't parse {tokens[1]} to integer");`
	`148`	`+ throw new FormatException($"Can't parse {line.Substring(spaceIndex)} to integer");`
`150`	`149`	`}`
`151`	`150`	`}`
`152`	`151`	`}`
`@@ -242,7 +241,7 @@ public override IReadOnlyList<Token> Tokenize(string sequence, bool isSpecialTok`
`242`	`241`	`/// <param name="isSpecialToken">Indicate if the token is a special token.</param>`
`243`	`242`	`/// <param name="accumulatedIds">The list of accumulated Ids.</param>`
`244`	`243`	`/// <returns>True if the operation succeeded, false otherwise.</returns>`
`245`		`- public override bool TokenizeToIds(string sequence, bool isSpecialToken, List<int> accumulatedIds)`
	`244`	`+ public override bool TokenizeToIds(string sequence, bool isSpecialToken, IList<int> accumulatedIds)`
`246`	`245`	`{`
`247`	`246`	`if (string.IsNullOrEmpty(sequence))`
`248`	`247`	`{`
`@@ -320,7 +319,7 @@ public override bool TokenizeToIds(string sequence, bool isSpecialToken, List<in`
`320`	`319`	`}`
`321`	`320`
`322`	`321`	`int[] idsToCache = BytePairEncoder.BytePairEncode(Encoding.UTF8.GetBytes(token), _encoder);`
`323`		`- _cache.Add(token, idsToCache.ToArray());`
	`322`	`+ _cache.Add(token, idsToCache);`
`324`	`323`
`325`	`324`	`if (idsToCache.Length == 1)`
`326`	`325`	`{`
`@@ -338,12 +337,12 @@ public override bool TokenizeToIds(string sequence, bool isSpecialToken, List<in`
`338`	`337`	`/// <returns>The mapped token of the Id.</returns>`
`339`	`338`	`public override string? IdToToken(int id, bool skipSpecialTokens = false)`
`340`	`339`	`{`
`341`		`- if (!skipSpecialTokens && _specialTokensDecoder is not null && _specialTokensDecoder.TryGetValue(id, out string token))`
	`340`	`+ if (!skipSpecialTokens && _specialTokensDecoder is not null && _specialTokensDecoder.TryGetValue(id, out string? token))`
`342`	`341`	`{`
`343`	`342`	`return token;`
`344`	`343`	`}`
`345`	`344`
`346`		`- if (_decoder.TryGetValue(id, out byte[] tokenBytes))`
	`345`	`+ if (_decoder.TryGetValue(id, out byte[]? tokenBytes))`
`347`	`346`	`{`
`348`	`347`	`return Encoding.UTF8.GetString(tokenBytes);`
`349`	`348`	`}`
`@@ -363,11 +362,11 @@ public override bool TokenizeToIds(string sequence, bool isSpecialToken, List<in`
`363`	`362`
`364`	`363`	`foreach (int id in ids)`
`365`	`364`	`{`
`366`		`- if (_decoder.TryGetValue(id, out byte[] tokenBytes))`
	`365`	`+ if (_decoder.TryGetValue(id, out byte[]? tokenBytes))`
`367`	`366`	`{`
`368`	`367`	`utf8Bytes.AddRange(tokenBytes);`
`369`	`368`	`}`
`370`		`- else if (useSpecialTokens && _specialTokensDecoder!.TryGetValue(id, out string token))`
	`369`	`+ else if (useSpecialTokens && _specialTokensDecoder!.TryGetValue(id, out string? token))`
`371`	`370`	`{`
`372`	`371`	`utf8Bytes.AddRange(Encoding.UTF8.GetBytes(token));`
`373`	`372`	`}`