dotnet
diff --git a/‎src/Microsoft.ML.Tokenizers/Model/BPE.cs‎
Lines changed: 58 additions & 41 deletions b/‎src/Microsoft.ML.Tokenizers/Model/BPE.cs‎
Lines changed: 58 additions & 41 deletions
diff --git a/‎src/Microsoft.ML.Tokenizers/Model/Cache.cs‎
Lines changed: 7 additions & 12 deletions b/‎src/Microsoft.ML.Tokenizers/Model/Cache.cs‎
Lines changed: 7 additions & 12 deletions
@@ -95,6 +95,7 @@ public Bpe(string vocabFile, string? mergesFile, string? unknownToken = null, st
 
             (Dictionary<string, int>? vocab1, Vec<(string, string)> merges) = ReadFile(vocabFile, mergesFile);
             Vocab = vocab1 ?? new Dictionary<string, int>();
+            Cache = new Cache<string, Word>();
 
             VocabReverse = new();
 
@@ -146,23 +147,33 @@ public Bpe(string vocabFile, string? mergesFile, string? unknownToken = null, st
         /// Tokenize a sequence string to a list of tokens.
         /// </summary>
         /// <param name="sequence">The sequence to tokenize.</param>
+        /// <param name="isSpecialToken">Indicate if the token is a special token.</param>
         /// <returns>The list of tokens generated from the sequence tokenization.</returns>
-        public override IReadOnlyList<Token> Tokenize(string sequence)
+        public override IReadOnlyList<Token> Tokenize(string sequence, bool isSpecialToken = false)
         {
             if (sequence.Length == 0)
             {
                 return EmptyTokensList;
             }
 
-            if (!Dropout.HasValue)
-            {
-                return TokenizeWithCache(sequence);
-            }
+            return TokenizeWithCache(sequence);
+        }
 
-            Word word = MergeWord(sequence);
+        /// <summary>
+        /// Tokenize a split sequence string to a list of Ids and add them to the accumulatedIds list.
+        /// </summary>
+        /// <param name="sequence">The sequence to split.</param>
+        /// <param name="isSpecialToken">Indicate if the token is a special token.</param>
+        /// <param name="accumulatedIds">The list of accumulated tokenized Ids.</param>
+        public override void TokenizeToIds(string sequence, bool isSpecialToken, IList<int> accumulatedIds) => TokenizeToIdsWithCache(sequence, accumulatedIds);
 
-            return WordToTokens(ref word);
-        }
+        /// <summary>
+        /// Get the number of tokens that the input sequence will be encoded to.
+        /// </summary>
+        /// <param name="sequence">The text to tokenize.</param>
+        /// <param name="isSpecialToken">Indicate if the token is special token.</param>
+        /// <returns>The number of tokens that the input sequence will be encoded to.</returns>
+        public override int CountTokens(string sequence, bool isSpecialToken) => TokenizeToIdsWithCache(sequence, null);
 
         /// <summary>
         /// Map the token to tokenized Id.
@@ -195,14 +206,6 @@ public override IReadOnlyList<Token> Tokenize(string sequence)
             return null;
         }
 
-        /// <summary>
-        /// Map the tokenized Id to the token.
-        /// </summary>
-        /// <param name="id">The Id to map to the token.</param>
-        /// <param name="skipSpecialTokens">Indicate if want to skip the special tokens during the decoding.</param>
-        /// <returns>The mapped token of the Id.</returns>
-        public override string? IdToString(int id, bool skipSpecialTokens = false) => throw new NotImplementedException();
-
         /// <summary>
         /// Gets the dictionary mapping tokens to Ids.
         /// </summary>
@@ -332,7 +335,7 @@ internal string CharToString(char c)
 
         internal Word MergeWord(string w)
         {
-            Word word = Word.WithCapacity((int)w.Length);
+            Word word = Word.WithCapacity(w.Length);
             (int Id, int Len)? unk = null;
             int i = 0;
 
@@ -344,7 +347,7 @@ internal Word MergeWord(string w)
                 if (Char.IsHighSurrogate(w[i]) && i < w.Length - 1 && Char.IsLowSurrogate(w[i + 1]))
                 {
                     length = 2;
-                    s = w.Substring(i, (int)length);
+                    s = w.Substring(i, length);
                 }
                 else
                 {
@@ -403,7 +406,7 @@ internal Word MergeWord(string w)
                     }
                 }
 
-                i += (int)length;
+                i += length;
             }
 
             if (unk.HasValue)
@@ -415,45 +418,59 @@ internal Word MergeWord(string w)
             return word;
         }
 
-        // internal Word.Enumerator WordToTokens(Word word) => word.GetIterator(VocabReverse);
-        internal List<Token> WordToTokens(ref Word word)
+        internal List<Token> WordToTokens(ref Word word) => word.ToTokens(VocabReverse);
+
+        internal List<Token> TokenizeWithCache(string sequence)
         {
-            List<Token> tokens = new(word.SymbolsCount);
+            Word word;
+            if (Cache is not null)
+            {
+                if (Cache.TryGet(sequence, out word))
+                {
+                    return WordToTokens(ref word);
+                }
 
-            foreach (Token token in word.GetIterator(VocabReverse))
+                word = MergeWord(sequence);
+                Cache.Set(sequence, word);
+            }
+            else
             {
-                tokens.Add(token);
+                word = MergeWord(sequence);
             }
 
-            return tokens;
+            return WordToTokens(ref word);
         }
 
-        internal List<Token> TokenizeWithCache(string sequence)
+        internal int WordToIds(ref Word word, IList<int>? accumulatedIds)
         {
-            if (Cache is not null)
+            if (accumulatedIds is not null)
             {
-                Word? hit = Cache.Get(sequence);
-                if (hit.HasValue)
-                {
-                    Word w = hit.Value;
-                    return WordToTokens(ref w);
-                }
+                word.PopulateIds(accumulatedIds);
             }
 
-            Word word = MergeWord(sequence);
-            List<Token> tokens = WordToTokens(ref word);
+            return word.SymbolsCount;
+        }
+
+        internal int TokenizeToIdsWithCache(string sequence, IList<int>? accumulatedIds)
+        {
+            Word word;
 
             if (Cache is not null)
             {
+                if (Cache.TryGet(sequence, out Word hit))
+                {
+                    return WordToIds(ref hit, accumulatedIds);
+                }
+
+                word = MergeWord(sequence);
                 Cache.Set(sequence, word);
             }
+            else
+            {
+                word = MergeWord(sequence);
+            }
 
-            return tokens;
-        }
-
-        public override bool IsValidChar(char ch)
-        {
-            throw new NotImplementedException();
+            return WordToIds(ref word, accumulatedIds);
         }
 
         internal static readonly List<Token> EmptyTokensList = new();
 
@@ -9,14 +9,14 @@
 
 namespace Microsoft.ML.Tokenizers
 {
-    internal sealed class Cache<TKey, TValue> where TKey : notnull
+    internal sealed class Cache<TKey, TValue> where TKey : notnull where TValue : notnull
     {
         internal Cache() : this(Bpe.DefaultCacheCapacity) { }
 
         internal Cache(int capacity)
         {
             Capacity = capacity;
-            Map = new Dictionary<TKey, TValue>((int)Capacity);
+            Map = new Dictionary<TKey, TValue>(Capacity);
         }
 
         private readonly ReaderWriterLockSlim _cacheLock = new ReaderWriterLockSlim();
@@ -25,7 +25,7 @@ internal Cache(int capacity)
 
         internal int Capacity { get; set; }
 
-        internal void Fresh() => Map = new Dictionary<TKey, TValue>((int)Capacity);
+        internal void Fresh() => Map = new Dictionary<TKey, TValue>(Capacity);
 
         internal void Clear()
         {
@@ -56,27 +56,22 @@ internal List<TValue> GetValues(IEnumerable<TKey> keys)
             return values;
         }
 
-        internal TValue? Get(TKey key)
+        internal bool TryGet(TKey key, out TValue value)
         {
             _cacheLock.EnterReadLock();
             try
             {
-                if (Map.TryGetValue(key, out TValue? value))
-                {
-                    return value;
-                }
+                return Map.TryGetValue(key, out value!);
             }
             finally { _cacheLock.ExitReadLock(); }
-
-            return default;
         }
 
-        internal void SetValues(IEnumerable<(TKey, TValue)> enteries)
+        internal void SetValues(IEnumerable<(TKey, TValue)> entries)
         {
             _cacheLock.EnterWriteLock();
             try
             {
-                foreach ((TKey, TValue) entry in enteries)
+                foreach ((TKey, TValue) entry in entries)
                 {
                     if (Capacity <= Map.Count)
                     {
Original file line number	Diff line number	Diff line change
`@@ -9,14 +9,14 @@`
`9`	`9`
`10`	`10`	`namespace Microsoft.ML.Tokenizers`
`11`	`11`	`{`
`12`		`- internal sealed class Cache<TKey, TValue> where TKey : notnull`
	`12`	`+ internal sealed class Cache<TKey, TValue> where TKey : notnull where TValue : notnull`
`13`	`13`	`{`
`14`	`14`	`internal Cache() : this(Bpe.DefaultCacheCapacity) { }`
`15`	`15`
`16`	`16`	`internal Cache(int capacity)`
`17`	`17`	`{`
`18`	`18`	`Capacity = capacity;`
`19`		`- Map = new Dictionary<TKey, TValue>((int)Capacity);`
	`19`	`+ Map = new Dictionary<TKey, TValue>(Capacity);`
`20`	`20`	`}`
`21`	`21`
`22`	`22`	`private readonly ReaderWriterLockSlim _cacheLock = new ReaderWriterLockSlim();`
`@@ -25,7 +25,7 @@ internal Cache(int capacity)`
`25`	`25`
`26`	`26`	`internal int Capacity { get; set; }`
`27`	`27`
`28`		`- internal void Fresh() => Map = new Dictionary<TKey, TValue>((int)Capacity);`
	`28`	`+ internal void Fresh() => Map = new Dictionary<TKey, TValue>(Capacity);`
`29`	`29`
`30`	`30`	`internal void Clear()`
`31`	`31`	`{`
`@@ -56,27 +56,22 @@ internal List<TValue> GetValues(IEnumerable<TKey> keys)`
`56`	`56`	`return values;`
`57`	`57`	`}`
`58`	`58`
`59`		`- internal TValue? Get(TKey key)`
	`59`	`+ internal bool TryGet(TKey key, out TValue value)`
`60`	`60`	`{`
`61`	`61`	`_cacheLock.EnterReadLock();`
`62`	`62`	`try`
`63`	`63`	`{`
`64`		`- if (Map.TryGetValue(key, out TValue? value))`
`65`		`- {`
`66`		`- return value;`
`67`		`- }`
	`64`	`+ return Map.TryGetValue(key, out value!);`
`68`	`65`	`}`
`69`	`66`	`finally { _cacheLock.ExitReadLock(); }`
`70`		`-`
`71`		`- return default;`
`72`	`67`	`}`
`73`	`68`
`74`		`- internal void SetValues(IEnumerable<(TKey, TValue)> enteries)`
	`69`	`+ internal void SetValues(IEnumerable<(TKey, TValue)> entries)`
`75`	`70`	`{`
`76`	`71`	`_cacheLock.EnterWriteLock();`
`77`	`72`	`try`
`78`	`73`	`{`
`79`		`- foreach ((TKey, TValue) entry in enteries)`
	`74`	`+ foreach ((TKey, TValue) entry in entries)`
`80`	`75`	`{`
`81`	`76`	`if (Capacity <= Map.Count)`
`82`	`77`	`{`