From 1a700c6b7d11accb56405733a8f9f0bd15606841 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Wed, 8 Jan 2025 23:51:05 -0500 Subject: [PATCH] Some tweaks to the Microsoft.ML.Tokenizers PACKAGE.md --- src/Microsoft.ML.Tokenizers/PACKAGE.md | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/Microsoft.ML.Tokenizers/PACKAGE.md b/src/Microsoft.ML.Tokenizers/PACKAGE.md index 58b0feaebe..d408379c64 100644 --- a/src/Microsoft.ML.Tokenizers/PACKAGE.md +++ b/src/Microsoft.ML.Tokenizers/PACKAGE.md @@ -1,6 +1,6 @@ ## About -Microsoft.ML.Tokenizers supports various the implementation of the tokenization used in the NLP transforms. +Microsoft.ML.Tokenizers provides an abstraction for tokenizers as well as implementations of common tokenization algorithms. ## Key Features @@ -15,28 +15,28 @@ Microsoft.ML.Tokenizers supports various the implementation of the tokenization ```c# using Microsoft.ML.Tokenizers; -using System.Net.Http; using System.IO; +using System.Net.Http; // // Using Tiktoken Tokenizer // -// initialize the tokenizer for `gpt-4` model -Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4"); +// Initialize the tokenizer for the `gpt-4o` model. This instance should be cached for all subsequent use. +Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o"); string source = "Text tokenization is the process of splitting a string into a list of tokens."; Console.WriteLine($"Tokens: {tokenizer.CountTokens(source)}"); -// print: Tokens: 16 +// prints: Tokens: 16 var trimIndex = tokenizer.GetIndexByTokenCountFromEnd(source, 5, out string processedText, out _); Console.WriteLine($"5 tokens from end: {processedText.Substring(trimIndex)}"); -// 5 tokens from end: a list of tokens. +// prints: 5 tokens from end: a list of tokens. trimIndex = tokenizer.GetIndexByTokenCount(source, 5, out processedText, out _); Console.WriteLine($"5 tokens from start: {processedText.Substring(0, trimIndex)}"); -// 5 tokens from start: Text tokenization is the +// prints: 5 tokens from start: Text tokenization is the IReadOnlyList ids = tokenizer.EncodeToIds(source); Console.WriteLine(string.Join(", ", ids)); @@ -46,20 +46,21 @@ Console.WriteLine(string.Join(", ", ids)); // Using Llama Tokenizer // -// Open stream of remote Llama tokenizer model data file +// Open a stream to the remote Llama tokenizer model data file. using HttpClient httpClient = new(); const string modelUrl = @"https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model"; using Stream remoteStream = await httpClient.GetStreamAsync(modelUrl); -// Create the Llama tokenizer using the remote stream +// Create the Llama tokenizer using the remote stream. This should be cached for all subsequent use. Tokenizer llamaTokenizer = LlamaTokenizer.Create(remoteStream); + string input = "Hello, world!"; ids = llamaTokenizer.EncodeToIds(input); Console.WriteLine(string.Join(", ", ids)); // prints: 1, 15043, 29892, 3186, 29991 Console.WriteLine($"Tokens: {llamaTokenizer.CountTokens(input)}"); -// print: Tokens: 5 +// prints: Tokens: 5 ``` ## Main Types