From 1a700c6b7d11accb56405733a8f9f0bd15606841 Mon Sep 17 00:00:00 2001
From: Stephen Toub <stoub@microsoft.com>
Date: Wed, 8 Jan 2025 23:51:05 -0500
Subject: [PATCH] Some tweaks to the Microsoft.ML.Tokenizers PACKAGE.md

---
 src/Microsoft.ML.Tokenizers/PACKAGE.md | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)
diff --git a/src/Microsoft.ML.Tokenizers/PACKAGE.md b/src/Microsoft.ML.Tokenizers/PACKAGE.md
index 58b0feaebe..d408379c64 100644
--- a/src/Microsoft.ML.Tokenizers/PACKAGE.md
+++ b/src/Microsoft.ML.Tokenizers/PACKAGE.md
@@ -1,6 +1,6 @@
 ## About
 
-Microsoft.ML.Tokenizers supports various the implementation of the tokenization used in the NLP transforms.
+Microsoft.ML.Tokenizers provides an abstraction for tokenizers as well as implementations of common tokenization algorithms.
 
 ## Key Features
 
@@ -15,28 +15,28 @@ Microsoft.ML.Tokenizers supports various the implementation of the tokenization
 
 ```c#
 using Microsoft.ML.Tokenizers;
-using System.Net.Http;
 using System.IO;
+using System.Net.Http;
 
 //
 // Using Tiktoken Tokenizer
 //
 
-// initialize the tokenizer for `gpt-4` model
-Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4");
+// Initialize the tokenizer for the `gpt-4o` model. This instance should be cached for all subsequent use.
+Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o");
 
 string source = "Text tokenization is the process of splitting a string into a list of tokens.";
 
 Console.WriteLine($"Tokens: {tokenizer.CountTokens(source)}");
-// print: Tokens: 16
+// prints: Tokens: 16
 
 var trimIndex = tokenizer.GetIndexByTokenCountFromEnd(source, 5, out string processedText, out _);
 Console.WriteLine($"5 tokens from end: {processedText.Substring(trimIndex)}");
-// 5 tokens from end:  a list of tokens.
+// prints: 5 tokens from end:  a list of tokens.
 
 trimIndex = tokenizer.GetIndexByTokenCount(source, 5, out processedText, out _);
 Console.WriteLine($"5 tokens from start: {processedText.Substring(0, trimIndex)}");
-// 5 tokens from start: Text tokenization is the
+// prints: 5 tokens from start: Text tokenization is the
 
 IReadOnlyList<int> ids = tokenizer.EncodeToIds(source);
 Console.WriteLine(string.Join(", ", ids));
@@ -46,20 +46,21 @@ Console.WriteLine(string.Join(", ", ids));
 // Using Llama Tokenizer
 //
 
-// Open stream of remote Llama tokenizer model data file
+// Open a stream to the remote Llama tokenizer model data file.
 using HttpClient httpClient = new();
 const string modelUrl = @"https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model";
 using Stream remoteStream = await httpClient.GetStreamAsync(modelUrl);
 
-// Create the Llama tokenizer using the remote stream
+// Create the Llama tokenizer using the remote stream. This should be cached for all subsequent use.
 Tokenizer llamaTokenizer = LlamaTokenizer.Create(remoteStream);
+
 string input = "Hello, world!";
 ids = llamaTokenizer.EncodeToIds(input);
 Console.WriteLine(string.Join(", ", ids));
 // prints: 1, 15043, 29892, 3186, 29991
 
 Console.WriteLine($"Tokens: {llamaTokenizer.CountTokens(input)}");
-// print: Tokens: 5
+// prints: Tokens: 5
 ```
 
 ## Main Types