From 632dbe19fef117c262c218d6766cac3d46be041a Mon Sep 17 00:00:00 2001 From: XiaoYun Zhang Date: Thu, 29 Aug 2024 13:12:48 -0700 Subject: [PATCH 1/4] add embedding --- .../Phi3Mini/AutoGenSample.cs | 22 ++++++++++++++-- .../Microsoft.ML.GenAI.Samples/Program.cs | 2 +- .../Pipeline/CausalLMPipeline.cs | 25 +++++++++++++++++++ 3 files changed, 46 insertions(+), 3 deletions(-) diff --git a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs index 392aec674d..7bcdf380aa 100644 --- a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs +++ b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs @@ -27,13 +27,31 @@ public static async Task RunAsync() torch.set_default_dtype(defaultType); var weightFolder = @"C:\Users\xiaoyuz\source\repos\Phi-3-mini-4k-instruct"; var pipeline = Utils.LoadPhi3Mini4KFromFolder(weightFolder, device: device, quantizeToInt8: false); + var question = @"write a C# program to calculate the factorial of a number"; + var embeddingForQuery = pipeline.GenerateEmbeddingFromLastTokenPool(question); // agent var agent = new Phi3Agent(pipeline, "assistant") .RegisterPrintMessage(); - var question = @"write a C# program to calculate the factorial of a number"; // chat with the assistant - await agent.SendAsync(question); + var reply = await agent.SendAsync(question); + + + var replyContent = reply.GetContent() ?? throw new Exception("reply content is null"); + var replyEmbedding = pipeline.GenerateEmbeddingFromLastTokenPool(""" + What a sunny day! Time to travel. + """); + + // compare the similarity between the question and the reply + // the similarity is calculated by the dot product of the embeddings + + var similarity = 0f; + foreach (var (q, r) in embeddingForQuery.Zip(replyEmbedding)) + { + similarity += q * r; + } + + Console.WriteLine($"The similarity between the question and the reply is {similarity * 100}"); } } diff --git a/docs/samples/Microsoft.ML.GenAI.Samples/Program.cs b/docs/samples/Microsoft.ML.GenAI.Samples/Program.cs index 1560bad306..5e4355e595 100644 --- a/docs/samples/Microsoft.ML.GenAI.Samples/Program.cs +++ b/docs/samples/Microsoft.ML.GenAI.Samples/Program.cs @@ -1,4 +1,4 @@ // See https://aka.ms/new-console-template for more information using Microsoft.ML.GenAI.Samples.Phi3Mini; -await SemanticKernelSample.RunChatCompletionSample(); +await AutoGenSample.RunAsync(); diff --git a/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs b/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs index 7ecb64f761..7e7cb5f0f1 100644 --- a/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs +++ b/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs @@ -32,6 +32,8 @@ string Generate( float topP = CausalLMPipeline.Defaults.TopP, string[]? stopSequences = CausalLMPipeline.Defaults.StopSequence); + float[] GenerateEmbeddingFromLastTokenPool(string prompt); + IEnumerable GenerateStreaming( string prompt, int maxLen = CausalLMPipeline.Defaults.MaxLen, @@ -281,4 +283,27 @@ protected torch.Tensor SampleTopP(torch.Tensor logits, float topP) nextToken = torch.gather(probsIndex, dim: -1, index: nextToken); return nextToken; } + + public float[] GenerateEmbeddingFromLastTokenPool(string prompt) + { + using var scope = NewDisposeScope(); + using var noGrad = torch.no_grad(); + var inputIds = this.Tokenizer.EncodeToIds(prompt); + var inputTensor = torch.tensor(inputIds.ToArray(), dtype: ScalarType.Int64, device: this.Device).unsqueeze(0); + var attentionMask = torch.ones_like(inputTensor, device: this.Device); + var cache = new DynamicKVCache(); + var input = new CausalLMModelInput(inputTensor, attentionMask, pastKeyValuesLength: 0) + { + OverrideCache = cache, + }; + var output = this.Model.forward(input); + var lastTokenHiddenState = output.LastHiddenState[0, ^0]; + + // shape of lastTokenHiddenState: [hidden_size] + // L2 norm + var norm = lastTokenHiddenState.norm(); + var normalized = lastTokenHiddenState / norm; + + return normalized.to_type(ScalarType.Float32).data().ToArray(); + } } From 1130806ad25ca07645b2535fde5c428023ea7da5 Mon Sep 17 00:00:00 2001 From: XiaoYun Zhang Date: Thu, 29 Aug 2024 13:58:36 -0700 Subject: [PATCH 2/4] add frompretrain api to phi3 model --- .../Phi3Mini/AutoGenSample.cs | 25 +---- .../Phi3Mini/SemanticKernelSample.cs | 15 ++- .../Phi3Mini/Utils.cs | 103 ------------------ .../Pipeline/CausalLMPipeline.cs | 5 +- .../Phi3/Phi3ForCasualLM.cs | 50 +++++++++ 5 files changed, 69 insertions(+), 129 deletions(-) delete mode 100644 docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/Utils.cs diff --git a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs index 7bcdf380aa..393ad4f33f 100644 --- a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs +++ b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs @@ -9,6 +9,7 @@ using TorchSharp; using Microsoft.ML.GenAI.Core; using Microsoft.ML.GenAI.Core.Extension; +using Microsoft.ML.Tokenizers; namespace Microsoft.ML.GenAI.Samples.Phi3Mini; @@ -26,32 +27,16 @@ public static async Task RunAsync() torch.manual_seed(1); torch.set_default_dtype(defaultType); var weightFolder = @"C:\Users\xiaoyuz\source\repos\Phi-3-mini-4k-instruct"; - var pipeline = Utils.LoadPhi3Mini4KFromFolder(weightFolder, device: device, quantizeToInt8: false); + var tokenizer = Phi3TokenizerHelper.FromPretrained(weightFolder); + var model = Phi3ForCasualLM.FromPretrained(weightFolder, "config.json", layersOnTargetDevice: -1, quantizeToInt8: true); + var pipeline = new CausalLMPipeline(tokenizer, model, device); var question = @"write a C# program to calculate the factorial of a number"; - var embeddingForQuery = pipeline.GenerateEmbeddingFromLastTokenPool(question); // agent var agent = new Phi3Agent(pipeline, "assistant") .RegisterPrintMessage(); // chat with the assistant - var reply = await agent.SendAsync(question); - - - var replyContent = reply.GetContent() ?? throw new Exception("reply content is null"); - var replyEmbedding = pipeline.GenerateEmbeddingFromLastTokenPool(""" - What a sunny day! Time to travel. - """); - - // compare the similarity between the question and the reply - // the similarity is calculated by the dot product of the embeddings - - var similarity = 0f; - foreach (var (q, r) in embeddingForQuery.Zip(replyEmbedding)) - { - similarity += q * r; - } - - Console.WriteLine($"The similarity between the question and the reply is {similarity * 100}"); + await agent.SendAsync(question); } } diff --git a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/SemanticKernelSample.cs b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/SemanticKernelSample.cs index a6f445b643..03c4dbdac0 100644 --- a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/SemanticKernelSample.cs +++ b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/SemanticKernelSample.cs @@ -1,4 +1,7 @@ -using Microsoft.ML.GenAI.Phi.Extension; +using Microsoft.ML.GenAI.Core; +using Microsoft.ML.GenAI.Phi; +using Microsoft.ML.GenAI.Phi.Extension; +using Microsoft.ML.Tokenizers; using Microsoft.SemanticKernel; using Microsoft.SemanticKernel.ChatCompletion; using TorchSharp; @@ -20,8 +23,9 @@ public static async Task RunChatCompletionSample() torch.manual_seed(1); torch.set_default_dtype(defaultType); var weightFolder = @"C:\Users\xiaoyuz\source\repos\Phi-3-mini-4k-instruct"; - var pipeline = Utils.LoadPhi3Mini4KFromFolder(weightFolder, device: device); - + var tokenizer = Phi3TokenizerHelper.FromPretrained(weightFolder); + var model = Phi3ForCasualLM.FromPretrained(weightFolder, "config.json", layersOnTargetDevice: -1, quantizeToInt8: true); + var pipeline = new CausalLMPipeline(tokenizer, model, device); var kernel = Kernel.CreateBuilder() .AddGenAIChatCompletion(pipeline) @@ -49,8 +53,9 @@ public static async Task RunTextGenerationSample() torch.manual_seed(1); torch.set_default_dtype(defaultType); var weightFolder = @"C:\Users\xiaoyuz\source\repos\Phi-3-mini-4k-instruct"; - var pipeline = Utils.LoadPhi3Mini4KFromFolder(weightFolder, device); - + var tokenizer = Phi3TokenizerHelper.FromPretrained(weightFolder); + var model = Phi3ForCasualLM.FromPretrained(weightFolder, "config.json", layersOnTargetDevice: -1, quantizeToInt8: true); + var pipeline = new CausalLMPipeline(tokenizer, model, device); var kernel = Kernel.CreateBuilder() .AddGenAITextGeneration(pipeline) diff --git a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/Utils.cs b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/Utils.cs deleted file mode 100644 index 33819a8df4..0000000000 --- a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/Utils.cs +++ /dev/null @@ -1,103 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; -using Microsoft.ML.GenAI.Core; -using Microsoft.ML.GenAI.Phi; -using Tensorboard; -using static TorchSharp.torch; -using TorchSharp; -using Microsoft.ML.GenAI.Core.Extension; -using System.Text.Json; -using Microsoft.ML.Tokenizers; - -namespace Microsoft.ML.GenAI.Samples.Phi3Mini; - -internal static class Utils -{ - public static ICausalLMPipeline LoadPhi3Mini4KFromFolder( - string weightFolder, - string configName = "config.json", - string device = "cuda", - int modelSizeOnCudaInGB = 55, - int modelSizeOnMemoryInGB = 64, - int modelSizeOnDiskInGB = 200, - bool quantizeToInt8 = false, - bool quantizeToInt4 = false) - { - Console.WriteLine("Loading Phi3 from huggingface model weight folder"); - torch.set_default_device("meta"); - var configPath = System.IO.Path.Combine(weightFolder, configName); - var config = JsonSerializer.Deserialize(System.IO.File.ReadAllText(configPath)) ?? throw new ArgumentNullException(nameof(configPath)); - var timer = System.Diagnostics.Stopwatch.StartNew(); - var model = new Phi3ForCasualLM(config); - var tokenzierPath = System.IO.Path.Combine(weightFolder, "tokenizer.model"); - var tokenizer = Phi3TokenizerHelper.FromPretrained(tokenzierPath); - - if (quantizeToInt8) - { - model.ToInt8QuantizeModule(); - } - else if (quantizeToInt4) - { - model.ToInt4QuantizeModule(); - } - - var deviceSizeMap = new Dictionary - { - ["cuda"] = modelSizeOnCudaInGB * 1L * 1024 * 1024 * 1024, - ["cpu"] = modelSizeOnMemoryInGB * 1L * 1024 * 1024 * 1024, - ["disk"] = modelSizeOnDiskInGB * 1L * 1024 * 1024 * 1024, - }; - - var deviceMap = model.InferDeviceMapForEachLayer( - devices: ["cuda", "cpu", "disk"], - deviceSizeMapInByte: deviceSizeMap); - - var deviceMapJson = JsonSerializer.Serialize(deviceMap, new JsonSerializerOptions { WriteIndented = true }); - Console.WriteLine($"Device map:"); - Console.WriteLine(deviceMapJson); - - // load weight - torch.set_default_device("cpu"); - - Console.WriteLine("Start loading"); - timer = System.Diagnostics.Stopwatch.StartNew(); - model = new Phi3ForCasualLM(config); - timer.Stop(); - Console.WriteLine($"Phi3 model created in {timer.ElapsedMilliseconds / 1000} s"); - - timer = System.Diagnostics.Stopwatch.StartNew(); - model.LoadSafeTensors(weightFolder); - timer.Stop(); - Console.WriteLine($"Phi3 weight loaded in {timer.ElapsedMilliseconds / 1000} s"); - - if (quantizeToInt8 || quantizeToInt4) - { - timer = System.Diagnostics.Stopwatch.StartNew(); - Console.WriteLine("Start quantizing if needed"); - if (quantizeToInt8) - { - model.ToInt8QuantizeModule(); - } - else if (quantizeToInt4) - { - model.ToInt4QuantizeModule(); - } - Console.WriteLine("Quantizing done"); - timer.Stop(); - Console.WriteLine($"Quantizing done in {timer.ElapsedMilliseconds / 1000} s"); - } - - timer = System.Diagnostics.Stopwatch.StartNew(); - Console.WriteLine($"Start loading to device: {device}"); - model = model.ToDynamicLoadingModel(deviceMap, "cuda"); - timer.Stop(); - Console.WriteLine($"Phi3 loaded to device: {device} in {timer.ElapsedMilliseconds / 1000} s"); - var pipeline = new CausalLMPipeline(tokenizer, model, device); - torch.set_default_device(device); - - return pipeline; - } -} diff --git a/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs b/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs index 7e7cb5f0f1..c8f2fe7db8 100644 --- a/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs +++ b/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs @@ -32,6 +32,9 @@ string Generate( float topP = CausalLMPipeline.Defaults.TopP, string[]? stopSequences = CausalLMPipeline.Defaults.StopSequence); + /// + /// Generate the embedding(last hidden state of the last token) for the prompt. The embedding is normalized by L2 norm. + /// float[] GenerateEmbeddingFromLastTokenPool(string prompt); IEnumerable GenerateStreaming( @@ -297,7 +300,7 @@ public float[] GenerateEmbeddingFromLastTokenPool(string prompt) OverrideCache = cache, }; var output = this.Model.forward(input); - var lastTokenHiddenState = output.LastHiddenState[0, ^0]; + var lastTokenHiddenState = output.LastHiddenState[0, ^1]; // shape of lastTokenHiddenState: [hidden_size] // L2 norm diff --git a/src/Microsoft.ML.GenAI.Phi/Phi3/Phi3ForCasualLM.cs b/src/Microsoft.ML.GenAI.Phi/Phi3/Phi3ForCasualLM.cs index c67741377e..a5840b242a 100644 --- a/src/Microsoft.ML.GenAI.Phi/Phi3/Phi3ForCasualLM.cs +++ b/src/Microsoft.ML.GenAI.Phi/Phi3/Phi3ForCasualLM.cs @@ -9,6 +9,7 @@ using System.Text.Json; using System.Threading.Tasks; using Microsoft.ML.GenAI.Core; +using Microsoft.ML.GenAI.Core.Extension; using Microsoft.ML.GenAI.Phi.Module; using TorchSharp; using TorchSharp.Modules; @@ -66,6 +67,55 @@ public static Phi3ForCasualLM FromPretrained( return phi; } + public static Phi3ForCasualLM FromPretrained( + string modelFolder, + string configName = "config.json", + string checkPointName = "model.safetensors.index.json", + bool quantizeToInt8 = false, + bool quantizeToInt4 = false, + int layersOnTargetDevice = -1, + ScalarType torchDtype = ScalarType.BFloat16, + string targetDevice = "cuda") + { + if (layersOnTargetDevice == -1 && quantizeToInt4 == false && quantizeToInt8 == false) + { + return FromPretrained(modelFolder, configName, checkPointName, torchDtype, targetDevice); + } + + var originalDefaultDevice = torch.get_default_device(); + torch.set_default_device("meta"); + var config = Path.Join(modelFolder, configName); + var modelConfig = JsonSerializer.Deserialize(File.ReadAllText(config)) ?? throw new ArgumentNullException(nameof(config)); + modelConfig.DType = torchDtype; + var model = new Phi3ForCasualLM(modelConfig); + + if (quantizeToInt8) + { + model.ToInt8QuantizeModule(); + } + else if (quantizeToInt4) + { + model.ToInt4QuantizeModule(); + } + + var deviceMap = model.InferDeviceMapForEachLayer( + [ + KeyValuePair.Create(targetDevice, layersOnTargetDevice), + KeyValuePair.Create("cpu", -1) + ]); + + torch.set_default_device("cpu"); + model = new Phi3ForCasualLM(modelConfig); + + model.LoadSafeTensors(modelFolder, checkPointName); + + model = model.ToDynamicLoadingModel(deviceMap, targetDevice); + + torch.set_default_device(originalDefaultDevice); + + return model; + } + public void LoadSafeTensors(string modelFolder, string checkPointName = "model.safetensors.index.json") { this.load_checkpoint(path: modelFolder, checkpointName: checkPointName, strict: false, useTqdm: false); From 328f85987f4ac72608bd74e6cc9e024c72605fff Mon Sep 17 00:00:00 2001 From: XiaoYun Zhang Date: Thu, 29 Aug 2024 14:00:51 -0700 Subject: [PATCH 3/4] fix bug --- .../Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs | 3 ++- .../Phi3Mini/SemanticKernelSample.cs | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs index 393ad4f33f..20f2dd4418 100644 --- a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs +++ b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs @@ -27,7 +27,8 @@ public static async Task RunAsync() torch.manual_seed(1); torch.set_default_dtype(defaultType); var weightFolder = @"C:\Users\xiaoyuz\source\repos\Phi-3-mini-4k-instruct"; - var tokenizer = Phi3TokenizerHelper.FromPretrained(weightFolder); + var tokenizerPath = Path.Combine(weightFolder, "tokenizer.model"); + var tokenizer = Phi3TokenizerHelper.FromPretrained(tokenizerPath); var model = Phi3ForCasualLM.FromPretrained(weightFolder, "config.json", layersOnTargetDevice: -1, quantizeToInt8: true); var pipeline = new CausalLMPipeline(tokenizer, model, device); var question = @"write a C# program to calculate the factorial of a number"; diff --git a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/SemanticKernelSample.cs b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/SemanticKernelSample.cs index 03c4dbdac0..8ba882618b 100644 --- a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/SemanticKernelSample.cs +++ b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/SemanticKernelSample.cs @@ -23,7 +23,8 @@ public static async Task RunChatCompletionSample() torch.manual_seed(1); torch.set_default_dtype(defaultType); var weightFolder = @"C:\Users\xiaoyuz\source\repos\Phi-3-mini-4k-instruct"; - var tokenizer = Phi3TokenizerHelper.FromPretrained(weightFolder); + var tokenizerPath = Path.Combine(weightFolder, "tokenizer.model"); + var tokenizer = Phi3TokenizerHelper.FromPretrained(tokenizerPath); var model = Phi3ForCasualLM.FromPretrained(weightFolder, "config.json", layersOnTargetDevice: -1, quantizeToInt8: true); var pipeline = new CausalLMPipeline(tokenizer, model, device); @@ -53,7 +54,8 @@ public static async Task RunTextGenerationSample() torch.manual_seed(1); torch.set_default_dtype(defaultType); var weightFolder = @"C:\Users\xiaoyuz\source\repos\Phi-3-mini-4k-instruct"; - var tokenizer = Phi3TokenizerHelper.FromPretrained(weightFolder); + var tokenizerPath = Path.Combine(weightFolder, "tokenizer.model"); + var tokenizer = Phi3TokenizerHelper.FromPretrained(tokenizerPath); var model = Phi3ForCasualLM.FromPretrained(weightFolder, "config.json", layersOnTargetDevice: -1, quantizeToInt8: true); var pipeline = new CausalLMPipeline(tokenizer, model, device); From 15448e10a533c9c6d1466d76d0274c7b92e1cafe Mon Sep 17 00:00:00 2001 From: Xiaoyun Zhang Date: Thu, 29 Aug 2024 14:06:34 -0700 Subject: [PATCH 4/4] Update CausalLMPipeline.cs --- src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs b/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs index c8f2fe7db8..33e0bab19c 100644 --- a/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs +++ b/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs @@ -294,11 +294,7 @@ public float[] GenerateEmbeddingFromLastTokenPool(string prompt) var inputIds = this.Tokenizer.EncodeToIds(prompt); var inputTensor = torch.tensor(inputIds.ToArray(), dtype: ScalarType.Int64, device: this.Device).unsqueeze(0); var attentionMask = torch.ones_like(inputTensor, device: this.Device); - var cache = new DynamicKVCache(); - var input = new CausalLMModelInput(inputTensor, attentionMask, pastKeyValuesLength: 0) - { - OverrideCache = cache, - }; + var input = new CausalLMModelInput(inputTensor, attentionMask, pastKeyValuesLength: 0); var output = this.Model.forward(input); var lastTokenHiddenState = output.LastHiddenState[0, ^1];