From 632dbe19fef117c262c218d6766cac3d46be041a Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Thu, 29 Aug 2024 13:12:48 -0700
Subject: [PATCH 1/4] add embedding

---
 .../Phi3Mini/AutoGenSample.cs                 | 22 ++++++++++++++--
 .../Microsoft.ML.GenAI.Samples/Program.cs     |  2 +-
 .../Pipeline/CausalLMPipeline.cs              | 25 +++++++++++++++++++
 3 files changed, 46 insertions(+), 3 deletions(-)
diff --git a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs
index 392aec674d..7bcdf380aa 100644
--- a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs
+++ b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs
@@ -27,13 +27,31 @@ public static async Task RunAsync()
         torch.set_default_dtype(defaultType);
         var weightFolder = @"C:\Users\xiaoyuz\source\repos\Phi-3-mini-4k-instruct";
         var pipeline = Utils.LoadPhi3Mini4KFromFolder(weightFolder, device: device, quantizeToInt8: false);
+        var question = @"write a C# program to calculate the factorial of a number";
+        var embeddingForQuery = pipeline.GenerateEmbeddingFromLastTokenPool(question);
 
         // agent
         var agent = new Phi3Agent(pipeline, "assistant")
             .RegisterPrintMessage();
-        var question = @"write a C# program to calculate the factorial of a number";
 
         // chat with the assistant
-        await agent.SendAsync(question);
+        var reply = await agent.SendAsync(question);
+
+
+        var replyContent = reply.GetContent() ?? throw new Exception("reply content is null");
+        var replyEmbedding = pipeline.GenerateEmbeddingFromLastTokenPool("""
+            What a sunny day! Time to travel.
+            """);
+
+        // compare the similarity between the question and the reply
+        // the similarity is calculated by the dot product of the embeddings
+
+        var similarity = 0f;
+        foreach (var (q, r) in embeddingForQuery.Zip(replyEmbedding))
+        {
+            similarity += q * r;
+        }
+
+        Console.WriteLine($"The similarity between the question and the reply is {similarity * 100}");
     }
 }
diff --git a/docs/samples/Microsoft.ML.GenAI.Samples/Program.cs b/docs/samples/Microsoft.ML.GenAI.Samples/Program.cs
index 1560bad306..5e4355e595 100644
--- a/docs/samples/Microsoft.ML.GenAI.Samples/Program.cs
+++ b/docs/samples/Microsoft.ML.GenAI.Samples/Program.cs
@@ -1,4 +1,4 @@
 ﻿// See https://aka.ms/new-console-template for more information
 using Microsoft.ML.GenAI.Samples.Phi3Mini;
 
-await SemanticKernelSample.RunChatCompletionSample();
+await AutoGenSample.RunAsync();
diff --git a/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs b/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs
index 7ecb64f761..7e7cb5f0f1 100644
--- a/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs
+++ b/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs
@@ -32,6 +32,8 @@ string Generate(
         float topP = CausalLMPipeline.Defaults.TopP,
         string[]? stopSequences = CausalLMPipeline.Defaults.StopSequence);
 
+    float[] GenerateEmbeddingFromLastTokenPool(string prompt);
+
     IEnumerable<string> GenerateStreaming(
         string prompt,
         int maxLen = CausalLMPipeline.Defaults.MaxLen,
@@ -281,4 +283,27 @@ protected torch.Tensor SampleTopP(torch.Tensor logits, float topP)
         nextToken = torch.gather(probsIndex, dim: -1, index: nextToken);
         return nextToken;
     }
+
+    public float[] GenerateEmbeddingFromLastTokenPool(string prompt)
+    {
+        using var scope = NewDisposeScope();
+        using var noGrad = torch.no_grad();
+        var inputIds = this.Tokenizer.EncodeToIds(prompt);
+        var inputTensor = torch.tensor(inputIds.ToArray(), dtype: ScalarType.Int64, device: this.Device).unsqueeze(0);
+        var attentionMask = torch.ones_like(inputTensor, device: this.Device);
+        var cache = new DynamicKVCache();
+        var input = new CausalLMModelInput(inputTensor, attentionMask, pastKeyValuesLength: 0)
+        {
+            OverrideCache = cache,
+        };
+        var output = this.Model.forward(input);
+        var lastTokenHiddenState = output.LastHiddenState[0, ^0];
+
+        // shape of lastTokenHiddenState: [hidden_size]
+        // L2 norm
+        var norm = lastTokenHiddenState.norm();
+        var normalized = lastTokenHiddenState / norm;
+
+        return normalized.to_type(ScalarType.Float32).data<float>().ToArray();
+    }
 }

From 1130806ad25ca07645b2535fde5c428023ea7da5 Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Thu, 29 Aug 2024 13:58:36 -0700
Subject: [PATCH 2/4] add frompretrain api to phi3 model

---
 .../Phi3Mini/AutoGenSample.cs                 |  25 +----
 .../Phi3Mini/SemanticKernelSample.cs          |  15 ++-
 .../Phi3Mini/Utils.cs                         | 103 ------------------
 .../Pipeline/CausalLMPipeline.cs              |   5 +-
 .../Phi3/Phi3ForCasualLM.cs                   |  50 +++++++++
 5 files changed, 69 insertions(+), 129 deletions(-)
 delete mode 100644 docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/Utils.cs

diff --git a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs
index 7bcdf380aa..393ad4f33f 100644
--- a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs
+++ b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs
@@ -9,6 +9,7 @@
 using TorchSharp;
 using Microsoft.ML.GenAI.Core;
 using Microsoft.ML.GenAI.Core.Extension;
+using Microsoft.ML.Tokenizers;
 
 namespace Microsoft.ML.GenAI.Samples.Phi3Mini;
 
@@ -26,32 +27,16 @@ public static async Task RunAsync()
         torch.manual_seed(1);
         torch.set_default_dtype(defaultType);
         var weightFolder = @"C:\Users\xiaoyuz\source\repos\Phi-3-mini-4k-instruct";
-        var pipeline = Utils.LoadPhi3Mini4KFromFolder(weightFolder, device: device, quantizeToInt8: false);
+        var tokenizer = Phi3TokenizerHelper.FromPretrained(weightFolder);
+        var model = Phi3ForCasualLM.FromPretrained(weightFolder, "config.json", layersOnTargetDevice: -1, quantizeToInt8: true);
+        var pipeline = new CausalLMPipeline<LlamaTokenizer, Phi3ForCasualLM>(tokenizer, model, device);
         var question = @"write a C# program to calculate the factorial of a number";
-        var embeddingForQuery = pipeline.GenerateEmbeddingFromLastTokenPool(question);
 
         // agent
         var agent = new Phi3Agent(pipeline, "assistant")
             .RegisterPrintMessage();
 
         // chat with the assistant
-        var reply = await agent.SendAsync(question);
-
-
-        var replyContent = reply.GetContent() ?? throw new Exception("reply content is null");
-        var replyEmbedding = pipeline.GenerateEmbeddingFromLastTokenPool("""
-            What a sunny day! Time to travel.
-            """);
-
-        // compare the similarity between the question and the reply
-        // the similarity is calculated by the dot product of the embeddings
-
-        var similarity = 0f;
-        foreach (var (q, r) in embeddingForQuery.Zip(replyEmbedding))
-        {
-            similarity += q * r;
-        }
-
-        Console.WriteLine($"The similarity between the question and the reply is {similarity * 100}");
+        await agent.SendAsync(question);
     }
 }
diff --git a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/SemanticKernelSample.cs b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/SemanticKernelSample.cs
index a6f445b643..03c4dbdac0 100644
--- a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/SemanticKernelSample.cs
+++ b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/SemanticKernelSample.cs
@@ -1,4 +1,7 @@
-﻿using Microsoft.ML.GenAI.Phi.Extension;
+﻿using Microsoft.ML.GenAI.Core;
+using Microsoft.ML.GenAI.Phi;
+using Microsoft.ML.GenAI.Phi.Extension;
+using Microsoft.ML.Tokenizers;
 using Microsoft.SemanticKernel;
 using Microsoft.SemanticKernel.ChatCompletion;
 using TorchSharp;
@@ -20,8 +23,9 @@ public static async Task RunChatCompletionSample()
         torch.manual_seed(1);
         torch.set_default_dtype(defaultType);
         var weightFolder = @"C:\Users\xiaoyuz\source\repos\Phi-3-mini-4k-instruct";
-        var pipeline = Utils.LoadPhi3Mini4KFromFolder(weightFolder, device: device);
-
+        var tokenizer = Phi3TokenizerHelper.FromPretrained(weightFolder);
+        var model = Phi3ForCasualLM.FromPretrained(weightFolder, "config.json", layersOnTargetDevice: -1, quantizeToInt8: true);
+        var pipeline = new CausalLMPipeline<LlamaTokenizer, Phi3ForCasualLM>(tokenizer, model, device);
 
         var kernel = Kernel.CreateBuilder()
             .AddGenAIChatCompletion(pipeline)
@@ -49,8 +53,9 @@ public static async Task RunTextGenerationSample()
         torch.manual_seed(1);
         torch.set_default_dtype(defaultType);
         var weightFolder = @"C:\Users\xiaoyuz\source\repos\Phi-3-mini-4k-instruct";
-        var pipeline = Utils.LoadPhi3Mini4KFromFolder(weightFolder, device);
-
+        var tokenizer = Phi3TokenizerHelper.FromPretrained(weightFolder);
+        var model = Phi3ForCasualLM.FromPretrained(weightFolder, "config.json", layersOnTargetDevice: -1, quantizeToInt8: true);
+        var pipeline = new CausalLMPipeline<LlamaTokenizer, Phi3ForCasualLM>(tokenizer, model, device);
 
         var kernel = Kernel.CreateBuilder()
             .AddGenAITextGeneration(pipeline)
diff --git a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/Utils.cs b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/Utils.cs
deleted file mode 100644
index 33819a8df4..0000000000
--- a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/Utils.cs
+++ /dev/null
@@ -1,103 +0,0 @@
-﻿using System;
-using System.Collections.Generic;
-using System.Linq;
-using System.Text;
-using System.Threading.Tasks;
-using Microsoft.ML.GenAI.Core;
-using Microsoft.ML.GenAI.Phi;
-using Tensorboard;
-using static TorchSharp.torch;
-using TorchSharp;
-using Microsoft.ML.GenAI.Core.Extension;
-using System.Text.Json;
-using Microsoft.ML.Tokenizers;
-
-namespace Microsoft.ML.GenAI.Samples.Phi3Mini;
-
-internal static class Utils
-{
-    public static ICausalLMPipeline<Tokenizer, Phi3ForCasualLM> LoadPhi3Mini4KFromFolder(
-        string weightFolder,
-        string configName = "config.json",
-        string device = "cuda",
-        int modelSizeOnCudaInGB = 55,
-        int modelSizeOnMemoryInGB = 64,
-        int modelSizeOnDiskInGB = 200,
-        bool quantizeToInt8 = false,
-        bool quantizeToInt4 = false)
-    {
-        Console.WriteLine("Loading Phi3 from huggingface model weight folder");
-        torch.set_default_device("meta");
-        var configPath = System.IO.Path.Combine(weightFolder, configName);
-        var config = JsonSerializer.Deserialize<Phi3Config>(System.IO.File.ReadAllText(configPath)) ?? throw new ArgumentNullException(nameof(configPath));
-        var timer = System.Diagnostics.Stopwatch.StartNew();
-        var model = new Phi3ForCasualLM(config);
-        var tokenzierPath = System.IO.Path.Combine(weightFolder, "tokenizer.model");
-        var tokenizer = Phi3TokenizerHelper.FromPretrained(tokenzierPath);
-
-        if (quantizeToInt8)
-        {
-            model.ToInt8QuantizeModule();
-        }
-        else if (quantizeToInt4)
-        {
-            model.ToInt4QuantizeModule();
-        }
-
-        var deviceSizeMap = new Dictionary<string, long>
-        {
-            ["cuda"] = modelSizeOnCudaInGB * 1L * 1024 * 1024 * 1024,
-            ["cpu"] = modelSizeOnMemoryInGB * 1L * 1024 * 1024 * 1024,
-            ["disk"] = modelSizeOnDiskInGB * 1L * 1024 * 1024 * 1024,
-        };
-
-        var deviceMap = model.InferDeviceMapForEachLayer(
-            devices: ["cuda", "cpu", "disk"],
-            deviceSizeMapInByte: deviceSizeMap);
-
-        var deviceMapJson = JsonSerializer.Serialize(deviceMap, new JsonSerializerOptions { WriteIndented = true });
-        Console.WriteLine($"Device map:");
-        Console.WriteLine(deviceMapJson);
-
-        // load weight
-        torch.set_default_device("cpu");
-
-        Console.WriteLine("Start loading");
-        timer = System.Diagnostics.Stopwatch.StartNew();
-        model = new Phi3ForCasualLM(config);
-        timer.Stop();
-        Console.WriteLine($"Phi3 model created in {timer.ElapsedMilliseconds / 1000} s");
-
-        timer = System.Diagnostics.Stopwatch.StartNew();
-        model.LoadSafeTensors(weightFolder);
-        timer.Stop();
-        Console.WriteLine($"Phi3 weight loaded in {timer.ElapsedMilliseconds / 1000} s");
-
-        if (quantizeToInt8 || quantizeToInt4)
-        {
-            timer = System.Diagnostics.Stopwatch.StartNew();
-            Console.WriteLine("Start quantizing if needed");
-            if (quantizeToInt8)
-            {
-                model.ToInt8QuantizeModule();
-            }
-            else if (quantizeToInt4)
-            {
-                model.ToInt4QuantizeModule();
-            }
-            Console.WriteLine("Quantizing done");
-            timer.Stop();
-            Console.WriteLine($"Quantizing done in {timer.ElapsedMilliseconds / 1000} s");
-        }
-
-        timer = System.Diagnostics.Stopwatch.StartNew();
-        Console.WriteLine($"Start loading to device: {device}");
-        model = model.ToDynamicLoadingModel(deviceMap, "cuda");
-        timer.Stop();
-        Console.WriteLine($"Phi3 loaded to device: {device} in {timer.ElapsedMilliseconds / 1000} s");
-        var pipeline = new CausalLMPipeline<Tokenizer, Phi3ForCasualLM>(tokenizer, model, device);
-        torch.set_default_device(device);
-
-        return pipeline;
-    }
-}
diff --git a/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs b/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs
index 7e7cb5f0f1..c8f2fe7db8 100644
--- a/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs
+++ b/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs
@@ -32,6 +32,9 @@ string Generate(
         float topP = CausalLMPipeline.Defaults.TopP,
         string[]? stopSequences = CausalLMPipeline.Defaults.StopSequence);
 
+    /// <summary>
+    /// Generate the embedding(last hidden state of the last token) for the prompt. The embedding is normalized by L2 norm.
+    /// </summary>
     float[] GenerateEmbeddingFromLastTokenPool(string prompt);
 
     IEnumerable<string> GenerateStreaming(
@@ -297,7 +300,7 @@ public float[] GenerateEmbeddingFromLastTokenPool(string prompt)
             OverrideCache = cache,
         };
         var output = this.Model.forward(input);
-        var lastTokenHiddenState = output.LastHiddenState[0, ^0];
+        var lastTokenHiddenState = output.LastHiddenState[0, ^1];
 
         // shape of lastTokenHiddenState: [hidden_size]
         // L2 norm
diff --git a/src/Microsoft.ML.GenAI.Phi/Phi3/Phi3ForCasualLM.cs b/src/Microsoft.ML.GenAI.Phi/Phi3/Phi3ForCasualLM.cs
index c67741377e..a5840b242a 100644
--- a/src/Microsoft.ML.GenAI.Phi/Phi3/Phi3ForCasualLM.cs
+++ b/src/Microsoft.ML.GenAI.Phi/Phi3/Phi3ForCasualLM.cs
@@ -9,6 +9,7 @@
 using System.Text.Json;
 using System.Threading.Tasks;
 using Microsoft.ML.GenAI.Core;
+using Microsoft.ML.GenAI.Core.Extension;
 using Microsoft.ML.GenAI.Phi.Module;
 using TorchSharp;
 using TorchSharp.Modules;
@@ -66,6 +67,55 @@ public static Phi3ForCasualLM FromPretrained(
         return phi;
     }
 
+    public static Phi3ForCasualLM FromPretrained(
+        string modelFolder,
+        string configName = "config.json",
+        string checkPointName = "model.safetensors.index.json",
+        bool quantizeToInt8 = false,
+        bool quantizeToInt4 = false,
+        int layersOnTargetDevice = -1,
+        ScalarType torchDtype = ScalarType.BFloat16,
+        string targetDevice = "cuda")
+    {
+        if (layersOnTargetDevice == -1 && quantizeToInt4 == false && quantizeToInt8 == false)
+        {
+            return FromPretrained(modelFolder, configName, checkPointName, torchDtype, targetDevice);
+        }
+
+        var originalDefaultDevice = torch.get_default_device();
+        torch.set_default_device("meta");
+        var config = Path.Join(modelFolder, configName);
+        var modelConfig = JsonSerializer.Deserialize<Phi3Config>(File.ReadAllText(config)) ?? throw new ArgumentNullException(nameof(config));
+        modelConfig.DType = torchDtype;
+        var model = new Phi3ForCasualLM(modelConfig);
+
+        if (quantizeToInt8)
+        {
+            model.ToInt8QuantizeModule();
+        }
+        else if (quantizeToInt4)
+        {
+            model.ToInt4QuantizeModule();
+        }
+
+        var deviceMap = model.InferDeviceMapForEachLayer(
+            [
+                KeyValuePair.Create(targetDevice, layersOnTargetDevice),
+                KeyValuePair.Create("cpu", -1)
+            ]);
+
+        torch.set_default_device("cpu");
+        model = new Phi3ForCasualLM(modelConfig);
+
+        model.LoadSafeTensors(modelFolder, checkPointName);
+
+        model = model.ToDynamicLoadingModel(deviceMap, targetDevice);
+
+        torch.set_default_device(originalDefaultDevice);
+
+        return model;
+    }
+
     public void LoadSafeTensors(string modelFolder, string checkPointName = "model.safetensors.index.json")
     {
         this.load_checkpoint(path: modelFolder, checkpointName: checkPointName, strict: false, useTqdm: false);

From 328f85987f4ac72608bd74e6cc9e024c72605fff Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Thu, 29 Aug 2024 14:00:51 -0700
Subject: [PATCH 3/4] fix bug

---
 .../Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs    | 3 ++-
 .../Phi3Mini/SemanticKernelSample.cs                        | 6 ++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs
index 393ad4f33f..20f2dd4418 100644
--- a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs
+++ b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs
@@ -27,7 +27,8 @@ public static async Task RunAsync()
         torch.manual_seed(1);
         torch.set_default_dtype(defaultType);
         var weightFolder = @"C:\Users\xiaoyuz\source\repos\Phi-3-mini-4k-instruct";
-        var tokenizer = Phi3TokenizerHelper.FromPretrained(weightFolder);
+        var tokenizerPath = Path.Combine(weightFolder, "tokenizer.model");
+        var tokenizer = Phi3TokenizerHelper.FromPretrained(tokenizerPath);
         var model = Phi3ForCasualLM.FromPretrained(weightFolder, "config.json", layersOnTargetDevice: -1, quantizeToInt8: true);
         var pipeline = new CausalLMPipeline<LlamaTokenizer, Phi3ForCasualLM>(tokenizer, model, device);
         var question = @"write a C# program to calculate the factorial of a number";
diff --git a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/SemanticKernelSample.cs b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/SemanticKernelSample.cs
index 03c4dbdac0..8ba882618b 100644
--- a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/SemanticKernelSample.cs
+++ b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/SemanticKernelSample.cs
@@ -23,7 +23,8 @@ public static async Task RunChatCompletionSample()
         torch.manual_seed(1);
         torch.set_default_dtype(defaultType);
         var weightFolder = @"C:\Users\xiaoyuz\source\repos\Phi-3-mini-4k-instruct";
-        var tokenizer = Phi3TokenizerHelper.FromPretrained(weightFolder);
+        var tokenizerPath = Path.Combine(weightFolder, "tokenizer.model");
+        var tokenizer = Phi3TokenizerHelper.FromPretrained(tokenizerPath);
         var model = Phi3ForCasualLM.FromPretrained(weightFolder, "config.json", layersOnTargetDevice: -1, quantizeToInt8: true);
         var pipeline = new CausalLMPipeline<LlamaTokenizer, Phi3ForCasualLM>(tokenizer, model, device);
 
@@ -53,7 +54,8 @@ public static async Task RunTextGenerationSample()
         torch.manual_seed(1);
         torch.set_default_dtype(defaultType);
         var weightFolder = @"C:\Users\xiaoyuz\source\repos\Phi-3-mini-4k-instruct";
-        var tokenizer = Phi3TokenizerHelper.FromPretrained(weightFolder);
+        var tokenizerPath = Path.Combine(weightFolder, "tokenizer.model");
+        var tokenizer = Phi3TokenizerHelper.FromPretrained(tokenizerPath);
         var model = Phi3ForCasualLM.FromPretrained(weightFolder, "config.json", layersOnTargetDevice: -1, quantizeToInt8: true);
         var pipeline = new CausalLMPipeline<LlamaTokenizer, Phi3ForCasualLM>(tokenizer, model, device);
 

From 15448e10a533c9c6d1466d76d0274c7b92e1cafe Mon Sep 17 00:00:00 2001
From: Xiaoyun Zhang <bigmiao.zhang@gmail.com>
Date: Thu, 29 Aug 2024 14:06:34 -0700
Subject: [PATCH 4/4] Update CausalLMPipeline.cs

---
 src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs b/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs
index c8f2fe7db8..33e0bab19c 100644
--- a/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs
+++ b/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs
@@ -294,11 +294,7 @@ public float[] GenerateEmbeddingFromLastTokenPool(string prompt)
         var inputIds = this.Tokenizer.EncodeToIds(prompt);
         var inputTensor = torch.tensor(inputIds.ToArray(), dtype: ScalarType.Int64, device: this.Device).unsqueeze(0);
         var attentionMask = torch.ones_like(inputTensor, device: this.Device);
-        var cache = new DynamicKVCache();
-        var input = new CausalLMModelInput(inputTensor, attentionMask, pastKeyValuesLength: 0)
-        {
-            OverrideCache = cache,
-        };
+        var input = new CausalLMModelInput(inputTensor, attentionMask, pastKeyValuesLength: 0);
         var output = this.Model.forward(input);
         var lastTokenHiddenState = output.LastHiddenState[0, ^1];