From 280c9985d6baec099af1e10272493198d7d5389b Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Tue, 18 Jun 2024 09:33:17 -0700
Subject: [PATCH 01/12] add GenAI.Core project

---
 Microsoft.ML.sln                              |  33 ++-
 eng/Versions.props                            |   4 +-
 .../Extension/ModuleExtension.cs              | 274 ++++++++++++++++++
 .../Extension/TensorExtension.cs              |  33 +++
 .../Microsoft.ML.GenAI.Core.csproj            |  22 ++
 .../Module/DynamicLoadingModule.cs            |  50 ++++
 .../Module/IDynamicLoadModule.cs              |  15 +
 .../Module/IQuantizeModule.cs                 |   6 +
 .../Pipeline/CasualLMModelInput.cs            |  44 +++
 .../Pipeline/CasualLMModelOutput.cs           |  30 ++
 .../Pipeline/CausalLMPipeline.cs              | 141 +++++++++
 .../Utility/AttentionMaskConverter.cs         | 175 +++++++++++
 src/Microsoft.ML.GenAI.Core/Utility/Cache.cs  |  85 ++++++
 src/Microsoft.ML.GenAI.Core/Utils.cs          | 159 ++++++++++
 14 files changed, 1058 insertions(+), 13 deletions(-)
 create mode 100644 src/Microsoft.ML.GenAI.Core/Extension/ModuleExtension.cs
 create mode 100644 src/Microsoft.ML.GenAI.Core/Extension/TensorExtension.cs
 create mode 100644 src/Microsoft.ML.GenAI.Core/Microsoft.ML.GenAI.Core.csproj
 create mode 100644 src/Microsoft.ML.GenAI.Core/Module/DynamicLoadingModule.cs
 create mode 100644 src/Microsoft.ML.GenAI.Core/Module/IDynamicLoadModule.cs
 create mode 100644 src/Microsoft.ML.GenAI.Core/Module/IQuantizeModule.cs
 create mode 100644 src/Microsoft.ML.GenAI.Core/Pipeline/CasualLMModelInput.cs
 create mode 100644 src/Microsoft.ML.GenAI.Core/Pipeline/CasualLMModelOutput.cs
 create mode 100644 src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs
 create mode 100644 src/Microsoft.ML.GenAI.Core/Utility/AttentionMaskConverter.cs
 create mode 100644 src/Microsoft.ML.GenAI.Core/Utility/Cache.cs
 create mode 100644 src/Microsoft.ML.GenAI.Core/Utils.cs

diff --git a/Microsoft.ML.sln b/Microsoft.ML.sln
index 5763a903b4..1fa8823763 100644
--- a/Microsoft.ML.sln
+++ b/Microsoft.ML.sln
@@ -172,9 +172,11 @@ Project("{6EC3EE1D-3C4E-46DD-8F32-0CC8E7565705}") = "Microsoft.ML.FSharp.Tests",
 EndProject
 Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.Data.Analysis.PerformanceTests", "test\Microsoft.Data.Analysis.PerformanceTests\Microsoft.Data.Analysis.PerformanceTests.csproj", "{FB8A8823-CC6C-4C2F-8539-05FBFB7C91CD}"
 EndProject
-Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.TorchSharp.Tests", "test\Microsoft.ML.TorchSharp.Tests\Microsoft.ML.TorchSharp.Tests.csproj", "{AB8D68F1-6C3E-41FD-B0EC-A093E009341D}"
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.TorchSharp.Tests", "test\Microsoft.ML.TorchSharp.Tests\Microsoft.ML.TorchSharp.Tests.csproj", "{AB8D68F1-6C3E-41FD-B0EC-A093E009341D}"
 EndProject
-Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.TensorFlow.Tests", "test\Microsoft.ML.TensorFlow.Tests\Microsoft.ML.TensorFlow.Tests.csproj", "{763FF013-8309-4680-A769-B54E7BB99612}"
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.TensorFlow.Tests", "test\Microsoft.ML.TensorFlow.Tests\Microsoft.ML.TensorFlow.Tests.csproj", "{763FF013-8309-4680-A769-B54E7BB99612}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.GenAI.Core", "src\Microsoft.ML.GenAI.Core\Microsoft.ML.GenAI.Core.csproj", "{DB2CA055-8ABD-4E3E-8089-5B64C3415E85}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
@@ -512,6 +514,14 @@ Global
 		{C8E1772B-DFD9-4A4D-830D-6AAB1C668BB3}.Release|Any CPU.Build.0 = Release|Any CPU
 		{C8E1772B-DFD9-4A4D-830D-6AAB1C668BB3}.Release|x64.ActiveCfg = Release|Any CPU
 		{C8E1772B-DFD9-4A4D-830D-6AAB1C668BB3}.Release|x64.Build.0 = Release|Any CPU
+		{39E89702-1A46-4D5B-BA50-530D11309B5E}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{39E89702-1A46-4D5B-BA50-530D11309B5E}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{39E89702-1A46-4D5B-BA50-530D11309B5E}.Debug|x64.ActiveCfg = Debug|Any CPU
+		{39E89702-1A46-4D5B-BA50-530D11309B5E}.Debug|x64.Build.0 = Debug|Any CPU
+		{39E89702-1A46-4D5B-BA50-530D11309B5E}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{39E89702-1A46-4D5B-BA50-530D11309B5E}.Release|Any CPU.Build.0 = Release|Any CPU
+		{39E89702-1A46-4D5B-BA50-530D11309B5E}.Release|x64.ActiveCfg = Release|Any CPU
+		{39E89702-1A46-4D5B-BA50-530D11309B5E}.Release|x64.Build.0 = Release|Any CPU
 		{9222FC9D-599A-49A5-B685-08CC9A5C81D7}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
 		{9222FC9D-599A-49A5-B685-08CC9A5C81D7}.Debug|Any CPU.Build.0 = Debug|Any CPU
 		{9222FC9D-599A-49A5-B685-08CC9A5C81D7}.Debug|x64.ActiveCfg = Debug|Any CPU
@@ -820,14 +830,14 @@ Global
 		{763FF013-8309-4680-A769-B54E7BB99612}.Release|Any CPU.Build.0 = Release|Any CPU
 		{763FF013-8309-4680-A769-B54E7BB99612}.Release|x64.ActiveCfg = Release|Any CPU
 		{763FF013-8309-4680-A769-B54E7BB99612}.Release|x64.Build.0 = Release|Any CPU
-		{39E89702-1A46-4D5B-BA50-530D11309B5E}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
-		{39E89702-1A46-4D5B-BA50-530D11309B5E}.Debug|Any CPU.Build.0 = Debug|Any CPU
-		{39E89702-1A46-4D5B-BA50-530D11309B5E}.Debug|x64.ActiveCfg = Debug|Any CPU
-		{39E89702-1A46-4D5B-BA50-530D11309B5E}.Debug|x64.Build.0 = Debug|Any CPU
-		{39E89702-1A46-4D5B-BA50-530D11309B5E}.Release|Any CPU.ActiveCfg = Release|Any CPU
-		{39E89702-1A46-4D5B-BA50-530D11309B5E}.Release|Any CPU.Build.0 = Release|Any CPU
-		{39E89702-1A46-4D5B-BA50-530D11309B5E}.Release|x64.ActiveCfg = Release|Any CPU
-		{39E89702-1A46-4D5B-BA50-530D11309B5E}.Release|x64.Build.0 = Release|Any CPU
+		{DB2CA055-8ABD-4E3E-8089-5B64C3415E85}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{DB2CA055-8ABD-4E3E-8089-5B64C3415E85}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{DB2CA055-8ABD-4E3E-8089-5B64C3415E85}.Debug|x64.ActiveCfg = Debug|Any CPU
+		{DB2CA055-8ABD-4E3E-8089-5B64C3415E85}.Debug|x64.Build.0 = Debug|Any CPU
+		{DB2CA055-8ABD-4E3E-8089-5B64C3415E85}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{DB2CA055-8ABD-4E3E-8089-5B64C3415E85}.Release|Any CPU.Build.0 = Release|Any CPU
+		{DB2CA055-8ABD-4E3E-8089-5B64C3415E85}.Release|x64.ActiveCfg = Release|Any CPU
+		{DB2CA055-8ABD-4E3E-8089-5B64C3415E85}.Release|x64.Build.0 = Release|Any CPU
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
@@ -874,6 +884,7 @@ Global
 		{11A5210E-2EA7-42F1-80DB-827762E9C781} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
 		{38ED61F4-FA22-4DE9-B0C4-91F327F4EE31} = {DA452A53-2E94-4433-B08C-041EDEC729E6}
 		{C8E1772B-DFD9-4A4D-830D-6AAB1C668BB3} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
+		{39E89702-1A46-4D5B-BA50-530D11309B5E} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
 		{9222FC9D-599A-49A5-B685-08CC9A5C81D7} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
 		{6C29AA9B-054B-4762-BEA5-D305B932AA80} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
 		{4805129D-78C8-46D4-9519-0AD9B0574D6D} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
@@ -913,7 +924,7 @@ Global
 		{FB8A8823-CC6C-4C2F-8539-05FBFB7C91CD} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
 		{AB8D68F1-6C3E-41FD-B0EC-A093E009341D} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
 		{763FF013-8309-4680-A769-B54E7BB99612} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
-		{39E89702-1A46-4D5B-BA50-530D11309B5E} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
+		{DB2CA055-8ABD-4E3E-8089-5B64C3415E85} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
 	EndGlobalSection
 	GlobalSection(ExtensibilityGlobals) = postSolution
 		SolutionGuid = {41165AF1-35BB-4832-A189-73060F82B01D}
diff --git a/eng/Versions.props b/eng/Versions.props
index b1d4979662..95200a2599 100644
--- a/eng/Versions.props
+++ b/eng/Versions.props
@@ -63,8 +63,8 @@
     <TensorflowDotNETVersion>0.20.1</TensorflowDotNETVersion>
     <TensorFlowMajorVersion>2</TensorFlowMajorVersion>
     <TensorFlowVersion>2.3.1</TensorFlowVersion>
-    <TorchSharpVersion>0.101.5</TorchSharpVersion>
-    <LibTorchVersion>2.1.0.1</LibTorchVersion>
+    <TorchSharpVersion>0.102.5</TorchSharpVersion>
+    <LibTorchVersion>2.2.1.1</LibTorchVersion>
     <!-- Build/infrastructure Dependencies -->
     <CodecovVersion>1.12.4</CodecovVersion>
     <CoverletCollectorVersion>3.1.2</CoverletCollectorVersion>
diff --git a/src/Microsoft.ML.GenAI.Core/Extension/ModuleExtension.cs b/src/Microsoft.ML.GenAI.Core/Extension/ModuleExtension.cs
new file mode 100644
index 0000000000..3a4c012446
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Core/Extension/ModuleExtension.cs
@@ -0,0 +1,274 @@
+﻿using Microsoft.ML.GenAI.Core;
+using Microsoft.ML.GenAI.Core.Extension;
+using Phi.Module;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using TorchSharp;
+using static TorchSharp.torch;
+
+namespace Microsoft.ML.GenAI.Core.Extension;
+
+internal static class ModuleExtension
+{
+    public static long GetSizeInBytes(this nn.Module model)
+    {
+        var stateDict = model.state_dict();
+        long size = 0;
+        foreach (var (_, value) in stateDict)
+        {
+            size += value.numel() * value.element_size();
+        }
+
+        return size;
+    }
+
+    public static Dictionary<string, long> GetSizeForEachDynamicLayerInBytes(this nn.Module model)
+    {
+        var stateDict = model.named_children();
+        if (stateDict.Count() == 0)
+        {
+            return new();
+        }
+        else
+        {
+            var dict = new Dictionary<string, long>();
+
+            foreach (var (key, value) in stateDict)
+            {
+                if (value is IDynamicLoadModule)
+                {
+                    dict[key] = value.GetSizeInBytes();
+                }
+                else
+                {
+                    var subDict = value.GetSizeForEachDynamicLayerInBytes();
+                    foreach (var (subKey, subValue) in subDict)
+                    {
+                        dict[key + "." + subKey] = subValue;
+                    }
+                }
+            }
+
+            return dict;
+        }
+    }
+
+    public static void ToQuantizedModule<T>(
+        this T model)
+        where T : nn.Module
+    {
+        foreach (var (_, value) in model.named_children())
+        {
+            if (value is IQuantizeModule quantizeModule)
+            {
+                quantizeModule.Quantize();
+            }
+            else
+            {
+                value.ToQuantizedModule();
+            }
+        }
+    }
+
+    public static T ToDynamicLoadingModel<T>(
+        this T model,
+        Dictionary<string, string> deviceMap,
+        string targetDevice)
+        where T : nn.Module
+    {
+        if (deviceMap.Count == 0)
+        {
+            model.to(new Device(targetDevice));
+
+            return model;
+        }
+
+        //var dynamicModules = model.named_modules().Where(x => x.module is IDynamicLoadModule).Select(x => x.name).ToList();
+        // for each module in the model, update device if it is IDynamicLoadModule
+        foreach (var (key, value) in model.named_children())
+        {
+            if (value is IDynamicLoadModule dynamicModule)
+            {
+                var device = deviceMap[key];
+                if (device != targetDevice)
+                {
+                    dynamicModule.LoadToDeviceFunc = (nn.Module module) =>
+                    {
+                        module.to(new Device(targetDevice));
+                    };
+                    dynamicModule.UnloadFromDeviceFunc = (nn.Module module) =>
+                    {
+                        module.to(new Device(device));
+                    };
+                }
+
+                value.to(new Device(device));
+            }
+            else
+            {
+                var childrenDeviceMap = deviceMap.Where(x => x.Key.StartsWith($"{key}.")).ToDictionary(x => x.Key.Substring($"{key}.".Length), x => x.Value);
+                value.ToDynamicLoadingModel(childrenDeviceMap, targetDevice);
+            }
+        }
+
+        return model;
+    }
+
+    /// <summary>
+    /// Infer the device map for each layer in the model.
+    /// The device map is a dictionary where the key is the device id (e.g. "cuda:0") and the value is the memory size in bytes of the device.
+    /// When inferring the device map, each layer in the model will be placed on the device in the order of the devices list.
+    /// </summary>
+    /// <param name="model"></param>
+    /// <param name="devices">a list of device ids (e.g. ["cuda:0", "cpu", "disk"])</param>
+    /// <param name="deviceSizeMapInByte">a map where the key is the device id (e.g. "cuda:0") and the value is the memory size in bytes of the device</param>
+    /// <returns></returns>
+    public static Dictionary<string, string> InferDeviceMapForEachLayer(
+        this nn.Module model,
+        string[] devices,
+        Dictionary<string, long> deviceSizeMapInByte)
+    {
+        var layerSizeMap = model.GetSizeForEachDynamicLayerInBytes();
+        var sizeToRemainOnEachDevice = 2 * layerSizeMap.Max(x => x.Value);
+        var deviceMap = new Dictionary<string, string>();
+        foreach (var device in devices)
+        {
+            long size = deviceSizeMapInByte[device];
+            var remainingLayerSizeMap = layerSizeMap.Where(x => !deviceMap.ContainsKey(x.Key)).ToDictionary(x => x.Key, x => x.Value);
+            // larger layer fit first
+            foreach (var (key, value) in remainingLayerSizeMap.OrderByDescending(x => x.Value))
+            {
+                if (size >= value)
+                {
+                    deviceMap[key] = device;
+                    size -= value;
+                }
+
+                if (size < sizeToRemainOnEachDevice)
+                {
+                    break;
+                }
+            }
+        }
+
+        return deviceMap;
+    }
+
+    public static string? Generate(
+        this CausalLMPipeline pipeline,
+        string prompt,
+        int maxLen = 128,
+        float temperature = 0.7f,
+        float topP = 0.9f,
+        string[]? stopSequences = null,
+        int eosId = 0,
+        string device = "cpu",
+        bool bos = true,
+        bool eos = false,
+        bool echo = false)
+    {
+        using var newScope = NewDisposeScope();
+        var inputIds = pipeline.Tokenizer.EncodeToIds(prompt);
+        var inputTensor = torch.tensor(inputIds.ToArray(), dtype: ScalarType.Int64, device: device).unsqueeze(0);
+        var attentionMask = torch.ones_like(inputTensor);
+
+        // set up stop token ids
+        // stop token ids: [[eosId], [stopSequence1], [stopSequence2], ...]
+        // when causal language model generates tokens, it will stop when it generates any token in stopSequences
+        List<int[]> stopTokenIds = [[eosId]];
+        if (stopSequences != null)
+        {
+            stopTokenIds.AddRange(stopSequences.Select(x => pipeline.Tokenizer.EncodeToIds(x).ToArray()));
+        }
+
+        (var token, var _) = pipeline.Generate(inputTensor, attentionMask, temperature: temperature, maxLen: maxLen, topP: topP, stopTokenSequence: stopTokenIds.ToArray(), echo: echo);
+
+        var tokenIds = token[0].to_type(ScalarType.Int32).data<int>().ToArray();
+
+        return pipeline.Tokenizer.Decode(tokenIds);
+    }
+
+
+    public static string Peek(this nn.Module model)
+    {
+        var sb = new StringBuilder();
+        var stateDict = model.state_dict();
+        // preview state_dict
+        int i = 0;
+        foreach (var (key, value) in stateDict.OrderBy(x => x.Key, StringComparer.OrdinalIgnoreCase))
+        {
+            var str = value.Peek(key);
+            sb.AppendLine($"{i}: {str}");
+            i++;
+        }
+
+        var res = sb.ToString();
+
+        return res;
+    }
+
+    public static string PeekShape(this nn.Module model)
+    {
+        var sb = new StringBuilder();
+        var stateDict = model.state_dict();
+        // preview state_dict
+        int i = 0;
+        foreach (var (key, value) in stateDict.OrderBy(x => x.Key, StringComparer.OrdinalIgnoreCase))
+        {
+            // shape str: [x, y, z]
+            var shapeStr = string.Join(", ", value.shape);
+            sb.AppendLine($"{i}: {key} shape: [{shapeStr}]");
+            i++;
+        }
+
+        var res = sb.ToString();
+
+        return res;
+    }
+
+    public static void LoadStateDict(this Dictionary<string, Tensor> dict, string location)
+    {
+        using FileStream stream = File.OpenRead(location);
+        using BinaryReader reader = new BinaryReader(stream);
+        var num = reader.Decode();
+        for (int i = 0; i < num; i++)
+        {
+            var key = reader.ReadString();
+            Tensor tensor = dict[key];
+
+            var originalDevice = tensor.device;
+            var originalType = tensor.dtype;
+            if (tensor.dtype == ScalarType.BFloat16)
+            {
+                tensor = tensor.to_type(ScalarType.Float32);
+            }
+
+            TensorExtensionMethods.Load(ref tensor!, reader, skip: false);
+
+            tensor = tensor!.to_type(originalType);
+            dict[key] = tensor;
+        }
+    }
+
+    public static long Decode(this BinaryReader reader)
+    {
+        long num = 0L;
+        int num2 = 0;
+        while (true)
+        {
+            long num3 = reader.ReadByte();
+            num += (num3 & 0x7F) << num2 * 7;
+            if ((num3 & 0x80) == 0L)
+            {
+                break;
+            }
+
+            num2++;
+        }
+
+        return num;
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.Core/Extension/TensorExtension.cs b/src/Microsoft.ML.GenAI.Core/Extension/TensorExtension.cs
new file mode 100644
index 0000000000..d621e3cb43
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Core/Extension/TensorExtension.cs
@@ -0,0 +1,33 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using TorchSharp;
+using static TorchSharp.torch;
+
+namespace Microsoft.ML.GenAI.Core.Extension;
+
+internal static class TensorExtension
+{
+    public static string Peek(this Tensor tensor, string id, int n = 10)
+    {
+        var device = tensor.device;
+        var dType = tensor.dtype;
+        // if type is fp16, convert to fp32
+        if (tensor.dtype == ScalarType.Float16)
+        {
+            tensor = tensor.to_type(ScalarType.Float32);
+        }
+        tensor = tensor.cpu();
+        var shapeString = string.Join(',', tensor.shape);
+        var tensor1D = tensor.reshape(-1);
+        var tensorIndex = torch.arange(tensor1D.shape[0], dtype: ScalarType.Float32).to(tensor1D.device).sqrt();
+        var avg = (tensor1D * tensorIndex).sum();
+        avg = avg / tensor1D.sum();
+        // keep four decimal places
+        avg = avg.round(4);
+        var str = $"{id}: sum: {avg.ToSingle()}  dType: {dType} shape: [{shapeString}]";
+
+        return str;
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.Core/Microsoft.ML.GenAI.Core.csproj b/src/Microsoft.ML.GenAI.Core/Microsoft.ML.GenAI.Core.csproj
new file mode 100644
index 0000000000..3346b40caa
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Core/Microsoft.ML.GenAI.Core.csproj
@@ -0,0 +1,22 @@
+﻿<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <TargetFramework>net8.0</TargetFramework>
+    <IsPackable>false</IsPackable>
+    <Nullable>enable</Nullable>
+    <LangVersion>preview</LangVersion>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <PackageReference Include="System.Memory" Version="$(SystemMemoryVersion)" />
+    <PackageReference Include="TorchSharp" Version="$(TorchSharpVersion)" />
+    <PackageReference Include="libtorch-cpu-win-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('Windows'))" PrivateAssets="all" />
+    <PackageReference Include="libtorch-cpu-linux-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('Linux'))" PrivateAssets="all" />
+    <PackageReference Include="libtorch-cpu-osx-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('OSX'))" PrivateAssets="all" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <ProjectReference Include="..\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj" />
+  </ItemGroup>
+
+</Project>
diff --git a/src/Microsoft.ML.GenAI.Core/Module/DynamicLoadingModule.cs b/src/Microsoft.ML.GenAI.Core/Module/DynamicLoadingModule.cs
new file mode 100644
index 0000000000..42f3442f95
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Core/Module/DynamicLoadingModule.cs
@@ -0,0 +1,50 @@
+﻿using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+using TorchSharp;
+using static TorchSharp.torch;
+
+namespace Phi.Module;
+
+public class DynamicLoadingModule<T, T1, TResult> : torch.nn.Module<T1, TResult>, IDynamicLoadModule
+    where T : nn.Module<T1, TResult>
+    where T1 : Tensor
+{
+    private readonly T _model;
+
+    public DynamicLoadingModule(T model)
+        : base(model.GetName())
+    {
+        this._model = model;
+        this.RegisterComponents();
+    }
+
+    public static DynamicLoadingModule<T, T1, TResult> CreateFromModel(T model)
+    {
+        return new DynamicLoadingModule<T, T1, TResult>(model);
+    }
+
+    public Action<nn.Module>? LoadToDeviceFunc { get; set; }
+    public Action<nn.Module>? UnloadFromDeviceFunc { get; set; }
+
+#pragma warning disable MSML_GeneralName // This name should be PascalCased
+    public override TResult forward(T1 input)
+#pragma warning restore MSML_GeneralName // This name should be PascalCased
+    {
+        if (LoadToDeviceFunc != null)
+        {
+            LoadToDeviceFunc(this);
+        }
+
+        var output = this._model.forward(input);
+
+        if (UnloadFromDeviceFunc != null)
+        {
+            UnloadFromDeviceFunc(this);
+        }
+
+        return output;
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.Core/Module/IDynamicLoadModule.cs b/src/Microsoft.ML.GenAI.Core/Module/IDynamicLoadModule.cs
new file mode 100644
index 0000000000..25ba6cbc95
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Core/Module/IDynamicLoadModule.cs
@@ -0,0 +1,15 @@
+﻿using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+using static TorchSharp.torch;
+
+namespace Phi.Module;
+
+public interface IDynamicLoadModule
+{
+    public Action<nn.Module>? LoadToDeviceFunc { get; set; }
+
+    public Action<nn.Module>? UnloadFromDeviceFunc { get; set; }
+}
diff --git a/src/Microsoft.ML.GenAI.Core/Module/IQuantizeModule.cs b/src/Microsoft.ML.GenAI.Core/Module/IQuantizeModule.cs
new file mode 100644
index 0000000000..2cb065c7f7
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Core/Module/IQuantizeModule.cs
@@ -0,0 +1,6 @@
+﻿namespace Phi.Module;
+
+public interface IQuantizeModule
+{
+    public void Quantize();
+}
diff --git a/src/Microsoft.ML.GenAI.Core/Pipeline/CasualLMModelInput.cs b/src/Microsoft.ML.GenAI.Core/Pipeline/CasualLMModelInput.cs
new file mode 100644
index 0000000000..ef16b3078b
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Core/Pipeline/CasualLMModelInput.cs
@@ -0,0 +1,44 @@
+﻿using static TorchSharp.torch;
+
+namespace Microsoft.ML.GenAI.Core;
+
+public class CasualLMModelInput
+{
+    public CasualLMModelInput(
+        Tensor inputIds,
+        Tensor? attentionMask = null,
+        Tensor? positionIds = null,
+        int pastKeyValuesLength = 0,
+        Tensor? inputsEmbeds = null,
+        bool useCache = false,
+        bool outputAttentions = false,
+        bool outputHiddenStates = false)
+    {
+        this.InputIds = inputIds;
+        this.AttentionMask = attentionMask;
+        this.PositionIds = positionIds;
+        this.PastKeyValuesLength = pastKeyValuesLength;
+        this.InputEmbeddings = inputsEmbeds;
+        this.UseCache = useCache;
+        this.OutputAttentions = outputAttentions;
+        this.OutputHiddenStates = outputHiddenStates;
+    }
+
+    public Tensor InputIds { get; set; }
+
+    public Tensor? AttentionMask { get; set; }
+
+    public Tensor? PositionIds { get; set; }
+
+    public IKVCache? OverrideCache { get; set; }
+
+    public int PastKeyValuesLength { get; set; }
+
+    public Tensor? InputEmbeddings { get; set; }
+
+    public bool UseCache { get; set; }
+
+    public bool OutputAttentions { get; set; }
+
+    public bool OutputHiddenStates { get; set; }
+}
diff --git a/src/Microsoft.ML.GenAI.Core/Pipeline/CasualLMModelOutput.cs b/src/Microsoft.ML.GenAI.Core/Pipeline/CasualLMModelOutput.cs
new file mode 100644
index 0000000000..1ac56f9e31
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Core/Pipeline/CasualLMModelOutput.cs
@@ -0,0 +1,30 @@
+﻿using static TorchSharp.torch;
+
+namespace Microsoft.ML.GenAI.Core;
+
+public class CasualLMModelOutput
+{
+    public CasualLMModelOutput(
+        Tensor lastHiddenState,
+        Tensor logits,
+        Tensor[]? allHiddenStates = null,
+        Tensor[]? attentions = null,
+        IKVCache? cache = null)
+    {
+        this.LastHiddenState = lastHiddenState;
+        this.AllHiddenStates = allHiddenStates;
+        this.Logits = logits;
+        this.Attentions = attentions;
+        this.Cache = cache;
+    }
+
+    public Tensor Logits { get; set; }
+
+    public Tensor LastHiddenState { get; set; }
+
+    public Tensor[]? AllHiddenStates { get; set; }
+
+    public Tensor[]? Attentions { get; set; }
+
+    public IKVCache? Cache { get; set; }
+}
diff --git a/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs b/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs
new file mode 100644
index 0000000000..3d8309ac4b
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs
@@ -0,0 +1,141 @@
+﻿using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+using Microsoft.ML.Tokenizers;
+using TorchSharp;
+using static TorchSharp.torch;
+
+namespace Microsoft.ML.GenAI.Core;
+
+public class CausalLMPipeline<TTokenizer, TModel> : CausalLMPipeline
+    where TTokenizer : Tokenizer
+    where TModel : nn.Module<CasualLMModelInput, CasualLMModelOutput>
+{
+    public CausalLMPipeline(
+        TTokenizer tokenizer,
+        TModel model,
+        string device = "cpu")
+        : base(tokenizer, model, device)
+    {
+    }
+}
+
+public class CausalLMPipeline
+{
+    public CausalLMPipeline(
+        Tokenizer tokenizer,
+        nn.Module<CasualLMModelInput, CasualLMModelOutput> model,
+        string device = "cpu")
+    {
+        this.Tokenizer = tokenizer;
+        this.Model = model;
+        this.Device = device;
+    }
+
+    public Tokenizer Tokenizer { get; }
+
+    public nn.Module<CasualLMModelInput, CasualLMModelOutput> Model { get; }
+
+    public Device Device { get; }
+
+    public virtual (
+        Tensor, // output token ids [batch_size, sequence_length]
+        Tensor // output logits [batch_size, sequence_length, vocab_size]
+    ) Generate(
+        Tensor inputIds, // input token ids [batch_size, sequence_length]
+        Tensor attentionMask, // attention mask [batch_size, sequence_length]
+        int[][] stopTokenSequence,
+        float temperature = 0.7f,
+        float topP = 0.9f,
+        int maxLen = 128,
+        bool echo = false)
+    {
+        using var newScope = NewDisposeScope();
+        var batch = inputIds.shape[0];
+        var device = inputIds.device;
+        var promptLength = (int)inputIds.shape[1];
+        var totalLen = promptLength + maxLen;
+
+        using (var noGrad = torch.no_grad())
+        {
+            var prevPos = 0;
+            var eosReached = torch.tensor(new bool[batch], device: device);
+            torch.Tensor? logits = default;
+            var cache = new DynamicKVCache();
+            if (promptLength == totalLen)
+            {
+                var input = new CasualLMModelInput(inputIds, attentionMask, pastKeyValuesLength: 0)
+                {
+                    OverrideCache = cache,
+                };
+                var output = this.Model.forward(input);
+                logits = output.Logits;
+            }
+            for (var curPos = promptLength; curPos != totalLen; curPos++)
+            {
+                var input = new CasualLMModelInput(inputIds[.., prevPos..curPos], attentionMask[.., prevPos..curPos], pastKeyValuesLength: prevPos)
+                {
+                    OverrideCache = cache,
+                };
+                var output = this.Model.forward(input);
+                logits = output.Logits;
+                torch.Tensor nextToken;
+                if (temperature > 0)
+                {
+                    var probs = torch.softmax(logits[.., -1] / temperature, dim: -1);
+                    nextToken = this.SampleTopP(probs, topP);
+                }
+                else
+                {
+                    nextToken = torch.argmax(logits[.., -1], dim: -1);
+                }
+
+                nextToken = nextToken.reshape(-1);
+                inputIds = torch.cat([inputIds, nextToken.unsqueeze(1)], dim: -1);
+                attentionMask = torch.cat([attentionMask, attentionMask.new_ones(attentionMask.shape[0], 1)], dim: -1);
+                foreach (var stopSequence in stopTokenSequence)
+                {
+                    // determine if the last n tokens are the stop sequence
+                    var lastN = inputIds[.., ^stopSequence.Length..];
+                    var lastNMatch = lastN == torch.tensor(stopSequence, device: device);
+                    eosReached |= lastNMatch.all(dim: -1);
+                }
+                if (eosReached.all().item<bool>())
+                {
+                    break;
+                }
+
+                // pBar.Tick(curPos, message);
+                var nextTokenIds = nextToken.to_type(ScalarType.Int32).data<int>().ToArray();
+                var nextTokenStr = this.Tokenizer.Decode(nextTokenIds);
+
+                prevPos = curPos;
+            }
+
+            if (echo)
+            {
+                // return entire inputIds and logits
+                return (inputIds.MoveToOuterDisposeScope(), logits!.MoveToOuterDisposeScope());
+            }
+            else
+            {
+                // return [batch_size, promptLength..] and [batch_size, promptLength.., vocab_size]
+                return (inputIds[.., promptLength..].MoveToOuterDisposeScope(), logits![.., promptLength..].MoveToOuterDisposeScope());
+            }
+        }
+    }
+
+    protected torch.Tensor SampleTopP(torch.Tensor logits, float topP)
+    {
+        (var probsSort, var probsIndex) = torch.sort(logits, dim: -1, descending: true);
+        var cumSum = torch.cumsum(probsSort, dim: -1);
+        var mask = cumSum - probsSort > topP;
+        probsSort[mask] = 0f;
+        probsSort /= probsSort.sum(dim: -1, keepdim: true);
+        var nextToken = torch.multinomial(probsSort, num_samples: 1);
+        nextToken = torch.gather(probsIndex, dim: -1, index: nextToken);
+        return nextToken;
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.Core/Utility/AttentionMaskConverter.cs b/src/Microsoft.ML.GenAI.Core/Utility/AttentionMaskConverter.cs
new file mode 100644
index 0000000000..abd15c6c6f
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Core/Utility/AttentionMaskConverter.cs
@@ -0,0 +1,175 @@
+﻿using static TorchSharp.torch.nn;
+using static TorchSharp.torch;
+using TorchSharp.Modules;
+using TorchSharp;
+using System.Threading.Tasks;
+using System;
+
+namespace Microsoft.ML.GenAI.Core;
+
+public class AttentionMaskConverter
+{
+    private readonly bool _isCasual;
+    private readonly int? _slidingWindow;
+
+    public AttentionMaskConverter(bool isCausal, int? slidingWindow)
+    {
+        this._isCasual = isCausal;
+        this._slidingWindow = slidingWindow;
+    }
+
+    /// <summary>
+    /// Converts 2D attention mask to 4D attention mask by expanding mask to (bsz, head_dim=1, query_length,
+    /// key_value_length) shape and by adding a large negative bias to not-attended positions.If attention_mask is
+    /// causal, a causal mask will be added.
+    /// </summary>
+    /// <param name="attentionMask2d"></param>
+    /// <param name="queryLength"></param>
+    /// <param name="dType"></param>
+    /// <param name="keyValueLength"></param>
+    /// <returns></returns>
+    public Tensor To4D(
+        Tensor attentionMask2d,
+        int queryLength,
+        ScalarType dType,
+        int? keyValueLength = null)
+    {
+        long[] inputShape = [attentionMask2d.shape[0], queryLength];
+
+        // create causal mask
+        // [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        Tensor? casual4dMask = null;
+        if ((inputShape[^1] > 1 || this._slidingWindow is not null) && this._isCasual)
+        {
+            if (keyValueLength is null)
+            {
+                throw new ArgumentException("key_value_length should be provided when attention_mask is causal");
+            }
+
+            var pastKeyValuesLength = keyValueLength.Value - queryLength;
+            casual4dMask = MakeCasualMask(inputShape, dType, attentionMask2d.device, pastKeyValuesLength, this._slidingWindow);
+        }
+        else if (this._slidingWindow is not null)
+        {
+            throw new NotImplementedException("Sliding window is not supported for non-causal masks");
+        }
+
+        var expandedAttnMask = ExpandMask(attentionMask2d, dType, queryLength).to(attentionMask2d.device);
+        if (casual4dMask is not null)
+        {
+            var min = torch.finfo(dType).min;
+            expandedAttnMask = casual4dMask.masked_fill(expandedAttnMask.to(ScalarType.Bool), min);
+        }
+
+        return expandedAttnMask;
+    }
+
+    public Tensor? ToCasual4D(
+        int batchSize,
+        int queryLength,
+        int keyValueLength,
+        ScalarType dType,
+        Device device)
+    {
+        if (!_isCasual)
+        {
+            throw new ArgumentException("This is not a casual mask");
+        }
+
+        long[] inputShape = [batchSize, queryLength];
+        var pastKeyValueLength = keyValueLength - queryLength;
+
+        // create causal mask
+        // [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        Tensor? causal4DMask = null;
+        if (queryLength > 1 || this._slidingWindow is int)
+        {
+            causal4DMask = MakeCasualMask(inputShape, dType, device, pastKeyValueLength, this._slidingWindow);
+        }
+
+        return causal4DMask;
+    }
+
+    public static Tensor MakeCasualMask(
+        long[] inputIdsShape,
+        ScalarType dType,
+        Device device,
+        int pastKeyValuesLength = 0,
+        int? slidingWindow = null)
+    {
+        // Make causal mask used for bi-directional self-attention.
+        var bsz = inputIdsShape[0];
+        var tgtLen = inputIdsShape[1];
+        var min = torch.finfo(dType).min;
+        var mask = torch.full([tgtLen, tgtLen], min, dtype: dType, device: device);
+        var maskCondition = torch.arange(tgtLen, device: device);
+        mask.masked_fill_(maskCondition < (maskCondition + 1).view(tgtLen, 1), 0);
+        mask = mask.to(dType);
+
+
+        if (pastKeyValuesLength > 0)
+        {
+            mask = torch.cat([torch.zeros([tgtLen, pastKeyValuesLength], dtype: dType, device: device), mask], dim: -1);
+        }
+
+        if (slidingWindow is int window)
+        {
+            var diagonal = pastKeyValuesLength - window - 1;
+            var contextMask = torch.tril(torch.ones([tgtLen, tgtLen], dtype: ScalarType.Bool, device: device), diagonal: diagonal);
+            mask = mask.masked_fill(contextMask, min);
+        }
+
+        // return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+        return mask.unsqueeze(0).unsqueeze(0).expand(bsz, 1, tgtLen, tgtLen + pastKeyValuesLength);
+    }
+
+    /// <summary>
+    /// Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)`
+    /// </summary>
+    /// <param name="attentionMask">The attention mask should be 2D.</param>
+    /// <param name="device">The device to place the mask tensor.</param>
+    /// <param name="dType">The data type of the mask tensor.</param>
+    /// <param name="pastKeyValuesLength">The length of past key values in cache.</param>
+    /// <param name="slidingWindow">The sliding window size.</param>
+    /// <param name="inputShape">The input shape should be a tuple that defines `(batch_size, query_length)`.</param>
+    public static Tensor? Create4DCausalAttentionMask(
+        Tensor? attentionMask,
+        long[] inputShape,
+        ScalarType dType,
+        Device device,
+        int pastKeyValuesLength = 0,
+        int? slidingWindow = null)
+    {
+        var converter = new AttentionMaskConverter(isCausal: true, slidingWindow: slidingWindow);
+        var batchSize = (int)inputShape[0];
+        var queryLength = (int)inputShape[1];
+        var keyValueLength = pastKeyValuesLength + queryLength;
+        if (attentionMask is not null)
+        {
+            if (attentionMask.ndim != 2)
+            {
+                throw new ArgumentException("Attention mask should be 2D");
+            }
+            return converter.To4D(attentionMask, (int)inputShape[1], dType, keyValueLength);
+        }
+
+        return converter.ToCasual4D(batchSize, queryLength, keyValueLength, dType, device);
+    }
+
+    public static Tensor ExpandMask(
+        Tensor mask,
+        ScalarType dType,
+        int? tgtLen = null)
+    {
+        var bsz = (int)mask.shape[0];
+        var srcLen = (int)mask.shape[1];
+        tgtLen ??= srcLen;
+
+        var expandedMask = mask.unsqueeze(1).unsqueeze(1).expand(bsz, 1, tgtLen.Value, srcLen).to(dType);
+        var invertedMask = 1.0 - expandedMask;
+        var min = torch.finfo(dType).min;
+
+        return invertedMask.masked_fill(invertedMask.to(ScalarType.Bool), min);
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.Core/Utility/Cache.cs b/src/Microsoft.ML.GenAI.Core/Utility/Cache.cs
new file mode 100644
index 0000000000..026c2cdff3
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Core/Utility/Cache.cs
@@ -0,0 +1,85 @@
+﻿using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+using TorchSharp;
+using static TorchSharp.torch;
+
+namespace Microsoft.ML.GenAI.Core;
+
+public interface IKVCache : IDictionary<int, (Tensor, Tensor)>, IDisposable
+{
+    public (Tensor, Tensor) UpdateKVCache(Tensor key, Tensor value, int layerIndex);
+
+    public int GetSeqLen(int layerIndex = 0);
+
+    public int? GetMaxLength();
+
+    public int GetUsableLength(int newSeqLen, int layerIndex = 0);
+}
+
+public class DynamicKVCache : Dictionary<int, (Tensor, Tensor)>, IKVCache
+{
+    private readonly DisposeScope _disposeScope = NewDisposeScope();
+    public DynamicKVCache()
+    {
+    }
+
+    public (Tensor, Tensor) UpdateKVCache(Tensor key, Tensor value, int layerIndex)
+    {
+        if (this.ContainsKey(layerIndex))
+        {
+            var (oldKey, oldValue) = this[layerIndex];
+            oldKey.DetachFromDisposeScope();
+            oldValue.DetachFromDisposeScope();
+
+            var newKey = torch.cat([oldKey, key], -2).MoveToOtherDisposeScope(this._disposeScope);
+            var newValue = torch.cat([oldValue, value], -2).MoveToOtherDisposeScope(this._disposeScope);
+
+            oldKey.Dispose();
+            oldValue.Dispose();
+
+            this[layerIndex] = (newKey, newValue);
+        }
+        else
+        {
+            this.Add(layerIndex, (key.MoveToOtherDisposeScope(this._disposeScope), value.MoveToOtherDisposeScope(this._disposeScope)));
+        }
+
+        return this[layerIndex];
+    }
+
+    public int GetSeqLen(int layerIndex = 0)
+    {
+        if (this.TryGetValue(layerIndex, out var kv))
+        {
+            return kv.Item1.IntShape()[^2];
+        }
+
+        return 0;
+    }
+
+    public int? GetMaxLength()
+    {
+        return null;
+    }
+
+    public int GetUsableLength(int newSeqLen, int layerIndex = 0)
+    {
+        var maxLength = this.GetMaxLength();
+        var previousSeqLen = this.GetSeqLen(layerIndex);
+
+        if (maxLength.HasValue && previousSeqLen + newSeqLen > maxLength.Value)
+        {
+            return maxLength.Value - previousSeqLen;
+        }
+
+        return previousSeqLen;
+    }
+
+    public void Dispose()
+    {
+        this._disposeScope.Dispose();
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.Core/Utils.cs b/src/Microsoft.ML.GenAI.Core/Utils.cs
new file mode 100644
index 0000000000..84f0062951
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Core/Utils.cs
@@ -0,0 +1,159 @@
+﻿using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+using TorchSharp.Modules;
+using TorchSharp;
+using static TorchSharp.torch;
+using static TorchSharp.torch.nn;
+
+public static class Utils
+{
+    public static Tensor ApplyRotaryEmbeddings(Tensor input, Tensor freqsComplex)
+    {
+        // Separate the last dimension pairs of two values, representing the real and imaginary parts of the complex number
+        // Two consecutive values will become a single complex number
+        // (B, Seq_Len, H, Head_Dim) -> (B, Seq_Len, H, Head_Dim/2)
+        var inputComplex = input.to_type(ScalarType.Float32).reshape(input.shape[0], input.shape[1], input.shape[2], -1, 2).view_as_complex();
+        freqsComplex = freqsComplex.to(input.device);
+
+        // Reshape the freqs_complex tensor to match the shape of the x_complex tensor. So we need to add the batch dimension and the head dimension
+        // (Seq_Len, Head_Dim/2) --> (1, Seq_Len, 1, Head_Dim/2)
+        var freqsComplexReshaped = freqsComplex.unsqueeze(0).unsqueeze(2);
+
+        // Multiply each complex number in the x_complex tensor by the corresponding complex number in the freqs_complex tensor
+        // Which results in the rotation of the complex number as shown in the Figure 1 of the paper
+        // (B, Seq_Len, H, Head_Dim/2) * (1, Seq_Len, 1, Head_Dim/2) = (B, Seq_Len, H, Head_Dim/2)
+        var rotatedComplex = inputComplex * freqsComplexReshaped;
+        // Console.WriteLine(rotated_complex.mean().ToSingle());
+
+        // Convert the complex number back to the real number
+        // (B, Seq_Len, H, Head_Dim/2) -> (B, Seq_Len, H, Head_Dim/2, 2)
+        var rotated = rotatedComplex.view_as_real();
+
+        // (B, Seq_Len, H, Head_Dim/2, 2) -> (B, Seq_Len, H, Head_Dim)
+        var rotatedReshaped = rotated.reshape(rotated.shape[0], rotated.shape[1], rotated.shape[2], -1);
+
+        return rotatedReshaped.type_as(input);
+    }
+
+    public static Tensor PrecomputeThetaPosFrequencies(int headDim, int seqLen, string device, float theta = 10000.0f)
+    {
+        // As written in the paragraph 3.2.2 of the paper
+        // >> In order to generalize our results in 2D to any xi ∈ Rd where **d is even**, [...]
+        if (headDim % 2 != 0)
+        {
+            throw new ArgumentException("Dimension must be divisible by 2", nameof(headDim));
+        }
+
+        // Build the theta parameter
+        // According to the formula theta_i = 10000^(-2(i-1)/dim) for i = [1, 2, ... dim/2]
+        // Shape: (Head_Dim / 2)
+        var thetaNumerator = torch.arange(0, headDim, 2).to(torch.float32).to(device);
+        // Shape: (Head_Dim / 2)
+        var thetaInput = torch.pow(theta, -1.0f * (thetaNumerator / headDim)).to(device); // (Dim / 2)
+        // Construct the positions (the "m" parameter)
+        // Shape: (Seq_Len)
+        var m = torch.arange(seqLen, device: device);
+        // Multiply each theta by each position using the outer product.
+        // Shape: (Seq_Len) outer_product* (Head_Dim / 2) -> (Seq_Len, Head_Dim / 2)
+        var thetaPositionFrequencies = torch.outer(m, thetaInput).to(torch.float32).to(device);
+
+        // We can compute complex numbers in the polar form c = R * exp(m * theta), where R = 1 as follows:
+        // (Seq_Len, Head_Dim / 2) -> (Seq_Len, Head_Dim / 2)
+        var freqsComplex = torch.polar(torch.ones_like(thetaPositionFrequencies), thetaPositionFrequencies);
+
+        return freqsComplex;
+    }
+
+    // python
+    // def rotate_half(x):
+    // """Rotates half the hidden dims of the input."""
+    // x1 = x[..., : x.shape[-1] // 2]
+    // x2 = x[..., x.shape[-1] // 2 :]
+    // return torch.cat((-x2, x1), dim=-1)
+    public static Tensor RotateHalf(Tensor x)
+    {
+        var x1 = x[.., .., .., ..(int)(x.shape[^1] / 2)];
+        var x2 = x[.., .., .., (int)(x.shape[^1] / 2)..];
+        // (x1 * x1 * x2).Peek("x1 * x1 * x2");
+        return torch.cat([-x2, x1], dim: -1);
+    }
+
+    public static (Tensor, Tensor) ApplyRotaryPosEmb(Tensor q, Tensor k, Tensor cos, Tensor sin, Tensor? positionIds = null, int unsqueezeDim = 1)
+    {
+        // The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+        // sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+        // that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+        // k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+        // cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+        // the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+
+        if (positionIds is not null)
+        {
+            cos = cos[positionIds!].unsqueeze(unsqueezeDim);
+            sin = sin[positionIds!].unsqueeze(unsqueezeDim);
+        }
+        else
+        {
+            cos = cos.unsqueeze(unsqueezeDim);
+            sin = sin.unsqueeze(unsqueezeDim);
+        }
+
+        var qEmbed = q * cos;
+        qEmbed += RotateHalf(q) * sin;
+
+        var kEmbed = k * cos;
+        kEmbed += RotateHalf(k) * sin;
+        // var kEmbed = (k * cos) + (RotateHalf(k) * sin);
+        return (qEmbed, kEmbed);
+    }
+
+    public static Module<Tensor, Tensor> GetActivation(string actFn)
+    {
+        return actFn switch
+        {
+            "silu" => nn.SiLU(),
+            "relu" => nn.ReLU(),
+            "gelu" => nn.GELU(),
+            "tanh" => nn.Tanh(),
+            "swish" => nn.SiLU(),
+            _ => throw new ArgumentException("Invalid activation function", actFn),
+        };
+    }
+
+
+    public static Tensor Phi2RepeatKV(Tensor x, int nRep)
+    {
+        var batchSize = x.shape[0];
+        var seqLen = x.shape[1];
+        var nKVHeads = x.shape[2];
+        var headDim = x.shape[3];
+        if (nRep == 1)
+        {
+            return x;
+        }
+
+        return x.unsqueeze(3)
+                .expand(batchSize, seqLen, nKVHeads, nRep, headDim)
+                .view(batchSize, seqLen, nKVHeads * nRep, headDim);
+    }
+
+    public static Tensor Phi3RepeatKV(Tensor x, int nRep)
+    {
+        var batchSize = x.shape[0];
+        var nKVHeads = x.shape[1];
+        var seqLen = x.shape[2];
+        var headDim = x.shape[3];
+        if (nRep == 1)
+        {
+            return x;
+        }
+
+        return x.unsqueeze(3)
+                .expand(batchSize, nKVHeads, nRep, seqLen, headDim)
+                .view(batchSize, nKVHeads * nRep, seqLen, headDim);
+    }
+
+}

From be7690112acc5be954820eb0f89aab37a3ef543f Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Tue, 18 Jun 2024 09:42:32 -0700
Subject: [PATCH 02/12] fix format

---
 .../Extension/ModuleExtension.cs                   |  9 ++++++---
 .../Module/DynamicLoadingModule.cs                 |  8 ++++++--
 .../Module/IDynamicLoadModule.cs                   |  8 ++++++--
 .../Module/IQuantizeModule.cs                      |  6 +++++-
 .../Pipeline/CasualLMModelInput.cs                 |  6 +++++-
 .../Pipeline/CasualLMModelOutput.cs                |  6 +++++-
 .../Pipeline/CausalLMPipeline.cs                   |  6 +++++-
 .../Utility/AttentionMaskConverter.cs              | 14 +++++++++-----
 src/Microsoft.ML.GenAI.Core/Utility/Cache.cs       |  6 +++++-
 src/Microsoft.ML.GenAI.Core/Utils.cs               |  9 +++++++--
 10 files changed, 59 insertions(+), 19 deletions(-)

diff --git a/src/Microsoft.ML.GenAI.Core/Extension/ModuleExtension.cs b/src/Microsoft.ML.GenAI.Core/Extension/ModuleExtension.cs
index 3a4c012446..fa5849a08d 100644
--- a/src/Microsoft.ML.GenAI.Core/Extension/ModuleExtension.cs
+++ b/src/Microsoft.ML.GenAI.Core/Extension/ModuleExtension.cs
@@ -1,11 +1,14 @@
-﻿using Microsoft.ML.GenAI.Core;
-using Microsoft.ML.GenAI.Core.Extension;
-using Phi.Module;
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
 using System;
 using System.Collections.Generic;
 using System.IO;
 using System.Linq;
 using System.Text;
+using Microsoft.ML.GenAI.Core;
+using Microsoft.ML.GenAI.Core.Extension;
 using TorchSharp;
 using static TorchSharp.torch;
 
diff --git a/src/Microsoft.ML.GenAI.Core/Module/DynamicLoadingModule.cs b/src/Microsoft.ML.GenAI.Core/Module/DynamicLoadingModule.cs
index 42f3442f95..49b8b46477 100644
--- a/src/Microsoft.ML.GenAI.Core/Module/DynamicLoadingModule.cs
+++ b/src/Microsoft.ML.GenAI.Core/Module/DynamicLoadingModule.cs
@@ -1,4 +1,8 @@
-﻿using System;
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
 using System.Collections.Generic;
 using System.Linq;
 using System.Text;
@@ -6,7 +10,7 @@
 using TorchSharp;
 using static TorchSharp.torch;
 
-namespace Phi.Module;
+namespace Microsoft.ML.GenAI.Core;
 
 public class DynamicLoadingModule<T, T1, TResult> : torch.nn.Module<T1, TResult>, IDynamicLoadModule
     where T : nn.Module<T1, TResult>
diff --git a/src/Microsoft.ML.GenAI.Core/Module/IDynamicLoadModule.cs b/src/Microsoft.ML.GenAI.Core/Module/IDynamicLoadModule.cs
index 25ba6cbc95..d215d68bb3 100644
--- a/src/Microsoft.ML.GenAI.Core/Module/IDynamicLoadModule.cs
+++ b/src/Microsoft.ML.GenAI.Core/Module/IDynamicLoadModule.cs
@@ -1,11 +1,15 @@
-﻿using System;
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
 using System.Collections.Generic;
 using System.Linq;
 using System.Text;
 using System.Threading.Tasks;
 using static TorchSharp.torch;
 
-namespace Phi.Module;
+namespace Microsoft.ML.GenAI.Core;
 
 public interface IDynamicLoadModule
 {
diff --git a/src/Microsoft.ML.GenAI.Core/Module/IQuantizeModule.cs b/src/Microsoft.ML.GenAI.Core/Module/IQuantizeModule.cs
index 2cb065c7f7..164936f3d7 100644
--- a/src/Microsoft.ML.GenAI.Core/Module/IQuantizeModule.cs
+++ b/src/Microsoft.ML.GenAI.Core/Module/IQuantizeModule.cs
@@ -1,4 +1,8 @@
-﻿namespace Phi.Module;
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+namespace Microsoft.ML.GenAI.Core;
 
 public interface IQuantizeModule
 {
diff --git a/src/Microsoft.ML.GenAI.Core/Pipeline/CasualLMModelInput.cs b/src/Microsoft.ML.GenAI.Core/Pipeline/CasualLMModelInput.cs
index ef16b3078b..31b7530b88 100644
--- a/src/Microsoft.ML.GenAI.Core/Pipeline/CasualLMModelInput.cs
+++ b/src/Microsoft.ML.GenAI.Core/Pipeline/CasualLMModelInput.cs
@@ -1,4 +1,8 @@
-﻿using static TorchSharp.torch;
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using static TorchSharp.torch;
 
 namespace Microsoft.ML.GenAI.Core;
 
diff --git a/src/Microsoft.ML.GenAI.Core/Pipeline/CasualLMModelOutput.cs b/src/Microsoft.ML.GenAI.Core/Pipeline/CasualLMModelOutput.cs
index 1ac56f9e31..10dde68852 100644
--- a/src/Microsoft.ML.GenAI.Core/Pipeline/CasualLMModelOutput.cs
+++ b/src/Microsoft.ML.GenAI.Core/Pipeline/CasualLMModelOutput.cs
@@ -1,4 +1,8 @@
-﻿using static TorchSharp.torch;
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using static TorchSharp.torch;
 
 namespace Microsoft.ML.GenAI.Core;
 
diff --git a/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs b/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs
index 3d8309ac4b..fa61f7b43a 100644
--- a/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs
+++ b/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs
@@ -1,4 +1,8 @@
-﻿using System;
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
 using System.Collections.Generic;
 using System.Linq;
 using System.Text;
diff --git a/src/Microsoft.ML.GenAI.Core/Utility/AttentionMaskConverter.cs b/src/Microsoft.ML.GenAI.Core/Utility/AttentionMaskConverter.cs
index abd15c6c6f..b292c3d731 100644
--- a/src/Microsoft.ML.GenAI.Core/Utility/AttentionMaskConverter.cs
+++ b/src/Microsoft.ML.GenAI.Core/Utility/AttentionMaskConverter.cs
@@ -1,9 +1,13 @@
-﻿using static TorchSharp.torch.nn;
-using static TorchSharp.torch;
-using TorchSharp.Modules;
-using TorchSharp;
-using System.Threading.Tasks;
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
 using System;
+using System.Threading.Tasks;
+using TorchSharp;
+using TorchSharp.Modules;
+using static TorchSharp.torch;
+using static TorchSharp.torch.nn;
 
 namespace Microsoft.ML.GenAI.Core;
 
diff --git a/src/Microsoft.ML.GenAI.Core/Utility/Cache.cs b/src/Microsoft.ML.GenAI.Core/Utility/Cache.cs
index 026c2cdff3..4647cfd122 100644
--- a/src/Microsoft.ML.GenAI.Core/Utility/Cache.cs
+++ b/src/Microsoft.ML.GenAI.Core/Utility/Cache.cs
@@ -1,4 +1,8 @@
-﻿using System;
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
 using System.Collections.Generic;
 using System.Linq;
 using System.Text;
diff --git a/src/Microsoft.ML.GenAI.Core/Utils.cs b/src/Microsoft.ML.GenAI.Core/Utils.cs
index 84f0062951..2f46e7d43d 100644
--- a/src/Microsoft.ML.GenAI.Core/Utils.cs
+++ b/src/Microsoft.ML.GenAI.Core/Utils.cs
@@ -1,13 +1,18 @@
-﻿using System;
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
 using System.Collections.Generic;
 using System.Linq;
 using System.Text;
 using System.Threading.Tasks;
-using TorchSharp.Modules;
 using TorchSharp;
+using TorchSharp.Modules;
 using static TorchSharp.torch;
 using static TorchSharp.torch.nn;
 
+namespace Microsoft.ML.GenAI.Core;
 public static class Utils
 {
     public static Tensor ApplyRotaryEmbeddings(Tensor input, Tensor freqsComplex)

From f115479b384a2558b95e39712531dbaaea93afe9 Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Tue, 18 Jun 2024 09:47:37 -0700
Subject: [PATCH 03/12] move causalml generate method to
 causalmlpipelineextension

---
 .../Extension/CausalMLPipelineExtension.cs    | 51 +++++++++++++++++++
 .../Extension/ModuleExtension.cs              | 35 -------------
 2 files changed, 51 insertions(+), 35 deletions(-)
 create mode 100644 src/Microsoft.ML.GenAI.Core/Extension/CausalMLPipelineExtension.cs

diff --git a/src/Microsoft.ML.GenAI.Core/Extension/CausalMLPipelineExtension.cs b/src/Microsoft.ML.GenAI.Core/Extension/CausalMLPipelineExtension.cs
new file mode 100644
index 0000000000..4bb2f64a66
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Core/Extension/CausalMLPipelineExtension.cs
@@ -0,0 +1,51 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+using static TorchSharp.torch;
+using TorchSharp;
+
+namespace Microsoft.ML.GenAI.Core.Extension;
+
+public static class CausalMLPipelineExtension
+{
+
+    public static string? Generate(
+        this CausalLMPipeline pipeline,
+        string prompt,
+        int maxLen = 128,
+        float temperature = 0.7f,
+        float topP = 0.9f,
+        string[]? stopSequences = null,
+        int eosId = 0,
+        string device = "cpu",
+        bool bos = true,
+        bool eos = false,
+        bool echo = false)
+    {
+        using var newScope = NewDisposeScope();
+        var inputIds = pipeline.Tokenizer.EncodeToIds(prompt);
+        var inputTensor = torch.tensor(inputIds.ToArray(), dtype: ScalarType.Int64, device: device).unsqueeze(0);
+        var attentionMask = torch.ones_like(inputTensor);
+
+        // set up stop token ids
+        // stop token ids: [[eosId], [stopSequence1], [stopSequence2], ...]
+        // when causal language model generates tokens, it will stop when it generates any token in stopSequences
+        List<int[]> stopTokenIds = [[eosId]];
+        if (stopSequences != null)
+        {
+            stopTokenIds.AddRange(stopSequences.Select(x => pipeline.Tokenizer.EncodeToIds(x).ToArray()));
+        }
+
+        (var token, var _) = pipeline.Generate(inputTensor, attentionMask, temperature: temperature, maxLen: maxLen, topP: topP, stopTokenSequence: stopTokenIds.ToArray(), echo: echo);
+
+        var tokenIds = token[0].to_type(ScalarType.Int32).data<int>().ToArray();
+
+        return pipeline.Tokenizer.Decode(tokenIds);
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.Core/Extension/ModuleExtension.cs b/src/Microsoft.ML.GenAI.Core/Extension/ModuleExtension.cs
index fa5849a08d..e7b253d325 100644
--- a/src/Microsoft.ML.GenAI.Core/Extension/ModuleExtension.cs
+++ b/src/Microsoft.ML.GenAI.Core/Extension/ModuleExtension.cs
@@ -160,41 +160,6 @@ public static Dictionary<string, string> InferDeviceMapForEachLayer(
         return deviceMap;
     }
 
-    public static string? Generate(
-        this CausalLMPipeline pipeline,
-        string prompt,
-        int maxLen = 128,
-        float temperature = 0.7f,
-        float topP = 0.9f,
-        string[]? stopSequences = null,
-        int eosId = 0,
-        string device = "cpu",
-        bool bos = true,
-        bool eos = false,
-        bool echo = false)
-    {
-        using var newScope = NewDisposeScope();
-        var inputIds = pipeline.Tokenizer.EncodeToIds(prompt);
-        var inputTensor = torch.tensor(inputIds.ToArray(), dtype: ScalarType.Int64, device: device).unsqueeze(0);
-        var attentionMask = torch.ones_like(inputTensor);
-
-        // set up stop token ids
-        // stop token ids: [[eosId], [stopSequence1], [stopSequence2], ...]
-        // when causal language model generates tokens, it will stop when it generates any token in stopSequences
-        List<int[]> stopTokenIds = [[eosId]];
-        if (stopSequences != null)
-        {
-            stopTokenIds.AddRange(stopSequences.Select(x => pipeline.Tokenizer.EncodeToIds(x).ToArray()));
-        }
-
-        (var token, var _) = pipeline.Generate(inputTensor, attentionMask, temperature: temperature, maxLen: maxLen, topP: topP, stopTokenSequence: stopTokenIds.ToArray(), echo: echo);
-
-        var tokenIds = token[0].to_type(ScalarType.Int32).data<int>().ToArray();
-
-        return pipeline.Tokenizer.Decode(tokenIds);
-    }
-
-
     public static string Peek(this nn.Module model)
     {
         var sb = new StringBuilder();

From 487b7d4bb1d75229173d1259f2fda98c09a2ed89 Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Wed, 19 Jun 2024 09:01:09 -0700
Subject: [PATCH 04/12] fix build error from api breakchange in torchsharp

---
 src/Microsoft.ML.TorchSharp/AutoFormerV2/ConvModule.cs        | 2 +-
 .../NasBert/Optimizers/BaseOptimizer.cs                       | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Microsoft.ML.TorchSharp/AutoFormerV2/ConvModule.cs b/src/Microsoft.ML.TorchSharp/AutoFormerV2/ConvModule.cs
index f8f8167cce..8568239fc5 100644
--- a/src/Microsoft.ML.TorchSharp/AutoFormerV2/ConvModule.cs
+++ b/src/Microsoft.ML.TorchSharp/AutoFormerV2/ConvModule.cs
@@ -35,7 +35,7 @@ public class ConvModule : Module<Tensor, Tensor>
         public ConvModule(int inChannel, int outChannel, int kernelSize, int stride = 1, int padding = 0, int dilation = 1, bool bias = true, bool useRelu = true)
             : base(nameof(ConvModule))
         {
-            this.conv = nn.Conv2d(inputChannel: inChannel, outputChannel: outChannel, kernelSize: kernelSize, stride: stride, padding: padding, dilation: dilation, bias: bias);
+            this.conv = nn.Conv2d(in_channels: inChannel, out_channels: outChannel, kernelSize: kernelSize, stride: stride, padding: padding, dilation: dilation, bias: bias);
             this.useRelu = useRelu;
             if (this.useRelu)
             {
diff --git a/src/Microsoft.ML.TorchSharp/NasBert/Optimizers/BaseOptimizer.cs b/src/Microsoft.ML.TorchSharp/NasBert/Optimizers/BaseOptimizer.cs
index ff94553b93..b7c0595094 100644
--- a/src/Microsoft.ML.TorchSharp/NasBert/Optimizers/BaseOptimizer.cs
+++ b/src/Microsoft.ML.TorchSharp/NasBert/Optimizers/BaseOptimizer.cs
@@ -63,7 +63,7 @@ public void Step()
         public double GetGradNorm()
         {
             return Math.Sqrt(Parameters
-                .Select(p => p.grad())
+                .Select(p => p.grad)
                 .Where(grad => grad.IsNotNull())      // parameters unused have no gradient
                 .Select(grad => grad.square().sum().ToDouble())
                 .Sum());
@@ -82,7 +82,7 @@ public void MultiplyGrads(double c)
         {
             foreach (var p in Parameters)
             {
-                using var grad = p.grad();
+                using var grad = p.grad;
                 if (grad.IsNotNull())
                 {
                     grad.mul_(c);

From 4607227ab1802e5df97777dbbff1f51cf9d6c4b5 Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Wed, 19 Jun 2024 10:21:52 -0700
Subject: [PATCH 05/12] update package reference

---
 .../Microsoft.ML.TorchSharp.Tests.csproj                    | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/Microsoft.ML.TorchSharp.Tests/Microsoft.ML.TorchSharp.Tests.csproj b/test/Microsoft.ML.TorchSharp.Tests/Microsoft.ML.TorchSharp.Tests.csproj
index 4d7de3e37e..0091475ff4 100644
--- a/test/Microsoft.ML.TorchSharp.Tests/Microsoft.ML.TorchSharp.Tests.csproj
+++ b/test/Microsoft.ML.TorchSharp.Tests/Microsoft.ML.TorchSharp.Tests.csproj
@@ -25,10 +25,10 @@
   </ItemGroup>
 
   <ItemGroup Condition="'$(TargetArchitecture)' == 'x64'">
-    <PackageReference Include="libtorch-cpu-win-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('Windows')) AND '$(TargetArchitecture)' == 'x64'" />
-    <!-- <PackageReference Include="TorchSharp-cuda-windows" Version="$(TorchSharpVersion)" Condition="$([MSBuild]::IsOSPlatform('Windows'))" />    -->
+    <PackageReference include="TorchSharp-cpu" Version="$(TorchSharpVersion)" />
+    <!-- <PackageReference Include="libtorch-cpu-win-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('Windows')) AND '$(TargetArchitecture)' == 'x64'" />
     <PackageReference Include="libtorch-cpu-linux-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('Linux')) AND '$(TargetArchitecture)' == 'x64'" />
-    <PackageReference Include="libtorch-cpu-osx-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('OSX')) AND '$(TargetArchitecture)' == 'x64'" />
+    <PackageReference Include="libtorch-cpu-osx-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('OSX')) AND '$(TargetArchitecture)' == 'x64'" /> -->
 
     <PackageReference Include="MathNet.Numerics.Signed" Version="5.0.0" />
   </ItemGroup>

From 2ab68b8023cbb8410f7bcdb7db466b1b25dd24a0 Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Wed, 19 Jun 2024 10:24:12 -0700
Subject: [PATCH 06/12] fix build error

---
 .../Microsoft.ML.TorchSharp.Tests.csproj                    | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/test/Microsoft.ML.TorchSharp.Tests/Microsoft.ML.TorchSharp.Tests.csproj b/test/Microsoft.ML.TorchSharp.Tests/Microsoft.ML.TorchSharp.Tests.csproj
index 0091475ff4..fe4e783706 100644
--- a/test/Microsoft.ML.TorchSharp.Tests/Microsoft.ML.TorchSharp.Tests.csproj
+++ b/test/Microsoft.ML.TorchSharp.Tests/Microsoft.ML.TorchSharp.Tests.csproj
@@ -25,11 +25,7 @@
   </ItemGroup>
 
   <ItemGroup Condition="'$(TargetArchitecture)' == 'x64'">
-    <PackageReference include="TorchSharp-cpu" Version="$(TorchSharpVersion)" />
-    <!-- <PackageReference Include="libtorch-cpu-win-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('Windows')) AND '$(TargetArchitecture)' == 'x64'" />
-    <PackageReference Include="libtorch-cpu-linux-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('Linux')) AND '$(TargetArchitecture)' == 'x64'" />
-    <PackageReference Include="libtorch-cpu-osx-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('OSX')) AND '$(TargetArchitecture)' == 'x64'" /> -->
-
+    <PackageReference Include="TorchSharp-cpu" Version="$(TorchSharpVersion)" />
     <PackageReference Include="MathNet.Numerics.Signed" Version="5.0.0" />
   </ItemGroup>
 

From a390717dc4c94cd9566aa24890a044ac51a93291 Mon Sep 17 00:00:00 2001
From: Xiaoyun Zhang <bigmiao.zhang@gmail.com>
Date: Fri, 21 Jun 2024 12:15:02 -0700
Subject: [PATCH 07/12] Update job-template.yml

---
 build/ci/job-template.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build/ci/job-template.yml b/build/ci/job-template.yml
index 4c033c30cd..d437a2960f 100644
--- a/build/ci/job-template.yml
+++ b/build/ci/job-template.yml
@@ -121,7 +121,7 @@ jobs:
     - ${{ if eq(parameters.nightlyBuild, 'false') }}:
       - ${{ if eq(parameters.innerLoop, 'false') }}:
         - ${{ if and(eq(parameters.runSpecific, 'false'), eq(parameters.useVSTestTask, 'false')) }}:
-          - script: set PATH=%PATH%;%USERPROFILE%\.nuget\packages\libtorch-cpu-win-x64\2.1.0.1\runtimes\win-x64\native;%USERPROFILE%\.nuget\packages\torchsharp\0.101.5\runtimes\win-x64\native & ${{ parameters.buildScript }} /p:Build=false -configuration $(_configuration) /p:TargetArchitecture=${{ parameters.architecture }} /p:TestArchitectures=${{ parameters.architecture }} -test -integrationTest /p:Coverage=${{ parameters.codeCoverage }} $(testTargetFramework)
+          - script: set PATH=%PATH%;%USERPROFILE%\.nuget\packages\libtorch-cpu-win-x64\2.2.1.1\runtimes\win-x64\native;%USERPROFILE%\.nuget\packages\torchsharp\0.102.5\runtimes\win-x64\native & ${{ parameters.buildScript }} /p:Build=false -configuration $(_configuration) /p:TargetArchitecture=${{ parameters.architecture }} /p:TestArchitectures=${{ parameters.architecture }} -test -integrationTest /p:Coverage=${{ parameters.codeCoverage }} $(testTargetFramework)
             displayName: Run All Tests.
         - ${{ if and(eq(parameters.runSpecific, 'true'), eq(parameters.useVSTestTask, 'false')) }}:
           - script: ${{ parameters.buildScript }} /p:Build=false -configuration $(_configuration) /p:TargetArchitecture=${{ parameters.architecture }} /p:TestArchitectures=${{ parameters.architecture }} -test -integrationTest /p:TestRunnerAdditionalArguments='-trait$(spaceValue)Category=RunSpecificTest' /p:Coverage=${{ parameters.codeCoverage }} $(testTargetFramework)

From 656066965777343d772e0bc22a467c9349babe17 Mon Sep 17 00:00:00 2001
From: Xiaoyun Zhang <bigmiao.zhang@gmail.com>
Date: Fri, 21 Jun 2024 12:42:59 -0700
Subject: [PATCH 08/12] Update Microsoft.ML.TorchSharp.Tests.csproj

---
 .../Microsoft.ML.TorchSharp.Tests.csproj                     | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/test/Microsoft.ML.TorchSharp.Tests/Microsoft.ML.TorchSharp.Tests.csproj b/test/Microsoft.ML.TorchSharp.Tests/Microsoft.ML.TorchSharp.Tests.csproj
index fe4e783706..4824d46ea8 100644
--- a/test/Microsoft.ML.TorchSharp.Tests/Microsoft.ML.TorchSharp.Tests.csproj
+++ b/test/Microsoft.ML.TorchSharp.Tests/Microsoft.ML.TorchSharp.Tests.csproj
@@ -25,7 +25,10 @@
   </ItemGroup>
 
   <ItemGroup Condition="'$(TargetArchitecture)' == 'x64'">
-    <PackageReference Include="TorchSharp-cpu" Version="$(TorchSharpVersion)" />
+    <PackageReference Include="libtorch-cpu-win-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('Windows')) AND '$(TargetArchitecture)' == 'x64'" />
+    <!-- <PackageReference Include="TorchSharp-cuda-windows" Version="$(TorchSharpVersion)" Condition="$([MSBuild]::IsOSPlatform('Windows'))" />    -->
+    <PackageReference Include="libtorch-cpu-linux-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('Linux')) AND '$(TargetArchitecture)' == 'x64'" />
+    <PackageReference Include="libtorch-cpu-osx-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('OSX')) AND '$(TargetArchitecture)' == 'x64'" />
     <PackageReference Include="MathNet.Numerics.Signed" Version="5.0.0" />
   </ItemGroup>
 

From 9fcfec7fa306391157000b6d56ab82c6d04834ef Mon Sep 17 00:00:00 2001
From: Xiaoyun Zhang <bigmiao.zhang@gmail.com>
Date: Fri, 21 Jun 2024 12:43:45 -0700
Subject: [PATCH 09/12] Update Microsoft.ML.TorchSharp.Tests.csproj

---
 .../Microsoft.ML.TorchSharp.Tests.csproj                         | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/Microsoft.ML.TorchSharp.Tests/Microsoft.ML.TorchSharp.Tests.csproj b/test/Microsoft.ML.TorchSharp.Tests/Microsoft.ML.TorchSharp.Tests.csproj
index 4824d46ea8..4d7de3e37e 100644
--- a/test/Microsoft.ML.TorchSharp.Tests/Microsoft.ML.TorchSharp.Tests.csproj
+++ b/test/Microsoft.ML.TorchSharp.Tests/Microsoft.ML.TorchSharp.Tests.csproj
@@ -29,6 +29,7 @@
     <!-- <PackageReference Include="TorchSharp-cuda-windows" Version="$(TorchSharpVersion)" Condition="$([MSBuild]::IsOSPlatform('Windows'))" />    -->
     <PackageReference Include="libtorch-cpu-linux-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('Linux')) AND '$(TargetArchitecture)' == 'x64'" />
     <PackageReference Include="libtorch-cpu-osx-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('OSX')) AND '$(TargetArchitecture)' == 'x64'" />
+
     <PackageReference Include="MathNet.Numerics.Signed" Version="5.0.0" />
   </ItemGroup>
 

From a86bb7efe8cbb93751381a7dbb583fa2a6046466 Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Mon, 24 Jun 2024 09:18:54 -0700
Subject: [PATCH 10/12] revert change for update torchsharp runtime

---
 eng/Versions.props                                            | 4 ++--
 src/Microsoft.ML.TorchSharp/AutoFormerV2/ConvModule.cs        | 2 +-
 .../NasBert/Optimizers/BaseOptimizer.cs                       | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/eng/Versions.props b/eng/Versions.props
index 95200a2599..b1d4979662 100644
--- a/eng/Versions.props
+++ b/eng/Versions.props
@@ -63,8 +63,8 @@
     <TensorflowDotNETVersion>0.20.1</TensorflowDotNETVersion>
     <TensorFlowMajorVersion>2</TensorFlowMajorVersion>
     <TensorFlowVersion>2.3.1</TensorFlowVersion>
-    <TorchSharpVersion>0.102.5</TorchSharpVersion>
-    <LibTorchVersion>2.2.1.1</LibTorchVersion>
+    <TorchSharpVersion>0.101.5</TorchSharpVersion>
+    <LibTorchVersion>2.1.0.1</LibTorchVersion>
     <!-- Build/infrastructure Dependencies -->
     <CodecovVersion>1.12.4</CodecovVersion>
     <CoverletCollectorVersion>3.1.2</CoverletCollectorVersion>
diff --git a/src/Microsoft.ML.TorchSharp/AutoFormerV2/ConvModule.cs b/src/Microsoft.ML.TorchSharp/AutoFormerV2/ConvModule.cs
index 8568239fc5..f8f8167cce 100644
--- a/src/Microsoft.ML.TorchSharp/AutoFormerV2/ConvModule.cs
+++ b/src/Microsoft.ML.TorchSharp/AutoFormerV2/ConvModule.cs
@@ -35,7 +35,7 @@ public class ConvModule : Module<Tensor, Tensor>
         public ConvModule(int inChannel, int outChannel, int kernelSize, int stride = 1, int padding = 0, int dilation = 1, bool bias = true, bool useRelu = true)
             : base(nameof(ConvModule))
         {
-            this.conv = nn.Conv2d(in_channels: inChannel, out_channels: outChannel, kernelSize: kernelSize, stride: stride, padding: padding, dilation: dilation, bias: bias);
+            this.conv = nn.Conv2d(inputChannel: inChannel, outputChannel: outChannel, kernelSize: kernelSize, stride: stride, padding: padding, dilation: dilation, bias: bias);
             this.useRelu = useRelu;
             if (this.useRelu)
             {
diff --git a/src/Microsoft.ML.TorchSharp/NasBert/Optimizers/BaseOptimizer.cs b/src/Microsoft.ML.TorchSharp/NasBert/Optimizers/BaseOptimizer.cs
index b7c0595094..ff94553b93 100644
--- a/src/Microsoft.ML.TorchSharp/NasBert/Optimizers/BaseOptimizer.cs
+++ b/src/Microsoft.ML.TorchSharp/NasBert/Optimizers/BaseOptimizer.cs
@@ -63,7 +63,7 @@ public void Step()
         public double GetGradNorm()
         {
             return Math.Sqrt(Parameters
-                .Select(p => p.grad)
+                .Select(p => p.grad())
                 .Where(grad => grad.IsNotNull())      // parameters unused have no gradient
                 .Select(grad => grad.square().sum().ToDouble())
                 .Sum());
@@ -82,7 +82,7 @@ public void MultiplyGrads(double c)
         {
             foreach (var p in Parameters)
             {
-                using var grad = p.grad;
+                using var grad = p.grad();
                 if (grad.IsNotNull())
                 {
                     grad.mul_(c);

From d478444ebb8e20cd114d68087c0df54f309b2fd6 Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Mon, 24 Jun 2024 09:34:45 -0700
Subject: [PATCH 11/12] use explicit torchsharp version

---
 .../Microsoft.ML.GenAI.Core.csproj                        | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/Microsoft.ML.GenAI.Core/Microsoft.ML.GenAI.Core.csproj b/src/Microsoft.ML.GenAI.Core/Microsoft.ML.GenAI.Core.csproj
index 3346b40caa..6e1ffed0c9 100644
--- a/src/Microsoft.ML.GenAI.Core/Microsoft.ML.GenAI.Core.csproj
+++ b/src/Microsoft.ML.GenAI.Core/Microsoft.ML.GenAI.Core.csproj
@@ -9,10 +9,10 @@
 
   <ItemGroup>
     <PackageReference Include="System.Memory" Version="$(SystemMemoryVersion)" />
-    <PackageReference Include="TorchSharp" Version="$(TorchSharpVersion)" />
-    <PackageReference Include="libtorch-cpu-win-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('Windows'))" PrivateAssets="all" />
-    <PackageReference Include="libtorch-cpu-linux-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('Linux'))" PrivateAssets="all" />
-    <PackageReference Include="libtorch-cpu-osx-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('OSX'))" PrivateAssets="all" />
+    <PackageReference Include="TorchSharp" Version="0.102.5" />
+    <PackageReference Include="libtorch-cpu-win-x64" Version="2.2.1.1" Condition="$([MSBuild]::IsOSPlatform('Windows'))" PrivateAssets="all" />
+    <PackageReference Include="libtorch-cpu-linux-x64" Version="2.2.1.1" Condition="$([MSBuild]::IsOSPlatform('Linux'))" PrivateAssets="all" />
+    <PackageReference Include="libtorch-cpu-osx-x64" Version="2.2.1.1" Condition="$([MSBuild]::IsOSPlatform('OSX'))" PrivateAssets="all" />
   </ItemGroup>
 
   <ItemGroup>

From e39c8fc07f45c7b8d59a015bf8531211240a1c9b Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Tue, 25 Jun 2024 16:04:15 -0700
Subject: [PATCH 12/12] fix comments

---
 build/ci/job-template.yml                                      | 2 +-
 ...usalMLPipelineExtension.cs => CausalLMPipelineExtension.cs} | 3 +--
 src/Microsoft.ML.GenAI.Core/Extension/ModuleExtension.cs       | 1 -
 3 files changed, 2 insertions(+), 4 deletions(-)
 rename src/Microsoft.ML.GenAI.Core/Extension/{CausalMLPipelineExtension.cs => CausalLMPipelineExtension.cs} (97%)

diff --git a/build/ci/job-template.yml b/build/ci/job-template.yml
index d437a2960f..4c033c30cd 100644
--- a/build/ci/job-template.yml
+++ b/build/ci/job-template.yml
@@ -121,7 +121,7 @@ jobs:
     - ${{ if eq(parameters.nightlyBuild, 'false') }}:
       - ${{ if eq(parameters.innerLoop, 'false') }}:
         - ${{ if and(eq(parameters.runSpecific, 'false'), eq(parameters.useVSTestTask, 'false')) }}:
-          - script: set PATH=%PATH%;%USERPROFILE%\.nuget\packages\libtorch-cpu-win-x64\2.2.1.1\runtimes\win-x64\native;%USERPROFILE%\.nuget\packages\torchsharp\0.102.5\runtimes\win-x64\native & ${{ parameters.buildScript }} /p:Build=false -configuration $(_configuration) /p:TargetArchitecture=${{ parameters.architecture }} /p:TestArchitectures=${{ parameters.architecture }} -test -integrationTest /p:Coverage=${{ parameters.codeCoverage }} $(testTargetFramework)
+          - script: set PATH=%PATH%;%USERPROFILE%\.nuget\packages\libtorch-cpu-win-x64\2.1.0.1\runtimes\win-x64\native;%USERPROFILE%\.nuget\packages\torchsharp\0.101.5\runtimes\win-x64\native & ${{ parameters.buildScript }} /p:Build=false -configuration $(_configuration) /p:TargetArchitecture=${{ parameters.architecture }} /p:TestArchitectures=${{ parameters.architecture }} -test -integrationTest /p:Coverage=${{ parameters.codeCoverage }} $(testTargetFramework)
             displayName: Run All Tests.
         - ${{ if and(eq(parameters.runSpecific, 'true'), eq(parameters.useVSTestTask, 'false')) }}:
           - script: ${{ parameters.buildScript }} /p:Build=false -configuration $(_configuration) /p:TargetArchitecture=${{ parameters.architecture }} /p:TestArchitectures=${{ parameters.architecture }} -test -integrationTest /p:TestRunnerAdditionalArguments='-trait$(spaceValue)Category=RunSpecificTest' /p:Coverage=${{ parameters.codeCoverage }} $(testTargetFramework)
diff --git a/src/Microsoft.ML.GenAI.Core/Extension/CausalMLPipelineExtension.cs b/src/Microsoft.ML.GenAI.Core/Extension/CausalLMPipelineExtension.cs
similarity index 97%
rename from src/Microsoft.ML.GenAI.Core/Extension/CausalMLPipelineExtension.cs
rename to src/Microsoft.ML.GenAI.Core/Extension/CausalLMPipelineExtension.cs
index 4bb2f64a66..3a1041ee8a 100644
--- a/src/Microsoft.ML.GenAI.Core/Extension/CausalMLPipelineExtension.cs
+++ b/src/Microsoft.ML.GenAI.Core/Extension/CausalLMPipelineExtension.cs
@@ -12,9 +12,8 @@
 
 namespace Microsoft.ML.GenAI.Core.Extension;
 
-public static class CausalMLPipelineExtension
+public static class CausalLMPipelineExtension
 {
-
     public static string? Generate(
         this CausalLMPipeline pipeline,
         string prompt,
diff --git a/src/Microsoft.ML.GenAI.Core/Extension/ModuleExtension.cs b/src/Microsoft.ML.GenAI.Core/Extension/ModuleExtension.cs
index e7b253d325..6395ffd3fd 100644
--- a/src/Microsoft.ML.GenAI.Core/Extension/ModuleExtension.cs
+++ b/src/Microsoft.ML.GenAI.Core/Extension/ModuleExtension.cs
@@ -89,7 +89,6 @@ public static T ToDynamicLoadingModel<T>(
             return model;
         }
 
-        //var dynamicModules = model.named_modules().Where(x => x.module is IDynamicLoadModule).Select(x => x.name).ToList();
         // for each module in the model, update device if it is IDynamicLoadModule
         foreach (var (key, value) in model.named_children())
         {