From 7b17731c06b5ab694e5c81eedfc7ef55a094329d Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Tue, 13 Aug 2024 15:59:07 -0700
Subject: [PATCH 01/24] add llama

---
 Microsoft.ML.sln                              |  24 +-
 .../Microsoft.ML.GenAI.Core.csproj            |   9 +-
 .../Module/Attention.cs                       | 222 +++++++++++++
 .../Module/GenAILinear.cs                     |   2 +-
 .../Module/NewGELUActivation.cs               |   2 +-
 .../Module/QuantizedLinear.cs                 |   3 +-
 .../Module/RMSNorm.cs}                        |   8 +-
 .../Module/RotaryEmbedding.cs                 | 125 ++++++++
 ...lLMModelInput.cs => CausalLMModelInput.cs} |   4 +-
 ...MModelOutput.cs => CausalLMModelOutput.cs} |   4 +-
 .../Pipeline/CausalLMPipeline.cs              |  12 +-
 src/Microsoft.ML.GenAI.Core/Utils.cs          |   2 +-
 src/Microsoft.ML.GenAI.LLaMA/LlamaConfig.cs   | 110 +++++++
 .../LlamaForCausalLM.cs                       |  44 +++
 .../Microsoft.ML.GenAI.LLaMA.csproj           |  26 ++
 .../Module/LlamaDecoderLayer.cs               | 152 +++++++++
 .../Module/LlamaMLP.cs                        |  62 ++++
 .../Module/LlamaModel.cs                      | 138 +++++++++
 .../Config/meta-llama-3.1-8B-Instruct.json    |  38 +++
 .../RopeScalingObject.cs                      |   7 +
 src/Microsoft.ML.GenAI.LLaMA/Utils.cs         | 100 ++++++
 .../Microsoft.ML.GenAI.Phi.csproj             |   7 -
 .../Module/Phi2Attention.cs                   |   1 +
 src/Microsoft.ML.GenAI.Phi/Module/Phi2MLP.cs  |   1 +
 .../Module/Phi3Attention.cs                   | 148 ++-------
 .../Module/Phi3DecoderLayer.cs                |  14 +-
 src/Microsoft.ML.GenAI.Phi/Module/Phi3MLP.cs  |   3 +-
 .../Module/Phi3Model.cs                       |  10 +-
 .../Module/Phi3RotaryEmbedding.cs             |  81 -----
 .../Module/Phi3SuScaledRotaryEmbedding.cs     |   5 +-
 .../Phi2/Phi2ForCasualLM.cs                   |   6 +-
 .../Phi3/Phi3ForCasualLM.cs                   |   4 +-
 src/Microsoft.ML.GenAI.Phi/Utils.cs           |  12 -
 ...1Tests.Llama_3_1_8b_ShapeTest.approved.txt | 291 ++++++++++++++++++
 ...1Tests.Llama_3_1_8b_ShapeTest.received.txt | 291 ++++++++++++++++++
 .../LLaMA3_1Tests.cs                          |  42 +++
 .../Microsoft.ML.GenAI.LLaMA.Tests.csproj     |  39 +++
 37 files changed, 1774 insertions(+), 275 deletions(-)
 create mode 100644 src/Microsoft.ML.GenAI.Core/Module/Attention.cs
 rename src/{Microsoft.ML.GenAI.Phi/Module/Phi3RMSNorm.cs => Microsoft.ML.GenAI.Core/Module/RMSNorm.cs} (92%)
 create mode 100644 src/Microsoft.ML.GenAI.Core/Module/RotaryEmbedding.cs
 rename src/Microsoft.ML.GenAI.Core/Pipeline/{CasualLMModelInput.cs => CausalLMModelInput.cs} (96%)
 rename src/Microsoft.ML.GenAI.Core/Pipeline/{CasualLMModelOutput.cs => CausalLMModelOutput.cs} (94%)
 create mode 100644 src/Microsoft.ML.GenAI.LLaMA/LlamaConfig.cs
 create mode 100644 src/Microsoft.ML.GenAI.LLaMA/LlamaForCausalLM.cs
 create mode 100644 src/Microsoft.ML.GenAI.LLaMA/Microsoft.ML.GenAI.LLaMA.csproj
 create mode 100644 src/Microsoft.ML.GenAI.LLaMA/Module/LlamaDecoderLayer.cs
 create mode 100644 src/Microsoft.ML.GenAI.LLaMA/Module/LlamaMLP.cs
 create mode 100644 src/Microsoft.ML.GenAI.LLaMA/Module/LlamaModel.cs
 create mode 100644 src/Microsoft.ML.GenAI.LLaMA/Resource/Config/meta-llama-3.1-8B-Instruct.json
 create mode 100644 src/Microsoft.ML.GenAI.LLaMA/RopeScalingObject.cs
 create mode 100644 src/Microsoft.ML.GenAI.LLaMA/Utils.cs
 delete mode 100644 src/Microsoft.ML.GenAI.Phi/Module/Phi3RotaryEmbedding.cs
 create mode 100644 test/Microsoft.ML.GenAI.LLaMA.Tests/Approvals/LLaMA3_1Tests.Llama_3_1_8b_ShapeTest.approved.txt
 create mode 100644 test/Microsoft.ML.GenAI.LLaMA.Tests/Approvals/LLaMA3_1Tests.Llama_3_1_8b_ShapeTest.received.txt
 create mode 100644 test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs
 create mode 100644 test/Microsoft.ML.GenAI.LLaMA.Tests/Microsoft.ML.GenAI.LLaMA.Tests.csproj

diff --git a/Microsoft.ML.sln b/Microsoft.ML.sln
index d3985d1777..c55f5797f2 100644
--- a/Microsoft.ML.sln
+++ b/Microsoft.ML.sln
@@ -184,7 +184,11 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.GenAI.Phi.Test
 EndProject
 Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.GenAI.Samples", "docs\samples\Microsoft.ML.GenAI.Samples\Microsoft.ML.GenAI.Samples.csproj", "{1D4AD9A3-19AF-432B-889D-A63FE6D7BD47}"
 EndProject
-Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.GenAI.Core.Tests", "test\Microsoft.ML.GenAI.Core.Tests\Microsoft.ML.GenAI.Core.Tests.csproj", "{14AB0804-D4CE-4634-B544-5A8587620783}"
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.GenAI.Core.Tests", "test\Microsoft.ML.GenAI.Core.Tests\Microsoft.ML.GenAI.Core.Tests.csproj", "{14AB0804-D4CE-4634-B544-5A8587620783}"
+EndProject
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.GenAI.LLaMA", "src\Microsoft.ML.GenAI.LLaMA\Microsoft.ML.GenAI.LLaMA.csproj", "{0AA6D5CB-195F-457A-8792-4221E76E6C44}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.GenAI.LLaMA.Tests", "test\Microsoft.ML.GenAI.LLaMA.Tests\Microsoft.ML.GenAI.LLaMA.Tests.csproj", "{D202353D-6FAF-4263-9A01-BDCFBC92391F}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
@@ -878,6 +882,22 @@ Global
 		{14AB0804-D4CE-4634-B544-5A8587620783}.Release|Any CPU.Build.0 = Release|Any CPU
 		{14AB0804-D4CE-4634-B544-5A8587620783}.Release|x64.ActiveCfg = Release|Any CPU
 		{14AB0804-D4CE-4634-B544-5A8587620783}.Release|x64.Build.0 = Release|Any CPU
+		{0AA6D5CB-195F-457A-8792-4221E76E6C44}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{0AA6D5CB-195F-457A-8792-4221E76E6C44}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{0AA6D5CB-195F-457A-8792-4221E76E6C44}.Debug|x64.ActiveCfg = Debug|Any CPU
+		{0AA6D5CB-195F-457A-8792-4221E76E6C44}.Debug|x64.Build.0 = Debug|Any CPU
+		{0AA6D5CB-195F-457A-8792-4221E76E6C44}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{0AA6D5CB-195F-457A-8792-4221E76E6C44}.Release|Any CPU.Build.0 = Release|Any CPU
+		{0AA6D5CB-195F-457A-8792-4221E76E6C44}.Release|x64.ActiveCfg = Release|Any CPU
+		{0AA6D5CB-195F-457A-8792-4221E76E6C44}.Release|x64.Build.0 = Release|Any CPU
+		{D202353D-6FAF-4263-9A01-BDCFBC92391F}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{D202353D-6FAF-4263-9A01-BDCFBC92391F}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{D202353D-6FAF-4263-9A01-BDCFBC92391F}.Debug|x64.ActiveCfg = Debug|Any CPU
+		{D202353D-6FAF-4263-9A01-BDCFBC92391F}.Debug|x64.Build.0 = Debug|Any CPU
+		{D202353D-6FAF-4263-9A01-BDCFBC92391F}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{D202353D-6FAF-4263-9A01-BDCFBC92391F}.Release|Any CPU.Build.0 = Release|Any CPU
+		{D202353D-6FAF-4263-9A01-BDCFBC92391F}.Release|x64.ActiveCfg = Release|Any CPU
+		{D202353D-6FAF-4263-9A01-BDCFBC92391F}.Release|x64.Build.0 = Release|Any CPU
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
@@ -969,6 +989,8 @@ Global
 		{867FFC34-DFA7-400F-B9BB-85158326CE08} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
 		{1D4AD9A3-19AF-432B-889D-A63FE6D7BD47} = {DA452A53-2E94-4433-B08C-041EDEC729E6}
 		{14AB0804-D4CE-4634-B544-5A8587620783} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
+		{0AA6D5CB-195F-457A-8792-4221E76E6C44} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
+		{D202353D-6FAF-4263-9A01-BDCFBC92391F} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
 	EndGlobalSection
 	GlobalSection(ExtensibilityGlobals) = postSolution
 		SolutionGuid = {41165AF1-35BB-4832-A189-73060F82B01D}
diff --git a/src/Microsoft.ML.GenAI.Core/Microsoft.ML.GenAI.Core.csproj b/src/Microsoft.ML.GenAI.Core/Microsoft.ML.GenAI.Core.csproj
index dfb64082fb..2827fa237a 100644
--- a/src/Microsoft.ML.GenAI.Core/Microsoft.ML.GenAI.Core.csproj
+++ b/src/Microsoft.ML.GenAI.Core/Microsoft.ML.GenAI.Core.csproj
@@ -11,13 +11,6 @@
     <PackageReference Include="System.Memory" Version="$(SystemMemoryVersion)" />
     <PackageReference Include="TorchSharp" Version="$(TorchSharpVersion)" />
   </ItemGroup>
-<!-- 
-  
-  <ItemGroup Condition="'$(Configuration)' == 'Debug'">
-    <PackageReference Include="libtorch-cpu-win-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('Windows'))" PrivateAssets="all" />
-    <PackageReference Include="libtorch-cpu-linux-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('Linux'))" PrivateAssets="all" />
-    <PackageReference Include="libtorch-cpu-osx-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('OSX'))" PrivateAssets="all" />
-  </ItemGroup> -->
 
   <ItemGroup>
     <ProjectReference Include="..\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj" />
@@ -25,6 +18,8 @@
 
   <ItemGroup>
     <InternalsVisibleTo Include="Microsoft.ML.GenAI.Phi" />
+    <InternalsVisibleTo Include="Microsoft.ML.GenAI.LLaMA" />
+    <InternalsVisibleTo Include="Microsoft.ML.GenAI.LLaMA.Tests" />
     <InternalsVisibleTo Include="Microsoft.ML.GenAI.Phi.Tests" />
     <InternalsVisibleTo Include="Microsoft.ML.GenAI.Core.Tests" />
   </ItemGroup>
diff --git a/src/Microsoft.ML.GenAI.Core/Module/Attention.cs b/src/Microsoft.ML.GenAI.Core/Module/Attention.cs
new file mode 100644
index 0000000000..d6938b27f9
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Core/Module/Attention.cs
@@ -0,0 +1,222 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using System.Diagnostics.Contracts;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+using Microsoft.ML.GenAI.Core;
+using TorchSharp;
+using TorchSharp.Modules;
+using static TorchSharp.torch;
+
+namespace Microsoft.ML.GenAI.Core;
+
+internal class AttentionInput
+{
+    public AttentionInput(
+        Tensor hiddenStates,
+        Tensor positionIds,
+        Tensor? attentionMask = null,
+        IKVCache? cache = null,
+        bool outputAttentions = false)
+    {
+        this.HiddenStates = hiddenStates;
+        this.AttentionMask = attentionMask;
+        this.PositionIds = positionIds;
+        this.Cache = cache;
+        this.OutputAttentions = outputAttentions;
+    }
+    public Tensor HiddenStates { get; set; }
+
+    public Tensor? AttentionMask { get; set; }
+
+    public Tensor PositionIds { get; set; }
+
+    public IKVCache? Cache { get; set; }
+
+    public bool OutputAttentions { get; set; }
+}
+
+internal class AttentionOutput
+{
+    public AttentionOutput(
+        Tensor hiddenStates,
+        Tensor? attentions = null,
+        IKVCache? cache = null)
+    {
+        this.HiddenStates = hiddenStates;
+        this.Attentions = attentions;
+        this.Cache = cache;
+    }
+
+    public Tensor HiddenStates { get; set; }
+
+    public Tensor? Attentions { get; set; }
+
+    public IKVCache? Cache { get; set; }
+}
+
+internal class Attention : nn.Module<AttentionInput, AttentionOutput>
+{
+    private readonly int _layerIdx;
+    private readonly double _attentionDropout;
+    private readonly int _hiddenSize;
+    private readonly int _numHeads;
+    private readonly int _headDim;
+    private readonly int _numKeyValueHeads;
+    private readonly int _numKeyValueGroups;
+    private readonly int _maxPositionEmbeddings;
+    private readonly int _originalMaxPositionEmbeddings;
+#pragma warning disable MSML_PrivateFieldName // Private field name not in: _camelCase format
+    private readonly QuantizedLinear o_proj;
+    private readonly QuantizedLinear? qkv_proj;
+    private readonly QuantizedLinear? q_proj;
+    private readonly QuantizedLinear? k_proj;
+    private readonly QuantizedLinear? v_proj;
+    private readonly nn.Module<RotaryEmbeddingInput, RotaryEmbeddingOutput> rotary_emb;
+#pragma warning restore MSML_PrivateFieldName // Private field name not in: _camelCase format
+
+    public Attention(
+        double attentionDropout,
+        int hiddenSize,
+        int numHeads,
+        int headDim,
+        int numKeyValueHeads,
+        int numKeyValueGroups,
+        int maxPositionEmbeddings,
+        int originalMaxPositionEmbeddings,
+        int layerIdx,
+        ScalarType dtype,
+        nn.Module<RotaryEmbeddingInput, RotaryEmbeddingOutput> rotaryEmbedding,
+        bool attentionBias = false,
+        bool useQkvProj = true)
+        : base(nameof(Attention))
+    {
+        this._layerIdx = layerIdx;
+        this._attentionDropout = attentionDropout;
+        this._hiddenSize = hiddenSize;
+        this._numHeads = numHeads;
+        this._headDim = headDim;
+        this._numKeyValueHeads = numKeyValueHeads;
+        this._numKeyValueGroups = numKeyValueGroups;
+        this._maxPositionEmbeddings = maxPositionEmbeddings;
+        this._originalMaxPositionEmbeddings = originalMaxPositionEmbeddings;
+
+        Contract.Assert(this._hiddenSize % (this._headDim * this._numHeads) == 0, "hidden_size must be divisible by num_heads");
+
+        this.o_proj = new QuantizedLinear(this._hiddenSize, this._hiddenSize, hasBias: attentionBias, dtype: dtype);
+        if (useQkvProj)
+        {
+            var opSize = this._numHeads * this._headDim + 2 * (this._numKeyValueHeads * this._headDim);
+            this.qkv_proj = new QuantizedLinear(this._hiddenSize, opSize, hasBias: attentionBias, dtype: dtype);
+        }
+        else
+        {
+            this.q_proj = new QuantizedLinear(this._hiddenSize, this._numHeads * this._headDim, hasBias: attentionBias, dtype: dtype);
+            this.k_proj = new QuantizedLinear(this._hiddenSize, this._numKeyValueHeads * this._headDim, hasBias: attentionBias, dtype: dtype);
+            this.v_proj = new QuantizedLinear(this._hiddenSize, this._numKeyValueHeads * this._headDim, hasBias: attentionBias, dtype: dtype);
+        }
+
+        this.rotary_emb = rotaryEmbedding;
+    }
+
+#pragma warning disable MSML_GeneralName // This name should be PascalCased
+    public override AttentionOutput forward(AttentionInput input)
+#pragma warning restore MSML_GeneralName // This name should be PascalCased
+    {
+        using (var _ = NewDisposeScope())
+        {
+            var hiddenStates = input.HiddenStates;
+            var positionIds = input.PositionIds;
+            var outputAttentions = input.OutputAttentions;
+            var bsz = hiddenStates.shape[0];
+            var qLen = hiddenStates.shape[1];
+
+            Tensor queryStates;
+            Tensor keyStates;
+            Tensor valueStates;
+
+            if (this.qkv_proj is not null)
+            {
+                var qkv = this.qkv_proj.forward(hiddenStates);
+                var queryPos = this._numHeads * this._headDim;
+                queryStates = qkv[.., .., ..queryPos];
+                keyStates = qkv[.., .., queryPos..(queryPos + this._numKeyValueHeads * this._headDim)];
+                valueStates = qkv[.., .., (queryPos + this._numKeyValueHeads * this._headDim)..];
+            }
+            else if (this.q_proj is not null && this.k_proj is not null && this.v_proj is not null)
+            {
+                queryStates = this.q_proj.forward(hiddenStates);
+                keyStates = this.k_proj.forward(hiddenStates);
+                valueStates = this.v_proj.forward(hiddenStates);
+            }
+            else
+            {
+                throw new InvalidOperationException("Invalid state, either qkv_proj or q_proj, k_proj, v_proj should be initialized");
+            }
+
+            queryStates = queryStates.view(bsz, qLen, this._numHeads, this._headDim).transpose(1, 2);
+            keyStates = keyStates.view(bsz, qLen, this._numKeyValueHeads, this._headDim).transpose(1, 2);
+            valueStates = valueStates.view(bsz, qLen, this._numKeyValueHeads, this._headDim).transpose(1, 2);
+
+            var kvSeqLen = keyStates.IntShape()[^2];
+            var pastKeyValue = input.Cache;
+            if (pastKeyValue is not null)
+            {
+                kvSeqLen += pastKeyValue.GetUsableLength(kvSeqLen, this._layerIdx);
+            }
+
+            var embOutput = this.rotary_emb.forward(new RotaryEmbeddingInput(valueStates, positionIds, kvSeqLen));
+            (var cos, var sin) = (embOutput.Cos, embOutput.Sin);
+
+            (queryStates, keyStates) = Utils.ApplyRotaryPosEmb(queryStates, keyStates, cos, sin);
+
+            if (pastKeyValue is not null)
+            {
+                (keyStates, valueStates) = pastKeyValue.UpdateKVCache(keyStates, valueStates, this._layerIdx);
+            }
+
+            // repeat k/v heads if n_kv_heads < n_heads
+            keyStates = Utils.RepeatKV(keyStates, this._numKeyValueGroups);
+            valueStates = Utils.RepeatKV(valueStates, this._numKeyValueGroups);
+
+            var attnWeights = torch.matmul(queryStates, keyStates.transpose(2, 3));
+            attnWeights = attnWeights / Math.Sqrt(this._headDim);
+
+            // attnWeight's shape should be [bsz, this._numHeads, qLen, kvSeqLen]
+            Contract.Assert(attnWeights.shape.Length == 4);
+            Contract.Assert(attnWeights.shape[0] == bsz);
+            Contract.Assert(attnWeights.shape[1] == this._numHeads);
+            Contract.Assert(attnWeights.shape[2] == qLen);
+            Contract.Assert(attnWeights.shape[3] == kvSeqLen);
+
+            var attentionMask = input.AttentionMask;
+            if (attentionMask is not null)
+            {
+                Contract.Assert(attentionMask.shape.Length == 4);
+                Contract.Assert(attentionMask.shape[0] == bsz);
+                Contract.Assert(attentionMask.shape[1] == 1);
+                Contract.Assert(attentionMask.shape[2] == qLen);
+                Contract.Assert(attentionMask.shape[3] == kvSeqLen);
+                attnWeights = attnWeights + attentionMask;
+            }
+
+            // upscale attention to fp32 to avoid overflow
+            attnWeights = nn.functional.softmax(attnWeights, dim: -1, dtype: ScalarType.Float32).to(valueStates.dtype);
+            attnWeights = nn.functional.dropout(attnWeights, this._attentionDropout, this.training);
+
+            var attnOutput = torch.matmul(attnWeights, valueStates);
+
+            attnOutput = attnOutput.transpose(1, 2).contiguous();
+            attnOutput = attnOutput.reshape(bsz, qLen, this._hiddenSize);
+
+            attnOutput = this.o_proj.forward(attnOutput);
+
+            return new(attnOutput.MoveToOuterDisposeScope(), outputAttentions ? attnWeights.MoveToOuterDisposeScope() : null, pastKeyValue);
+        }
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.Core/Module/GenAILinear.cs b/src/Microsoft.ML.GenAI.Core/Module/GenAILinear.cs
index 77bcadeb82..178b8fddda 100644
--- a/src/Microsoft.ML.GenAI.Core/Module/GenAILinear.cs
+++ b/src/Microsoft.ML.GenAI.Core/Module/GenAILinear.cs
@@ -5,7 +5,7 @@
 using TorchSharp;
 using static TorchSharp.torch;
 
-namespace Microsoft.ML.GenAI;
+namespace Microsoft.ML.GenAI.Core;
 internal class GenAILinear : nn.Module<Tensor, Tensor>
 {
 #pragma warning disable MSML_GeneralName // This name should be PascalCased
diff --git a/src/Microsoft.ML.GenAI.Core/Module/NewGELUActivation.cs b/src/Microsoft.ML.GenAI.Core/Module/NewGELUActivation.cs
index 4c46e53104..a1b523a4df 100644
--- a/src/Microsoft.ML.GenAI.Core/Module/NewGELUActivation.cs
+++ b/src/Microsoft.ML.GenAI.Core/Module/NewGELUActivation.cs
@@ -6,7 +6,7 @@
 using TorchSharp;
 using static TorchSharp.torch;
 
-namespace Microsoft.ML.GenAI;
+namespace Microsoft.ML.GenAI.Core;
 #pragma warning disable MSML_GeneralName // This name should be PascalCased
 internal class NewGELUActivation : torch.nn.Module<Tensor, Tensor>
 #pragma warning disable MSML_GeneralName // This name should be PascalCased
diff --git a/src/Microsoft.ML.GenAI.Core/Module/QuantizedLinear.cs b/src/Microsoft.ML.GenAI.Core/Module/QuantizedLinear.cs
index 268ac0a4a4..f399efe324 100644
--- a/src/Microsoft.ML.GenAI.Core/Module/QuantizedLinear.cs
+++ b/src/Microsoft.ML.GenAI.Core/Module/QuantizedLinear.cs
@@ -5,7 +5,7 @@
 using Microsoft.ML.GenAI.Core;
 using TorchSharp;
 using static TorchSharp.torch;
-namespace Microsoft.ML.GenAI;
+namespace Microsoft.ML.GenAI.Core;
 
 internal class QuantizedLinear : GenAILinear, IQuantizeModule
 {
@@ -74,6 +74,7 @@ public void Int8()
             this.register_buffer("scale", scale);
         }
     }
+
 #pragma warning disable MSML_GeneralName // This name should be PascalCased
     public override Tensor forward(Tensor input)
 #pragma warning restore MSML_GeneralName // This name should be PascalCased
diff --git a/src/Microsoft.ML.GenAI.Phi/Module/Phi3RMSNorm.cs b/src/Microsoft.ML.GenAI.Core/Module/RMSNorm.cs
similarity index 92%
rename from src/Microsoft.ML.GenAI.Phi/Module/Phi3RMSNorm.cs
rename to src/Microsoft.ML.GenAI.Core/Module/RMSNorm.cs
index e8c847268e..b9555cd845 100644
--- a/src/Microsoft.ML.GenAI.Phi/Module/Phi3RMSNorm.cs
+++ b/src/Microsoft.ML.GenAI.Core/Module/RMSNorm.cs
@@ -11,10 +11,10 @@
 using TorchSharp.Modules;
 using static TorchSharp.torch;
 
-namespace Microsoft.ML.GenAI.Phi.Module;
+namespace Microsoft.ML.GenAI.Core;
 
 #pragma warning disable MSML_GeneralName // This name should be PascalCased
-internal class Phi3RMSNorm : torch.nn.Module<Tensor, Tensor>
+internal class RMSNorm : torch.nn.Module<Tensor, Tensor>
 #pragma warning restore MSML_GeneralName // This name should be PascalCased
 {
     private readonly int _dim;
@@ -23,11 +23,11 @@ internal class Phi3RMSNorm : torch.nn.Module<Tensor, Tensor>
     private readonly Parameter weight;
 #pragma warning restore MSML_PrivateFieldName // Private field name not in: _camelCase format
 
-    public Phi3RMSNorm(
+    public RMSNorm(
         int hiddenSize,
         float eps = 1e-6f,
         ScalarType dtype = ScalarType.Float32)
-        : base(nameof(Phi3RMSNorm))
+        : base(nameof(RMSNorm))
     {
         this._dim = hiddenSize;
         this._eps = eps;
diff --git a/src/Microsoft.ML.GenAI.Core/Module/RotaryEmbedding.cs b/src/Microsoft.ML.GenAI.Core/Module/RotaryEmbedding.cs
new file mode 100644
index 0000000000..8e06c838d5
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Core/Module/RotaryEmbedding.cs
@@ -0,0 +1,125 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Text.Json.Serialization;
+using TorchSharp;
+using static TorchSharp.torch;
+
+namespace Microsoft.ML.GenAI.Core;
+
+public class RopeScalingConfig
+{
+    public RopeScalingConfig()
+    {
+        this.Factor = 1.0f;
+        this.LowFreqFactor = 1.0f;
+        this.HighFreqFactor = 1.0f;
+        this.OriginalMaxPositionEmbeddings = 8192;
+        this.RopeType = "default";
+    }
+
+    [JsonPropertyName("factor")]
+    public float Factor { get; set; }
+
+    [JsonPropertyName("low_freq_factor")]
+    public float LowFreqFactor { get; set; }
+
+    [JsonPropertyName("high_freq_factor")]
+    public float HighFreqFactor { get; set; }
+
+    [JsonPropertyName("original_max_position_embeddings")]
+    public int OriginalMaxPositionEmbeddings { get; set; }
+
+    [JsonPropertyName("rope_type")]
+    public string RopeType { get; set; }
+}
+
+
+internal class RotaryEmbeddingInput
+{
+    public RotaryEmbeddingInput(Tensor input, Tensor positionIds, int? seqLen = null)
+    {
+        Input = input;
+        PositionIds = positionIds;
+        SeqLen = seqLen;
+    }
+
+    public Tensor Input { get; set; }
+
+    public Tensor PositionIds { get; set; }
+
+    public int? SeqLen { get; set; }
+}
+
+internal class RotaryEmbeddingOutput
+{
+    public RotaryEmbeddingOutput(Tensor cos, Tensor sin)
+    {
+        Cos = cos;
+        Sin = sin;
+    }
+
+    public Tensor Cos { get; set; }
+
+    public Tensor Sin { get; set; }
+}
+
+
+internal class RotaryEmbedding : nn.Module<
+    RotaryEmbeddingInput,
+    RotaryEmbeddingOutput>
+{
+    private readonly double _base;
+    private readonly int _maxPositionEmbeddings;
+    private readonly int _dim;
+
+    public RotaryEmbedding(double baseValue, int maxPositionEmbeddings, int dim)
+        : this(baseValue, dim, new RopeScalingConfig() { RopeType = "default", OriginalMaxPositionEmbeddings = maxPositionEmbeddings })
+    {
+    }
+
+    public RotaryEmbedding(double baseValue, int dim, RopeScalingConfig config)
+        : base(nameof(RotaryEmbedding))
+    {
+        _base = baseValue;
+        _maxPositionEmbeddings = config.OriginalMaxPositionEmbeddings;
+        _dim = dim;
+
+        if (config.RopeType == "default")
+        {
+            var thetaNumerator = torch.arange(0, _dim, 2, dtype: ScalarType.Int64).to(torch.float32);
+            this.register_buffer("inv_freq", torch.pow(baseValue, -1.0f * (thetaNumerator / dim)), persistent: false);
+        }
+        else
+        {
+            throw new NotImplementedException("Rope type not implemented");
+        }
+    }
+
+    public int Dim => _dim;
+
+#pragma warning disable MSML_GeneralName // This name should be PascalCased
+    public override RotaryEmbeddingOutput forward(RotaryEmbeddingInput input)
+#pragma warning restore MSML_GeneralName // This name should be PascalCased
+    {
+        var x = input.Input;
+        var positionIds = input.PositionIds;
+        var seqLen = input.SeqLen;
+        // TODO
+        // can be calculated once and cached
+        var invFreq = this.get_buffer("inv_freq").to(x.device);
+        var invFreqExpanded = invFreq.unsqueeze(0).unsqueeze(-1);
+        invFreqExpanded = invFreqExpanded.expand(new long[] { positionIds.shape[0], -1, 1 });
+        var positionIdsExpanded = positionIds.unsqueeze(1).to(torch.float32);
+        var freqs = invFreqExpanded * positionIdsExpanded;
+        freqs = freqs.transpose(1, 2);
+        var emb = torch.cat([freqs, freqs], dim: -1);
+
+        var cos = torch.cos(emb);
+        var sin = torch.sin(emb);
+
+        return new(cos.to_type(x.dtype), sin.to_type(x.dtype));
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.Core/Pipeline/CasualLMModelInput.cs b/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMModelInput.cs
similarity index 96%
rename from src/Microsoft.ML.GenAI.Core/Pipeline/CasualLMModelInput.cs
rename to src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMModelInput.cs
index 49fcfef627..eaf94f2a80 100644
--- a/src/Microsoft.ML.GenAI.Core/Pipeline/CasualLMModelInput.cs
+++ b/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMModelInput.cs
@@ -6,7 +6,7 @@
 
 namespace Microsoft.ML.GenAI.Core;
 
-public class CasualLMModelInput
+public class CausalLMModelInput
 {
     internal static class Defaults
     {
@@ -18,7 +18,7 @@ internal static class Defaults
         internal const bool OutputAttentions = false;
         internal const bool OutputHiddenStates = false;
     }
-    public CasualLMModelInput(
+    public CausalLMModelInput(
         Tensor inputIds,
         Tensor? attentionMask = Defaults.AttentionMask,
         Tensor? positionIds = Defaults.PositionIds,
diff --git a/src/Microsoft.ML.GenAI.Core/Pipeline/CasualLMModelOutput.cs b/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMModelOutput.cs
similarity index 94%
rename from src/Microsoft.ML.GenAI.Core/Pipeline/CasualLMModelOutput.cs
rename to src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMModelOutput.cs
index afaa84e778..c10b68e60f 100644
--- a/src/Microsoft.ML.GenAI.Core/Pipeline/CasualLMModelOutput.cs
+++ b/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMModelOutput.cs
@@ -6,7 +6,7 @@
 
 namespace Microsoft.ML.GenAI.Core;
 
-public class CasualLMModelOutput
+public class CausalLMModelOutput
 {
     internal static class Defaults
     {
@@ -15,7 +15,7 @@ internal static class Defaults
         internal const Tensor[]? Attentions = null;
         internal const IKVCache? Cache = null;
     }
-    public CasualLMModelOutput(
+    public CausalLMModelOutput(
         Tensor lastHiddenState,
         Tensor? logits = Defaults.Logits,
         Tensor[]? allHiddenStates = Defaults.AllHiddenStates,
diff --git a/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs b/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs
index 9decdd3207..7ecb64f761 100644
--- a/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs
+++ b/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs
@@ -16,7 +16,7 @@ namespace Microsoft.ML.GenAI.Core;
 
 public interface ICausalLMPipeline<out TTokenizer, out TModel> : ICausalLMPipeline
     where TTokenizer : Tokenizer
-    where TModel : nn.Module<CasualLMModelInput, CasualLMModelOutput>
+    where TModel : nn.Module<CausalLMModelInput, CausalLMModelOutput>
 {
     TTokenizer Tokenizer { get; }
 
@@ -58,7 +58,7 @@ IEnumerable<string> GenerateStreaming(
 
 public class CausalLMPipeline<TTokenizer, TModel> : CausalLMPipeline, ICausalLMPipeline<TTokenizer, TModel>
     where TTokenizer : Tokenizer
-    where TModel : nn.Module<CasualLMModelInput, CasualLMModelOutput>
+    where TModel : nn.Module<CausalLMModelInput, CausalLMModelOutput>
 {
     public CausalLMPipeline(
         TTokenizer tokenizer,
@@ -86,7 +86,7 @@ internal static class Defaults
 
     public CausalLMPipeline(
         Tokenizer tokenizer,
-        nn.Module<CasualLMModelInput, CasualLMModelOutput> model,
+        nn.Module<CausalLMModelInput, CausalLMModelOutput> model,
         string device = Defaults.Device)
     {
         this.Tokenizer = tokenizer;
@@ -106,7 +106,7 @@ private protected CausalLMPipeline()
 
     public Tokenizer Tokenizer { get; }
 
-    public nn.Module<CasualLMModelInput, CasualLMModelOutput> Model { get; }
+    public nn.Module<CausalLMModelInput, CausalLMModelOutput> Model { get; }
 
     public Device Device { get; }
 
@@ -134,7 +134,7 @@ private protected CausalLMPipeline()
         var cache = new DynamicKVCache();
         if (promptLength == totalLen)
         {
-            var input = new CasualLMModelInput(inputIds, attentionMask, pastKeyValuesLength: 0)
+            var input = new CausalLMModelInput(inputIds, attentionMask, pastKeyValuesLength: 0)
             {
                 OverrideCache = cache,
             };
@@ -143,7 +143,7 @@ private protected CausalLMPipeline()
         }
         for (var curPos = promptLength; curPos != totalLen; curPos++)
         {
-            var input = new CasualLMModelInput(inputIds[.., prevPos..curPos], attentionMask[.., prevPos..curPos], pastKeyValuesLength: prevPos)
+            var input = new CausalLMModelInput(inputIds[.., prevPos..curPos], attentionMask[.., prevPos..curPos], pastKeyValuesLength: prevPos)
             {
                 OverrideCache = cache,
             };
diff --git a/src/Microsoft.ML.GenAI.Core/Utils.cs b/src/Microsoft.ML.GenAI.Core/Utils.cs
index 2f46e7d43d..161b8d5185 100644
--- a/src/Microsoft.ML.GenAI.Core/Utils.cs
+++ b/src/Microsoft.ML.GenAI.Core/Utils.cs
@@ -145,7 +145,7 @@ public static Tensor Phi2RepeatKV(Tensor x, int nRep)
                 .view(batchSize, seqLen, nKVHeads * nRep, headDim);
     }
 
-    public static Tensor Phi3RepeatKV(Tensor x, int nRep)
+    public static Tensor RepeatKV(Tensor x, int nRep)
     {
         var batchSize = x.shape[0];
         var nKVHeads = x.shape[1];
diff --git a/src/Microsoft.ML.GenAI.LLaMA/LlamaConfig.cs b/src/Microsoft.ML.GenAI.LLaMA/LlamaConfig.cs
new file mode 100644
index 0000000000..b10c6c02f5
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.LLaMA/LlamaConfig.cs
@@ -0,0 +1,110 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Text.Json;
+using System.Text.Json.Serialization;
+using System.Threading.Tasks;
+using Microsoft.ML.GenAI.Core;
+using TorchSharp;
+
+namespace Microsoft.ML.GenAI.LLaMA;
+
+public class LlamaConfig
+{
+    public LlamaConfig()
+    {
+        this.AttentionBias = false;
+        this.AttentionDropout = 0.0;
+        this.HiddenAct = "silu";
+        this.HiddenSize = 4096;
+        this.InitializerRange = 0.02;
+        this.IntermediateSize = 14336;
+        this.MaxPositionEmbeddings = 131072;
+        this.MlpBias = false;
+        this.NumAttentionHeads = 32;
+        this.NumHiddenLayers = 32;
+        this.NumKeyValueHeads = 8;
+        this.PretrainingTp = 1;
+        this.RmsNormEps = 1e-05f;
+        this.RopeScaling = new RopeScalingConfig();
+        this.RopeTheta = 500000.0;
+        this.TieWordEmbeddings = false;
+        this.VocabSize = 128256;
+        this.AttnImplementation = "eager";
+        this.DType = torch.ScalarType.BFloat16;
+    }
+
+    static LlamaConfig()
+    {
+#pragma warning disable MSML_ParameterLocalVarName // Parameter or local variable name not standard
+        var llama3_1_8b_content = Utils.GetEmbeddedResource("Microsoft.ML.GenAI.LLaMA.Resource.Config.meta-llama-3.1-8B-Instruct.json");
+#pragma warning restore MSML_ParameterLocalVarName // Parameter or local variable name not standard
+
+        Llama3_1_8B_Instruct = JsonSerializer.Deserialize<LlamaConfig>(llama3_1_8b_content) ?? throw new ArgumentNullException(nameof(llama3_1_8b_content));
+    }
+
+#pragma warning disable MSML_GeneralName // This name should be PascalCased
+    /// <summary>
+    /// The llama-3.1-8B-Instruct configuration created from https://huggingface.co/meta-llama/Meta-Llama-3.1-8B.
+    /// </summary>
+    public static LlamaConfig Llama3_1_8B_Instruct { get; }
+#pragma warning restore MSML_GeneralName // This name should be PascalCased
+
+    [JsonPropertyName("attention_bias")]
+    public bool AttentionBias { get; set; }
+
+    [JsonPropertyName("attention_dropout")]
+    public double AttentionDropout { get; set; }
+
+    [JsonPropertyName("hidden_act")]
+    public string HiddenAct { get; set; }
+
+    [JsonPropertyName("hidden_size")]
+    public int HiddenSize { get; set; }
+
+    [JsonPropertyName("initializer_range")]
+    public double InitializerRange { get; set; }
+
+    [JsonPropertyName("intermediate_size")]
+    public int IntermediateSize { get; set; }
+
+    [JsonPropertyName("max_position_embeddings")]
+    public int MaxPositionEmbeddings { get; set; }
+
+    [JsonPropertyName("mlp_bias")]
+    public bool MlpBias { get; set; }
+
+    [JsonPropertyName("num_attention_heads")]
+    public int NumAttentionHeads { get; set; }
+
+    [JsonPropertyName("num_hidden_layers")]
+    public int NumHiddenLayers { get; set; }
+
+    [JsonPropertyName("num_key_value_heads")]
+    public int NumKeyValueHeads { get; set; }
+
+    [JsonPropertyName("pretraining_tp")]
+    public int PretrainingTp { get; set; }
+
+    [JsonPropertyName("rms_norm_eps")]
+    public float RmsNormEps { get; set; }
+
+    public RopeScalingConfig RopeScaling { get; set; }
+
+    [JsonPropertyName("rope_theta")]
+    public double RopeTheta { get; set; }
+
+    [JsonPropertyName("tie_word_embeddings")]
+    public bool TieWordEmbeddings { get; set; }
+
+    [JsonPropertyName("vocab_size")]
+    public int VocabSize { get; set; }
+    public int? PadTokenId { get; set; }
+    public torch.ScalarType DType { get; set; }
+    public string AttnImplementation { get; set; }
+}
diff --git a/src/Microsoft.ML.GenAI.LLaMA/LlamaForCausalLM.cs b/src/Microsoft.ML.GenAI.LLaMA/LlamaForCausalLM.cs
new file mode 100644
index 0000000000..6b38d15ebd
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.LLaMA/LlamaForCausalLM.cs
@@ -0,0 +1,44 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using Microsoft.ML.GenAI.Core;
+using Microsoft.ML.GenAI.LLaMA.Module;
+using static TorchSharp.torch;
+
+namespace Microsoft.ML.GenAI.LLaMA;
+
+public class LlamaForCausalLM : nn.Module<CausalLMModelInput, CausalLMModelOutput>
+{
+    private readonly LlamaConfig _config;
+    private readonly int _vocabSize;
+
+#pragma warning disable MSML_PrivateFieldName // Private field name not in: _camelCase format
+    private readonly GenAILinear lm_head;
+    private readonly LlamaModel model;
+#pragma warning restore MSML_PrivateFieldName // Private field name not in: _camelCase format
+
+    public LlamaForCausalLM(LlamaConfig config)
+        : base(nameof(LlamaForCausalLM))
+    {
+        _config = config;
+        _vocabSize = config.VocabSize;
+
+        model = new LlamaModel(config);
+        lm_head = new GenAILinear(config.HiddenSize, config.VocabSize, hasBias: false);
+
+        this.RegisterComponents();
+    }
+
+#pragma warning disable MSML_GeneralName // This name should be PascalCased
+    public override CausalLMModelOutput forward(CausalLMModelInput input)
+#pragma warning restore MSML_GeneralName // This name should be PascalCased
+    {
+        var outputs = this.model.forward(input);
+        var logits = this.lm_head.forward(outputs.LastHiddenState);
+        logits = logits.to_type(ScalarType.Float32);
+        outputs.Logits = logits;
+
+        return outputs;
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.LLaMA/Microsoft.ML.GenAI.LLaMA.csproj b/src/Microsoft.ML.GenAI.LLaMA/Microsoft.ML.GenAI.LLaMA.csproj
new file mode 100644
index 0000000000..a9b21b5737
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.LLaMA/Microsoft.ML.GenAI.LLaMA.csproj
@@ -0,0 +1,26 @@
+﻿<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <TargetFrameworks>net6.0;net8.0</TargetFrameworks>
+    <ImplicitUsings>enable</ImplicitUsings>
+    <Nullable>enable</Nullable>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <PackageReference Include="AutoGen.Core" Version="$(AutoGenVersion)" />
+    <PackageReference Include="TorchSharp.PyBridge" Version="$(TorchSharpPyBridgeVersion)" />
+    <PackageReference Include="TorchSharp" Version="$(TorchSharpVersion)" />
+    <PackageReference Include="Microsoft.SemanticKernel.Abstractions" Version="$(SemanticKernelVersion)" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <ProjectReference Include="..\Microsoft.ML.GenAI.Core\Microsoft.ML.GenAI.Core.csproj" PrivateAssets="all" />
+    <ProjectReference Include="..\Microsoft.ML.Core\Microsoft.ML.Core.csproj" PrivateAssets="all" />
+    <ProjectReference Include="..\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <EmbeddedResource Include="Resource\Config\*.json" />
+  </ItemGroup>
+
+</Project>
diff --git a/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaDecoderLayer.cs b/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaDecoderLayer.cs
new file mode 100644
index 0000000000..57f141978a
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaDecoderLayer.cs
@@ -0,0 +1,152 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+using Microsoft.ML.GenAI.Core;
+using static TorchSharp.torch;
+
+namespace Microsoft.ML.GenAI.LLaMA.Module;
+
+
+internal class DecoderLayerInput
+{
+    public DecoderLayerInput(
+        Tensor hiddenStates,
+        Tensor attentionMask,
+        Tensor positionIds,
+        IKVCache? pastKeyValue = null,
+        (Tensor, Tensor)? positionEmbeddings = null, // cos, sin
+        bool outputAttentions = false)
+    {
+        this.HiddenStates = hiddenStates;
+        this.AttentionMask = attentionMask;
+        this.PositionIds = positionIds;
+        this.PastKeyValue = pastKeyValue;
+        this.OutputAttentions = outputAttentions;
+    }
+
+    public Tensor HiddenStates { get; set; }
+
+    public Tensor AttentionMask { get; set; }
+
+    public Tensor PositionIds { get; set; }
+
+    public (Tensor, Tensor) PositionalEmbeddings { get; set; }
+
+    public IKVCache? PastKeyValue { get; set; }
+
+    public bool OutputAttentions { get; set; }
+}
+
+internal class DecoderLayerOutput
+{
+    public DecoderLayerOutput(
+        Tensor hiddenStates,
+        Tensor? attentions = null,
+        IKVCache? pastKeyValue = null)
+    {
+        this.HiddenStates = hiddenStates;
+        this.Attentions = attentions;
+        this.PastKeyValue = pastKeyValue;
+    }
+
+    public Tensor HiddenStates { get; set; }
+
+    public Tensor? Attentions { get; set; }
+
+    public IKVCache? PastKeyValue { get; set; }
+}
+internal class LlamaDecoderLayer : nn.Module<DecoderLayerInput, DecoderLayerOutput>, IDynamicLoadModule
+{
+    private readonly LlamaConfig _llamaConfig;
+    private readonly int _layerIndex;
+    private readonly int _hiddenSize;
+
+#pragma warning disable MSML_PrivateFieldName // Private field name not in: _camelCase format
+    private readonly LlamaMLP mlp;
+    private readonly Core.RMSNorm input_layernorm;
+    private readonly Core.RMSNorm post_attention_layernorm;
+    private readonly Attention self_attn;
+
+    public Action<nn.Module>? LoadToDeviceFunc { get; set; }
+    public Action<nn.Module>? UnloadFromDeviceFunc { get; set; }
+
+#pragma warning restore MSML_PrivateFieldName // Private field name not in: _camelCase format
+
+    public LlamaDecoderLayer(LlamaConfig config, int layerIndex)
+        : base(nameof(LlamaDecoderLayer))
+    {
+        _llamaConfig = config;
+        _layerIndex = layerIndex;
+        _hiddenSize = config.HiddenSize;
+
+        this.self_attn = CreateAttention(config, layerIndex);
+        this.mlp = new LlamaMLP(config);
+        this.input_layernorm = new Core.RMSNorm(this._hiddenSize, eps: config.RmsNormEps);
+        this.post_attention_layernorm = new Core.RMSNorm(this._hiddenSize, eps: config.RmsNormEps);
+    }
+
+    private Attention CreateAttention(LlamaConfig config, int layerIndex)
+    {
+        var headDim = config.HiddenSize / config.NumAttentionHeads;
+        return new Attention(
+            attentionDropout: config.AttentionDropout,
+            hiddenSize: config.HiddenSize,
+            numHeads: config.NumAttentionHeads,
+            headDim: headDim,
+            numKeyValueHeads: config.NumKeyValueHeads,
+            numKeyValueGroups: config.NumAttentionHeads / config.NumKeyValueHeads,
+            maxPositionEmbeddings: config.MaxPositionEmbeddings,
+            originalMaxPositionEmbeddings: config.MaxPositionEmbeddings,
+            layerIdx: layerIndex,
+            useQkvProj: false,
+            dtype: config.DType,
+            attentionBias: config.AttentionBias,
+            rotaryEmbedding: config.RopeScaling switch
+            {
+                null => new RotaryEmbedding(config.RopeTheta, config.MaxPositionEmbeddings, headDim),
+                _ => new RotaryEmbedding(config.RopeTheta, headDim, config.RopeScaling),
+            });
+    }
+
+#pragma warning disable MSML_GeneralName // This name should be PascalCased
+    public override DecoderLayerOutput forward(DecoderLayerInput input)
+#pragma warning restore MSML_GeneralName // This name should be PascalCased
+    {
+        if (LoadToDeviceFunc != null)
+        {
+            LoadToDeviceFunc(this);
+        }
+
+        using var disposeScope = NewDisposeScope();
+        var residual = input.HiddenStates;
+        var hiddenStates = this.input_layernorm.forward(input.HiddenStates);
+
+        var selfAttnInput = new AttentionInput(
+            hiddenStates: hiddenStates,
+            attentionMask: input.AttentionMask,
+            positionIds: input.PositionIds,
+            cache: input.PastKeyValue,
+            outputAttentions: input.OutputAttentions);
+
+        var selfAttnOutput = this.self_attn.forward(selfAttnInput);
+
+        hiddenStates = residual + selfAttnOutput.HiddenStates;
+
+        // Fully connected
+        residual = hiddenStates;
+        hiddenStates = this.post_attention_layernorm.forward(hiddenStates);
+        hiddenStates = this.mlp.forward(hiddenStates);
+        hiddenStates = residual + hiddenStates;
+
+        return new DecoderLayerOutput(
+            hiddenStates: hiddenStates,
+            attentions: input.OutputAttentions ? selfAttnOutput.Attentions : null,
+            pastKeyValue: selfAttnOutput.Cache);
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaMLP.cs b/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaMLP.cs
new file mode 100644
index 0000000000..09052b5602
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaMLP.cs
@@ -0,0 +1,62 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+using Microsoft.ML.GenAI.Core;
+using Microsoft.ML.GenAI.LLaMA;
+using TorchSharp;
+using TorchSharp.Modules;
+using static TorchSharp.torch;
+
+namespace Microsoft.ML.GenAI.LLaMA.Module;
+#pragma warning disable MSML_GeneralName // This name should be PascalCased
+internal class LlamaMLP : torch.nn.Module<Tensor, Tensor>
+#pragma warning restore MSML_GeneralName // This name should be PascalCased
+{
+    private readonly int _pretrainingTp;
+    private readonly int _intermediateSize;
+    private readonly int _hiddenSize;
+    private readonly bool _hasBias;
+#pragma warning disable MSML_PrivateFieldName // Private field name not in: _camelCase format
+    private readonly QuantizedLinear gate_proj;
+    private readonly QuantizedLinear up_proj;
+    private readonly QuantizedLinear down_proj;
+    private readonly torch.nn.Module<Tensor, Tensor> activation_fn;
+#pragma warning restore MSML_PrivateFieldName // Private field name not in: _camelCase format
+
+    public LlamaMLP(LlamaConfig config)
+        : base(nameof(LlamaMLP))
+    {
+        this._hiddenSize = config.HiddenSize;
+        this._intermediateSize = config.IntermediateSize;
+        this._hasBias = config.MlpBias;
+        this._pretrainingTp = config.PretrainingTp;
+        var hiddenAct = config.HiddenAct;
+        this.gate_proj = new QuantizedLinear(this._hiddenSize, this._intermediateSize, hasBias: this._hasBias, dtype: config.DType);
+        this.up_proj = new QuantizedLinear(this._hiddenSize, this._intermediateSize, hasBias: this._hasBias, dtype: config.DType);
+        this.down_proj = new QuantizedLinear(this._intermediateSize, this._hiddenSize, hasBias: this._hasBias, dtype: config.DType);
+        this.RegisterComponents();
+        this.activation_fn = Core.Utils.GetActivation(hiddenAct);
+    }
+
+#pragma warning disable MSML_GeneralName // This name should be PascalCased
+    public override Tensor forward(Tensor input)
+#pragma warning restore MSML_GeneralName // This name should be PascalCased
+    {
+        if (this._pretrainingTp > 1)
+        {
+            throw new NotImplementedException("PretrainingTp > 1 is not supported yet.");
+        }
+
+        using var disposeScope = NewDisposeScope();
+        var input1 = this.gate_proj.forward(input);
+        input1 = this.activation_fn.forward(input1);
+        input1 = this.up_proj.forward(input1);
+        return this.down_proj.forward(input1).MoveToOuterDisposeScope();
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaModel.cs b/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaModel.cs
new file mode 100644
index 0000000000..b15dcde532
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaModel.cs
@@ -0,0 +1,138 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+using Microsoft.ML.GenAI.Core;
+using TorchSharp;
+using TorchSharp.Modules;
+using static TorchSharp.torch;
+
+namespace Microsoft.ML.GenAI.LLaMA.Module;
+
+internal class LlamaModel : nn.Module<CausalLMModelInput, CausalLMModelOutput>
+{
+    private readonly LlamaConfig _config;
+    private readonly int? _paddingIdx;
+    private readonly int _vocabSize;
+    private IKVCache _cache;
+#pragma warning disable MSML_PrivateFieldName // Private field name not in: _camelCase format
+    private readonly Embedding embed_tokens;
+    private readonly ModuleList<LlamaDecoderLayer> layers;
+    private readonly RMSNorm norm;
+#pragma warning restore MSML_PrivateFieldName // Private field name not in: _camelCase format
+
+    public LlamaModel(LlamaConfig config)
+        : base(nameof(LlamaModel))
+    {
+        this._config = config;
+        this._paddingIdx = config.PadTokenId;
+        this._vocabSize = config.VocabSize;
+
+        this.embed_tokens = nn.Embedding(config.VocabSize, config.HiddenSize, padding_idx: this._paddingIdx, dtype: config.DType);
+        this.layers = new ModuleList<LlamaDecoderLayer>();
+
+        for (int i = 0; i < config.NumHiddenLayers; i++)
+        {
+            this.layers.Add(new LlamaDecoderLayer(config, i));
+        }
+        this.norm = new RMSNorm(config.HiddenSize, config.RmsNormEps, config.DType);
+        this._cache = new DynamicKVCache();
+        this.RegisterComponents();
+    }
+
+#pragma warning disable MSML_GeneralName // This name should be PascalCased
+    public override CausalLMModelOutput forward(CausalLMModelInput input)
+#pragma warning restore MSML_GeneralName // This name should be PascalCased
+    {
+        if (input.OverrideCache is not null)
+        {
+            this._cache = input.OverrideCache;
+        }
+
+        var outputAttentions = input.OutputAttentions;
+        var outputHiddenStates = input.OutputHiddenStates;
+        var attentionMask = input.AttentionMask;
+        Device device;
+        var inputIds = input.InputIds;
+        var positionIds = input.PositionIds;
+        var inputsEmbeds = input.InputEmbeddings;
+        int batchSize;
+        int seqLength;
+        if (inputIds is not null && inputsEmbeds is not null)
+        {
+            throw new ArgumentException("Only one of input_ids or inputs_embeds may be set");
+        }
+        else if (inputIds is not null)
+        {
+            batchSize = inputIds.IntShape()[0];
+            seqLength = inputIds.IntShape()[1];
+            inputsEmbeds = this.embed_tokens.forward(inputIds);
+            device = inputIds.device;
+        }
+        else if (inputsEmbeds is not null)
+        {
+            batchSize = inputsEmbeds.IntShape()[0];
+            seqLength = inputsEmbeds.IntShape()[1];
+            device = inputsEmbeds.device;
+        }
+        else
+        {
+            throw new ArgumentException("Either input_ids or inputs_embeds must be set");
+        }
+
+        var pastKeyValuesLength = input.PastKeyValuesLength;
+
+        if (positionIds is null)
+        {
+            positionIds = torch.arange(pastKeyValuesLength, seqLength + pastKeyValuesLength, device: device);
+            positionIds = positionIds.unsqueeze(0).view(-1, seqLength);
+        }
+        else
+        {
+            positionIds = ((long)positionIds.view(-1, seqLength));
+        }
+
+        if (this._config.AttnImplementation == "flash_attention_2")
+        {
+            throw new NotImplementedException();
+        }
+        else
+        {
+            attentionMask = AttentionMaskConverter.Create4DCausalAttentionMask(attentionMask, [batchSize, seqLength], inputsEmbeds.dtype, device, pastKeyValuesLength);
+        }
+
+        var hiddenStates = inputsEmbeds;
+
+        var allHiddenStates = new List<Tensor>();
+        var allAttentions = new List<Tensor>();
+
+        foreach (var layer in this.layers)
+        {
+            if (outputHiddenStates)
+            {
+                allHiddenStates.Add(hiddenStates);
+            }
+
+            var decoderInput = new DecoderLayerInput(hiddenStates, attentionMask!, positionIds, this._cache, outputAttentions: outputAttentions);
+            var layerOutput = layer.forward(decoderInput);
+            hiddenStates = layerOutput.HiddenStates;
+            if (outputAttentions && layerOutput.Attentions is not null)
+            {
+                allAttentions.Add(layerOutput.Attentions);
+            }
+        }
+
+        hiddenStates = this.norm.forward(hiddenStates);
+        if (outputHiddenStates)
+        {
+            allHiddenStates.Add(hiddenStates);
+        }
+
+        return new CausalLMModelOutput(lastHiddenState: hiddenStates, allHiddenStates: allHiddenStates.ToArray(), attentions: allAttentions.ToArray(), cache: this._cache);
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.LLaMA/Resource/Config/meta-llama-3.1-8B-Instruct.json b/src/Microsoft.ML.GenAI.LLaMA/Resource/Config/meta-llama-3.1-8B-Instruct.json
new file mode 100644
index 0000000000..0bb6fd75b3
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.LLaMA/Resource/Config/meta-llama-3.1-8B-Instruct.json
@@ -0,0 +1,38 @@
+{
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 128000,
+  "eos_token_id": [
+    128001,
+    128008,
+    128009
+  ],
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 8.0,
+    "low_freq_factor": 1.0,
+    "high_freq_factor": 4.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.42.3",
+  "use_cache": true,
+  "vocab_size": 128256
+}
diff --git a/src/Microsoft.ML.GenAI.LLaMA/RopeScalingObject.cs b/src/Microsoft.ML.GenAI.LLaMA/RopeScalingObject.cs
new file mode 100644
index 0000000000..ab5d0238e7
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.LLaMA/RopeScalingObject.cs
@@ -0,0 +1,7 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Text.Json.Serialization;
+
+namespace Microsoft.ML.GenAI.LLaMA;
diff --git a/src/Microsoft.ML.GenAI.LLaMA/Utils.cs b/src/Microsoft.ML.GenAI.LLaMA/Utils.cs
new file mode 100644
index 0000000000..db849d2064
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.LLaMA/Utils.cs
@@ -0,0 +1,100 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Reflection;
+using TorchSharp;
+using static TorchSharp.torch;
+
+namespace Microsoft.ML.GenAI.LLaMA;
+
+internal static class Utils
+{
+    public static Tensor ApplyRotaryEmbeddings(Tensor input, Tensor freqsComplex)
+    {
+        // Separate the last dimension pairs of two values, representing the real and imaginary parts of the complex number
+        // Two consecutive values will become a single complex number
+        // (B, Seq_Len, H, Head_Dim) -> (B, Seq_Len, H, Head_Dim/2)
+        var inputComplex = input.to_type(ScalarType.Float32).reshape(input.shape[0], input.shape[1], input.shape[2], -1, 2).view_as_complex();
+
+        // Reshape the freqs_complex tensor to match the shape of the x_complex tensor. So we need to add the batch dimension and the head dimension
+        // (Seq_Len, Head_Dim/2) --> (1, Seq_Len, 1, Head_Dim/2)
+        var freqsComplexReshaped = freqsComplex.unsqueeze(0).unsqueeze(2);
+
+        // Multiply each complex number in the x_complex tensor by the corresponding complex number in the freqs_complex tensor
+        // Which results in the rotation of the complex number as shown in the Figure 1 of the paper
+        // (B, Seq_Len, H, Head_Dim/2) * (1, Seq_Len, 1, Head_Dim/2) = (B, Seq_Len, H, Head_Dim/2)
+        var rotatedComplex = inputComplex * freqsComplexReshaped;
+        // Console.WriteLine(rotated_complex.mean().ToSingle());
+
+        // Convert the complex number back to the real number
+        // (B, Seq_Len, H, Head_Dim/2) -> (B, Seq_Len, H, Head_Dim/2, 2)
+        var rotated = rotatedComplex.view_as_real();
+
+        // (B, Seq_Len, H, Head_Dim/2, 2) -> (B, Seq_Len, H, Head_Dim)
+        var rotatedReshaped = rotated.reshape(rotated.shape[0], rotated.shape[1], rotated.shape[2], -1);
+
+        return rotatedReshaped.type_as(input);
+    }
+
+    public static Tensor PrecomputeThetaPosFrequencies(int headDim, int seqLen, float theta = 10000.0f)
+    {
+        // As written in the paragraph 3.2.2 of the paper
+        // >> In order to generalize our results in 2D to any xi ∈ Rd where **d is even**, [...]
+        if (headDim % 2 != 0)
+        {
+            throw new ArgumentException("Dimension must be divisible by 2", nameof(headDim));
+        }
+
+        // Build the theta parameter
+        // According to the formula theta_i = 10000^(-2(i-1)/dim) for i = [1, 2, ... dim/2]
+        // Shape: (Head_Dim / 2)
+        var thetaNumerator = torch.arange(0, headDim, 2).to(torch.float32);
+        // Shape: (Head_Dim / 2)
+        var thetaInput = torch.pow(theta, -1.0f * (thetaNumerator / headDim)); // (Dim / 2)
+        // Construct the positions (the "m" parameter)
+        // Shape: (Seq_Len)
+        var m = torch.arange(seqLen);
+        // Multiply each theta by each position using the outer product.
+        // Shape: (Seq_Len) outer_product* (Head_Dim / 2) -> (Seq_Len, Head_Dim / 2)
+        var freqs = torch.outer(m, thetaInput).to(torch.float32);
+
+        // We can compute complex numbers in the polar form c = R * exp(m * theta), where R = 1 as follows:
+        // (Seq_Len, Head_Dim / 2) -> (Seq_Len, Head_Dim / 2)
+        var freqsComplex = torch.polar(torch.ones_like(freqs), freqs);
+
+        return freqsComplex;
+    }
+
+
+    public static Tensor RepeatKV(Tensor x, int nRep)
+    {
+        var batchSize = x.shape[0];
+        var seqLen = x.shape[1];
+        var nKVHeads = x.shape[2];
+        var headDim = x.shape[3];
+        if (nRep == 1)
+        {
+            return x;
+        }
+
+        return x.unsqueeze(3)
+                .expand(batchSize, seqLen, nKVHeads, nRep, headDim)
+                .reshape(batchSize, seqLen, nKVHeads * nRep, headDim);
+    }
+
+    public static string GetEmbeddedResource(string resourceName)
+    {
+        // read file content from embedded resource
+        var assembly = Assembly.GetExecutingAssembly();
+        var resourceStream = assembly.GetManifestResourceStream(resourceName);
+
+        if (resourceStream == null)
+        {
+            throw new ArgumentException("Resource not found", nameof(resourceName));
+        }
+
+        using var reader = new System.IO.StreamReader(resourceStream);
+        return reader.ReadToEnd();
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.Phi/Microsoft.ML.GenAI.Phi.csproj b/src/Microsoft.ML.GenAI.Phi/Microsoft.ML.GenAI.Phi.csproj
index a9556443dd..af8b6aed6e 100644
--- a/src/Microsoft.ML.GenAI.Phi/Microsoft.ML.GenAI.Phi.csproj
+++ b/src/Microsoft.ML.GenAI.Phi/Microsoft.ML.GenAI.Phi.csproj
@@ -11,15 +11,8 @@
     <PackageReference Include="TorchSharp.PyBridge" Version="$(TorchSharpPyBridgeVersion)" />
     <PackageReference Include="TorchSharp" Version="$(TorchSharpVersion)" />
     <PackageReference Include="Microsoft.SemanticKernel.Abstractions" Version="$(SemanticKernelVersion)" />
-    
   </ItemGroup>
 
-  <!-- <ItemGroup Condition="'$(Configuration)' == 'Debug'">
-    <PackageReference Include="libtorch-cpu-win-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('Windows'))" PrivateAssets="all" />
-    <PackageReference Include="libtorch-cpu-linux-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('Linux'))" PrivateAssets="all" />
-    <PackageReference Include="libtorch-cpu-osx-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('OSX'))" PrivateAssets="all" />
-  </ItemGroup> -->
-
   <ItemGroup>
     <ProjectReference Include="..\Microsoft.ML.GenAI.Core\Microsoft.ML.GenAI.Core.csproj" PrivateAssets="all" />
     <ProjectReference Include="..\Microsoft.ML.Core\Microsoft.ML.Core.csproj" PrivateAssets="all" />
diff --git a/src/Microsoft.ML.GenAI.Phi/Module/Phi2Attention.cs b/src/Microsoft.ML.GenAI.Phi/Module/Phi2Attention.cs
index 918ae7c99b..fe0021980f 100644
--- a/src/Microsoft.ML.GenAI.Phi/Module/Phi2Attention.cs
+++ b/src/Microsoft.ML.GenAI.Phi/Module/Phi2Attention.cs
@@ -3,6 +3,7 @@
 // See the LICENSE file in the project root for more information.
 
 using System.Diagnostics.Contracts;
+using Microsoft.ML.GenAI.Core;
 using TorchSharp;
 using TorchSharp.Modules;
 using static TorchSharp.torch;
diff --git a/src/Microsoft.ML.GenAI.Phi/Module/Phi2MLP.cs b/src/Microsoft.ML.GenAI.Phi/Module/Phi2MLP.cs
index 384d012e22..42bd892588 100644
--- a/src/Microsoft.ML.GenAI.Phi/Module/Phi2MLP.cs
+++ b/src/Microsoft.ML.GenAI.Phi/Module/Phi2MLP.cs
@@ -2,6 +2,7 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
+using Microsoft.ML.GenAI.Core;
 using TorchSharp;
 using TorchSharp.Modules;
 using static TorchSharp.torch;
diff --git a/src/Microsoft.ML.GenAI.Phi/Module/Phi3Attention.cs b/src/Microsoft.ML.GenAI.Phi/Module/Phi3Attention.cs
index 72c7c8946a..d8a3393fcb 100644
--- a/src/Microsoft.ML.GenAI.Phi/Module/Phi3Attention.cs
+++ b/src/Microsoft.ML.GenAI.Phi/Module/Phi3Attention.cs
@@ -60,137 +60,27 @@ public Phi3AttentionOutput(
     public IKVCache? Cache { get; set; }
 }
 
-internal class Phi3Attention : nn.Module<Phi3AttentionInput, Phi3AttentionOutput>
+internal class Phi3Attention
 {
-    private readonly Phi3Config _config;
-    private readonly int _layerIdx;
-    private readonly double _attentionDropout;
-    private readonly int _hiddenSize;
-    private readonly int _numHeads;
-    private readonly int _headDim;
-    private readonly int _numKeyValueHeads;
-    private readonly int _numKeyValueGroups;
-    private readonly int _maxPositionEmbeddings;
-    private readonly int _originalMaxPositionEmbeddings;
-    private readonly double _ropeTheta;
-    private readonly Dictionary<string, object>? _ropeScaling;
-#pragma warning disable MSML_PrivateFieldName // Private field name not in: _camelCase format
-    private readonly QuantizedLinear o_proj;
-    private readonly QuantizedLinear qkv_proj;
-    private nn.Module<Phi3RotaryEmbeddingInput, Phi3RotaryEmbeddingOutput> rotary_emb = null!;
-#pragma warning restore MSML_PrivateFieldName // Private field name not in: _camelCase format
-
-    public Phi3Attention(Phi3Config config, int layerIdx)
-        : base(nameof(Phi3Attention))
+    public static Attention FromConfig(Phi3Config config, int layerIdx)
     {
-        this._config = config;
-        this._layerIdx = layerIdx;
-        this._attentionDropout = config.AttentionDropout;
-        this._hiddenSize = config.HiddenSize;
-        this._numHeads = config.NumAttentionHeads;
-        this._headDim = this._hiddenSize / this._numHeads;
-        this._numKeyValueHeads = config.NumKeyValueHeads ?? throw new ArgumentException("num_key_value_heads must be specified");
-        this._numKeyValueGroups = this._numHeads / this._numKeyValueHeads;
-        this._maxPositionEmbeddings = config.MaxPositionEmbeddings;
-        this._originalMaxPositionEmbeddings = config.OriginalMaxPositionEmbeddings;
-        this._ropeTheta = config.RopeTheta;
-        this._ropeScaling = config.RopeScaling;
-
-        Contract.Assert(this._hiddenSize % (this._headDim * this._numHeads) == 0, "hidden_size must be divisible by num_heads");
-
-        var opSize = this._numHeads * this._headDim + 2 * (this._numKeyValueHeads * this._headDim);
-        this.o_proj = new QuantizedLinear(this._numHeads * this._headDim, this._hiddenSize, hasBias: false, dtype: config.DType);
-        this.qkv_proj = new QuantizedLinear(this._hiddenSize, opSize, hasBias: false, dtype: config.DType);
-        this.InitRope();
-    }
-
-    private void InitRope()
-    {
-        if (this._ropeScaling is null)
-        {
-            this.rotary_emb = new Phi3RotaryEmbedding(this._ropeTheta, this._maxPositionEmbeddings, this._headDim);
-        }
-        else
-        {
-            this.rotary_emb = new Phi3SuScaledRotaryEmbedding(this._headDim, this._config);
-        }
-    }
-
-#pragma warning disable MSML_GeneralName // This name should be PascalCased
-    public override Phi3AttentionOutput forward(Phi3AttentionInput input)
-#pragma warning restore MSML_GeneralName // This name should be PascalCased
-    {
-        using (var _ = NewDisposeScope())
-        {
-            var hiddenStates = input.HiddenStates;
-            var positionIds = input.PositionIds;
-            var outputAttentions = input.OutputAttentions;
-            var bsz = hiddenStates.shape[0];
-            var qLen = hiddenStates.shape[1];
-
-            var qkv = this.qkv_proj.forward(hiddenStates);
-            var queryPos = this._numHeads * this._headDim;
-            var queryStates = qkv[.., .., ..queryPos];
-            var keyStates = qkv[.., .., queryPos..(queryPos + this._numKeyValueHeads * this._headDim)];
-            var valueStates = qkv[.., .., (queryPos + this._numKeyValueHeads * this._headDim)..];
-            queryStates = queryStates.view(bsz, qLen, this._numHeads, this._headDim).transpose(1, 2);
-            keyStates = keyStates.view(bsz, qLen, this._numKeyValueHeads, this._headDim).transpose(1, 2);
-            valueStates = valueStates.view(bsz, qLen, this._numKeyValueHeads, this._headDim).transpose(1, 2);
-
-            var kvSeqLen = keyStates.IntShape()[^2];
-            var pastKeyValue = input.Cache;
-            if (pastKeyValue is not null)
+        var headDim = config.HiddenSize / config.NumAttentionHeads;
+        return new Attention(
+            attentionDropout: config.AttentionDropout,
+            hiddenSize: config.HiddenSize,
+            numHeads: config.NumAttentionHeads,
+            headDim: headDim,
+            numKeyValueHeads: config.NumKeyValueHeads ?? throw new ArgumentException("num_key_value_heads must be specified"),
+            numKeyValueGroups: config.NumAttentionHeads / config.NumKeyValueHeads ?? throw new ArgumentException("num_key_value_heads must be specified"),
+            maxPositionEmbeddings: config.MaxPositionEmbeddings,
+            originalMaxPositionEmbeddings: config.OriginalMaxPositionEmbeddings,
+            layerIdx: layerIdx,
+            useQkvProj: true,
+            dtype: config.DType,
+            rotaryEmbedding: config.RopeScaling switch
             {
-                kvSeqLen += pastKeyValue.GetUsableLength(kvSeqLen, this._layerIdx);
-            }
-
-            var embOutput = this.rotary_emb.forward(new Phi3RotaryEmbeddingInput(valueStates, positionIds, kvSeqLen));
-            (var cos, var sin) = (embOutput.Cos, embOutput.Sin);
-
-            (queryStates, keyStates) = Utils.ApplyRotaryPosEmb(queryStates, keyStates, cos, sin);
-
-            if (pastKeyValue is not null)
-            {
-                (keyStates, valueStates) = pastKeyValue.UpdateKVCache(keyStates, valueStates, this._layerIdx);
-            }
-
-            // repeat k/v heads if n_kv_heads < n_heads
-            keyStates = Utils.Phi3RepeatKV(keyStates, this._numKeyValueGroups);
-            valueStates = Utils.Phi3RepeatKV(valueStates, this._numKeyValueGroups);
-
-            var attnWeights = torch.matmul(queryStates, keyStates.transpose(2, 3));
-            attnWeights = attnWeights / Math.Sqrt(this._headDim);
-
-            // attnWeight's shape should be [bsz, this._numHeads, qLen, kvSeqLen]
-            Contract.Assert(attnWeights.shape.Length == 4);
-            Contract.Assert(attnWeights.shape[0] == bsz);
-            Contract.Assert(attnWeights.shape[1] == this._numHeads);
-            Contract.Assert(attnWeights.shape[2] == qLen);
-            Contract.Assert(attnWeights.shape[3] == kvSeqLen);
-
-            var attentionMask = input.AttentionMask;
-            if (attentionMask is not null)
-            {
-                Contract.Assert(attentionMask.shape.Length == 4);
-                Contract.Assert(attentionMask.shape[0] == bsz);
-                Contract.Assert(attentionMask.shape[1] == 1);
-                Contract.Assert(attentionMask.shape[2] == qLen);
-                Contract.Assert(attentionMask.shape[3] == kvSeqLen);
-                attnWeights = attnWeights + attentionMask;
-            }
-
-            // upscale attention to fp32 to avoid overflow
-            attnWeights = nn.functional.softmax(attnWeights, dim: -1, dtype: ScalarType.Float32).to(valueStates.dtype);
-            attnWeights = nn.functional.dropout(attnWeights, this._attentionDropout, this.training);
-
-            var attnOutput = torch.matmul(attnWeights, valueStates);
-
-            attnOutput = attnOutput.transpose(1, 2).contiguous();
-            attnOutput = attnOutput.reshape(bsz, qLen, this._hiddenSize);
-
-            attnOutput = this.o_proj.forward(attnOutput);
-
-            return new(attnOutput.MoveToOuterDisposeScope(), outputAttentions ? attnWeights.MoveToOuterDisposeScope() : null, pastKeyValue);
-        }
+                null => new RotaryEmbedding(config.RopeTheta, config.MaxPositionEmbeddings, headDim),
+                _ => new Phi3SuScaledRotaryEmbedding(headDim, config),
+            });
     }
 }
diff --git a/src/Microsoft.ML.GenAI.Phi/Module/Phi3DecoderLayer.cs b/src/Microsoft.ML.GenAI.Phi/Module/Phi3DecoderLayer.cs
index 399cd25646..bada15bbfd 100644
--- a/src/Microsoft.ML.GenAI.Phi/Module/Phi3DecoderLayer.cs
+++ b/src/Microsoft.ML.GenAI.Phi/Module/Phi3DecoderLayer.cs
@@ -63,12 +63,12 @@ internal class Phi3DecoderLayer : nn.Module<Phi3DecoderLayerInput, Phi3DecoderLa
 {
     private readonly Phi3Config _config;
 #pragma warning disable MSML_PrivateFieldName // Private field name not in: _camelCase format
-    private readonly nn.Module<Phi3AttentionInput, Phi3AttentionOutput> self_attn;
+    private readonly nn.Module<AttentionInput, AttentionOutput> self_attn;
     private readonly Phi3MLP mlp;
-    private readonly Phi3RMSNorm input_layernorm;
+    private readonly RMSNorm input_layernorm;
     private readonly Dropout resid_attn_dropout;
     private readonly Dropout resid_mlp_dropout;
-    private readonly Phi3RMSNorm post_attention_layernorm;
+    private readonly RMSNorm post_attention_layernorm;
 #pragma warning restore MSML_PrivateFieldName // Private field name not in: _camelCase format
 
     public Phi3DecoderLayer(Phi3Config config, int layerIdx)
@@ -77,7 +77,7 @@ public Phi3DecoderLayer(Phi3Config config, int layerIdx)
         this._config = config;
         if (config.AttnImplementation == "eager")
         {
-            this.self_attn = new Phi3Attention(config, layerIdx);
+            this.self_attn = Phi3Attention.FromConfig(config, layerIdx);
         }
         else
         {
@@ -85,11 +85,11 @@ public Phi3DecoderLayer(Phi3Config config, int layerIdx)
         }
 
         this.mlp = new Phi3MLP(config);
-        this.input_layernorm = new Phi3RMSNorm(config.HiddenSize, config.RmsNormEps, config.DType);
+        this.input_layernorm = new RMSNorm(config.HiddenSize, config.RmsNormEps, config.DType);
 
         this.resid_attn_dropout = nn.Dropout(config.ResidPdrop);
         this.resid_mlp_dropout = nn.Dropout(config.ResidPdrop);
-        this.post_attention_layernorm = new Phi3RMSNorm(config.HiddenSize, config.RmsNormEps, config.DType);
+        this.post_attention_layernorm = new RMSNorm(config.HiddenSize, config.RmsNormEps, config.DType);
     }
 
     public Action<nn.Module>? LoadToDeviceFunc { get; set; }
@@ -109,7 +109,7 @@ public override Phi3DecoderLayerOutput forward(Phi3DecoderLayerInput input)
         var residual = input.HiddenStates;
         hiddenStates = this.input_layernorm.forward(hiddenStates);
 
-        var attentionInput = new Phi3AttentionInput(hiddenStates, input.PositionIds, input.AttentionMask, input.PastKeyValue, input.OutputAttentions);
+        var attentionInput = new AttentionInput(hiddenStates, input.PositionIds, input.AttentionMask, input.PastKeyValue, input.OutputAttentions);
         var output = this.self_attn.forward(attentionInput);
         var attnOutputs = output.HiddenStates;
         var selfAttnWeights = output.Attentions;
diff --git a/src/Microsoft.ML.GenAI.Phi/Module/Phi3MLP.cs b/src/Microsoft.ML.GenAI.Phi/Module/Phi3MLP.cs
index 745c000800..65c0413e39 100644
--- a/src/Microsoft.ML.GenAI.Phi/Module/Phi3MLP.cs
+++ b/src/Microsoft.ML.GenAI.Phi/Module/Phi3MLP.cs
@@ -7,6 +7,7 @@
 using System.Linq;
 using System.Text;
 using System.Threading.Tasks;
+using Microsoft.ML.GenAI.Core;
 using TorchSharp;
 using TorchSharp.Modules;
 using static TorchSharp.torch;
@@ -33,7 +34,7 @@ public Phi3MLP(int hiddenSize, int intermediateSize, string hiddenAct, ScalarTyp
         this.gate_up_proj = new QuantizedLinear(hiddenSize, 2 * intermediateSize, hasBias: false, dtype: dtype);
         this.down_proj = new QuantizedLinear(intermediateSize, hiddenSize, hasBias: false, dtype: dtype);
         this.RegisterComponents();
-        this.activation_fn = Utils.GetActivation(hiddenAct);
+        this.activation_fn = Core.Utils.GetActivation(hiddenAct);
     }
 
 #pragma warning disable MSML_GeneralName // This name should be PascalCased
diff --git a/src/Microsoft.ML.GenAI.Phi/Module/Phi3Model.cs b/src/Microsoft.ML.GenAI.Phi/Module/Phi3Model.cs
index 9f9f0a17ab..839f9c7cc1 100644
--- a/src/Microsoft.ML.GenAI.Phi/Module/Phi3Model.cs
+++ b/src/Microsoft.ML.GenAI.Phi/Module/Phi3Model.cs
@@ -9,7 +9,7 @@
 
 namespace Microsoft.ML.GenAI.Phi.Module;
 
-internal class Phi3Model : nn.Module<CasualLMModelInput, CasualLMModelOutput>
+internal class Phi3Model : nn.Module<CausalLMModelInput, CausalLMModelOutput>
 {
     private readonly Phi3Config _config;
     private readonly int _paddingIdx;
@@ -19,7 +19,7 @@ internal class Phi3Model : nn.Module<CasualLMModelInput, CasualLMModelOutput>
     private readonly Embedding embed_tokens;
     private readonly Dropout embed_dropout;
     private readonly ModuleList<Phi3DecoderLayer> layers;
-    private readonly Phi3RMSNorm norm;
+    private readonly RMSNorm norm;
 #pragma warning restore MSML_PrivateFieldName // Private field name not in: _camelCase format
 
     public Phi3Model(Phi3Config config)
@@ -37,12 +37,12 @@ public Phi3Model(Phi3Config config)
         {
             this.layers.Add(new Phi3DecoderLayer(config, i));
         }
-        this.norm = new Phi3RMSNorm(config.HiddenSize, config.RmsNormEps, config.DType);
+        this.norm = new RMSNorm(config.HiddenSize, config.RmsNormEps, config.DType);
         this._cache = new DynamicKVCache();
         this.RegisterComponents();
     }
 #pragma warning disable MSML_GeneralName // This name should be PascalCased
-    public override CasualLMModelOutput forward(CasualLMModelInput input)
+    public override CausalLMModelOutput forward(CausalLMModelInput input)
 #pragma warning restore MSML_GeneralName // This name should be PascalCased
     {
         if (input.OverrideCache is not null)
@@ -129,6 +129,6 @@ public override CasualLMModelOutput forward(CasualLMModelInput input)
             allHiddenStates.Add(hiddenStates);
         }
 
-        return new CasualLMModelOutput(lastHiddenState: hiddenStates, allHiddenStates: allHiddenStates.ToArray(), attentions: allAttentions.ToArray(), cache: this._cache);
+        return new CausalLMModelOutput(lastHiddenState: hiddenStates, allHiddenStates: allHiddenStates.ToArray(), attentions: allAttentions.ToArray(), cache: this._cache);
     }
 }
diff --git a/src/Microsoft.ML.GenAI.Phi/Module/Phi3RotaryEmbedding.cs b/src/Microsoft.ML.GenAI.Phi/Module/Phi3RotaryEmbedding.cs
deleted file mode 100644
index 9b04a301d6..0000000000
--- a/src/Microsoft.ML.GenAI.Phi/Module/Phi3RotaryEmbedding.cs
+++ /dev/null
@@ -1,81 +0,0 @@
-﻿// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-// See the LICENSE file in the project root for more information.
-
-using TorchSharp;
-using static TorchSharp.torch;
-
-namespace Microsoft.ML.GenAI.Phi.Module;
-internal class Phi3RotaryEmbeddingInput
-{
-    public Phi3RotaryEmbeddingInput(Tensor input, Tensor positionIds, int? seqLen = null)
-    {
-        Input = input;
-        PositionIds = positionIds;
-        SeqLen = seqLen;
-    }
-
-    public Tensor Input { get; set; }
-
-    public Tensor PositionIds { get; set; }
-
-    public int? SeqLen { get; set; }
-}
-
-internal class Phi3RotaryEmbeddingOutput
-{
-    public Phi3RotaryEmbeddingOutput(Tensor cos, Tensor sin)
-    {
-        Cos = cos;
-        Sin = sin;
-    }
-
-    public Tensor Cos { get; set; }
-
-    public Tensor Sin { get; set; }
-}
-
-
-internal class Phi3RotaryEmbedding : nn.Module<
-    Phi3RotaryEmbeddingInput,
-    Phi3RotaryEmbeddingOutput>
-{
-    private readonly double _base;
-    private readonly int _maxPositionEmbeddings;
-    private readonly int _dim;
-
-    public Phi3RotaryEmbedding(double baseValue, int maxPositionEmbeddings, int dim)
-        : base(nameof(Phi3RotaryEmbedding))
-    {
-        _base = baseValue;
-        _maxPositionEmbeddings = maxPositionEmbeddings;
-        _dim = dim;
-        var thetaNumerator = torch.arange(0, _dim, 2, dtype: ScalarType.Int64).to(torch.float32);
-        this.register_buffer("inv_freq", torch.pow(baseValue, -1.0f * (thetaNumerator / dim)), persistent: false);
-    }
-
-    public int Dim => _dim;
-
-#pragma warning disable MSML_GeneralName // This name should be PascalCased
-    public override Phi3RotaryEmbeddingOutput forward(Phi3RotaryEmbeddingInput input)
-#pragma warning restore MSML_GeneralName // This name should be PascalCased
-    {
-        var x = input.Input;
-        var positionIds = input.PositionIds;
-        var seqLen = input.SeqLen;
-        // TODO
-        // can be calculated once and cached
-        var invFreq = this.get_buffer("inv_freq").to(x.device);
-        var invFreqExpanded = invFreq.unsqueeze(0).unsqueeze(-1);
-        invFreqExpanded = invFreqExpanded.expand(new long[] { positionIds.shape[0], -1, 1 });
-        var positionIdsExpanded = positionIds.unsqueeze(1).to(torch.float32);
-        var freqs = invFreqExpanded * positionIdsExpanded;
-        freqs = freqs.transpose(1, 2);
-        var emb = torch.cat([freqs, freqs], dim: -1);
-
-        var cos = torch.cos(emb);
-        var sin = torch.sin(emb);
-
-        return new(cos.to_type(x.dtype), sin.to_type(x.dtype));
-    }
-}
diff --git a/src/Microsoft.ML.GenAI.Phi/Module/Phi3SuScaledRotaryEmbedding.cs b/src/Microsoft.ML.GenAI.Phi/Module/Phi3SuScaledRotaryEmbedding.cs
index ce0e70b686..e2170493e4 100644
--- a/src/Microsoft.ML.GenAI.Phi/Module/Phi3SuScaledRotaryEmbedding.cs
+++ b/src/Microsoft.ML.GenAI.Phi/Module/Phi3SuScaledRotaryEmbedding.cs
@@ -8,12 +8,13 @@
 using System.Text;
 using System.Text.Json;
 using System.Threading.Tasks;
+using Microsoft.ML.GenAI.Core;
 using TorchSharp;
 using static TorchSharp.torch;
 
 namespace Microsoft.ML.GenAI.Phi.Module;
 
-internal class Phi3SuScaledRotaryEmbedding : Phi3RotaryEmbedding
+internal class Phi3SuScaledRotaryEmbedding : RotaryEmbedding
 {
     private readonly double[] _shortFactor;
     private readonly double[] _longFactor;
@@ -35,7 +36,7 @@ public Phi3SuScaledRotaryEmbedding(int dim, Phi3Config config)
     }
 
 #pragma warning disable MSML_GeneralName // This name should be PascalCased
-    public override Phi3RotaryEmbeddingOutput forward(Phi3RotaryEmbeddingInput input)
+    public override RotaryEmbeddingOutput forward(RotaryEmbeddingInput input)
 #pragma warning restore MSML_GeneralName // This name should be PascalCased
     {
         var seqLen = (torch.max(input.PositionIds) + 1).ToInt32();
diff --git a/src/Microsoft.ML.GenAI.Phi/Phi2/Phi2ForCasualLM.cs b/src/Microsoft.ML.GenAI.Phi/Phi2/Phi2ForCasualLM.cs
index efb3f23de9..1d49375565 100644
--- a/src/Microsoft.ML.GenAI.Phi/Phi2/Phi2ForCasualLM.cs
+++ b/src/Microsoft.ML.GenAI.Phi/Phi2/Phi2ForCasualLM.cs
@@ -14,7 +14,7 @@
 
 namespace Microsoft.ML.GenAI.Phi;
 
-public class Phi2ForCasualLM : nn.Module<CasualLMModelInput, CasualLMModelOutput>
+public class Phi2ForCasualLM : nn.Module<CausalLMModelInput, CausalLMModelOutput>
 {
 #pragma warning disable MSML_PrivateFieldName // Private field name not in: _camelCase format
     private readonly Phi2Model model;
@@ -30,7 +30,7 @@ public Phi2ForCasualLM(Phi2Config config)
     }
 
 #pragma warning disable MSML_GeneralName // This name should be PascalCased
-    public override CasualLMModelOutput forward(CasualLMModelInput input) // use_cache, output_attentions, output_hidden_states
+    public override CausalLMModelOutput forward(CausalLMModelInput input) // use_cache, output_attentions, output_hidden_states
 #pragma warning restore MSML_GeneralName // This name should be PascalCased
     {
         var inputIds = input.InputIds;
@@ -44,7 +44,7 @@ public override CasualLMModelOutput forward(CasualLMModelInput input) // use_cac
 
         var lmLogits = this.lm_head.forward(hiddenState);
 
-        return new CasualLMModelOutput(lastHiddenState: hiddenState, logits: lmLogits);
+        return new CausalLMModelOutput(lastHiddenState: hiddenState, logits: lmLogits);
     }
 
     public static Phi2ForCasualLM FromPretrained(
diff --git a/src/Microsoft.ML.GenAI.Phi/Phi3/Phi3ForCasualLM.cs b/src/Microsoft.ML.GenAI.Phi/Phi3/Phi3ForCasualLM.cs
index 41b2d970fd..c67741377e 100644
--- a/src/Microsoft.ML.GenAI.Phi/Phi3/Phi3ForCasualLM.cs
+++ b/src/Microsoft.ML.GenAI.Phi/Phi3/Phi3ForCasualLM.cs
@@ -17,7 +17,7 @@
 
 namespace Microsoft.ML.GenAI.Phi;
 
-public class Phi3ForCasualLM : nn.Module<CasualLMModelInput, CasualLMModelOutput>
+public class Phi3ForCasualLM : nn.Module<CausalLMModelInput, CausalLMModelOutput>
 {
     private readonly Phi3Config _config;
 
@@ -37,7 +37,7 @@ public Phi3ForCasualLM(Phi3Config config)
     }
 
 #pragma warning disable MSML_GeneralName // This name should be PascalCased
-    public override CasualLMModelOutput forward(CasualLMModelInput input)
+    public override CausalLMModelOutput forward(CausalLMModelInput input)
 #pragma warning restore MSML_GeneralName // This name should be PascalCased
     {
         var outputs = this.model.forward(input);
diff --git a/src/Microsoft.ML.GenAI.Phi/Utils.cs b/src/Microsoft.ML.GenAI.Phi/Utils.cs
index 4591d94f14..aa5a71719e 100644
--- a/src/Microsoft.ML.GenAI.Phi/Utils.cs
+++ b/src/Microsoft.ML.GenAI.Phi/Utils.cs
@@ -130,18 +130,6 @@ public static (Tensor, Tensor) ApplyRotaryPosEmb(Tensor q, Tensor k, Tensor cos,
         return (qEmbed, kEmbed);
     }
 
-    public static Module<Tensor, Tensor> GetActivation(string actFn)
-    {
-        return actFn switch
-        {
-            "silu" => nn.SiLU(),
-            "relu" => nn.ReLU(),
-            "gelu" => nn.GELU(),
-            "tanh" => nn.Tanh(),
-            "swish" => nn.SiLU(),
-            _ => throw new ArgumentException("Invalid activation function", actFn),
-        };
-    }
 
 
     public static Tensor Phi2RepeatKV(Tensor x, int nRep)
diff --git a/test/Microsoft.ML.GenAI.LLaMA.Tests/Approvals/LLaMA3_1Tests.Llama_3_1_8b_ShapeTest.approved.txt b/test/Microsoft.ML.GenAI.LLaMA.Tests/Approvals/LLaMA3_1Tests.Llama_3_1_8b_ShapeTest.approved.txt
new file mode 100644
index 0000000000..887b49cfa6
--- /dev/null
+++ b/test/Microsoft.ML.GenAI.LLaMA.Tests/Approvals/LLaMA3_1Tests.Llama_3_1_8b_ShapeTest.approved.txt
@@ -0,0 +1,291 @@
+﻿0: lm_head.weight shape: [128256, 4096]
+1: model.embed_tokens.weight shape: [128256, 4096]
+2: model.layers.0.input_layernorm.weight shape: [4096]
+3: model.layers.0.mlp.down_proj.weight shape: [4096, 14336]
+4: model.layers.0.mlp.gate_proj.weight shape: [14336, 4096]
+5: model.layers.0.mlp.up_proj.weight shape: [14336, 4096]
+6: model.layers.0.post_attention_layernorm.weight shape: [4096]
+7: model.layers.0.self_attn.k_proj.weight shape: [1024, 4096]
+8: model.layers.0.self_attn.o_proj.weight shape: [4096, 4096]
+9: model.layers.0.self_attn.q_proj.weight shape: [4096, 4096]
+10: model.layers.0.self_attn.v_proj.weight shape: [1024, 4096]
+11: model.layers.1.input_layernorm.weight shape: [4096]
+12: model.layers.1.mlp.down_proj.weight shape: [4096, 14336]
+13: model.layers.1.mlp.gate_proj.weight shape: [14336, 4096]
+14: model.layers.1.mlp.up_proj.weight shape: [14336, 4096]
+15: model.layers.1.post_attention_layernorm.weight shape: [4096]
+16: model.layers.1.self_attn.k_proj.weight shape: [1024, 4096]
+17: model.layers.1.self_attn.o_proj.weight shape: [4096, 4096]
+18: model.layers.1.self_attn.q_proj.weight shape: [4096, 4096]
+19: model.layers.1.self_attn.v_proj.weight shape: [1024, 4096]
+20: model.layers.10.input_layernorm.weight shape: [4096]
+21: model.layers.10.mlp.down_proj.weight shape: [4096, 14336]
+22: model.layers.10.mlp.gate_proj.weight shape: [14336, 4096]
+23: model.layers.10.mlp.up_proj.weight shape: [14336, 4096]
+24: model.layers.10.post_attention_layernorm.weight shape: [4096]
+25: model.layers.10.self_attn.k_proj.weight shape: [1024, 4096]
+26: model.layers.10.self_attn.o_proj.weight shape: [4096, 4096]
+27: model.layers.10.self_attn.q_proj.weight shape: [4096, 4096]
+28: model.layers.10.self_attn.v_proj.weight shape: [1024, 4096]
+29: model.layers.11.input_layernorm.weight shape: [4096]
+30: model.layers.11.mlp.down_proj.weight shape: [4096, 14336]
+31: model.layers.11.mlp.gate_proj.weight shape: [14336, 4096]
+32: model.layers.11.mlp.up_proj.weight shape: [14336, 4096]
+33: model.layers.11.post_attention_layernorm.weight shape: [4096]
+34: model.layers.11.self_attn.k_proj.weight shape: [1024, 4096]
+35: model.layers.11.self_attn.o_proj.weight shape: [4096, 4096]
+36: model.layers.11.self_attn.q_proj.weight shape: [4096, 4096]
+37: model.layers.11.self_attn.v_proj.weight shape: [1024, 4096]
+38: model.layers.12.input_layernorm.weight shape: [4096]
+39: model.layers.12.mlp.down_proj.weight shape: [4096, 14336]
+40: model.layers.12.mlp.gate_proj.weight shape: [14336, 4096]
+41: model.layers.12.mlp.up_proj.weight shape: [14336, 4096]
+42: model.layers.12.post_attention_layernorm.weight shape: [4096]
+43: model.layers.12.self_attn.k_proj.weight shape: [1024, 4096]
+44: model.layers.12.self_attn.o_proj.weight shape: [4096, 4096]
+45: model.layers.12.self_attn.q_proj.weight shape: [4096, 4096]
+46: model.layers.12.self_attn.v_proj.weight shape: [1024, 4096]
+47: model.layers.13.input_layernorm.weight shape: [4096]
+48: model.layers.13.mlp.down_proj.weight shape: [4096, 14336]
+49: model.layers.13.mlp.gate_proj.weight shape: [14336, 4096]
+50: model.layers.13.mlp.up_proj.weight shape: [14336, 4096]
+51: model.layers.13.post_attention_layernorm.weight shape: [4096]
+52: model.layers.13.self_attn.k_proj.weight shape: [1024, 4096]
+53: model.layers.13.self_attn.o_proj.weight shape: [4096, 4096]
+54: model.layers.13.self_attn.q_proj.weight shape: [4096, 4096]
+55: model.layers.13.self_attn.v_proj.weight shape: [1024, 4096]
+56: model.layers.14.input_layernorm.weight shape: [4096]
+57: model.layers.14.mlp.down_proj.weight shape: [4096, 14336]
+58: model.layers.14.mlp.gate_proj.weight shape: [14336, 4096]
+59: model.layers.14.mlp.up_proj.weight shape: [14336, 4096]
+60: model.layers.14.post_attention_layernorm.weight shape: [4096]
+61: model.layers.14.self_attn.k_proj.weight shape: [1024, 4096]
+62: model.layers.14.self_attn.o_proj.weight shape: [4096, 4096]
+63: model.layers.14.self_attn.q_proj.weight shape: [4096, 4096]
+64: model.layers.14.self_attn.v_proj.weight shape: [1024, 4096]
+65: model.layers.15.input_layernorm.weight shape: [4096]
+66: model.layers.15.mlp.down_proj.weight shape: [4096, 14336]
+67: model.layers.15.mlp.gate_proj.weight shape: [14336, 4096]
+68: model.layers.15.mlp.up_proj.weight shape: [14336, 4096]
+69: model.layers.15.post_attention_layernorm.weight shape: [4096]
+70: model.layers.15.self_attn.k_proj.weight shape: [1024, 4096]
+71: model.layers.15.self_attn.o_proj.weight shape: [4096, 4096]
+72: model.layers.15.self_attn.q_proj.weight shape: [4096, 4096]
+73: model.layers.15.self_attn.v_proj.weight shape: [1024, 4096]
+74: model.layers.16.input_layernorm.weight shape: [4096]
+75: model.layers.16.mlp.down_proj.weight shape: [4096, 14336]
+76: model.layers.16.mlp.gate_proj.weight shape: [14336, 4096]
+77: model.layers.16.mlp.up_proj.weight shape: [14336, 4096]
+78: model.layers.16.post_attention_layernorm.weight shape: [4096]
+79: model.layers.16.self_attn.k_proj.weight shape: [1024, 4096]
+80: model.layers.16.self_attn.o_proj.weight shape: [4096, 4096]
+81: model.layers.16.self_attn.q_proj.weight shape: [4096, 4096]
+82: model.layers.16.self_attn.v_proj.weight shape: [1024, 4096]
+83: model.layers.17.input_layernorm.weight shape: [4096]
+84: model.layers.17.mlp.down_proj.weight shape: [4096, 14336]
+85: model.layers.17.mlp.gate_proj.weight shape: [14336, 4096]
+86: model.layers.17.mlp.up_proj.weight shape: [14336, 4096]
+87: model.layers.17.post_attention_layernorm.weight shape: [4096]
+88: model.layers.17.self_attn.k_proj.weight shape: [1024, 4096]
+89: model.layers.17.self_attn.o_proj.weight shape: [4096, 4096]
+90: model.layers.17.self_attn.q_proj.weight shape: [4096, 4096]
+91: model.layers.17.self_attn.v_proj.weight shape: [1024, 4096]
+92: model.layers.18.input_layernorm.weight shape: [4096]
+93: model.layers.18.mlp.down_proj.weight shape: [4096, 14336]
+94: model.layers.18.mlp.gate_proj.weight shape: [14336, 4096]
+95: model.layers.18.mlp.up_proj.weight shape: [14336, 4096]
+96: model.layers.18.post_attention_layernorm.weight shape: [4096]
+97: model.layers.18.self_attn.k_proj.weight shape: [1024, 4096]
+98: model.layers.18.self_attn.o_proj.weight shape: [4096, 4096]
+99: model.layers.18.self_attn.q_proj.weight shape: [4096, 4096]
+100: model.layers.18.self_attn.v_proj.weight shape: [1024, 4096]
+101: model.layers.19.input_layernorm.weight shape: [4096]
+102: model.layers.19.mlp.down_proj.weight shape: [4096, 14336]
+103: model.layers.19.mlp.gate_proj.weight shape: [14336, 4096]
+104: model.layers.19.mlp.up_proj.weight shape: [14336, 4096]
+105: model.layers.19.post_attention_layernorm.weight shape: [4096]
+106: model.layers.19.self_attn.k_proj.weight shape: [1024, 4096]
+107: model.layers.19.self_attn.o_proj.weight shape: [4096, 4096]
+108: model.layers.19.self_attn.q_proj.weight shape: [4096, 4096]
+109: model.layers.19.self_attn.v_proj.weight shape: [1024, 4096]
+110: model.layers.2.input_layernorm.weight shape: [4096]
+111: model.layers.2.mlp.down_proj.weight shape: [4096, 14336]
+112: model.layers.2.mlp.gate_proj.weight shape: [14336, 4096]
+113: model.layers.2.mlp.up_proj.weight shape: [14336, 4096]
+114: model.layers.2.post_attention_layernorm.weight shape: [4096]
+115: model.layers.2.self_attn.k_proj.weight shape: [1024, 4096]
+116: model.layers.2.self_attn.o_proj.weight shape: [4096, 4096]
+117: model.layers.2.self_attn.q_proj.weight shape: [4096, 4096]
+118: model.layers.2.self_attn.v_proj.weight shape: [1024, 4096]
+119: model.layers.20.input_layernorm.weight shape: [4096]
+120: model.layers.20.mlp.down_proj.weight shape: [4096, 14336]
+121: model.layers.20.mlp.gate_proj.weight shape: [14336, 4096]
+122: model.layers.20.mlp.up_proj.weight shape: [14336, 4096]
+123: model.layers.20.post_attention_layernorm.weight shape: [4096]
+124: model.layers.20.self_attn.k_proj.weight shape: [1024, 4096]
+125: model.layers.20.self_attn.o_proj.weight shape: [4096, 4096]
+126: model.layers.20.self_attn.q_proj.weight shape: [4096, 4096]
+127: model.layers.20.self_attn.v_proj.weight shape: [1024, 4096]
+128: model.layers.21.input_layernorm.weight shape: [4096]
+129: model.layers.21.mlp.down_proj.weight shape: [4096, 14336]
+130: model.layers.21.mlp.gate_proj.weight shape: [14336, 4096]
+131: model.layers.21.mlp.up_proj.weight shape: [14336, 4096]
+132: model.layers.21.post_attention_layernorm.weight shape: [4096]
+133: model.layers.21.self_attn.k_proj.weight shape: [1024, 4096]
+134: model.layers.21.self_attn.o_proj.weight shape: [4096, 4096]
+135: model.layers.21.self_attn.q_proj.weight shape: [4096, 4096]
+136: model.layers.21.self_attn.v_proj.weight shape: [1024, 4096]
+137: model.layers.22.input_layernorm.weight shape: [4096]
+138: model.layers.22.mlp.down_proj.weight shape: [4096, 14336]
+139: model.layers.22.mlp.gate_proj.weight shape: [14336, 4096]
+140: model.layers.22.mlp.up_proj.weight shape: [14336, 4096]
+141: model.layers.22.post_attention_layernorm.weight shape: [4096]
+142: model.layers.22.self_attn.k_proj.weight shape: [1024, 4096]
+143: model.layers.22.self_attn.o_proj.weight shape: [4096, 4096]
+144: model.layers.22.self_attn.q_proj.weight shape: [4096, 4096]
+145: model.layers.22.self_attn.v_proj.weight shape: [1024, 4096]
+146: model.layers.23.input_layernorm.weight shape: [4096]
+147: model.layers.23.mlp.down_proj.weight shape: [4096, 14336]
+148: model.layers.23.mlp.gate_proj.weight shape: [14336, 4096]
+149: model.layers.23.mlp.up_proj.weight shape: [14336, 4096]
+150: model.layers.23.post_attention_layernorm.weight shape: [4096]
+151: model.layers.23.self_attn.k_proj.weight shape: [1024, 4096]
+152: model.layers.23.self_attn.o_proj.weight shape: [4096, 4096]
+153: model.layers.23.self_attn.q_proj.weight shape: [4096, 4096]
+154: model.layers.23.self_attn.v_proj.weight shape: [1024, 4096]
+155: model.layers.24.input_layernorm.weight shape: [4096]
+156: model.layers.24.mlp.down_proj.weight shape: [4096, 14336]
+157: model.layers.24.mlp.gate_proj.weight shape: [14336, 4096]
+158: model.layers.24.mlp.up_proj.weight shape: [14336, 4096]
+159: model.layers.24.post_attention_layernorm.weight shape: [4096]
+160: model.layers.24.self_attn.k_proj.weight shape: [1024, 4096]
+161: model.layers.24.self_attn.o_proj.weight shape: [4096, 4096]
+162: model.layers.24.self_attn.q_proj.weight shape: [4096, 4096]
+163: model.layers.24.self_attn.v_proj.weight shape: [1024, 4096]
+164: model.layers.25.input_layernorm.weight shape: [4096]
+165: model.layers.25.mlp.down_proj.weight shape: [4096, 14336]
+166: model.layers.25.mlp.gate_proj.weight shape: [14336, 4096]
+167: model.layers.25.mlp.up_proj.weight shape: [14336, 4096]
+168: model.layers.25.post_attention_layernorm.weight shape: [4096]
+169: model.layers.25.self_attn.k_proj.weight shape: [1024, 4096]
+170: model.layers.25.self_attn.o_proj.weight shape: [4096, 4096]
+171: model.layers.25.self_attn.q_proj.weight shape: [4096, 4096]
+172: model.layers.25.self_attn.v_proj.weight shape: [1024, 4096]
+173: model.layers.26.input_layernorm.weight shape: [4096]
+174: model.layers.26.mlp.down_proj.weight shape: [4096, 14336]
+175: model.layers.26.mlp.gate_proj.weight shape: [14336, 4096]
+176: model.layers.26.mlp.up_proj.weight shape: [14336, 4096]
+177: model.layers.26.post_attention_layernorm.weight shape: [4096]
+178: model.layers.26.self_attn.k_proj.weight shape: [1024, 4096]
+179: model.layers.26.self_attn.o_proj.weight shape: [4096, 4096]
+180: model.layers.26.self_attn.q_proj.weight shape: [4096, 4096]
+181: model.layers.26.self_attn.v_proj.weight shape: [1024, 4096]
+182: model.layers.27.input_layernorm.weight shape: [4096]
+183: model.layers.27.mlp.down_proj.weight shape: [4096, 14336]
+184: model.layers.27.mlp.gate_proj.weight shape: [14336, 4096]
+185: model.layers.27.mlp.up_proj.weight shape: [14336, 4096]
+186: model.layers.27.post_attention_layernorm.weight shape: [4096]
+187: model.layers.27.self_attn.k_proj.weight shape: [1024, 4096]
+188: model.layers.27.self_attn.o_proj.weight shape: [4096, 4096]
+189: model.layers.27.self_attn.q_proj.weight shape: [4096, 4096]
+190: model.layers.27.self_attn.v_proj.weight shape: [1024, 4096]
+191: model.layers.28.input_layernorm.weight shape: [4096]
+192: model.layers.28.mlp.down_proj.weight shape: [4096, 14336]
+193: model.layers.28.mlp.gate_proj.weight shape: [14336, 4096]
+194: model.layers.28.mlp.up_proj.weight shape: [14336, 4096]
+195: model.layers.28.post_attention_layernorm.weight shape: [4096]
+196: model.layers.28.self_attn.k_proj.weight shape: [1024, 4096]
+197: model.layers.28.self_attn.o_proj.weight shape: [4096, 4096]
+198: model.layers.28.self_attn.q_proj.weight shape: [4096, 4096]
+199: model.layers.28.self_attn.v_proj.weight shape: [1024, 4096]
+200: model.layers.29.input_layernorm.weight shape: [4096]
+201: model.layers.29.mlp.down_proj.weight shape: [4096, 14336]
+202: model.layers.29.mlp.gate_proj.weight shape: [14336, 4096]
+203: model.layers.29.mlp.up_proj.weight shape: [14336, 4096]
+204: model.layers.29.post_attention_layernorm.weight shape: [4096]
+205: model.layers.29.self_attn.k_proj.weight shape: [1024, 4096]
+206: model.layers.29.self_attn.o_proj.weight shape: [4096, 4096]
+207: model.layers.29.self_attn.q_proj.weight shape: [4096, 4096]
+208: model.layers.29.self_attn.v_proj.weight shape: [1024, 4096]
+209: model.layers.3.input_layernorm.weight shape: [4096]
+210: model.layers.3.mlp.down_proj.weight shape: [4096, 14336]
+211: model.layers.3.mlp.gate_proj.weight shape: [14336, 4096]
+212: model.layers.3.mlp.up_proj.weight shape: [14336, 4096]
+213: model.layers.3.post_attention_layernorm.weight shape: [4096]
+214: model.layers.3.self_attn.k_proj.weight shape: [1024, 4096]
+215: model.layers.3.self_attn.o_proj.weight shape: [4096, 4096]
+216: model.layers.3.self_attn.q_proj.weight shape: [4096, 4096]
+217: model.layers.3.self_attn.v_proj.weight shape: [1024, 4096]
+218: model.layers.30.input_layernorm.weight shape: [4096]
+219: model.layers.30.mlp.down_proj.weight shape: [4096, 14336]
+220: model.layers.30.mlp.gate_proj.weight shape: [14336, 4096]
+221: model.layers.30.mlp.up_proj.weight shape: [14336, 4096]
+222: model.layers.30.post_attention_layernorm.weight shape: [4096]
+223: model.layers.30.self_attn.k_proj.weight shape: [1024, 4096]
+224: model.layers.30.self_attn.o_proj.weight shape: [4096, 4096]
+225: model.layers.30.self_attn.q_proj.weight shape: [4096, 4096]
+226: model.layers.30.self_attn.v_proj.weight shape: [1024, 4096]
+227: model.layers.31.input_layernorm.weight shape: [4096]
+228: model.layers.31.mlp.down_proj.weight shape: [4096, 14336]
+229: model.layers.31.mlp.gate_proj.weight shape: [14336, 4096]
+230: model.layers.31.mlp.up_proj.weight shape: [14336, 4096]
+231: model.layers.31.post_attention_layernorm.weight shape: [4096]
+232: model.layers.31.self_attn.k_proj.weight shape: [1024, 4096]
+233: model.layers.31.self_attn.o_proj.weight shape: [4096, 4096]
+234: model.layers.31.self_attn.q_proj.weight shape: [4096, 4096]
+235: model.layers.31.self_attn.v_proj.weight shape: [1024, 4096]
+236: model.layers.4.input_layernorm.weight shape: [4096]
+237: model.layers.4.mlp.down_proj.weight shape: [4096, 14336]
+238: model.layers.4.mlp.gate_proj.weight shape: [14336, 4096]
+239: model.layers.4.mlp.up_proj.weight shape: [14336, 4096]
+240: model.layers.4.post_attention_layernorm.weight shape: [4096]
+241: model.layers.4.self_attn.k_proj.weight shape: [1024, 4096]
+242: model.layers.4.self_attn.o_proj.weight shape: [4096, 4096]
+243: model.layers.4.self_attn.q_proj.weight shape: [4096, 4096]
+244: model.layers.4.self_attn.v_proj.weight shape: [1024, 4096]
+245: model.layers.5.input_layernorm.weight shape: [4096]
+246: model.layers.5.mlp.down_proj.weight shape: [4096, 14336]
+247: model.layers.5.mlp.gate_proj.weight shape: [14336, 4096]
+248: model.layers.5.mlp.up_proj.weight shape: [14336, 4096]
+249: model.layers.5.post_attention_layernorm.weight shape: [4096]
+250: model.layers.5.self_attn.k_proj.weight shape: [1024, 4096]
+251: model.layers.5.self_attn.o_proj.weight shape: [4096, 4096]
+252: model.layers.5.self_attn.q_proj.weight shape: [4096, 4096]
+253: model.layers.5.self_attn.v_proj.weight shape: [1024, 4096]
+254: model.layers.6.input_layernorm.weight shape: [4096]
+255: model.layers.6.mlp.down_proj.weight shape: [4096, 14336]
+256: model.layers.6.mlp.gate_proj.weight shape: [14336, 4096]
+257: model.layers.6.mlp.up_proj.weight shape: [14336, 4096]
+258: model.layers.6.post_attention_layernorm.weight shape: [4096]
+259: model.layers.6.self_attn.k_proj.weight shape: [1024, 4096]
+260: model.layers.6.self_attn.o_proj.weight shape: [4096, 4096]
+261: model.layers.6.self_attn.q_proj.weight shape: [4096, 4096]
+262: model.layers.6.self_attn.v_proj.weight shape: [1024, 4096]
+263: model.layers.7.input_layernorm.weight shape: [4096]
+264: model.layers.7.mlp.down_proj.weight shape: [4096, 14336]
+265: model.layers.7.mlp.gate_proj.weight shape: [14336, 4096]
+266: model.layers.7.mlp.up_proj.weight shape: [14336, 4096]
+267: model.layers.7.post_attention_layernorm.weight shape: [4096]
+268: model.layers.7.self_attn.k_proj.weight shape: [1024, 4096]
+269: model.layers.7.self_attn.o_proj.weight shape: [4096, 4096]
+270: model.layers.7.self_attn.q_proj.weight shape: [4096, 4096]
+271: model.layers.7.self_attn.v_proj.weight shape: [1024, 4096]
+272: model.layers.8.input_layernorm.weight shape: [4096]
+273: model.layers.8.mlp.down_proj.weight shape: [4096, 14336]
+274: model.layers.8.mlp.gate_proj.weight shape: [14336, 4096]
+275: model.layers.8.mlp.up_proj.weight shape: [14336, 4096]
+276: model.layers.8.post_attention_layernorm.weight shape: [4096]
+277: model.layers.8.self_attn.k_proj.weight shape: [1024, 4096]
+278: model.layers.8.self_attn.o_proj.weight shape: [4096, 4096]
+279: model.layers.8.self_attn.q_proj.weight shape: [4096, 4096]
+280: model.layers.8.self_attn.v_proj.weight shape: [1024, 4096]
+281: model.layers.9.input_layernorm.weight shape: [4096]
+282: model.layers.9.mlp.down_proj.weight shape: [4096, 14336]
+283: model.layers.9.mlp.gate_proj.weight shape: [14336, 4096]
+284: model.layers.9.mlp.up_proj.weight shape: [14336, 4096]
+285: model.layers.9.post_attention_layernorm.weight shape: [4096]
+286: model.layers.9.self_attn.k_proj.weight shape: [1024, 4096]
+287: model.layers.9.self_attn.o_proj.weight shape: [4096, 4096]
+288: model.layers.9.self_attn.q_proj.weight shape: [4096, 4096]
+289: model.layers.9.self_attn.v_proj.weight shape: [1024, 4096]
+290: model.norm.weight shape: [4096]
diff --git a/test/Microsoft.ML.GenAI.LLaMA.Tests/Approvals/LLaMA3_1Tests.Llama_3_1_8b_ShapeTest.received.txt b/test/Microsoft.ML.GenAI.LLaMA.Tests/Approvals/LLaMA3_1Tests.Llama_3_1_8b_ShapeTest.received.txt
new file mode 100644
index 0000000000..887b49cfa6
--- /dev/null
+++ b/test/Microsoft.ML.GenAI.LLaMA.Tests/Approvals/LLaMA3_1Tests.Llama_3_1_8b_ShapeTest.received.txt
@@ -0,0 +1,291 @@
+﻿0: lm_head.weight shape: [128256, 4096]
+1: model.embed_tokens.weight shape: [128256, 4096]
+2: model.layers.0.input_layernorm.weight shape: [4096]
+3: model.layers.0.mlp.down_proj.weight shape: [4096, 14336]
+4: model.layers.0.mlp.gate_proj.weight shape: [14336, 4096]
+5: model.layers.0.mlp.up_proj.weight shape: [14336, 4096]
+6: model.layers.0.post_attention_layernorm.weight shape: [4096]
+7: model.layers.0.self_attn.k_proj.weight shape: [1024, 4096]
+8: model.layers.0.self_attn.o_proj.weight shape: [4096, 4096]
+9: model.layers.0.self_attn.q_proj.weight shape: [4096, 4096]
+10: model.layers.0.self_attn.v_proj.weight shape: [1024, 4096]
+11: model.layers.1.input_layernorm.weight shape: [4096]
+12: model.layers.1.mlp.down_proj.weight shape: [4096, 14336]
+13: model.layers.1.mlp.gate_proj.weight shape: [14336, 4096]
+14: model.layers.1.mlp.up_proj.weight shape: [14336, 4096]
+15: model.layers.1.post_attention_layernorm.weight shape: [4096]
+16: model.layers.1.self_attn.k_proj.weight shape: [1024, 4096]
+17: model.layers.1.self_attn.o_proj.weight shape: [4096, 4096]
+18: model.layers.1.self_attn.q_proj.weight shape: [4096, 4096]
+19: model.layers.1.self_attn.v_proj.weight shape: [1024, 4096]
+20: model.layers.10.input_layernorm.weight shape: [4096]
+21: model.layers.10.mlp.down_proj.weight shape: [4096, 14336]
+22: model.layers.10.mlp.gate_proj.weight shape: [14336, 4096]
+23: model.layers.10.mlp.up_proj.weight shape: [14336, 4096]
+24: model.layers.10.post_attention_layernorm.weight shape: [4096]
+25: model.layers.10.self_attn.k_proj.weight shape: [1024, 4096]
+26: model.layers.10.self_attn.o_proj.weight shape: [4096, 4096]
+27: model.layers.10.self_attn.q_proj.weight shape: [4096, 4096]
+28: model.layers.10.self_attn.v_proj.weight shape: [1024, 4096]
+29: model.layers.11.input_layernorm.weight shape: [4096]
+30: model.layers.11.mlp.down_proj.weight shape: [4096, 14336]
+31: model.layers.11.mlp.gate_proj.weight shape: [14336, 4096]
+32: model.layers.11.mlp.up_proj.weight shape: [14336, 4096]
+33: model.layers.11.post_attention_layernorm.weight shape: [4096]
+34: model.layers.11.self_attn.k_proj.weight shape: [1024, 4096]
+35: model.layers.11.self_attn.o_proj.weight shape: [4096, 4096]
+36: model.layers.11.self_attn.q_proj.weight shape: [4096, 4096]
+37: model.layers.11.self_attn.v_proj.weight shape: [1024, 4096]
+38: model.layers.12.input_layernorm.weight shape: [4096]
+39: model.layers.12.mlp.down_proj.weight shape: [4096, 14336]
+40: model.layers.12.mlp.gate_proj.weight shape: [14336, 4096]
+41: model.layers.12.mlp.up_proj.weight shape: [14336, 4096]
+42: model.layers.12.post_attention_layernorm.weight shape: [4096]
+43: model.layers.12.self_attn.k_proj.weight shape: [1024, 4096]
+44: model.layers.12.self_attn.o_proj.weight shape: [4096, 4096]
+45: model.layers.12.self_attn.q_proj.weight shape: [4096, 4096]
+46: model.layers.12.self_attn.v_proj.weight shape: [1024, 4096]
+47: model.layers.13.input_layernorm.weight shape: [4096]
+48: model.layers.13.mlp.down_proj.weight shape: [4096, 14336]
+49: model.layers.13.mlp.gate_proj.weight shape: [14336, 4096]
+50: model.layers.13.mlp.up_proj.weight shape: [14336, 4096]
+51: model.layers.13.post_attention_layernorm.weight shape: [4096]
+52: model.layers.13.self_attn.k_proj.weight shape: [1024, 4096]
+53: model.layers.13.self_attn.o_proj.weight shape: [4096, 4096]
+54: model.layers.13.self_attn.q_proj.weight shape: [4096, 4096]
+55: model.layers.13.self_attn.v_proj.weight shape: [1024, 4096]
+56: model.layers.14.input_layernorm.weight shape: [4096]
+57: model.layers.14.mlp.down_proj.weight shape: [4096, 14336]
+58: model.layers.14.mlp.gate_proj.weight shape: [14336, 4096]
+59: model.layers.14.mlp.up_proj.weight shape: [14336, 4096]
+60: model.layers.14.post_attention_layernorm.weight shape: [4096]
+61: model.layers.14.self_attn.k_proj.weight shape: [1024, 4096]
+62: model.layers.14.self_attn.o_proj.weight shape: [4096, 4096]
+63: model.layers.14.self_attn.q_proj.weight shape: [4096, 4096]
+64: model.layers.14.self_attn.v_proj.weight shape: [1024, 4096]
+65: model.layers.15.input_layernorm.weight shape: [4096]
+66: model.layers.15.mlp.down_proj.weight shape: [4096, 14336]
+67: model.layers.15.mlp.gate_proj.weight shape: [14336, 4096]
+68: model.layers.15.mlp.up_proj.weight shape: [14336, 4096]
+69: model.layers.15.post_attention_layernorm.weight shape: [4096]
+70: model.layers.15.self_attn.k_proj.weight shape: [1024, 4096]
+71: model.layers.15.self_attn.o_proj.weight shape: [4096, 4096]
+72: model.layers.15.self_attn.q_proj.weight shape: [4096, 4096]
+73: model.layers.15.self_attn.v_proj.weight shape: [1024, 4096]
+74: model.layers.16.input_layernorm.weight shape: [4096]
+75: model.layers.16.mlp.down_proj.weight shape: [4096, 14336]
+76: model.layers.16.mlp.gate_proj.weight shape: [14336, 4096]
+77: model.layers.16.mlp.up_proj.weight shape: [14336, 4096]
+78: model.layers.16.post_attention_layernorm.weight shape: [4096]
+79: model.layers.16.self_attn.k_proj.weight shape: [1024, 4096]
+80: model.layers.16.self_attn.o_proj.weight shape: [4096, 4096]
+81: model.layers.16.self_attn.q_proj.weight shape: [4096, 4096]
+82: model.layers.16.self_attn.v_proj.weight shape: [1024, 4096]
+83: model.layers.17.input_layernorm.weight shape: [4096]
+84: model.layers.17.mlp.down_proj.weight shape: [4096, 14336]
+85: model.layers.17.mlp.gate_proj.weight shape: [14336, 4096]
+86: model.layers.17.mlp.up_proj.weight shape: [14336, 4096]
+87: model.layers.17.post_attention_layernorm.weight shape: [4096]
+88: model.layers.17.self_attn.k_proj.weight shape: [1024, 4096]
+89: model.layers.17.self_attn.o_proj.weight shape: [4096, 4096]
+90: model.layers.17.self_attn.q_proj.weight shape: [4096, 4096]
+91: model.layers.17.self_attn.v_proj.weight shape: [1024, 4096]
+92: model.layers.18.input_layernorm.weight shape: [4096]
+93: model.layers.18.mlp.down_proj.weight shape: [4096, 14336]
+94: model.layers.18.mlp.gate_proj.weight shape: [14336, 4096]
+95: model.layers.18.mlp.up_proj.weight shape: [14336, 4096]
+96: model.layers.18.post_attention_layernorm.weight shape: [4096]
+97: model.layers.18.self_attn.k_proj.weight shape: [1024, 4096]
+98: model.layers.18.self_attn.o_proj.weight shape: [4096, 4096]
+99: model.layers.18.self_attn.q_proj.weight shape: [4096, 4096]
+100: model.layers.18.self_attn.v_proj.weight shape: [1024, 4096]
+101: model.layers.19.input_layernorm.weight shape: [4096]
+102: model.layers.19.mlp.down_proj.weight shape: [4096, 14336]
+103: model.layers.19.mlp.gate_proj.weight shape: [14336, 4096]
+104: model.layers.19.mlp.up_proj.weight shape: [14336, 4096]
+105: model.layers.19.post_attention_layernorm.weight shape: [4096]
+106: model.layers.19.self_attn.k_proj.weight shape: [1024, 4096]
+107: model.layers.19.self_attn.o_proj.weight shape: [4096, 4096]
+108: model.layers.19.self_attn.q_proj.weight shape: [4096, 4096]
+109: model.layers.19.self_attn.v_proj.weight shape: [1024, 4096]
+110: model.layers.2.input_layernorm.weight shape: [4096]
+111: model.layers.2.mlp.down_proj.weight shape: [4096, 14336]
+112: model.layers.2.mlp.gate_proj.weight shape: [14336, 4096]
+113: model.layers.2.mlp.up_proj.weight shape: [14336, 4096]
+114: model.layers.2.post_attention_layernorm.weight shape: [4096]
+115: model.layers.2.self_attn.k_proj.weight shape: [1024, 4096]
+116: model.layers.2.self_attn.o_proj.weight shape: [4096, 4096]
+117: model.layers.2.self_attn.q_proj.weight shape: [4096, 4096]
+118: model.layers.2.self_attn.v_proj.weight shape: [1024, 4096]
+119: model.layers.20.input_layernorm.weight shape: [4096]
+120: model.layers.20.mlp.down_proj.weight shape: [4096, 14336]
+121: model.layers.20.mlp.gate_proj.weight shape: [14336, 4096]
+122: model.layers.20.mlp.up_proj.weight shape: [14336, 4096]
+123: model.layers.20.post_attention_layernorm.weight shape: [4096]
+124: model.layers.20.self_attn.k_proj.weight shape: [1024, 4096]
+125: model.layers.20.self_attn.o_proj.weight shape: [4096, 4096]
+126: model.layers.20.self_attn.q_proj.weight shape: [4096, 4096]
+127: model.layers.20.self_attn.v_proj.weight shape: [1024, 4096]
+128: model.layers.21.input_layernorm.weight shape: [4096]
+129: model.layers.21.mlp.down_proj.weight shape: [4096, 14336]
+130: model.layers.21.mlp.gate_proj.weight shape: [14336, 4096]
+131: model.layers.21.mlp.up_proj.weight shape: [14336, 4096]
+132: model.layers.21.post_attention_layernorm.weight shape: [4096]
+133: model.layers.21.self_attn.k_proj.weight shape: [1024, 4096]
+134: model.layers.21.self_attn.o_proj.weight shape: [4096, 4096]
+135: model.layers.21.self_attn.q_proj.weight shape: [4096, 4096]
+136: model.layers.21.self_attn.v_proj.weight shape: [1024, 4096]
+137: model.layers.22.input_layernorm.weight shape: [4096]
+138: model.layers.22.mlp.down_proj.weight shape: [4096, 14336]
+139: model.layers.22.mlp.gate_proj.weight shape: [14336, 4096]
+140: model.layers.22.mlp.up_proj.weight shape: [14336, 4096]
+141: model.layers.22.post_attention_layernorm.weight shape: [4096]
+142: model.layers.22.self_attn.k_proj.weight shape: [1024, 4096]
+143: model.layers.22.self_attn.o_proj.weight shape: [4096, 4096]
+144: model.layers.22.self_attn.q_proj.weight shape: [4096, 4096]
+145: model.layers.22.self_attn.v_proj.weight shape: [1024, 4096]
+146: model.layers.23.input_layernorm.weight shape: [4096]
+147: model.layers.23.mlp.down_proj.weight shape: [4096, 14336]
+148: model.layers.23.mlp.gate_proj.weight shape: [14336, 4096]
+149: model.layers.23.mlp.up_proj.weight shape: [14336, 4096]
+150: model.layers.23.post_attention_layernorm.weight shape: [4096]
+151: model.layers.23.self_attn.k_proj.weight shape: [1024, 4096]
+152: model.layers.23.self_attn.o_proj.weight shape: [4096, 4096]
+153: model.layers.23.self_attn.q_proj.weight shape: [4096, 4096]
+154: model.layers.23.self_attn.v_proj.weight shape: [1024, 4096]
+155: model.layers.24.input_layernorm.weight shape: [4096]
+156: model.layers.24.mlp.down_proj.weight shape: [4096, 14336]
+157: model.layers.24.mlp.gate_proj.weight shape: [14336, 4096]
+158: model.layers.24.mlp.up_proj.weight shape: [14336, 4096]
+159: model.layers.24.post_attention_layernorm.weight shape: [4096]
+160: model.layers.24.self_attn.k_proj.weight shape: [1024, 4096]
+161: model.layers.24.self_attn.o_proj.weight shape: [4096, 4096]
+162: model.layers.24.self_attn.q_proj.weight shape: [4096, 4096]
+163: model.layers.24.self_attn.v_proj.weight shape: [1024, 4096]
+164: model.layers.25.input_layernorm.weight shape: [4096]
+165: model.layers.25.mlp.down_proj.weight shape: [4096, 14336]
+166: model.layers.25.mlp.gate_proj.weight shape: [14336, 4096]
+167: model.layers.25.mlp.up_proj.weight shape: [14336, 4096]
+168: model.layers.25.post_attention_layernorm.weight shape: [4096]
+169: model.layers.25.self_attn.k_proj.weight shape: [1024, 4096]
+170: model.layers.25.self_attn.o_proj.weight shape: [4096, 4096]
+171: model.layers.25.self_attn.q_proj.weight shape: [4096, 4096]
+172: model.layers.25.self_attn.v_proj.weight shape: [1024, 4096]
+173: model.layers.26.input_layernorm.weight shape: [4096]
+174: model.layers.26.mlp.down_proj.weight shape: [4096, 14336]
+175: model.layers.26.mlp.gate_proj.weight shape: [14336, 4096]
+176: model.layers.26.mlp.up_proj.weight shape: [14336, 4096]
+177: model.layers.26.post_attention_layernorm.weight shape: [4096]
+178: model.layers.26.self_attn.k_proj.weight shape: [1024, 4096]
+179: model.layers.26.self_attn.o_proj.weight shape: [4096, 4096]
+180: model.layers.26.self_attn.q_proj.weight shape: [4096, 4096]
+181: model.layers.26.self_attn.v_proj.weight shape: [1024, 4096]
+182: model.layers.27.input_layernorm.weight shape: [4096]
+183: model.layers.27.mlp.down_proj.weight shape: [4096, 14336]
+184: model.layers.27.mlp.gate_proj.weight shape: [14336, 4096]
+185: model.layers.27.mlp.up_proj.weight shape: [14336, 4096]
+186: model.layers.27.post_attention_layernorm.weight shape: [4096]
+187: model.layers.27.self_attn.k_proj.weight shape: [1024, 4096]
+188: model.layers.27.self_attn.o_proj.weight shape: [4096, 4096]
+189: model.layers.27.self_attn.q_proj.weight shape: [4096, 4096]
+190: model.layers.27.self_attn.v_proj.weight shape: [1024, 4096]
+191: model.layers.28.input_layernorm.weight shape: [4096]
+192: model.layers.28.mlp.down_proj.weight shape: [4096, 14336]
+193: model.layers.28.mlp.gate_proj.weight shape: [14336, 4096]
+194: model.layers.28.mlp.up_proj.weight shape: [14336, 4096]
+195: model.layers.28.post_attention_layernorm.weight shape: [4096]
+196: model.layers.28.self_attn.k_proj.weight shape: [1024, 4096]
+197: model.layers.28.self_attn.o_proj.weight shape: [4096, 4096]
+198: model.layers.28.self_attn.q_proj.weight shape: [4096, 4096]
+199: model.layers.28.self_attn.v_proj.weight shape: [1024, 4096]
+200: model.layers.29.input_layernorm.weight shape: [4096]
+201: model.layers.29.mlp.down_proj.weight shape: [4096, 14336]
+202: model.layers.29.mlp.gate_proj.weight shape: [14336, 4096]
+203: model.layers.29.mlp.up_proj.weight shape: [14336, 4096]
+204: model.layers.29.post_attention_layernorm.weight shape: [4096]
+205: model.layers.29.self_attn.k_proj.weight shape: [1024, 4096]
+206: model.layers.29.self_attn.o_proj.weight shape: [4096, 4096]
+207: model.layers.29.self_attn.q_proj.weight shape: [4096, 4096]
+208: model.layers.29.self_attn.v_proj.weight shape: [1024, 4096]
+209: model.layers.3.input_layernorm.weight shape: [4096]
+210: model.layers.3.mlp.down_proj.weight shape: [4096, 14336]
+211: model.layers.3.mlp.gate_proj.weight shape: [14336, 4096]
+212: model.layers.3.mlp.up_proj.weight shape: [14336, 4096]
+213: model.layers.3.post_attention_layernorm.weight shape: [4096]
+214: model.layers.3.self_attn.k_proj.weight shape: [1024, 4096]
+215: model.layers.3.self_attn.o_proj.weight shape: [4096, 4096]
+216: model.layers.3.self_attn.q_proj.weight shape: [4096, 4096]
+217: model.layers.3.self_attn.v_proj.weight shape: [1024, 4096]
+218: model.layers.30.input_layernorm.weight shape: [4096]
+219: model.layers.30.mlp.down_proj.weight shape: [4096, 14336]
+220: model.layers.30.mlp.gate_proj.weight shape: [14336, 4096]
+221: model.layers.30.mlp.up_proj.weight shape: [14336, 4096]
+222: model.layers.30.post_attention_layernorm.weight shape: [4096]
+223: model.layers.30.self_attn.k_proj.weight shape: [1024, 4096]
+224: model.layers.30.self_attn.o_proj.weight shape: [4096, 4096]
+225: model.layers.30.self_attn.q_proj.weight shape: [4096, 4096]
+226: model.layers.30.self_attn.v_proj.weight shape: [1024, 4096]
+227: model.layers.31.input_layernorm.weight shape: [4096]
+228: model.layers.31.mlp.down_proj.weight shape: [4096, 14336]
+229: model.layers.31.mlp.gate_proj.weight shape: [14336, 4096]
+230: model.layers.31.mlp.up_proj.weight shape: [14336, 4096]
+231: model.layers.31.post_attention_layernorm.weight shape: [4096]
+232: model.layers.31.self_attn.k_proj.weight shape: [1024, 4096]
+233: model.layers.31.self_attn.o_proj.weight shape: [4096, 4096]
+234: model.layers.31.self_attn.q_proj.weight shape: [4096, 4096]
+235: model.layers.31.self_attn.v_proj.weight shape: [1024, 4096]
+236: model.layers.4.input_layernorm.weight shape: [4096]
+237: model.layers.4.mlp.down_proj.weight shape: [4096, 14336]
+238: model.layers.4.mlp.gate_proj.weight shape: [14336, 4096]
+239: model.layers.4.mlp.up_proj.weight shape: [14336, 4096]
+240: model.layers.4.post_attention_layernorm.weight shape: [4096]
+241: model.layers.4.self_attn.k_proj.weight shape: [1024, 4096]
+242: model.layers.4.self_attn.o_proj.weight shape: [4096, 4096]
+243: model.layers.4.self_attn.q_proj.weight shape: [4096, 4096]
+244: model.layers.4.self_attn.v_proj.weight shape: [1024, 4096]
+245: model.layers.5.input_layernorm.weight shape: [4096]
+246: model.layers.5.mlp.down_proj.weight shape: [4096, 14336]
+247: model.layers.5.mlp.gate_proj.weight shape: [14336, 4096]
+248: model.layers.5.mlp.up_proj.weight shape: [14336, 4096]
+249: model.layers.5.post_attention_layernorm.weight shape: [4096]
+250: model.layers.5.self_attn.k_proj.weight shape: [1024, 4096]
+251: model.layers.5.self_attn.o_proj.weight shape: [4096, 4096]
+252: model.layers.5.self_attn.q_proj.weight shape: [4096, 4096]
+253: model.layers.5.self_attn.v_proj.weight shape: [1024, 4096]
+254: model.layers.6.input_layernorm.weight shape: [4096]
+255: model.layers.6.mlp.down_proj.weight shape: [4096, 14336]
+256: model.layers.6.mlp.gate_proj.weight shape: [14336, 4096]
+257: model.layers.6.mlp.up_proj.weight shape: [14336, 4096]
+258: model.layers.6.post_attention_layernorm.weight shape: [4096]
+259: model.layers.6.self_attn.k_proj.weight shape: [1024, 4096]
+260: model.layers.6.self_attn.o_proj.weight shape: [4096, 4096]
+261: model.layers.6.self_attn.q_proj.weight shape: [4096, 4096]
+262: model.layers.6.self_attn.v_proj.weight shape: [1024, 4096]
+263: model.layers.7.input_layernorm.weight shape: [4096]
+264: model.layers.7.mlp.down_proj.weight shape: [4096, 14336]
+265: model.layers.7.mlp.gate_proj.weight shape: [14336, 4096]
+266: model.layers.7.mlp.up_proj.weight shape: [14336, 4096]
+267: model.layers.7.post_attention_layernorm.weight shape: [4096]
+268: model.layers.7.self_attn.k_proj.weight shape: [1024, 4096]
+269: model.layers.7.self_attn.o_proj.weight shape: [4096, 4096]
+270: model.layers.7.self_attn.q_proj.weight shape: [4096, 4096]
+271: model.layers.7.self_attn.v_proj.weight shape: [1024, 4096]
+272: model.layers.8.input_layernorm.weight shape: [4096]
+273: model.layers.8.mlp.down_proj.weight shape: [4096, 14336]
+274: model.layers.8.mlp.gate_proj.weight shape: [14336, 4096]
+275: model.layers.8.mlp.up_proj.weight shape: [14336, 4096]
+276: model.layers.8.post_attention_layernorm.weight shape: [4096]
+277: model.layers.8.self_attn.k_proj.weight shape: [1024, 4096]
+278: model.layers.8.self_attn.o_proj.weight shape: [4096, 4096]
+279: model.layers.8.self_attn.q_proj.weight shape: [4096, 4096]
+280: model.layers.8.self_attn.v_proj.weight shape: [1024, 4096]
+281: model.layers.9.input_layernorm.weight shape: [4096]
+282: model.layers.9.mlp.down_proj.weight shape: [4096, 14336]
+283: model.layers.9.mlp.gate_proj.weight shape: [14336, 4096]
+284: model.layers.9.mlp.up_proj.weight shape: [14336, 4096]
+285: model.layers.9.post_attention_layernorm.weight shape: [4096]
+286: model.layers.9.self_attn.k_proj.weight shape: [1024, 4096]
+287: model.layers.9.self_attn.o_proj.weight shape: [4096, 4096]
+288: model.layers.9.self_attn.q_proj.weight shape: [4096, 4096]
+289: model.layers.9.self_attn.v_proj.weight shape: [1024, 4096]
+290: model.norm.weight shape: [4096]
diff --git a/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs b/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs
new file mode 100644
index 0000000000..69d66e9df6
--- /dev/null
+++ b/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs
@@ -0,0 +1,42 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+using ApprovalTests;
+using ApprovalTests.Namers;
+using ApprovalTests.Reporters;
+using TorchSharp;
+using Xunit;
+using Microsoft.ML.GenAI.Core.Extension;
+
+namespace Microsoft.ML.GenAI.LLaMA.Tests;
+
+[Collection("NoParallelization")]
+public class LLaMA3_1Tests
+{
+    public LLaMA3_1Tests()
+    {
+        if (Environment.GetEnvironmentVariable("HELIX_CORRELATION_ID") != null)
+        {
+            Approvals.UseAssemblyLocationForApprovedFiles();
+        }
+
+        torch.set_default_device("meta");
+    }
+
+    [Fact]
+    [UseReporter(typeof(DiffReporter))]
+    [UseApprovalSubdirectory("Approvals")]
+    public void Llama_3_1_8b_ShapeTest()
+    {
+        var model = new LlamaForCausalLM(LlamaConfig.Llama3_1_8B_Instruct);
+        var stateDictStr = model.PeekShape();
+        Approvals.Verify(stateDictStr);
+    }
+
+}
diff --git a/test/Microsoft.ML.GenAI.LLaMA.Tests/Microsoft.ML.GenAI.LLaMA.Tests.csproj b/test/Microsoft.ML.GenAI.LLaMA.Tests/Microsoft.ML.GenAI.LLaMA.Tests.csproj
new file mode 100644
index 0000000000..a810482d7e
--- /dev/null
+++ b/test/Microsoft.ML.GenAI.LLaMA.Tests/Microsoft.ML.GenAI.LLaMA.Tests.csproj
@@ -0,0 +1,39 @@
+﻿<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <TargetFrameworks>net6.0</TargetFrameworks>
+    <ImplicitUsings>enable</ImplicitUsings>
+    <NoWarn>$(NoWarn);MSML_ExtendBaseTestClass</NoWarn>
+    <Nullable>enable</Nullable>
+    <PreserveCompilationContext>true</PreserveCompilationContext>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <ProjectReference Include="..\..\src\Microsoft.ML.GenAI.Core\Microsoft.ML.GenAI.Core.csproj" />
+    <ProjectReference Include="..\..\src\Microsoft.ML.GenAI.LLaMA\Microsoft.ML.GenAI.LLaMA.csproj" />
+    <ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <PackageReference Include="ApprovalTests" Version="$(ApprovalTestsVersion)" />
+    <PackageReference Include="FluentAssertions" Version="$(FluentAssertionVersion)" />
+    <PackageReference Include="Microsoft.SemanticKernel" Version="$(SemanticKernelVersion)" />
+    <PackageReference Include="Moq" Version="$(MoqVersion)" />
+    <PackageReference Include="Microsoft.ML.TestTokenizers" Version="$(MicrosoftMLTestTokenizersVersion)" />
+    <PackageReference Include="TorchSharp" Version="$(TorchSharpVersion)" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <None Update="Approvals\**\*">
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+    </None>
+  </ItemGroup>
+
+
+  <ItemGroup Condition="'$(TargetArchitecture)' == 'x64'">
+    <PackageReference Include="libtorch-cpu-win-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('Windows')) AND '$(TargetArchitecture)' == 'x64'" />
+    <PackageReference Include="libtorch-cpu-linux-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('Linux')) AND '$(TargetArchitecture)' == 'x64'" />
+    <PackageReference Include="libtorch-cpu-osx-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('OSX')) AND '$(TargetArchitecture)' == 'x64'" />
+  </ItemGroup>
+
+</Project>

From 996548b154a06d8d9c52274c2136fcd4d987942d Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Tue, 13 Aug 2024 21:06:52 -0700
Subject: [PATCH 02/24] add test for tokenizer

---
 .../Llama3_1TokenizerHelper.cs                |  51 +++
 ...1Tests.Llama_3_1_8b_ShapeTest.received.txt | 291 ------------------
 .../LLaMA3_1Tests.TokenizerTest.approved.txt  |   8 +
 .../LLaMA3_1Tests.TokenizerTest.received.txt  |   6 +
 .../LLaMA3_1Tests.cs                          |  33 ++
 5 files changed, 98 insertions(+), 291 deletions(-)
 create mode 100644 src/Microsoft.ML.GenAI.LLaMA/Llama3_1TokenizerHelper.cs
 delete mode 100644 test/Microsoft.ML.GenAI.LLaMA.Tests/Approvals/LLaMA3_1Tests.Llama_3_1_8b_ShapeTest.received.txt
 create mode 100644 test/Microsoft.ML.GenAI.LLaMA.Tests/Approvals/LLaMA3_1Tests.TokenizerTest.approved.txt
 create mode 100644 test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.TokenizerTest.received.txt

diff --git a/src/Microsoft.ML.GenAI.LLaMA/Llama3_1TokenizerHelper.cs b/src/Microsoft.ML.GenAI.LLaMA/Llama3_1TokenizerHelper.cs
new file mode 100644
index 0000000000..1d509a1e30
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.LLaMA/Llama3_1TokenizerHelper.cs
@@ -0,0 +1,51 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Text.RegularExpressions;
+using System.Threading.Tasks;
+using Microsoft.ML.Tokenizers;
+
+namespace Microsoft.ML.GenAI.LLaMA;
+
+#pragma warning disable MSML_GeneralName // This name should be PascalCased
+public class Llama3_1TokenizerHelper
+#pragma warning restore MSML_GeneralName // This name should be PascalCased
+{
+    /// <summary>
+    /// https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct/blob/main/tokenizer.json#pre_tokenizer.pretokenizers.pattern
+    /// </summary>
+    private const string _re = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+";
+
+    /// <summary>
+    /// https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct/blob/main/tokenizer.json#added_tokens
+    /// </summary>
+    private static readonly Dictionary<string, int> _specialTokens = new()
+    {
+        { "<|begin_of_text|>", 128000 },
+        { "<|end_of_text|>", 128001 },
+        { "<|finetune_right_pad_id|>", 128004 },
+        { "<|start_header_id|>", 128006 },
+        { "<|end_header_id|>", 128007 },
+        { "<|eom_id|>", 128008 },
+        { "<|eot_id|>", 128009 },
+        { "<|system|>", 32006 },
+        { "<|user|>", 32010 },
+        { "<|assistant|>", 32001 },
+        { "<|end|>", 32007 }
+    };
+
+    /// <summary>
+    /// Create <see cref="TiktokenTokenizer"/> from tokenizer model file.
+    /// </summary>
+    /// <param name="modelPath">path to tokenizer model file</param>
+    public static TiktokenTokenizer FromPretrained(string modelPath)
+    {
+        var preTokenizer = new TiktokenPreTokenizer(new Regex(_re), _specialTokens);
+        return TiktokenTokenizer.Create(File.OpenRead(modelPath), preTokenizer, normalizer: null, specialTokens: _specialTokens);
+    }
+}
diff --git a/test/Microsoft.ML.GenAI.LLaMA.Tests/Approvals/LLaMA3_1Tests.Llama_3_1_8b_ShapeTest.received.txt b/test/Microsoft.ML.GenAI.LLaMA.Tests/Approvals/LLaMA3_1Tests.Llama_3_1_8b_ShapeTest.received.txt
deleted file mode 100644
index 887b49cfa6..0000000000
--- a/test/Microsoft.ML.GenAI.LLaMA.Tests/Approvals/LLaMA3_1Tests.Llama_3_1_8b_ShapeTest.received.txt
+++ /dev/null
@@ -1,291 +0,0 @@
-﻿0: lm_head.weight shape: [128256, 4096]
-1: model.embed_tokens.weight shape: [128256, 4096]
-2: model.layers.0.input_layernorm.weight shape: [4096]
-3: model.layers.0.mlp.down_proj.weight shape: [4096, 14336]
-4: model.layers.0.mlp.gate_proj.weight shape: [14336, 4096]
-5: model.layers.0.mlp.up_proj.weight shape: [14336, 4096]
-6: model.layers.0.post_attention_layernorm.weight shape: [4096]
-7: model.layers.0.self_attn.k_proj.weight shape: [1024, 4096]
-8: model.layers.0.self_attn.o_proj.weight shape: [4096, 4096]
-9: model.layers.0.self_attn.q_proj.weight shape: [4096, 4096]
-10: model.layers.0.self_attn.v_proj.weight shape: [1024, 4096]
-11: model.layers.1.input_layernorm.weight shape: [4096]
-12: model.layers.1.mlp.down_proj.weight shape: [4096, 14336]
-13: model.layers.1.mlp.gate_proj.weight shape: [14336, 4096]
-14: model.layers.1.mlp.up_proj.weight shape: [14336, 4096]
-15: model.layers.1.post_attention_layernorm.weight shape: [4096]
-16: model.layers.1.self_attn.k_proj.weight shape: [1024, 4096]
-17: model.layers.1.self_attn.o_proj.weight shape: [4096, 4096]
-18: model.layers.1.self_attn.q_proj.weight shape: [4096, 4096]
-19: model.layers.1.self_attn.v_proj.weight shape: [1024, 4096]
-20: model.layers.10.input_layernorm.weight shape: [4096]
-21: model.layers.10.mlp.down_proj.weight shape: [4096, 14336]
-22: model.layers.10.mlp.gate_proj.weight shape: [14336, 4096]
-23: model.layers.10.mlp.up_proj.weight shape: [14336, 4096]
-24: model.layers.10.post_attention_layernorm.weight shape: [4096]
-25: model.layers.10.self_attn.k_proj.weight shape: [1024, 4096]
-26: model.layers.10.self_attn.o_proj.weight shape: [4096, 4096]
-27: model.layers.10.self_attn.q_proj.weight shape: [4096, 4096]
-28: model.layers.10.self_attn.v_proj.weight shape: [1024, 4096]
-29: model.layers.11.input_layernorm.weight shape: [4096]
-30: model.layers.11.mlp.down_proj.weight shape: [4096, 14336]
-31: model.layers.11.mlp.gate_proj.weight shape: [14336, 4096]
-32: model.layers.11.mlp.up_proj.weight shape: [14336, 4096]
-33: model.layers.11.post_attention_layernorm.weight shape: [4096]
-34: model.layers.11.self_attn.k_proj.weight shape: [1024, 4096]
-35: model.layers.11.self_attn.o_proj.weight shape: [4096, 4096]
-36: model.layers.11.self_attn.q_proj.weight shape: [4096, 4096]
-37: model.layers.11.self_attn.v_proj.weight shape: [1024, 4096]
-38: model.layers.12.input_layernorm.weight shape: [4096]
-39: model.layers.12.mlp.down_proj.weight shape: [4096, 14336]
-40: model.layers.12.mlp.gate_proj.weight shape: [14336, 4096]
-41: model.layers.12.mlp.up_proj.weight shape: [14336, 4096]
-42: model.layers.12.post_attention_layernorm.weight shape: [4096]
-43: model.layers.12.self_attn.k_proj.weight shape: [1024, 4096]
-44: model.layers.12.self_attn.o_proj.weight shape: [4096, 4096]
-45: model.layers.12.self_attn.q_proj.weight shape: [4096, 4096]
-46: model.layers.12.self_attn.v_proj.weight shape: [1024, 4096]
-47: model.layers.13.input_layernorm.weight shape: [4096]
-48: model.layers.13.mlp.down_proj.weight shape: [4096, 14336]
-49: model.layers.13.mlp.gate_proj.weight shape: [14336, 4096]
-50: model.layers.13.mlp.up_proj.weight shape: [14336, 4096]
-51: model.layers.13.post_attention_layernorm.weight shape: [4096]
-52: model.layers.13.self_attn.k_proj.weight shape: [1024, 4096]
-53: model.layers.13.self_attn.o_proj.weight shape: [4096, 4096]
-54: model.layers.13.self_attn.q_proj.weight shape: [4096, 4096]
-55: model.layers.13.self_attn.v_proj.weight shape: [1024, 4096]
-56: model.layers.14.input_layernorm.weight shape: [4096]
-57: model.layers.14.mlp.down_proj.weight shape: [4096, 14336]
-58: model.layers.14.mlp.gate_proj.weight shape: [14336, 4096]
-59: model.layers.14.mlp.up_proj.weight shape: [14336, 4096]
-60: model.layers.14.post_attention_layernorm.weight shape: [4096]
-61: model.layers.14.self_attn.k_proj.weight shape: [1024, 4096]
-62: model.layers.14.self_attn.o_proj.weight shape: [4096, 4096]
-63: model.layers.14.self_attn.q_proj.weight shape: [4096, 4096]
-64: model.layers.14.self_attn.v_proj.weight shape: [1024, 4096]
-65: model.layers.15.input_layernorm.weight shape: [4096]
-66: model.layers.15.mlp.down_proj.weight shape: [4096, 14336]
-67: model.layers.15.mlp.gate_proj.weight shape: [14336, 4096]
-68: model.layers.15.mlp.up_proj.weight shape: [14336, 4096]
-69: model.layers.15.post_attention_layernorm.weight shape: [4096]
-70: model.layers.15.self_attn.k_proj.weight shape: [1024, 4096]
-71: model.layers.15.self_attn.o_proj.weight shape: [4096, 4096]
-72: model.layers.15.self_attn.q_proj.weight shape: [4096, 4096]
-73: model.layers.15.self_attn.v_proj.weight shape: [1024, 4096]
-74: model.layers.16.input_layernorm.weight shape: [4096]
-75: model.layers.16.mlp.down_proj.weight shape: [4096, 14336]
-76: model.layers.16.mlp.gate_proj.weight shape: [14336, 4096]
-77: model.layers.16.mlp.up_proj.weight shape: [14336, 4096]
-78: model.layers.16.post_attention_layernorm.weight shape: [4096]
-79: model.layers.16.self_attn.k_proj.weight shape: [1024, 4096]
-80: model.layers.16.self_attn.o_proj.weight shape: [4096, 4096]
-81: model.layers.16.self_attn.q_proj.weight shape: [4096, 4096]
-82: model.layers.16.self_attn.v_proj.weight shape: [1024, 4096]
-83: model.layers.17.input_layernorm.weight shape: [4096]
-84: model.layers.17.mlp.down_proj.weight shape: [4096, 14336]
-85: model.layers.17.mlp.gate_proj.weight shape: [14336, 4096]
-86: model.layers.17.mlp.up_proj.weight shape: [14336, 4096]
-87: model.layers.17.post_attention_layernorm.weight shape: [4096]
-88: model.layers.17.self_attn.k_proj.weight shape: [1024, 4096]
-89: model.layers.17.self_attn.o_proj.weight shape: [4096, 4096]
-90: model.layers.17.self_attn.q_proj.weight shape: [4096, 4096]
-91: model.layers.17.self_attn.v_proj.weight shape: [1024, 4096]
-92: model.layers.18.input_layernorm.weight shape: [4096]
-93: model.layers.18.mlp.down_proj.weight shape: [4096, 14336]
-94: model.layers.18.mlp.gate_proj.weight shape: [14336, 4096]
-95: model.layers.18.mlp.up_proj.weight shape: [14336, 4096]
-96: model.layers.18.post_attention_layernorm.weight shape: [4096]
-97: model.layers.18.self_attn.k_proj.weight shape: [1024, 4096]
-98: model.layers.18.self_attn.o_proj.weight shape: [4096, 4096]
-99: model.layers.18.self_attn.q_proj.weight shape: [4096, 4096]
-100: model.layers.18.self_attn.v_proj.weight shape: [1024, 4096]
-101: model.layers.19.input_layernorm.weight shape: [4096]
-102: model.layers.19.mlp.down_proj.weight shape: [4096, 14336]
-103: model.layers.19.mlp.gate_proj.weight shape: [14336, 4096]
-104: model.layers.19.mlp.up_proj.weight shape: [14336, 4096]
-105: model.layers.19.post_attention_layernorm.weight shape: [4096]
-106: model.layers.19.self_attn.k_proj.weight shape: [1024, 4096]
-107: model.layers.19.self_attn.o_proj.weight shape: [4096, 4096]
-108: model.layers.19.self_attn.q_proj.weight shape: [4096, 4096]
-109: model.layers.19.self_attn.v_proj.weight shape: [1024, 4096]
-110: model.layers.2.input_layernorm.weight shape: [4096]
-111: model.layers.2.mlp.down_proj.weight shape: [4096, 14336]
-112: model.layers.2.mlp.gate_proj.weight shape: [14336, 4096]
-113: model.layers.2.mlp.up_proj.weight shape: [14336, 4096]
-114: model.layers.2.post_attention_layernorm.weight shape: [4096]
-115: model.layers.2.self_attn.k_proj.weight shape: [1024, 4096]
-116: model.layers.2.self_attn.o_proj.weight shape: [4096, 4096]
-117: model.layers.2.self_attn.q_proj.weight shape: [4096, 4096]
-118: model.layers.2.self_attn.v_proj.weight shape: [1024, 4096]
-119: model.layers.20.input_layernorm.weight shape: [4096]
-120: model.layers.20.mlp.down_proj.weight shape: [4096, 14336]
-121: model.layers.20.mlp.gate_proj.weight shape: [14336, 4096]
-122: model.layers.20.mlp.up_proj.weight shape: [14336, 4096]
-123: model.layers.20.post_attention_layernorm.weight shape: [4096]
-124: model.layers.20.self_attn.k_proj.weight shape: [1024, 4096]
-125: model.layers.20.self_attn.o_proj.weight shape: [4096, 4096]
-126: model.layers.20.self_attn.q_proj.weight shape: [4096, 4096]
-127: model.layers.20.self_attn.v_proj.weight shape: [1024, 4096]
-128: model.layers.21.input_layernorm.weight shape: [4096]
-129: model.layers.21.mlp.down_proj.weight shape: [4096, 14336]
-130: model.layers.21.mlp.gate_proj.weight shape: [14336, 4096]
-131: model.layers.21.mlp.up_proj.weight shape: [14336, 4096]
-132: model.layers.21.post_attention_layernorm.weight shape: [4096]
-133: model.layers.21.self_attn.k_proj.weight shape: [1024, 4096]
-134: model.layers.21.self_attn.o_proj.weight shape: [4096, 4096]
-135: model.layers.21.self_attn.q_proj.weight shape: [4096, 4096]
-136: model.layers.21.self_attn.v_proj.weight shape: [1024, 4096]
-137: model.layers.22.input_layernorm.weight shape: [4096]
-138: model.layers.22.mlp.down_proj.weight shape: [4096, 14336]
-139: model.layers.22.mlp.gate_proj.weight shape: [14336, 4096]
-140: model.layers.22.mlp.up_proj.weight shape: [14336, 4096]
-141: model.layers.22.post_attention_layernorm.weight shape: [4096]
-142: model.layers.22.self_attn.k_proj.weight shape: [1024, 4096]
-143: model.layers.22.self_attn.o_proj.weight shape: [4096, 4096]
-144: model.layers.22.self_attn.q_proj.weight shape: [4096, 4096]
-145: model.layers.22.self_attn.v_proj.weight shape: [1024, 4096]
-146: model.layers.23.input_layernorm.weight shape: [4096]
-147: model.layers.23.mlp.down_proj.weight shape: [4096, 14336]
-148: model.layers.23.mlp.gate_proj.weight shape: [14336, 4096]
-149: model.layers.23.mlp.up_proj.weight shape: [14336, 4096]
-150: model.layers.23.post_attention_layernorm.weight shape: [4096]
-151: model.layers.23.self_attn.k_proj.weight shape: [1024, 4096]
-152: model.layers.23.self_attn.o_proj.weight shape: [4096, 4096]
-153: model.layers.23.self_attn.q_proj.weight shape: [4096, 4096]
-154: model.layers.23.self_attn.v_proj.weight shape: [1024, 4096]
-155: model.layers.24.input_layernorm.weight shape: [4096]
-156: model.layers.24.mlp.down_proj.weight shape: [4096, 14336]
-157: model.layers.24.mlp.gate_proj.weight shape: [14336, 4096]
-158: model.layers.24.mlp.up_proj.weight shape: [14336, 4096]
-159: model.layers.24.post_attention_layernorm.weight shape: [4096]
-160: model.layers.24.self_attn.k_proj.weight shape: [1024, 4096]
-161: model.layers.24.self_attn.o_proj.weight shape: [4096, 4096]
-162: model.layers.24.self_attn.q_proj.weight shape: [4096, 4096]
-163: model.layers.24.self_attn.v_proj.weight shape: [1024, 4096]
-164: model.layers.25.input_layernorm.weight shape: [4096]
-165: model.layers.25.mlp.down_proj.weight shape: [4096, 14336]
-166: model.layers.25.mlp.gate_proj.weight shape: [14336, 4096]
-167: model.layers.25.mlp.up_proj.weight shape: [14336, 4096]
-168: model.layers.25.post_attention_layernorm.weight shape: [4096]
-169: model.layers.25.self_attn.k_proj.weight shape: [1024, 4096]
-170: model.layers.25.self_attn.o_proj.weight shape: [4096, 4096]
-171: model.layers.25.self_attn.q_proj.weight shape: [4096, 4096]
-172: model.layers.25.self_attn.v_proj.weight shape: [1024, 4096]
-173: model.layers.26.input_layernorm.weight shape: [4096]
-174: model.layers.26.mlp.down_proj.weight shape: [4096, 14336]
-175: model.layers.26.mlp.gate_proj.weight shape: [14336, 4096]
-176: model.layers.26.mlp.up_proj.weight shape: [14336, 4096]
-177: model.layers.26.post_attention_layernorm.weight shape: [4096]
-178: model.layers.26.self_attn.k_proj.weight shape: [1024, 4096]
-179: model.layers.26.self_attn.o_proj.weight shape: [4096, 4096]
-180: model.layers.26.self_attn.q_proj.weight shape: [4096, 4096]
-181: model.layers.26.self_attn.v_proj.weight shape: [1024, 4096]
-182: model.layers.27.input_layernorm.weight shape: [4096]
-183: model.layers.27.mlp.down_proj.weight shape: [4096, 14336]
-184: model.layers.27.mlp.gate_proj.weight shape: [14336, 4096]
-185: model.layers.27.mlp.up_proj.weight shape: [14336, 4096]
-186: model.layers.27.post_attention_layernorm.weight shape: [4096]
-187: model.layers.27.self_attn.k_proj.weight shape: [1024, 4096]
-188: model.layers.27.self_attn.o_proj.weight shape: [4096, 4096]
-189: model.layers.27.self_attn.q_proj.weight shape: [4096, 4096]
-190: model.layers.27.self_attn.v_proj.weight shape: [1024, 4096]
-191: model.layers.28.input_layernorm.weight shape: [4096]
-192: model.layers.28.mlp.down_proj.weight shape: [4096, 14336]
-193: model.layers.28.mlp.gate_proj.weight shape: [14336, 4096]
-194: model.layers.28.mlp.up_proj.weight shape: [14336, 4096]
-195: model.layers.28.post_attention_layernorm.weight shape: [4096]
-196: model.layers.28.self_attn.k_proj.weight shape: [1024, 4096]
-197: model.layers.28.self_attn.o_proj.weight shape: [4096, 4096]
-198: model.layers.28.self_attn.q_proj.weight shape: [4096, 4096]
-199: model.layers.28.self_attn.v_proj.weight shape: [1024, 4096]
-200: model.layers.29.input_layernorm.weight shape: [4096]
-201: model.layers.29.mlp.down_proj.weight shape: [4096, 14336]
-202: model.layers.29.mlp.gate_proj.weight shape: [14336, 4096]
-203: model.layers.29.mlp.up_proj.weight shape: [14336, 4096]
-204: model.layers.29.post_attention_layernorm.weight shape: [4096]
-205: model.layers.29.self_attn.k_proj.weight shape: [1024, 4096]
-206: model.layers.29.self_attn.o_proj.weight shape: [4096, 4096]
-207: model.layers.29.self_attn.q_proj.weight shape: [4096, 4096]
-208: model.layers.29.self_attn.v_proj.weight shape: [1024, 4096]
-209: model.layers.3.input_layernorm.weight shape: [4096]
-210: model.layers.3.mlp.down_proj.weight shape: [4096, 14336]
-211: model.layers.3.mlp.gate_proj.weight shape: [14336, 4096]
-212: model.layers.3.mlp.up_proj.weight shape: [14336, 4096]
-213: model.layers.3.post_attention_layernorm.weight shape: [4096]
-214: model.layers.3.self_attn.k_proj.weight shape: [1024, 4096]
-215: model.layers.3.self_attn.o_proj.weight shape: [4096, 4096]
-216: model.layers.3.self_attn.q_proj.weight shape: [4096, 4096]
-217: model.layers.3.self_attn.v_proj.weight shape: [1024, 4096]
-218: model.layers.30.input_layernorm.weight shape: [4096]
-219: model.layers.30.mlp.down_proj.weight shape: [4096, 14336]
-220: model.layers.30.mlp.gate_proj.weight shape: [14336, 4096]
-221: model.layers.30.mlp.up_proj.weight shape: [14336, 4096]
-222: model.layers.30.post_attention_layernorm.weight shape: [4096]
-223: model.layers.30.self_attn.k_proj.weight shape: [1024, 4096]
-224: model.layers.30.self_attn.o_proj.weight shape: [4096, 4096]
-225: model.layers.30.self_attn.q_proj.weight shape: [4096, 4096]
-226: model.layers.30.self_attn.v_proj.weight shape: [1024, 4096]
-227: model.layers.31.input_layernorm.weight shape: [4096]
-228: model.layers.31.mlp.down_proj.weight shape: [4096, 14336]
-229: model.layers.31.mlp.gate_proj.weight shape: [14336, 4096]
-230: model.layers.31.mlp.up_proj.weight shape: [14336, 4096]
-231: model.layers.31.post_attention_layernorm.weight shape: [4096]
-232: model.layers.31.self_attn.k_proj.weight shape: [1024, 4096]
-233: model.layers.31.self_attn.o_proj.weight shape: [4096, 4096]
-234: model.layers.31.self_attn.q_proj.weight shape: [4096, 4096]
-235: model.layers.31.self_attn.v_proj.weight shape: [1024, 4096]
-236: model.layers.4.input_layernorm.weight shape: [4096]
-237: model.layers.4.mlp.down_proj.weight shape: [4096, 14336]
-238: model.layers.4.mlp.gate_proj.weight shape: [14336, 4096]
-239: model.layers.4.mlp.up_proj.weight shape: [14336, 4096]
-240: model.layers.4.post_attention_layernorm.weight shape: [4096]
-241: model.layers.4.self_attn.k_proj.weight shape: [1024, 4096]
-242: model.layers.4.self_attn.o_proj.weight shape: [4096, 4096]
-243: model.layers.4.self_attn.q_proj.weight shape: [4096, 4096]
-244: model.layers.4.self_attn.v_proj.weight shape: [1024, 4096]
-245: model.layers.5.input_layernorm.weight shape: [4096]
-246: model.layers.5.mlp.down_proj.weight shape: [4096, 14336]
-247: model.layers.5.mlp.gate_proj.weight shape: [14336, 4096]
-248: model.layers.5.mlp.up_proj.weight shape: [14336, 4096]
-249: model.layers.5.post_attention_layernorm.weight shape: [4096]
-250: model.layers.5.self_attn.k_proj.weight shape: [1024, 4096]
-251: model.layers.5.self_attn.o_proj.weight shape: [4096, 4096]
-252: model.layers.5.self_attn.q_proj.weight shape: [4096, 4096]
-253: model.layers.5.self_attn.v_proj.weight shape: [1024, 4096]
-254: model.layers.6.input_layernorm.weight shape: [4096]
-255: model.layers.6.mlp.down_proj.weight shape: [4096, 14336]
-256: model.layers.6.mlp.gate_proj.weight shape: [14336, 4096]
-257: model.layers.6.mlp.up_proj.weight shape: [14336, 4096]
-258: model.layers.6.post_attention_layernorm.weight shape: [4096]
-259: model.layers.6.self_attn.k_proj.weight shape: [1024, 4096]
-260: model.layers.6.self_attn.o_proj.weight shape: [4096, 4096]
-261: model.layers.6.self_attn.q_proj.weight shape: [4096, 4096]
-262: model.layers.6.self_attn.v_proj.weight shape: [1024, 4096]
-263: model.layers.7.input_layernorm.weight shape: [4096]
-264: model.layers.7.mlp.down_proj.weight shape: [4096, 14336]
-265: model.layers.7.mlp.gate_proj.weight shape: [14336, 4096]
-266: model.layers.7.mlp.up_proj.weight shape: [14336, 4096]
-267: model.layers.7.post_attention_layernorm.weight shape: [4096]
-268: model.layers.7.self_attn.k_proj.weight shape: [1024, 4096]
-269: model.layers.7.self_attn.o_proj.weight shape: [4096, 4096]
-270: model.layers.7.self_attn.q_proj.weight shape: [4096, 4096]
-271: model.layers.7.self_attn.v_proj.weight shape: [1024, 4096]
-272: model.layers.8.input_layernorm.weight shape: [4096]
-273: model.layers.8.mlp.down_proj.weight shape: [4096, 14336]
-274: model.layers.8.mlp.gate_proj.weight shape: [14336, 4096]
-275: model.layers.8.mlp.up_proj.weight shape: [14336, 4096]
-276: model.layers.8.post_attention_layernorm.weight shape: [4096]
-277: model.layers.8.self_attn.k_proj.weight shape: [1024, 4096]
-278: model.layers.8.self_attn.o_proj.weight shape: [4096, 4096]
-279: model.layers.8.self_attn.q_proj.weight shape: [4096, 4096]
-280: model.layers.8.self_attn.v_proj.weight shape: [1024, 4096]
-281: model.layers.9.input_layernorm.weight shape: [4096]
-282: model.layers.9.mlp.down_proj.weight shape: [4096, 14336]
-283: model.layers.9.mlp.gate_proj.weight shape: [14336, 4096]
-284: model.layers.9.mlp.up_proj.weight shape: [14336, 4096]
-285: model.layers.9.post_attention_layernorm.weight shape: [4096]
-286: model.layers.9.self_attn.k_proj.weight shape: [1024, 4096]
-287: model.layers.9.self_attn.o_proj.weight shape: [4096, 4096]
-288: model.layers.9.self_attn.q_proj.weight shape: [4096, 4096]
-289: model.layers.9.self_attn.v_proj.weight shape: [1024, 4096]
-290: model.norm.weight shape: [4096]
diff --git a/test/Microsoft.ML.GenAI.LLaMA.Tests/Approvals/LLaMA3_1Tests.TokenizerTest.approved.txt b/test/Microsoft.ML.GenAI.LLaMA.Tests/Approvals/LLaMA3_1Tests.TokenizerTest.approved.txt
new file mode 100644
index 0000000000..fc0568084b
--- /dev/null
+++ b/test/Microsoft.ML.GenAI.LLaMA.Tests/Approvals/LLaMA3_1Tests.TokenizerTest.approved.txt
@@ -0,0 +1,8 @@
+﻿Can you provide ways to eat combinations of bananas and dragonfruits?
+6854, 499, 3493, 5627, 311, 8343, 28559, 315, 68442, 323, 26161, 1658, 12059, 30
+Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey.
+40914, 0, 5810, 527, 1063, 5627, 311, 8343, 68442, 323, 26161, 1658, 12059, 3871, 25, 220, 16, 13, 76924, 323, 26161, 36698, 11113, 648, 25, 55248, 68442, 323, 26161, 1658, 12059, 3871, 449, 1063, 14403, 323, 26828, 13, 220, 17, 13, 76924, 323, 26161, 36698, 33566, 25, 19771, 48715, 68442, 323, 26161, 1658, 12059, 3871, 449, 1063, 30564, 23661, 323, 26828, 13
+What about solving an 2x + 3 = 7 equation?
+3923, 922, 22581, 459, 220, 17, 87, 489, 220, 18, 284, 220, 22, 24524, 30
+<|begin_of_text|>Hello World<|end_of_text|>
+128000, 9906, 4435, 128001
diff --git a/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.TokenizerTest.received.txt b/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.TokenizerTest.received.txt
new file mode 100644
index 0000000000..9bb3220214
--- /dev/null
+++ b/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.TokenizerTest.received.txt
@@ -0,0 +1,6 @@
+﻿Can you provide ways to eat combinations of bananas and dragonfruits?
+6854, 499, 3493, 5627, 311, 8343, 28559, 315, 68442, 323, 26161, 1658, 12059, 30
+Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey.
+40914, 0, 5810, 527, 1063, 5627, 311, 8343, 68442, 323, 26161, 1658, 12059, 3871, 25, 220, 16, 13, 76924, 323, 26161, 36698, 11113, 648, 25, 55248, 68442, 323, 26161, 1658, 12059, 3871, 449, 1063, 14403, 323, 26828, 13, 220, 17, 13, 76924, 323, 26161, 36698, 33566, 25, 19771, 48715, 68442, 323, 26161, 1658, 12059, 3871, 449, 1063, 30564, 23661, 323, 26828, 13
+What about solving an 2x + 3 = 7 equation?
+3923, 922, 22581, 459, 220, 17, 87, 489, 220, 18, 284, 220, 22, 24524, 30
diff --git a/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs b/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs
index 69d66e9df6..9028b8933c 100644
--- a/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs
+++ b/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs
@@ -13,6 +13,9 @@
 using TorchSharp;
 using Xunit;
 using Microsoft.ML.GenAI.Core.Extension;
+using Microsoft.ML.Tokenizers;
+using FluentAssertions;
+using System.Text.RegularExpressions;
 
 namespace Microsoft.ML.GenAI.LLaMA.Tests;
 
@@ -39,4 +42,34 @@ public void Llama_3_1_8b_ShapeTest()
         Approvals.Verify(stateDictStr);
     }
 
+    [Fact]
+    [UseReporter(typeof(DiffReporter))]
+    [UseApprovalSubdirectory("Approvals")]
+    public void TokenizerTest()
+    {
+        var modelWeightFolder = Path.Join("C:\\Users\\xiaoyuz\\source\\repos\\Meta-Llama-3.1-8B-Instruct\\original");
+        var tokenizer = Llama3_1TokenizerHelper.FromPretrained(Path.Join(modelWeightFolder, "tokenizer.model"));
+
+        var messages = new string[]
+        {
+            "Can you provide ways to eat combinations of bananas and dragonfruits?",
+            "Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey.",
+            "What about solving an 2x + 3 = 7 equation?",
+            """
+            <|begin_of_text|>Hello World<|end_of_text|>
+            """
+        };
+
+        var sb = new StringBuilder();
+        foreach (var message in messages)
+        {
+            var tokenizeIds = tokenizer.EncodeToIds(message, true, false);
+            var decodeToString = tokenizer.Decode(tokenizeIds);
+            sb.AppendLine(decodeToString);
+            var tokenizedStr = string.Join(", ", tokenizeIds.Select(x => x.ToString()));
+
+            sb.AppendLine(tokenizedStr);
+        }
+        Approvals.Verify(sb.ToString());
+    }
 }

From 023c9f7e629c62412c417a10d42a1bfa2943345f Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Wed, 14 Aug 2024 15:53:24 -0700
Subject: [PATCH 03/24] make llama 3.1 working

---
 .../Microsoft.ML.GenAI.Samples/Llama/test.cs  | 56 +++++++++++++++++++
 .../Microsoft.ML.GenAI.Samples.csproj         |  1 +
 .../Microsoft.ML.GenAI.Samples/Program.cs     |  4 +-
 .../Module/Attention.cs                       |  3 +-
 src/Microsoft.ML.GenAI.Core/Utils.cs          |  4 +-
 .../Llama3_1TokenizerHelper.cs                | 10 +++-
 .../LlamaForCausalLM.cs                       | 27 +++++++++
 .../Module/LlamaDecoderLayer.cs               | 15 +++--
 .../Module/LlamaMLP.cs                        |  9 ++-
 .../Module/LlamaModel.cs                      |  3 +-
 .../Config/meta-llama-3.1-8B-Instruct.json    |  1 -
 .../Module/Phi3DecoderLayer.cs                |  1 +
 .../Module/Phi3Model.cs                       |  5 +-
 13 files changed, 118 insertions(+), 21 deletions(-)
 create mode 100644 docs/samples/Microsoft.ML.GenAI.Samples/Llama/test.cs

diff --git a/docs/samples/Microsoft.ML.GenAI.Samples/Llama/test.cs b/docs/samples/Microsoft.ML.GenAI.Samples/Llama/test.cs
new file mode 100644
index 0000000000..804a7e3b77
--- /dev/null
+++ b/docs/samples/Microsoft.ML.GenAI.Samples/Llama/test.cs
@@ -0,0 +1,56 @@
+﻿using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+using Microsoft.ML.GenAI.Core;
+using Microsoft.ML.GenAI.LLaMA;
+using Microsoft.ML.Tokenizers;
+using TorchSharp;
+using static TorchSharp.torch;
+
+namespace Microsoft.ML.GenAI.Samples.Llama;
+
+internal class LlamaSample
+{
+    public static void Run()
+    {
+        var device = "cuda";
+        if (device == "cuda")
+        {
+            torch.InitializeDeviceType(DeviceType.CUDA);
+        }
+
+        var defaultType = ScalarType.Float16;
+        torch.manual_seed(1);
+        torch.set_default_dtype(defaultType);
+        var weightFolder = @"C:\Users\xiaoyuz\source\repos\Meta-Llama-3.1-8B-Instruct";
+        var originalWeightFolder = Path.Combine(weightFolder, "original");
+
+        Console.WriteLine("Loading Llama from huggingface model weight folder");
+        var stopWatch = System.Diagnostics.Stopwatch.StartNew();
+        stopWatch.Start();
+        var tokenizer = Llama3_1TokenizerHelper.FromPretrained(originalWeightFolder);
+        var model = LlamaForCausalLM.FromPretrained(weightFolder, device: device);
+        stopWatch.Stop();
+
+        Console.WriteLine($"Loading time: {stopWatch.ElapsedMilliseconds} ms");
+
+        var pipeline = new CausalLMPipeline<TiktokenTokenizer, LlamaForCausalLM>(tokenizer, model, device);
+
+        var prompt = """
+            <|begin_of_text|>
+            <|start_header_id|>system<|end_header_id|>
+            You are a pirate chatbot who always responds in pirate speak!<|eot_id|>
+            <|start_header_id|>user<|end_header_id|>
+            Who are you?<|eot_id|>
+
+            <|start_header_id|>assistant<|end_header_id|>
+            """;
+
+        foreach (var word in pipeline.GenerateStreaming(prompt, stopSequences: ["<|eot_id|>"]))
+        {
+            Console.Write(word);
+        }
+    }
+}
diff --git a/docs/samples/Microsoft.ML.GenAI.Samples/Microsoft.ML.GenAI.Samples.csproj b/docs/samples/Microsoft.ML.GenAI.Samples/Microsoft.ML.GenAI.Samples.csproj
index 0331a32fc1..d9932106d6 100644
--- a/docs/samples/Microsoft.ML.GenAI.Samples/Microsoft.ML.GenAI.Samples.csproj
+++ b/docs/samples/Microsoft.ML.GenAI.Samples/Microsoft.ML.GenAI.Samples.csproj
@@ -9,6 +9,7 @@
 
   <ItemGroup>
     <ProjectReference Include="..\..\..\src\Microsoft.ML.GenAI.Core\Microsoft.ML.GenAI.Core.csproj" />
+    <ProjectReference Include="..\..\..\src\Microsoft.ML.GenAI.LLaMA\Microsoft.ML.GenAI.LLaMA.csproj" />
     <ProjectReference Include="..\..\..\src\Microsoft.ML.GenAI.Phi\Microsoft.ML.GenAI.Phi.csproj" />
   </ItemGroup>
 
diff --git a/docs/samples/Microsoft.ML.GenAI.Samples/Program.cs b/docs/samples/Microsoft.ML.GenAI.Samples/Program.cs
index 1560bad306..100748ca7b 100644
--- a/docs/samples/Microsoft.ML.GenAI.Samples/Program.cs
+++ b/docs/samples/Microsoft.ML.GenAI.Samples/Program.cs
@@ -1,4 +1,6 @@
 ﻿// See https://aka.ms/new-console-template for more information
+using Microsoft.ML.GenAI.Samples.Llama;
 using Microsoft.ML.GenAI.Samples.Phi3Mini;
 
-await SemanticKernelSample.RunChatCompletionSample();
+LlamaSample.Run();
+//await AutoGenSample.RunAsync();
diff --git a/src/Microsoft.ML.GenAI.Core/Module/Attention.cs b/src/Microsoft.ML.GenAI.Core/Module/Attention.cs
index d6938b27f9..e059af949d 100644
--- a/src/Microsoft.ML.GenAI.Core/Module/Attention.cs
+++ b/src/Microsoft.ML.GenAI.Core/Module/Attention.cs
@@ -9,6 +9,7 @@
 using System.Text;
 using System.Threading.Tasks;
 using Microsoft.ML.GenAI.Core;
+using Microsoft.ML.GenAI.Core.Extension;
 using TorchSharp;
 using TorchSharp.Modules;
 using static TorchSharp.torch;
@@ -162,7 +163,6 @@ public override AttentionOutput forward(AttentionInput input)
             queryStates = queryStates.view(bsz, qLen, this._numHeads, this._headDim).transpose(1, 2);
             keyStates = keyStates.view(bsz, qLen, this._numKeyValueHeads, this._headDim).transpose(1, 2);
             valueStates = valueStates.view(bsz, qLen, this._numKeyValueHeads, this._headDim).transpose(1, 2);
-
             var kvSeqLen = keyStates.IntShape()[^2];
             var pastKeyValue = input.Cache;
             if (pastKeyValue is not null)
@@ -184,6 +184,7 @@ public override AttentionOutput forward(AttentionInput input)
             keyStates = Utils.RepeatKV(keyStates, this._numKeyValueGroups);
             valueStates = Utils.RepeatKV(valueStates, this._numKeyValueGroups);
 
+            // to fp32 to avoid overflow
             var attnWeights = torch.matmul(queryStates, keyStates.transpose(2, 3));
             attnWeights = attnWeights / Math.Sqrt(this._headDim);
 
diff --git a/src/Microsoft.ML.GenAI.Core/Utils.cs b/src/Microsoft.ML.GenAI.Core/Utils.cs
index 161b8d5185..e4e1078d2e 100644
--- a/src/Microsoft.ML.GenAI.Core/Utils.cs
+++ b/src/Microsoft.ML.GenAI.Core/Utils.cs
@@ -156,9 +156,9 @@ public static Tensor RepeatKV(Tensor x, int nRep)
             return x;
         }
 
-        return x.unsqueeze(3)
+        return x.unsqueeze(2)
                 .expand(batchSize, nKVHeads, nRep, seqLen, headDim)
-                .view(batchSize, nKVHeads * nRep, seqLen, headDim);
+                .reshape(batchSize, nKVHeads * nRep, seqLen, headDim);
     }
 
 }
diff --git a/src/Microsoft.ML.GenAI.LLaMA/Llama3_1TokenizerHelper.cs b/src/Microsoft.ML.GenAI.LLaMA/Llama3_1TokenizerHelper.cs
index 1d509a1e30..74a61b2cee 100644
--- a/src/Microsoft.ML.GenAI.LLaMA/Llama3_1TokenizerHelper.cs
+++ b/src/Microsoft.ML.GenAI.LLaMA/Llama3_1TokenizerHelper.cs
@@ -42,10 +42,14 @@ public class Llama3_1TokenizerHelper
     /// <summary>
     /// Create <see cref="TiktokenTokenizer"/> from tokenizer model file.
     /// </summary>
-    /// <param name="modelPath">path to tokenizer model file</param>
-    public static TiktokenTokenizer FromPretrained(string modelPath)
+    /// <param name="modelWeightFolder">path to tokenizer model folder</param>
+    /// <param name="modelFile">tokenizer model file name</param>
+    public static TiktokenTokenizer FromPretrained(
+        string modelWeightFolder,
+        string modelFile = "tokenizer.model")
     {
+        var modelFilePath = Path.Join(modelWeightFolder, modelFile);
         var preTokenizer = new TiktokenPreTokenizer(new Regex(_re), _specialTokens);
-        return TiktokenTokenizer.Create(File.OpenRead(modelPath), preTokenizer, normalizer: null, specialTokens: _specialTokens);
+        return TiktokenTokenizer.Create(File.OpenRead(modelFilePath), preTokenizer, normalizer: null, specialTokens: _specialTokens);
     }
 }
diff --git a/src/Microsoft.ML.GenAI.LLaMA/LlamaForCausalLM.cs b/src/Microsoft.ML.GenAI.LLaMA/LlamaForCausalLM.cs
index 6b38d15ebd..3fadd9aeb4 100644
--- a/src/Microsoft.ML.GenAI.LLaMA/LlamaForCausalLM.cs
+++ b/src/Microsoft.ML.GenAI.LLaMA/LlamaForCausalLM.cs
@@ -2,8 +2,12 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
+using System.Diagnostics;
+using System.Text.Json;
 using Microsoft.ML.GenAI.Core;
 using Microsoft.ML.GenAI.LLaMA.Module;
+using TorchSharp;
+using TorchSharp.PyBridge;
 using static TorchSharp.torch;
 
 namespace Microsoft.ML.GenAI.LLaMA;
@@ -41,4 +45,27 @@ public override CausalLMModelOutput forward(CausalLMModelInput input)
 
         return outputs;
     }
+
+    public static LlamaForCausalLM FromPretrained(
+        string modelFolder,
+        string configName = "config.json",
+        string checkPointName = "model.safetensors.index.json",
+        ScalarType torchDtype = ScalarType.BFloat16,
+        string device = "cpu")
+    {
+        var config = Path.Join(modelFolder, configName);
+        var modelConfig = JsonSerializer.Deserialize<LlamaConfig>(File.ReadAllText(config)) ?? throw new ArgumentNullException(nameof(config));
+        modelConfig.DType = torchDtype;
+        var model = new LlamaForCausalLM(modelConfig);
+
+        model.LoadSafeTensors(modelFolder, checkPointName);
+        model = model.to(device);
+
+        return model;
+    }
+
+    public void LoadSafeTensors(string modelFolder, string checkPointName = "model.safetensors.index.json")
+    {
+        this.load_checkpoint(path: modelFolder, checkpointName: checkPointName, strict: true, useTqdm: false, loadedParameters: loadedParameters);
+    }
 }
diff --git a/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaDecoderLayer.cs b/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaDecoderLayer.cs
index 57f141978a..bedd255bad 100644
--- a/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaDecoderLayer.cs
+++ b/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaDecoderLayer.cs
@@ -8,11 +8,11 @@
 using System.Text;
 using System.Threading.Tasks;
 using Microsoft.ML.GenAI.Core;
+using Microsoft.ML.GenAI.Core.Extension;
 using static TorchSharp.torch;
 
 namespace Microsoft.ML.GenAI.LLaMA.Module;
 
-
 internal class DecoderLayerInput
 {
     public DecoderLayerInput(
@@ -87,8 +87,8 @@ public LlamaDecoderLayer(LlamaConfig config, int layerIndex)
 
         this.self_attn = CreateAttention(config, layerIndex);
         this.mlp = new LlamaMLP(config);
-        this.input_layernorm = new Core.RMSNorm(this._hiddenSize, eps: config.RmsNormEps);
-        this.post_attention_layernorm = new Core.RMSNorm(this._hiddenSize, eps: config.RmsNormEps);
+        this.input_layernorm = new Core.RMSNorm(this._hiddenSize, eps: config.RmsNormEps, config.DType);
+        this.post_attention_layernorm = new Core.RMSNorm(this._hiddenSize, eps: config.RmsNormEps, config.DType);
     }
 
     private Attention CreateAttention(LlamaConfig config, int layerIndex)
@@ -144,9 +144,14 @@ public override DecoderLayerOutput forward(DecoderLayerInput input)
         hiddenStates = this.mlp.forward(hiddenStates);
         hiddenStates = residual + hiddenStates;
 
+        if (UnloadFromDeviceFunc != null)
+        {
+            UnloadFromDeviceFunc(this);
+        }
+
         return new DecoderLayerOutput(
-            hiddenStates: hiddenStates,
-            attentions: input.OutputAttentions ? selfAttnOutput.Attentions : null,
+            hiddenStates: hiddenStates.MoveToOuterDisposeScope(),
+            attentions: input.OutputAttentions ? selfAttnOutput.Attentions?.MoveToOuterDisposeScope() : null,
             pastKeyValue: selfAttnOutput.Cache);
     }
 }
diff --git a/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaMLP.cs b/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaMLP.cs
index 09052b5602..cbc841f144 100644
--- a/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaMLP.cs
+++ b/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaMLP.cs
@@ -53,10 +53,9 @@ public override Tensor forward(Tensor input)
             throw new NotImplementedException("PretrainingTp > 1 is not supported yet.");
         }
 
-        using var disposeScope = NewDisposeScope();
-        var input1 = this.gate_proj.forward(input);
-        input1 = this.activation_fn.forward(input1);
-        input1 = this.up_proj.forward(input1);
-        return this.down_proj.forward(input1).MoveToOuterDisposeScope();
+        using var input1 = this.gate_proj.forward(input);
+        using var input2 = this.activation_fn.forward(input1);
+        using var input3 = input2 * this.up_proj.forward(input);
+        return this.down_proj.forward(input3);
     }
 }
diff --git a/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaModel.cs b/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaModel.cs
index b15dcde532..0f271edfd8 100644
--- a/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaModel.cs
+++ b/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaModel.cs
@@ -8,6 +8,7 @@
 using System.Text;
 using System.Threading.Tasks;
 using Microsoft.ML.GenAI.Core;
+using Microsoft.ML.GenAI.Core.Extension;
 using TorchSharp;
 using TorchSharp.Modules;
 using static TorchSharp.torch;
@@ -103,7 +104,7 @@ public override CausalLMModelOutput forward(CausalLMModelInput input)
         }
         else
         {
-            attentionMask = AttentionMaskConverter.Create4DCausalAttentionMask(attentionMask, [batchSize, seqLength], inputsEmbeds.dtype, device, pastKeyValuesLength);
+            attentionMask = AttentionMaskConverter.Create4DCausalAttentionMask(attentionMask, [batchSize, seqLength], inputsEmbeds.dtype, device, pastKeyValuesLength, 2048);
         }
 
         var hiddenStates = inputsEmbeds;
diff --git a/src/Microsoft.ML.GenAI.LLaMA/Resource/Config/meta-llama-3.1-8B-Instruct.json b/src/Microsoft.ML.GenAI.LLaMA/Resource/Config/meta-llama-3.1-8B-Instruct.json
index 0bb6fd75b3..4eaf7c29ef 100644
--- a/src/Microsoft.ML.GenAI.LLaMA/Resource/Config/meta-llama-3.1-8B-Instruct.json
+++ b/src/Microsoft.ML.GenAI.LLaMA/Resource/Config/meta-llama-3.1-8B-Instruct.json
@@ -32,7 +32,6 @@
   "rope_theta": 500000.0,
   "tie_word_embeddings": false,
   "torch_dtype": "bfloat16",
-  "transformers_version": "4.42.3",
   "use_cache": true,
   "vocab_size": 128256
 }
diff --git a/src/Microsoft.ML.GenAI.Phi/Module/Phi3DecoderLayer.cs b/src/Microsoft.ML.GenAI.Phi/Module/Phi3DecoderLayer.cs
index bada15bbfd..b42b6a81fe 100644
--- a/src/Microsoft.ML.GenAI.Phi/Module/Phi3DecoderLayer.cs
+++ b/src/Microsoft.ML.GenAI.Phi/Module/Phi3DecoderLayer.cs
@@ -8,6 +8,7 @@
 using System.Text;
 using System.Threading.Tasks;
 using Microsoft.ML.GenAI.Core;
+using Microsoft.ML.GenAI.Core.Extension;
 using TorchSharp.Modules;
 using static TorchSharp.torch;
 
diff --git a/src/Microsoft.ML.GenAI.Phi/Module/Phi3Model.cs b/src/Microsoft.ML.GenAI.Phi/Module/Phi3Model.cs
index 839f9c7cc1..03114d93fb 100644
--- a/src/Microsoft.ML.GenAI.Phi/Module/Phi3Model.cs
+++ b/src/Microsoft.ML.GenAI.Phi/Module/Phi3Model.cs
@@ -5,6 +5,7 @@
 using Microsoft.ML.GenAI.Core;
 using TorchSharp;
 using TorchSharp.Modules;
+using Microsoft.ML.GenAI.Core.Extension;
 using static TorchSharp.torch;
 
 namespace Microsoft.ML.GenAI.Phi.Module;
@@ -106,14 +107,14 @@ public override CausalLMModelOutput forward(CausalLMModelInput input)
 
         var allHiddenStates = new List<Tensor>();
         var allAttentions = new List<Tensor>();
-
+        var i = 0;
         foreach (var layer in this.layers)
         {
             if (outputHiddenStates)
             {
                 allHiddenStates.Add(hiddenStates);
             }
-
+            Console.WriteLine($"{i++}: {hiddenStates.Peek("hidden_state")}");
             var decoderInput = new Phi3DecoderLayerInput(hiddenStates, attentionMask!, positionIds, this._cache, outputAttentions);
             var layerOutput = layer.forward(decoderInput);
             hiddenStates = layerOutput.HiddenStates;

From 63be6557ad75afacdd334379b440bd74b11a8841 Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Wed, 14 Aug 2024 16:21:40 -0700
Subject: [PATCH 04/24] update

---
 src/Microsoft.ML.GenAI.LLaMA/LlamaForCausalLM.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Microsoft.ML.GenAI.LLaMA/LlamaForCausalLM.cs b/src/Microsoft.ML.GenAI.LLaMA/LlamaForCausalLM.cs
index 3fadd9aeb4..9f70749ff9 100644
--- a/src/Microsoft.ML.GenAI.LLaMA/LlamaForCausalLM.cs
+++ b/src/Microsoft.ML.GenAI.LLaMA/LlamaForCausalLM.cs
@@ -66,6 +66,6 @@ public static LlamaForCausalLM FromPretrained(
 
     public void LoadSafeTensors(string modelFolder, string checkPointName = "model.safetensors.index.json")
     {
-        this.load_checkpoint(path: modelFolder, checkpointName: checkPointName, strict: true, useTqdm: false, loadedParameters: loadedParameters);
+        this.load_checkpoint(path: modelFolder, checkpointName: checkPointName, strict: true, useTqdm: false);
     }
 }

From 44b3302f32604c78c328cab87a69c7559a82400a Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Thu, 15 Aug 2024 10:09:48 -0700
Subject: [PATCH 05/24] add shape test for 70b and 405b

---
 .../Microsoft.ML.GenAI.Samples/Llama/test.cs  |   82 +-
 .../Phi3Mini/AutoGenSample.cs                 |    4 +-
 .../Phi3Mini/Utils.cs                         |    2 +-
 .../Module/Attention.cs                       |    2 +-
 src/Microsoft.ML.GenAI.LLaMA/LlamaConfig.cs   |   14 +
 .../LlamaForCausalLM.cs                       |    2 +-
 ...nizerHelper.cs => LlamaTokenizerHelper.cs} |    2 +-
 .../Microsoft.ML.GenAI.LLaMA.csproj           |    5 +
 .../Module/LlamaModel.cs                      |    3 +-
 .../Config/meta-llama-3.1-405B-Instruct.json  |   32 +
 .../Config/meta-llama-3.1-70B-Instruct.json   |   32 +
 .../Config/meta-llama-3.1-8B-Instruct.json    |    4 -
 src/Microsoft.ML.GenAI.LLaMA/Utils.cs         |    2 +-
 .../Module/Phi3Model.cs                       |    2 -
 ...ests.Llama_3_1_405b_ShapeTest.approved.txt | 1137 +++++++++++++++++
 ...Tests.Llama_3_1_70b_ShapeTest.approved.txt |  723 +++++++++++
 .../LLaMA3_1Tests.cs                          |   22 +-
 17 files changed, 2050 insertions(+), 20 deletions(-)
 rename src/Microsoft.ML.GenAI.LLaMA/{Llama3_1TokenizerHelper.cs => LlamaTokenizerHelper.cs} (98%)
 create mode 100644 src/Microsoft.ML.GenAI.LLaMA/Resource/Config/meta-llama-3.1-405B-Instruct.json
 create mode 100644 src/Microsoft.ML.GenAI.LLaMA/Resource/Config/meta-llama-3.1-70B-Instruct.json
 create mode 100644 test/Microsoft.ML.GenAI.LLaMA.Tests/Approvals/LLaMA3_1Tests.Llama_3_1_405b_ShapeTest.approved.txt
 create mode 100644 test/Microsoft.ML.GenAI.LLaMA.Tests/Approvals/LLaMA3_1Tests.Llama_3_1_70b_ShapeTest.approved.txt

diff --git a/docs/samples/Microsoft.ML.GenAI.Samples/Llama/test.cs b/docs/samples/Microsoft.ML.GenAI.Samples/Llama/test.cs
index 804a7e3b77..bd7f6ed996 100644
--- a/docs/samples/Microsoft.ML.GenAI.Samples/Llama/test.cs
+++ b/docs/samples/Microsoft.ML.GenAI.Samples/Llama/test.cs
@@ -2,8 +2,10 @@
 using System.Collections.Generic;
 using System.Linq;
 using System.Text;
+using System.Text.Json;
 using System.Threading.Tasks;
 using Microsoft.ML.GenAI.Core;
+using Microsoft.ML.GenAI.Core.Extension;
 using Microsoft.ML.GenAI.LLaMA;
 using Microsoft.ML.Tokenizers;
 using TorchSharp;
@@ -24,19 +26,89 @@ public static void Run()
         var defaultType = ScalarType.Float16;
         torch.manual_seed(1);
         torch.set_default_dtype(defaultType);
-        var weightFolder = @"C:\Users\xiaoyuz\source\repos\Meta-Llama-3.1-8B-Instruct";
+        var weightFolder = @"C:\Users\xiaoyuz\source\repos\Meta-Llama-3.1-70B-Instruct";
+        var configName = "config.json";
+        var quantizeToInt8 = false;
+        var quantizeToInt4 = false;
+        var modelSizeOnCudaInGB = 18;
+        var modelSizeOnMemoryInGB = 640;
+        var modelSizeOnDiskInGB = 200;
         var originalWeightFolder = Path.Combine(weightFolder, "original");
 
         Console.WriteLine("Loading Llama from huggingface model weight folder");
         var stopWatch = System.Diagnostics.Stopwatch.StartNew();
         stopWatch.Start();
-        var tokenizer = Llama3_1TokenizerHelper.FromPretrained(originalWeightFolder);
-        var model = LlamaForCausalLM.FromPretrained(weightFolder, device: device);
-        stopWatch.Stop();
+        var tokenizer = LlamaTokenizerHelper.FromPretrained(originalWeightFolder);
+        Console.WriteLine("Loading Phi3 from huggingface model weight folder");
+        torch.set_default_device("meta");
+        var configPath = System.IO.Path.Combine(weightFolder, configName);
+        var config = JsonSerializer.Deserialize<LlamaConfig>(System.IO.File.ReadAllText(configPath)) ?? throw new ArgumentNullException(nameof(configPath));
+        var timer = System.Diagnostics.Stopwatch.StartNew();
+        var model = new LlamaForCausalLM(config);
+        var tokenzierPath = System.IO.Path.Combine(weightFolder, "tokenizer.model");
 
-        Console.WriteLine($"Loading time: {stopWatch.ElapsedMilliseconds} ms");
+        if (quantizeToInt8)
+        {
+            model.ToInt8QuantizeModule();
+        }
+        else if (quantizeToInt4)
+        {
+            model.ToInt4QuantizeModule();
+        }
+
+        var deviceSizeMap = new Dictionary<string, long>
+        {
+            ["cuda"] = modelSizeOnCudaInGB * 1L * 1024 * 1024 * 1024,
+            ["cpu"] = modelSizeOnMemoryInGB * 1L * 1024 * 1024 * 1024,
+            ["disk"] = modelSizeOnDiskInGB * 1L * 1024 * 1024 * 1024,
+        };
+
+        var deviceMap = model.InferDeviceMapForEachLayer(
+            devices: ["cuda", "cpu", "disk"],
+            deviceSizeMapInByte: deviceSizeMap);
+
+        var deviceMapJson = JsonSerializer.Serialize(deviceMap, new JsonSerializerOptions { WriteIndented = true });
+        Console.WriteLine($"Device map:");
+        Console.WriteLine(deviceMapJson);
+
+        // load weight
+        torch.set_default_device("cpu");
+
+        Console.WriteLine("Start loading");
+        timer = System.Diagnostics.Stopwatch.StartNew();
+        model = new LlamaForCausalLM(config);
+        timer.Stop();
+        Console.WriteLine($"model created in {timer.ElapsedMilliseconds / 1000} s");
+
+        timer = System.Diagnostics.Stopwatch.StartNew();
+        model.LoadSafeTensors(weightFolder);
+        timer.Stop();
+        Console.WriteLine($"weight loaded in {timer.ElapsedMilliseconds / 1000} s");
+
+        if (quantizeToInt8 || quantizeToInt4)
+        {
+            timer = System.Diagnostics.Stopwatch.StartNew();
+            Console.WriteLine("Start quantizing if needed");
+            if (quantizeToInt8)
+            {
+                model.ToInt8QuantizeModule();
+            }
+            else if (quantizeToInt4)
+            {
+                model.ToInt4QuantizeModule();
+            }
+            Console.WriteLine("Quantizing done");
+            timer.Stop();
+            Console.WriteLine($"Quantizing done in {timer.ElapsedMilliseconds / 1000} s");
+        }
 
+        timer = System.Diagnostics.Stopwatch.StartNew();
+        Console.WriteLine($"Start loading to device: {device}");
+        model = model.ToDynamicLoadingModel(deviceMap, "cuda");
+        timer.Stop();
+        Console.WriteLine($"Phi3 loaded to device: {device} in {timer.ElapsedMilliseconds / 1000} s");
         var pipeline = new CausalLMPipeline<TiktokenTokenizer, LlamaForCausalLM>(tokenizer, model, device);
+        torch.set_default_device(device);
 
         var prompt = """
             <|begin_of_text|>
diff --git a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs
index 379fd2b97b..5b3dce01de 100644
--- a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs
+++ b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs
@@ -25,8 +25,8 @@ public static async Task RunAsync()
         var defaultType = ScalarType.Float16;
         torch.manual_seed(1);
         torch.set_default_dtype(defaultType);
-        var weightFolder = @"C:\Users\xiaoyuz\source\repos\Phi-3-mini-4k-instruct";
-        var pipeline = Utils.LoadPhi3Mini4KFromFolder(weightFolder, device: device);
+        var weightFolder = @"C:\Users\xiaoyuz\source\repos\Phi-3-medium-4k-instruct";
+        var pipeline = Utils.LoadPhi3Mini4KFromFolder(weightFolder, device: device, quantizeToInt8: true);
 
         // agent
         var agent = new Phi3Agent(pipeline, "assistant")
diff --git a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/Utils.cs b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/Utils.cs
index 5e53ef0ac4..33819a8df4 100644
--- a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/Utils.cs
+++ b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/Utils.cs
@@ -20,7 +20,7 @@ public static ICausalLMPipeline<Tokenizer, Phi3ForCasualLM> LoadPhi3Mini4KFromFo
         string weightFolder,
         string configName = "config.json",
         string device = "cuda",
-        int modelSizeOnCudaInGB = 16,
+        int modelSizeOnCudaInGB = 55,
         int modelSizeOnMemoryInGB = 64,
         int modelSizeOnDiskInGB = 200,
         bool quantizeToInt8 = false,
diff --git a/src/Microsoft.ML.GenAI.Core/Module/Attention.cs b/src/Microsoft.ML.GenAI.Core/Module/Attention.cs
index e059af949d..6a846cb684 100644
--- a/src/Microsoft.ML.GenAI.Core/Module/Attention.cs
+++ b/src/Microsoft.ML.GenAI.Core/Module/Attention.cs
@@ -202,7 +202,7 @@ public override AttentionOutput forward(AttentionInput input)
                 Contract.Assert(attentionMask.shape[0] == bsz);
                 Contract.Assert(attentionMask.shape[1] == 1);
                 Contract.Assert(attentionMask.shape[2] == qLen);
-                Contract.Assert(attentionMask.shape[3] == kvSeqLen);
+                //Contract.Assert(attentionMask.shape[3] == kvSeqLen);
                 attnWeights = attnWeights + attentionMask;
             }
 
diff --git a/src/Microsoft.ML.GenAI.LLaMA/LlamaConfig.cs b/src/Microsoft.ML.GenAI.LLaMA/LlamaConfig.cs
index b10c6c02f5..a8a6985ee8 100644
--- a/src/Microsoft.ML.GenAI.LLaMA/LlamaConfig.cs
+++ b/src/Microsoft.ML.GenAI.LLaMA/LlamaConfig.cs
@@ -43,9 +43,13 @@ static LlamaConfig()
     {
 #pragma warning disable MSML_ParameterLocalVarName // Parameter or local variable name not standard
         var llama3_1_8b_content = Utils.GetEmbeddedResource("Microsoft.ML.GenAI.LLaMA.Resource.Config.meta-llama-3.1-8B-Instruct.json");
+        var llama3_1_70b_content = Utils.GetEmbeddedResource("Microsoft.ML.GenAI.LLaMA.Resource.Config.meta-llama-3.1-70B-Instruct.json");
+        var llama3_1_405b_content = Utils.GetEmbeddedResource("Microsoft.ML.GenAI.LLaMA.Resource.Config.meta-llama-3.1-405B-Instruct.json");
 #pragma warning restore MSML_ParameterLocalVarName // Parameter or local variable name not standard
 
         Llama3_1_8B_Instruct = JsonSerializer.Deserialize<LlamaConfig>(llama3_1_8b_content) ?? throw new ArgumentNullException(nameof(llama3_1_8b_content));
+        Llama3_1_70B_Instruct = JsonSerializer.Deserialize<LlamaConfig>(llama3_1_70b_content) ?? throw new ArgumentNullException(nameof(llama3_1_70b_content));
+        Llama3_1_405B_Instruct = JsonSerializer.Deserialize<LlamaConfig>(llama3_1_405b_content) ?? throw new ArgumentNullException(nameof(llama3_1_405b_content));
     }
 
 #pragma warning disable MSML_GeneralName // This name should be PascalCased
@@ -53,6 +57,16 @@ static LlamaConfig()
     /// The llama-3.1-8B-Instruct configuration created from https://huggingface.co/meta-llama/Meta-Llama-3.1-8B.
     /// </summary>
     public static LlamaConfig Llama3_1_8B_Instruct { get; }
+
+    /// <summary>
+    /// The llama-3.1-70B-Instruct configuration created from https://huggingface.co/meta-llama/Meta-Llama-3.1-70B.
+    /// </summary>
+    public static LlamaConfig Llama3_1_70B_Instruct { get; }
+
+    /// <summary>
+    /// The llama-3.1-405B-Instruct configuration created from https://huggingface.co/meta-llama/Meta-Llama-3.1-405B.
+    /// </summary>
+    public static LlamaConfig Llama3_1_405B_Instruct { get; }
 #pragma warning restore MSML_GeneralName // This name should be PascalCased
 
     [JsonPropertyName("attention_bias")]
diff --git a/src/Microsoft.ML.GenAI.LLaMA/LlamaForCausalLM.cs b/src/Microsoft.ML.GenAI.LLaMA/LlamaForCausalLM.cs
index 3fadd9aeb4..9f70749ff9 100644
--- a/src/Microsoft.ML.GenAI.LLaMA/LlamaForCausalLM.cs
+++ b/src/Microsoft.ML.GenAI.LLaMA/LlamaForCausalLM.cs
@@ -66,6 +66,6 @@ public static LlamaForCausalLM FromPretrained(
 
     public void LoadSafeTensors(string modelFolder, string checkPointName = "model.safetensors.index.json")
     {
-        this.load_checkpoint(path: modelFolder, checkpointName: checkPointName, strict: true, useTqdm: false, loadedParameters: loadedParameters);
+        this.load_checkpoint(path: modelFolder, checkpointName: checkPointName, strict: true, useTqdm: false);
     }
 }
diff --git a/src/Microsoft.ML.GenAI.LLaMA/Llama3_1TokenizerHelper.cs b/src/Microsoft.ML.GenAI.LLaMA/LlamaTokenizerHelper.cs
similarity index 98%
rename from src/Microsoft.ML.GenAI.LLaMA/Llama3_1TokenizerHelper.cs
rename to src/Microsoft.ML.GenAI.LLaMA/LlamaTokenizerHelper.cs
index 74a61b2cee..ea6f49edf7 100644
--- a/src/Microsoft.ML.GenAI.LLaMA/Llama3_1TokenizerHelper.cs
+++ b/src/Microsoft.ML.GenAI.LLaMA/LlamaTokenizerHelper.cs
@@ -13,7 +13,7 @@
 namespace Microsoft.ML.GenAI.LLaMA;
 
 #pragma warning disable MSML_GeneralName // This name should be PascalCased
-public class Llama3_1TokenizerHelper
+public class LlamaTokenizerHelper
 #pragma warning restore MSML_GeneralName // This name should be PascalCased
 {
     /// <summary>
diff --git a/src/Microsoft.ML.GenAI.LLaMA/Microsoft.ML.GenAI.LLaMA.csproj b/src/Microsoft.ML.GenAI.LLaMA/Microsoft.ML.GenAI.LLaMA.csproj
index a9b21b5737..8c7200fa1e 100644
--- a/src/Microsoft.ML.GenAI.LLaMA/Microsoft.ML.GenAI.LLaMA.csproj
+++ b/src/Microsoft.ML.GenAI.LLaMA/Microsoft.ML.GenAI.LLaMA.csproj
@@ -23,4 +23,9 @@
     <EmbeddedResource Include="Resource\Config\*.json" />
   </ItemGroup>
 
+  <ItemGroup>
+    <None Remove="Resource\Config\meta-llama-3.1-405B-Instruct.json" />
+    <None Remove="Resource\Config\meta-llama-3.1-70B-Instruct.json" />
+  </ItemGroup>
+
 </Project>
diff --git a/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaModel.cs b/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaModel.cs
index 0f271edfd8..cf08f31b54 100644
--- a/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaModel.cs
+++ b/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaModel.cs
@@ -104,7 +104,8 @@ public override CausalLMModelOutput forward(CausalLMModelInput input)
         }
         else
         {
-            attentionMask = AttentionMaskConverter.Create4DCausalAttentionMask(attentionMask, [batchSize, seqLength], inputsEmbeds.dtype, device, pastKeyValuesLength, 2048);
+            // the following behavior of creating 4d causal mask doesn't match python's, remember to look into it when there's time.
+            attentionMask = AttentionMaskConverter.Create4DCausalAttentionMask(attentionMask, [batchSize, seqLength], inputsEmbeds.dtype, device, pastKeyValuesLength);
         }
 
         var hiddenStates = inputsEmbeds;
diff --git a/src/Microsoft.ML.GenAI.LLaMA/Resource/Config/meta-llama-3.1-405B-Instruct.json b/src/Microsoft.ML.GenAI.LLaMA/Resource/Config/meta-llama-3.1-405B-Instruct.json
new file mode 100644
index 0000000000..373b94f4f6
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.LLaMA/Resource/Config/meta-llama-3.1-405B-Instruct.json
@@ -0,0 +1,32 @@
+{
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 128000,
+  "eos_token_id": [
+    128001,
+    128008,
+    128009
+  ],
+  "hidden_act": "silu",
+  "hidden_size": 16384,
+  "initializer_range": 0.02,
+  "intermediate_size": 53248,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "num_attention_heads": 128,
+  "num_hidden_layers": 126,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 8.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "vocab_size": 128256
+}
diff --git a/src/Microsoft.ML.GenAI.LLaMA/Resource/Config/meta-llama-3.1-70B-Instruct.json b/src/Microsoft.ML.GenAI.LLaMA/Resource/Config/meta-llama-3.1-70B-Instruct.json
new file mode 100644
index 0000000000..2cd3ad59ac
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.LLaMA/Resource/Config/meta-llama-3.1-70B-Instruct.json
@@ -0,0 +1,32 @@
+{
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 128000,
+  "eos_token_id": [
+    128001,
+    128008,
+    128009
+  ],
+  "hidden_act": "silu",
+  "hidden_size": 8192,
+  "initializer_range": 0.02,
+  "intermediate_size": 28672,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "num_attention_heads": 64,
+  "num_hidden_layers": 80,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 8.0,
+    "low_freq_factor": 1.0,
+    "high_freq_factor": 4.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "vocab_size": 128256
+}
diff --git a/src/Microsoft.ML.GenAI.LLaMA/Resource/Config/meta-llama-3.1-8B-Instruct.json b/src/Microsoft.ML.GenAI.LLaMA/Resource/Config/meta-llama-3.1-8B-Instruct.json
index 4eaf7c29ef..750f5671d6 100644
--- a/src/Microsoft.ML.GenAI.LLaMA/Resource/Config/meta-llama-3.1-8B-Instruct.json
+++ b/src/Microsoft.ML.GenAI.LLaMA/Resource/Config/meta-llama-3.1-8B-Instruct.json
@@ -1,7 +1,4 @@
 {
-  "architectures": [
-    "LlamaForCausalLM"
-  ],
   "attention_bias": false,
   "attention_dropout": 0.0,
   "bos_token_id": 128000,
@@ -31,7 +28,6 @@
   },
   "rope_theta": 500000.0,
   "tie_word_embeddings": false,
-  "torch_dtype": "bfloat16",
   "use_cache": true,
   "vocab_size": 128256
 }
diff --git a/src/Microsoft.ML.GenAI.LLaMA/Utils.cs b/src/Microsoft.ML.GenAI.LLaMA/Utils.cs
index db849d2064..b3dec789f3 100644
--- a/src/Microsoft.ML.GenAI.LLaMA/Utils.cs
+++ b/src/Microsoft.ML.GenAI.LLaMA/Utils.cs
@@ -91,7 +91,7 @@ public static string GetEmbeddedResource(string resourceName)
 
         if (resourceStream == null)
         {
-            throw new ArgumentException("Resource not found", nameof(resourceName));
+            throw new ArgumentException("Resource not found", resourceName);
         }
 
         using var reader = new System.IO.StreamReader(resourceStream);
diff --git a/src/Microsoft.ML.GenAI.Phi/Module/Phi3Model.cs b/src/Microsoft.ML.GenAI.Phi/Module/Phi3Model.cs
index 03114d93fb..463ea5cddc 100644
--- a/src/Microsoft.ML.GenAI.Phi/Module/Phi3Model.cs
+++ b/src/Microsoft.ML.GenAI.Phi/Module/Phi3Model.cs
@@ -107,14 +107,12 @@ public override CausalLMModelOutput forward(CausalLMModelInput input)
 
         var allHiddenStates = new List<Tensor>();
         var allAttentions = new List<Tensor>();
-        var i = 0;
         foreach (var layer in this.layers)
         {
             if (outputHiddenStates)
             {
                 allHiddenStates.Add(hiddenStates);
             }
-            Console.WriteLine($"{i++}: {hiddenStates.Peek("hidden_state")}");
             var decoderInput = new Phi3DecoderLayerInput(hiddenStates, attentionMask!, positionIds, this._cache, outputAttentions);
             var layerOutput = layer.forward(decoderInput);
             hiddenStates = layerOutput.HiddenStates;
diff --git a/test/Microsoft.ML.GenAI.LLaMA.Tests/Approvals/LLaMA3_1Tests.Llama_3_1_405b_ShapeTest.approved.txt b/test/Microsoft.ML.GenAI.LLaMA.Tests/Approvals/LLaMA3_1Tests.Llama_3_1_405b_ShapeTest.approved.txt
new file mode 100644
index 0000000000..6b8d7749dc
--- /dev/null
+++ b/test/Microsoft.ML.GenAI.LLaMA.Tests/Approvals/LLaMA3_1Tests.Llama_3_1_405b_ShapeTest.approved.txt
@@ -0,0 +1,1137 @@
+﻿0: lm_head.weight shape: [128256, 16384]
+1: model.embed_tokens.weight shape: [128256, 16384]
+2: model.layers.0.input_layernorm.weight shape: [16384]
+3: model.layers.0.mlp.down_proj.weight shape: [16384, 53248]
+4: model.layers.0.mlp.gate_proj.weight shape: [53248, 16384]
+5: model.layers.0.mlp.up_proj.weight shape: [53248, 16384]
+6: model.layers.0.post_attention_layernorm.weight shape: [16384]
+7: model.layers.0.self_attn.k_proj.weight shape: [1024, 16384]
+8: model.layers.0.self_attn.o_proj.weight shape: [16384, 16384]
+9: model.layers.0.self_attn.q_proj.weight shape: [16384, 16384]
+10: model.layers.0.self_attn.v_proj.weight shape: [1024, 16384]
+11: model.layers.1.input_layernorm.weight shape: [16384]
+12: model.layers.1.mlp.down_proj.weight shape: [16384, 53248]
+13: model.layers.1.mlp.gate_proj.weight shape: [53248, 16384]
+14: model.layers.1.mlp.up_proj.weight shape: [53248, 16384]
+15: model.layers.1.post_attention_layernorm.weight shape: [16384]
+16: model.layers.1.self_attn.k_proj.weight shape: [1024, 16384]
+17: model.layers.1.self_attn.o_proj.weight shape: [16384, 16384]
+18: model.layers.1.self_attn.q_proj.weight shape: [16384, 16384]
+19: model.layers.1.self_attn.v_proj.weight shape: [1024, 16384]
+20: model.layers.10.input_layernorm.weight shape: [16384]
+21: model.layers.10.mlp.down_proj.weight shape: [16384, 53248]
+22: model.layers.10.mlp.gate_proj.weight shape: [53248, 16384]
+23: model.layers.10.mlp.up_proj.weight shape: [53248, 16384]
+24: model.layers.10.post_attention_layernorm.weight shape: [16384]
+25: model.layers.10.self_attn.k_proj.weight shape: [1024, 16384]
+26: model.layers.10.self_attn.o_proj.weight shape: [16384, 16384]
+27: model.layers.10.self_attn.q_proj.weight shape: [16384, 16384]
+28: model.layers.10.self_attn.v_proj.weight shape: [1024, 16384]
+29: model.layers.100.input_layernorm.weight shape: [16384]
+30: model.layers.100.mlp.down_proj.weight shape: [16384, 53248]
+31: model.layers.100.mlp.gate_proj.weight shape: [53248, 16384]
+32: model.layers.100.mlp.up_proj.weight shape: [53248, 16384]
+33: model.layers.100.post_attention_layernorm.weight shape: [16384]
+34: model.layers.100.self_attn.k_proj.weight shape: [1024, 16384]
+35: model.layers.100.self_attn.o_proj.weight shape: [16384, 16384]
+36: model.layers.100.self_attn.q_proj.weight shape: [16384, 16384]
+37: model.layers.100.self_attn.v_proj.weight shape: [1024, 16384]
+38: model.layers.101.input_layernorm.weight shape: [16384]
+39: model.layers.101.mlp.down_proj.weight shape: [16384, 53248]
+40: model.layers.101.mlp.gate_proj.weight shape: [53248, 16384]
+41: model.layers.101.mlp.up_proj.weight shape: [53248, 16384]
+42: model.layers.101.post_attention_layernorm.weight shape: [16384]
+43: model.layers.101.self_attn.k_proj.weight shape: [1024, 16384]
+44: model.layers.101.self_attn.o_proj.weight shape: [16384, 16384]
+45: model.layers.101.self_attn.q_proj.weight shape: [16384, 16384]
+46: model.layers.101.self_attn.v_proj.weight shape: [1024, 16384]
+47: model.layers.102.input_layernorm.weight shape: [16384]
+48: model.layers.102.mlp.down_proj.weight shape: [16384, 53248]
+49: model.layers.102.mlp.gate_proj.weight shape: [53248, 16384]
+50: model.layers.102.mlp.up_proj.weight shape: [53248, 16384]
+51: model.layers.102.post_attention_layernorm.weight shape: [16384]
+52: model.layers.102.self_attn.k_proj.weight shape: [1024, 16384]
+53: model.layers.102.self_attn.o_proj.weight shape: [16384, 16384]
+54: model.layers.102.self_attn.q_proj.weight shape: [16384, 16384]
+55: model.layers.102.self_attn.v_proj.weight shape: [1024, 16384]
+56: model.layers.103.input_layernorm.weight shape: [16384]
+57: model.layers.103.mlp.down_proj.weight shape: [16384, 53248]
+58: model.layers.103.mlp.gate_proj.weight shape: [53248, 16384]
+59: model.layers.103.mlp.up_proj.weight shape: [53248, 16384]
+60: model.layers.103.post_attention_layernorm.weight shape: [16384]
+61: model.layers.103.self_attn.k_proj.weight shape: [1024, 16384]
+62: model.layers.103.self_attn.o_proj.weight shape: [16384, 16384]
+63: model.layers.103.self_attn.q_proj.weight shape: [16384, 16384]
+64: model.layers.103.self_attn.v_proj.weight shape: [1024, 16384]
+65: model.layers.104.input_layernorm.weight shape: [16384]
+66: model.layers.104.mlp.down_proj.weight shape: [16384, 53248]
+67: model.layers.104.mlp.gate_proj.weight shape: [53248, 16384]
+68: model.layers.104.mlp.up_proj.weight shape: [53248, 16384]
+69: model.layers.104.post_attention_layernorm.weight shape: [16384]
+70: model.layers.104.self_attn.k_proj.weight shape: [1024, 16384]
+71: model.layers.104.self_attn.o_proj.weight shape: [16384, 16384]
+72: model.layers.104.self_attn.q_proj.weight shape: [16384, 16384]
+73: model.layers.104.self_attn.v_proj.weight shape: [1024, 16384]
+74: model.layers.105.input_layernorm.weight shape: [16384]
+75: model.layers.105.mlp.down_proj.weight shape: [16384, 53248]
+76: model.layers.105.mlp.gate_proj.weight shape: [53248, 16384]
+77: model.layers.105.mlp.up_proj.weight shape: [53248, 16384]
+78: model.layers.105.post_attention_layernorm.weight shape: [16384]
+79: model.layers.105.self_attn.k_proj.weight shape: [1024, 16384]
+80: model.layers.105.self_attn.o_proj.weight shape: [16384, 16384]
+81: model.layers.105.self_attn.q_proj.weight shape: [16384, 16384]
+82: model.layers.105.self_attn.v_proj.weight shape: [1024, 16384]
+83: model.layers.106.input_layernorm.weight shape: [16384]
+84: model.layers.106.mlp.down_proj.weight shape: [16384, 53248]
+85: model.layers.106.mlp.gate_proj.weight shape: [53248, 16384]
+86: model.layers.106.mlp.up_proj.weight shape: [53248, 16384]
+87: model.layers.106.post_attention_layernorm.weight shape: [16384]
+88: model.layers.106.self_attn.k_proj.weight shape: [1024, 16384]
+89: model.layers.106.self_attn.o_proj.weight shape: [16384, 16384]
+90: model.layers.106.self_attn.q_proj.weight shape: [16384, 16384]
+91: model.layers.106.self_attn.v_proj.weight shape: [1024, 16384]
+92: model.layers.107.input_layernorm.weight shape: [16384]
+93: model.layers.107.mlp.down_proj.weight shape: [16384, 53248]
+94: model.layers.107.mlp.gate_proj.weight shape: [53248, 16384]
+95: model.layers.107.mlp.up_proj.weight shape: [53248, 16384]
+96: model.layers.107.post_attention_layernorm.weight shape: [16384]
+97: model.layers.107.self_attn.k_proj.weight shape: [1024, 16384]
+98: model.layers.107.self_attn.o_proj.weight shape: [16384, 16384]
+99: model.layers.107.self_attn.q_proj.weight shape: [16384, 16384]
+100: model.layers.107.self_attn.v_proj.weight shape: [1024, 16384]
+101: model.layers.108.input_layernorm.weight shape: [16384]
+102: model.layers.108.mlp.down_proj.weight shape: [16384, 53248]
+103: model.layers.108.mlp.gate_proj.weight shape: [53248, 16384]
+104: model.layers.108.mlp.up_proj.weight shape: [53248, 16384]
+105: model.layers.108.post_attention_layernorm.weight shape: [16384]
+106: model.layers.108.self_attn.k_proj.weight shape: [1024, 16384]
+107: model.layers.108.self_attn.o_proj.weight shape: [16384, 16384]
+108: model.layers.108.self_attn.q_proj.weight shape: [16384, 16384]
+109: model.layers.108.self_attn.v_proj.weight shape: [1024, 16384]
+110: model.layers.109.input_layernorm.weight shape: [16384]
+111: model.layers.109.mlp.down_proj.weight shape: [16384, 53248]
+112: model.layers.109.mlp.gate_proj.weight shape: [53248, 16384]
+113: model.layers.109.mlp.up_proj.weight shape: [53248, 16384]
+114: model.layers.109.post_attention_layernorm.weight shape: [16384]
+115: model.layers.109.self_attn.k_proj.weight shape: [1024, 16384]
+116: model.layers.109.self_attn.o_proj.weight shape: [16384, 16384]
+117: model.layers.109.self_attn.q_proj.weight shape: [16384, 16384]
+118: model.layers.109.self_attn.v_proj.weight shape: [1024, 16384]
+119: model.layers.11.input_layernorm.weight shape: [16384]
+120: model.layers.11.mlp.down_proj.weight shape: [16384, 53248]
+121: model.layers.11.mlp.gate_proj.weight shape: [53248, 16384]
+122: model.layers.11.mlp.up_proj.weight shape: [53248, 16384]
+123: model.layers.11.post_attention_layernorm.weight shape: [16384]
+124: model.layers.11.self_attn.k_proj.weight shape: [1024, 16384]
+125: model.layers.11.self_attn.o_proj.weight shape: [16384, 16384]
+126: model.layers.11.self_attn.q_proj.weight shape: [16384, 16384]
+127: model.layers.11.self_attn.v_proj.weight shape: [1024, 16384]
+128: model.layers.110.input_layernorm.weight shape: [16384]
+129: model.layers.110.mlp.down_proj.weight shape: [16384, 53248]
+130: model.layers.110.mlp.gate_proj.weight shape: [53248, 16384]
+131: model.layers.110.mlp.up_proj.weight shape: [53248, 16384]
+132: model.layers.110.post_attention_layernorm.weight shape: [16384]
+133: model.layers.110.self_attn.k_proj.weight shape: [1024, 16384]
+134: model.layers.110.self_attn.o_proj.weight shape: [16384, 16384]
+135: model.layers.110.self_attn.q_proj.weight shape: [16384, 16384]
+136: model.layers.110.self_attn.v_proj.weight shape: [1024, 16384]
+137: model.layers.111.input_layernorm.weight shape: [16384]
+138: model.layers.111.mlp.down_proj.weight shape: [16384, 53248]
+139: model.layers.111.mlp.gate_proj.weight shape: [53248, 16384]
+140: model.layers.111.mlp.up_proj.weight shape: [53248, 16384]
+141: model.layers.111.post_attention_layernorm.weight shape: [16384]
+142: model.layers.111.self_attn.k_proj.weight shape: [1024, 16384]
+143: model.layers.111.self_attn.o_proj.weight shape: [16384, 16384]
+144: model.layers.111.self_attn.q_proj.weight shape: [16384, 16384]
+145: model.layers.111.self_attn.v_proj.weight shape: [1024, 16384]
+146: model.layers.112.input_layernorm.weight shape: [16384]
+147: model.layers.112.mlp.down_proj.weight shape: [16384, 53248]
+148: model.layers.112.mlp.gate_proj.weight shape: [53248, 16384]
+149: model.layers.112.mlp.up_proj.weight shape: [53248, 16384]
+150: model.layers.112.post_attention_layernorm.weight shape: [16384]
+151: model.layers.112.self_attn.k_proj.weight shape: [1024, 16384]
+152: model.layers.112.self_attn.o_proj.weight shape: [16384, 16384]
+153: model.layers.112.self_attn.q_proj.weight shape: [16384, 16384]
+154: model.layers.112.self_attn.v_proj.weight shape: [1024, 16384]
+155: model.layers.113.input_layernorm.weight shape: [16384]
+156: model.layers.113.mlp.down_proj.weight shape: [16384, 53248]
+157: model.layers.113.mlp.gate_proj.weight shape: [53248, 16384]
+158: model.layers.113.mlp.up_proj.weight shape: [53248, 16384]
+159: model.layers.113.post_attention_layernorm.weight shape: [16384]
+160: model.layers.113.self_attn.k_proj.weight shape: [1024, 16384]
+161: model.layers.113.self_attn.o_proj.weight shape: [16384, 16384]
+162: model.layers.113.self_attn.q_proj.weight shape: [16384, 16384]
+163: model.layers.113.self_attn.v_proj.weight shape: [1024, 16384]
+164: model.layers.114.input_layernorm.weight shape: [16384]
+165: model.layers.114.mlp.down_proj.weight shape: [16384, 53248]
+166: model.layers.114.mlp.gate_proj.weight shape: [53248, 16384]
+167: model.layers.114.mlp.up_proj.weight shape: [53248, 16384]
+168: model.layers.114.post_attention_layernorm.weight shape: [16384]
+169: model.layers.114.self_attn.k_proj.weight shape: [1024, 16384]
+170: model.layers.114.self_attn.o_proj.weight shape: [16384, 16384]
+171: model.layers.114.self_attn.q_proj.weight shape: [16384, 16384]
+172: model.layers.114.self_attn.v_proj.weight shape: [1024, 16384]
+173: model.layers.115.input_layernorm.weight shape: [16384]
+174: model.layers.115.mlp.down_proj.weight shape: [16384, 53248]
+175: model.layers.115.mlp.gate_proj.weight shape: [53248, 16384]
+176: model.layers.115.mlp.up_proj.weight shape: [53248, 16384]
+177: model.layers.115.post_attention_layernorm.weight shape: [16384]
+178: model.layers.115.self_attn.k_proj.weight shape: [1024, 16384]
+179: model.layers.115.self_attn.o_proj.weight shape: [16384, 16384]
+180: model.layers.115.self_attn.q_proj.weight shape: [16384, 16384]
+181: model.layers.115.self_attn.v_proj.weight shape: [1024, 16384]
+182: model.layers.116.input_layernorm.weight shape: [16384]
+183: model.layers.116.mlp.down_proj.weight shape: [16384, 53248]
+184: model.layers.116.mlp.gate_proj.weight shape: [53248, 16384]
+185: model.layers.116.mlp.up_proj.weight shape: [53248, 16384]
+186: model.layers.116.post_attention_layernorm.weight shape: [16384]
+187: model.layers.116.self_attn.k_proj.weight shape: [1024, 16384]
+188: model.layers.116.self_attn.o_proj.weight shape: [16384, 16384]
+189: model.layers.116.self_attn.q_proj.weight shape: [16384, 16384]
+190: model.layers.116.self_attn.v_proj.weight shape: [1024, 16384]
+191: model.layers.117.input_layernorm.weight shape: [16384]
+192: model.layers.117.mlp.down_proj.weight shape: [16384, 53248]
+193: model.layers.117.mlp.gate_proj.weight shape: [53248, 16384]
+194: model.layers.117.mlp.up_proj.weight shape: [53248, 16384]
+195: model.layers.117.post_attention_layernorm.weight shape: [16384]
+196: model.layers.117.self_attn.k_proj.weight shape: [1024, 16384]
+197: model.layers.117.self_attn.o_proj.weight shape: [16384, 16384]
+198: model.layers.117.self_attn.q_proj.weight shape: [16384, 16384]
+199: model.layers.117.self_attn.v_proj.weight shape: [1024, 16384]
+200: model.layers.118.input_layernorm.weight shape: [16384]
+201: model.layers.118.mlp.down_proj.weight shape: [16384, 53248]
+202: model.layers.118.mlp.gate_proj.weight shape: [53248, 16384]
+203: model.layers.118.mlp.up_proj.weight shape: [53248, 16384]
+204: model.layers.118.post_attention_layernorm.weight shape: [16384]
+205: model.layers.118.self_attn.k_proj.weight shape: [1024, 16384]
+206: model.layers.118.self_attn.o_proj.weight shape: [16384, 16384]
+207: model.layers.118.self_attn.q_proj.weight shape: [16384, 16384]
+208: model.layers.118.self_attn.v_proj.weight shape: [1024, 16384]
+209: model.layers.119.input_layernorm.weight shape: [16384]
+210: model.layers.119.mlp.down_proj.weight shape: [16384, 53248]
+211: model.layers.119.mlp.gate_proj.weight shape: [53248, 16384]
+212: model.layers.119.mlp.up_proj.weight shape: [53248, 16384]
+213: model.layers.119.post_attention_layernorm.weight shape: [16384]
+214: model.layers.119.self_attn.k_proj.weight shape: [1024, 16384]
+215: model.layers.119.self_attn.o_proj.weight shape: [16384, 16384]
+216: model.layers.119.self_attn.q_proj.weight shape: [16384, 16384]
+217: model.layers.119.self_attn.v_proj.weight shape: [1024, 16384]
+218: model.layers.12.input_layernorm.weight shape: [16384]
+219: model.layers.12.mlp.down_proj.weight shape: [16384, 53248]
+220: model.layers.12.mlp.gate_proj.weight shape: [53248, 16384]
+221: model.layers.12.mlp.up_proj.weight shape: [53248, 16384]
+222: model.layers.12.post_attention_layernorm.weight shape: [16384]
+223: model.layers.12.self_attn.k_proj.weight shape: [1024, 16384]
+224: model.layers.12.self_attn.o_proj.weight shape: [16384, 16384]
+225: model.layers.12.self_attn.q_proj.weight shape: [16384, 16384]
+226: model.layers.12.self_attn.v_proj.weight shape: [1024, 16384]
+227: model.layers.120.input_layernorm.weight shape: [16384]
+228: model.layers.120.mlp.down_proj.weight shape: [16384, 53248]
+229: model.layers.120.mlp.gate_proj.weight shape: [53248, 16384]
+230: model.layers.120.mlp.up_proj.weight shape: [53248, 16384]
+231: model.layers.120.post_attention_layernorm.weight shape: [16384]
+232: model.layers.120.self_attn.k_proj.weight shape: [1024, 16384]
+233: model.layers.120.self_attn.o_proj.weight shape: [16384, 16384]
+234: model.layers.120.self_attn.q_proj.weight shape: [16384, 16384]
+235: model.layers.120.self_attn.v_proj.weight shape: [1024, 16384]
+236: model.layers.121.input_layernorm.weight shape: [16384]
+237: model.layers.121.mlp.down_proj.weight shape: [16384, 53248]
+238: model.layers.121.mlp.gate_proj.weight shape: [53248, 16384]
+239: model.layers.121.mlp.up_proj.weight shape: [53248, 16384]
+240: model.layers.121.post_attention_layernorm.weight shape: [16384]
+241: model.layers.121.self_attn.k_proj.weight shape: [1024, 16384]
+242: model.layers.121.self_attn.o_proj.weight shape: [16384, 16384]
+243: model.layers.121.self_attn.q_proj.weight shape: [16384, 16384]
+244: model.layers.121.self_attn.v_proj.weight shape: [1024, 16384]
+245: model.layers.122.input_layernorm.weight shape: [16384]
+246: model.layers.122.mlp.down_proj.weight shape: [16384, 53248]
+247: model.layers.122.mlp.gate_proj.weight shape: [53248, 16384]
+248: model.layers.122.mlp.up_proj.weight shape: [53248, 16384]
+249: model.layers.122.post_attention_layernorm.weight shape: [16384]
+250: model.layers.122.self_attn.k_proj.weight shape: [1024, 16384]
+251: model.layers.122.self_attn.o_proj.weight shape: [16384, 16384]
+252: model.layers.122.self_attn.q_proj.weight shape: [16384, 16384]
+253: model.layers.122.self_attn.v_proj.weight shape: [1024, 16384]
+254: model.layers.123.input_layernorm.weight shape: [16384]
+255: model.layers.123.mlp.down_proj.weight shape: [16384, 53248]
+256: model.layers.123.mlp.gate_proj.weight shape: [53248, 16384]
+257: model.layers.123.mlp.up_proj.weight shape: [53248, 16384]
+258: model.layers.123.post_attention_layernorm.weight shape: [16384]
+259: model.layers.123.self_attn.k_proj.weight shape: [1024, 16384]
+260: model.layers.123.self_attn.o_proj.weight shape: [16384, 16384]
+261: model.layers.123.self_attn.q_proj.weight shape: [16384, 16384]
+262: model.layers.123.self_attn.v_proj.weight shape: [1024, 16384]
+263: model.layers.124.input_layernorm.weight shape: [16384]
+264: model.layers.124.mlp.down_proj.weight shape: [16384, 53248]
+265: model.layers.124.mlp.gate_proj.weight shape: [53248, 16384]
+266: model.layers.124.mlp.up_proj.weight shape: [53248, 16384]
+267: model.layers.124.post_attention_layernorm.weight shape: [16384]
+268: model.layers.124.self_attn.k_proj.weight shape: [1024, 16384]
+269: model.layers.124.self_attn.o_proj.weight shape: [16384, 16384]
+270: model.layers.124.self_attn.q_proj.weight shape: [16384, 16384]
+271: model.layers.124.self_attn.v_proj.weight shape: [1024, 16384]
+272: model.layers.125.input_layernorm.weight shape: [16384]
+273: model.layers.125.mlp.down_proj.weight shape: [16384, 53248]
+274: model.layers.125.mlp.gate_proj.weight shape: [53248, 16384]
+275: model.layers.125.mlp.up_proj.weight shape: [53248, 16384]
+276: model.layers.125.post_attention_layernorm.weight shape: [16384]
+277: model.layers.125.self_attn.k_proj.weight shape: [1024, 16384]
+278: model.layers.125.self_attn.o_proj.weight shape: [16384, 16384]
+279: model.layers.125.self_attn.q_proj.weight shape: [16384, 16384]
+280: model.layers.125.self_attn.v_proj.weight shape: [1024, 16384]
+281: model.layers.13.input_layernorm.weight shape: [16384]
+282: model.layers.13.mlp.down_proj.weight shape: [16384, 53248]
+283: model.layers.13.mlp.gate_proj.weight shape: [53248, 16384]
+284: model.layers.13.mlp.up_proj.weight shape: [53248, 16384]
+285: model.layers.13.post_attention_layernorm.weight shape: [16384]
+286: model.layers.13.self_attn.k_proj.weight shape: [1024, 16384]
+287: model.layers.13.self_attn.o_proj.weight shape: [16384, 16384]
+288: model.layers.13.self_attn.q_proj.weight shape: [16384, 16384]
+289: model.layers.13.self_attn.v_proj.weight shape: [1024, 16384]
+290: model.layers.14.input_layernorm.weight shape: [16384]
+291: model.layers.14.mlp.down_proj.weight shape: [16384, 53248]
+292: model.layers.14.mlp.gate_proj.weight shape: [53248, 16384]
+293: model.layers.14.mlp.up_proj.weight shape: [53248, 16384]
+294: model.layers.14.post_attention_layernorm.weight shape: [16384]
+295: model.layers.14.self_attn.k_proj.weight shape: [1024, 16384]
+296: model.layers.14.self_attn.o_proj.weight shape: [16384, 16384]
+297: model.layers.14.self_attn.q_proj.weight shape: [16384, 16384]
+298: model.layers.14.self_attn.v_proj.weight shape: [1024, 16384]
+299: model.layers.15.input_layernorm.weight shape: [16384]
+300: model.layers.15.mlp.down_proj.weight shape: [16384, 53248]
+301: model.layers.15.mlp.gate_proj.weight shape: [53248, 16384]
+302: model.layers.15.mlp.up_proj.weight shape: [53248, 16384]
+303: model.layers.15.post_attention_layernorm.weight shape: [16384]
+304: model.layers.15.self_attn.k_proj.weight shape: [1024, 16384]
+305: model.layers.15.self_attn.o_proj.weight shape: [16384, 16384]
+306: model.layers.15.self_attn.q_proj.weight shape: [16384, 16384]
+307: model.layers.15.self_attn.v_proj.weight shape: [1024, 16384]
+308: model.layers.16.input_layernorm.weight shape: [16384]
+309: model.layers.16.mlp.down_proj.weight shape: [16384, 53248]
+310: model.layers.16.mlp.gate_proj.weight shape: [53248, 16384]
+311: model.layers.16.mlp.up_proj.weight shape: [53248, 16384]
+312: model.layers.16.post_attention_layernorm.weight shape: [16384]
+313: model.layers.16.self_attn.k_proj.weight shape: [1024, 16384]
+314: model.layers.16.self_attn.o_proj.weight shape: [16384, 16384]
+315: model.layers.16.self_attn.q_proj.weight shape: [16384, 16384]
+316: model.layers.16.self_attn.v_proj.weight shape: [1024, 16384]
+317: model.layers.17.input_layernorm.weight shape: [16384]
+318: model.layers.17.mlp.down_proj.weight shape: [16384, 53248]
+319: model.layers.17.mlp.gate_proj.weight shape: [53248, 16384]
+320: model.layers.17.mlp.up_proj.weight shape: [53248, 16384]
+321: model.layers.17.post_attention_layernorm.weight shape: [16384]
+322: model.layers.17.self_attn.k_proj.weight shape: [1024, 16384]
+323: model.layers.17.self_attn.o_proj.weight shape: [16384, 16384]
+324: model.layers.17.self_attn.q_proj.weight shape: [16384, 16384]
+325: model.layers.17.self_attn.v_proj.weight shape: [1024, 16384]
+326: model.layers.18.input_layernorm.weight shape: [16384]
+327: model.layers.18.mlp.down_proj.weight shape: [16384, 53248]
+328: model.layers.18.mlp.gate_proj.weight shape: [53248, 16384]
+329: model.layers.18.mlp.up_proj.weight shape: [53248, 16384]
+330: model.layers.18.post_attention_layernorm.weight shape: [16384]
+331: model.layers.18.self_attn.k_proj.weight shape: [1024, 16384]
+332: model.layers.18.self_attn.o_proj.weight shape: [16384, 16384]
+333: model.layers.18.self_attn.q_proj.weight shape: [16384, 16384]
+334: model.layers.18.self_attn.v_proj.weight shape: [1024, 16384]
+335: model.layers.19.input_layernorm.weight shape: [16384]
+336: model.layers.19.mlp.down_proj.weight shape: [16384, 53248]
+337: model.layers.19.mlp.gate_proj.weight shape: [53248, 16384]
+338: model.layers.19.mlp.up_proj.weight shape: [53248, 16384]
+339: model.layers.19.post_attention_layernorm.weight shape: [16384]
+340: model.layers.19.self_attn.k_proj.weight shape: [1024, 16384]
+341: model.layers.19.self_attn.o_proj.weight shape: [16384, 16384]
+342: model.layers.19.self_attn.q_proj.weight shape: [16384, 16384]
+343: model.layers.19.self_attn.v_proj.weight shape: [1024, 16384]
+344: model.layers.2.input_layernorm.weight shape: [16384]
+345: model.layers.2.mlp.down_proj.weight shape: [16384, 53248]
+346: model.layers.2.mlp.gate_proj.weight shape: [53248, 16384]
+347: model.layers.2.mlp.up_proj.weight shape: [53248, 16384]
+348: model.layers.2.post_attention_layernorm.weight shape: [16384]
+349: model.layers.2.self_attn.k_proj.weight shape: [1024, 16384]
+350: model.layers.2.self_attn.o_proj.weight shape: [16384, 16384]
+351: model.layers.2.self_attn.q_proj.weight shape: [16384, 16384]
+352: model.layers.2.self_attn.v_proj.weight shape: [1024, 16384]
+353: model.layers.20.input_layernorm.weight shape: [16384]
+354: model.layers.20.mlp.down_proj.weight shape: [16384, 53248]
+355: model.layers.20.mlp.gate_proj.weight shape: [53248, 16384]
+356: model.layers.20.mlp.up_proj.weight shape: [53248, 16384]
+357: model.layers.20.post_attention_layernorm.weight shape: [16384]
+358: model.layers.20.self_attn.k_proj.weight shape: [1024, 16384]
+359: model.layers.20.self_attn.o_proj.weight shape: [16384, 16384]
+360: model.layers.20.self_attn.q_proj.weight shape: [16384, 16384]
+361: model.layers.20.self_attn.v_proj.weight shape: [1024, 16384]
+362: model.layers.21.input_layernorm.weight shape: [16384]
+363: model.layers.21.mlp.down_proj.weight shape: [16384, 53248]
+364: model.layers.21.mlp.gate_proj.weight shape: [53248, 16384]
+365: model.layers.21.mlp.up_proj.weight shape: [53248, 16384]
+366: model.layers.21.post_attention_layernorm.weight shape: [16384]
+367: model.layers.21.self_attn.k_proj.weight shape: [1024, 16384]
+368: model.layers.21.self_attn.o_proj.weight shape: [16384, 16384]
+369: model.layers.21.self_attn.q_proj.weight shape: [16384, 16384]
+370: model.layers.21.self_attn.v_proj.weight shape: [1024, 16384]
+371: model.layers.22.input_layernorm.weight shape: [16384]
+372: model.layers.22.mlp.down_proj.weight shape: [16384, 53248]
+373: model.layers.22.mlp.gate_proj.weight shape: [53248, 16384]
+374: model.layers.22.mlp.up_proj.weight shape: [53248, 16384]
+375: model.layers.22.post_attention_layernorm.weight shape: [16384]
+376: model.layers.22.self_attn.k_proj.weight shape: [1024, 16384]
+377: model.layers.22.self_attn.o_proj.weight shape: [16384, 16384]
+378: model.layers.22.self_attn.q_proj.weight shape: [16384, 16384]
+379: model.layers.22.self_attn.v_proj.weight shape: [1024, 16384]
+380: model.layers.23.input_layernorm.weight shape: [16384]
+381: model.layers.23.mlp.down_proj.weight shape: [16384, 53248]
+382: model.layers.23.mlp.gate_proj.weight shape: [53248, 16384]
+383: model.layers.23.mlp.up_proj.weight shape: [53248, 16384]
+384: model.layers.23.post_attention_layernorm.weight shape: [16384]
+385: model.layers.23.self_attn.k_proj.weight shape: [1024, 16384]
+386: model.layers.23.self_attn.o_proj.weight shape: [16384, 16384]
+387: model.layers.23.self_attn.q_proj.weight shape: [16384, 16384]
+388: model.layers.23.self_attn.v_proj.weight shape: [1024, 16384]
+389: model.layers.24.input_layernorm.weight shape: [16384]
+390: model.layers.24.mlp.down_proj.weight shape: [16384, 53248]
+391: model.layers.24.mlp.gate_proj.weight shape: [53248, 16384]
+392: model.layers.24.mlp.up_proj.weight shape: [53248, 16384]
+393: model.layers.24.post_attention_layernorm.weight shape: [16384]
+394: model.layers.24.self_attn.k_proj.weight shape: [1024, 16384]
+395: model.layers.24.self_attn.o_proj.weight shape: [16384, 16384]
+396: model.layers.24.self_attn.q_proj.weight shape: [16384, 16384]
+397: model.layers.24.self_attn.v_proj.weight shape: [1024, 16384]
+398: model.layers.25.input_layernorm.weight shape: [16384]
+399: model.layers.25.mlp.down_proj.weight shape: [16384, 53248]
+400: model.layers.25.mlp.gate_proj.weight shape: [53248, 16384]
+401: model.layers.25.mlp.up_proj.weight shape: [53248, 16384]
+402: model.layers.25.post_attention_layernorm.weight shape: [16384]
+403: model.layers.25.self_attn.k_proj.weight shape: [1024, 16384]
+404: model.layers.25.self_attn.o_proj.weight shape: [16384, 16384]
+405: model.layers.25.self_attn.q_proj.weight shape: [16384, 16384]
+406: model.layers.25.self_attn.v_proj.weight shape: [1024, 16384]
+407: model.layers.26.input_layernorm.weight shape: [16384]
+408: model.layers.26.mlp.down_proj.weight shape: [16384, 53248]
+409: model.layers.26.mlp.gate_proj.weight shape: [53248, 16384]
+410: model.layers.26.mlp.up_proj.weight shape: [53248, 16384]
+411: model.layers.26.post_attention_layernorm.weight shape: [16384]
+412: model.layers.26.self_attn.k_proj.weight shape: [1024, 16384]
+413: model.layers.26.self_attn.o_proj.weight shape: [16384, 16384]
+414: model.layers.26.self_attn.q_proj.weight shape: [16384, 16384]
+415: model.layers.26.self_attn.v_proj.weight shape: [1024, 16384]
+416: model.layers.27.input_layernorm.weight shape: [16384]
+417: model.layers.27.mlp.down_proj.weight shape: [16384, 53248]
+418: model.layers.27.mlp.gate_proj.weight shape: [53248, 16384]
+419: model.layers.27.mlp.up_proj.weight shape: [53248, 16384]
+420: model.layers.27.post_attention_layernorm.weight shape: [16384]
+421: model.layers.27.self_attn.k_proj.weight shape: [1024, 16384]
+422: model.layers.27.self_attn.o_proj.weight shape: [16384, 16384]
+423: model.layers.27.self_attn.q_proj.weight shape: [16384, 16384]
+424: model.layers.27.self_attn.v_proj.weight shape: [1024, 16384]
+425: model.layers.28.input_layernorm.weight shape: [16384]
+426: model.layers.28.mlp.down_proj.weight shape: [16384, 53248]
+427: model.layers.28.mlp.gate_proj.weight shape: [53248, 16384]
+428: model.layers.28.mlp.up_proj.weight shape: [53248, 16384]
+429: model.layers.28.post_attention_layernorm.weight shape: [16384]
+430: model.layers.28.self_attn.k_proj.weight shape: [1024, 16384]
+431: model.layers.28.self_attn.o_proj.weight shape: [16384, 16384]
+432: model.layers.28.self_attn.q_proj.weight shape: [16384, 16384]
+433: model.layers.28.self_attn.v_proj.weight shape: [1024, 16384]
+434: model.layers.29.input_layernorm.weight shape: [16384]
+435: model.layers.29.mlp.down_proj.weight shape: [16384, 53248]
+436: model.layers.29.mlp.gate_proj.weight shape: [53248, 16384]
+437: model.layers.29.mlp.up_proj.weight shape: [53248, 16384]
+438: model.layers.29.post_attention_layernorm.weight shape: [16384]
+439: model.layers.29.self_attn.k_proj.weight shape: [1024, 16384]
+440: model.layers.29.self_attn.o_proj.weight shape: [16384, 16384]
+441: model.layers.29.self_attn.q_proj.weight shape: [16384, 16384]
+442: model.layers.29.self_attn.v_proj.weight shape: [1024, 16384]
+443: model.layers.3.input_layernorm.weight shape: [16384]
+444: model.layers.3.mlp.down_proj.weight shape: [16384, 53248]
+445: model.layers.3.mlp.gate_proj.weight shape: [53248, 16384]
+446: model.layers.3.mlp.up_proj.weight shape: [53248, 16384]
+447: model.layers.3.post_attention_layernorm.weight shape: [16384]
+448: model.layers.3.self_attn.k_proj.weight shape: [1024, 16384]
+449: model.layers.3.self_attn.o_proj.weight shape: [16384, 16384]
+450: model.layers.3.self_attn.q_proj.weight shape: [16384, 16384]
+451: model.layers.3.self_attn.v_proj.weight shape: [1024, 16384]
+452: model.layers.30.input_layernorm.weight shape: [16384]
+453: model.layers.30.mlp.down_proj.weight shape: [16384, 53248]
+454: model.layers.30.mlp.gate_proj.weight shape: [53248, 16384]
+455: model.layers.30.mlp.up_proj.weight shape: [53248, 16384]
+456: model.layers.30.post_attention_layernorm.weight shape: [16384]
+457: model.layers.30.self_attn.k_proj.weight shape: [1024, 16384]
+458: model.layers.30.self_attn.o_proj.weight shape: [16384, 16384]
+459: model.layers.30.self_attn.q_proj.weight shape: [16384, 16384]
+460: model.layers.30.self_attn.v_proj.weight shape: [1024, 16384]
+461: model.layers.31.input_layernorm.weight shape: [16384]
+462: model.layers.31.mlp.down_proj.weight shape: [16384, 53248]
+463: model.layers.31.mlp.gate_proj.weight shape: [53248, 16384]
+464: model.layers.31.mlp.up_proj.weight shape: [53248, 16384]
+465: model.layers.31.post_attention_layernorm.weight shape: [16384]
+466: model.layers.31.self_attn.k_proj.weight shape: [1024, 16384]
+467: model.layers.31.self_attn.o_proj.weight shape: [16384, 16384]
+468: model.layers.31.self_attn.q_proj.weight shape: [16384, 16384]
+469: model.layers.31.self_attn.v_proj.weight shape: [1024, 16384]
+470: model.layers.32.input_layernorm.weight shape: [16384]
+471: model.layers.32.mlp.down_proj.weight shape: [16384, 53248]
+472: model.layers.32.mlp.gate_proj.weight shape: [53248, 16384]
+473: model.layers.32.mlp.up_proj.weight shape: [53248, 16384]
+474: model.layers.32.post_attention_layernorm.weight shape: [16384]
+475: model.layers.32.self_attn.k_proj.weight shape: [1024, 16384]
+476: model.layers.32.self_attn.o_proj.weight shape: [16384, 16384]
+477: model.layers.32.self_attn.q_proj.weight shape: [16384, 16384]
+478: model.layers.32.self_attn.v_proj.weight shape: [1024, 16384]
+479: model.layers.33.input_layernorm.weight shape: [16384]
+480: model.layers.33.mlp.down_proj.weight shape: [16384, 53248]
+481: model.layers.33.mlp.gate_proj.weight shape: [53248, 16384]
+482: model.layers.33.mlp.up_proj.weight shape: [53248, 16384]
+483: model.layers.33.post_attention_layernorm.weight shape: [16384]
+484: model.layers.33.self_attn.k_proj.weight shape: [1024, 16384]
+485: model.layers.33.self_attn.o_proj.weight shape: [16384, 16384]
+486: model.layers.33.self_attn.q_proj.weight shape: [16384, 16384]
+487: model.layers.33.self_attn.v_proj.weight shape: [1024, 16384]
+488: model.layers.34.input_layernorm.weight shape: [16384]
+489: model.layers.34.mlp.down_proj.weight shape: [16384, 53248]
+490: model.layers.34.mlp.gate_proj.weight shape: [53248, 16384]
+491: model.layers.34.mlp.up_proj.weight shape: [53248, 16384]
+492: model.layers.34.post_attention_layernorm.weight shape: [16384]
+493: model.layers.34.self_attn.k_proj.weight shape: [1024, 16384]
+494: model.layers.34.self_attn.o_proj.weight shape: [16384, 16384]
+495: model.layers.34.self_attn.q_proj.weight shape: [16384, 16384]
+496: model.layers.34.self_attn.v_proj.weight shape: [1024, 16384]
+497: model.layers.35.input_layernorm.weight shape: [16384]
+498: model.layers.35.mlp.down_proj.weight shape: [16384, 53248]
+499: model.layers.35.mlp.gate_proj.weight shape: [53248, 16384]
+500: model.layers.35.mlp.up_proj.weight shape: [53248, 16384]
+501: model.layers.35.post_attention_layernorm.weight shape: [16384]
+502: model.layers.35.self_attn.k_proj.weight shape: [1024, 16384]
+503: model.layers.35.self_attn.o_proj.weight shape: [16384, 16384]
+504: model.layers.35.self_attn.q_proj.weight shape: [16384, 16384]
+505: model.layers.35.self_attn.v_proj.weight shape: [1024, 16384]
+506: model.layers.36.input_layernorm.weight shape: [16384]
+507: model.layers.36.mlp.down_proj.weight shape: [16384, 53248]
+508: model.layers.36.mlp.gate_proj.weight shape: [53248, 16384]
+509: model.layers.36.mlp.up_proj.weight shape: [53248, 16384]
+510: model.layers.36.post_attention_layernorm.weight shape: [16384]
+511: model.layers.36.self_attn.k_proj.weight shape: [1024, 16384]
+512: model.layers.36.self_attn.o_proj.weight shape: [16384, 16384]
+513: model.layers.36.self_attn.q_proj.weight shape: [16384, 16384]
+514: model.layers.36.self_attn.v_proj.weight shape: [1024, 16384]
+515: model.layers.37.input_layernorm.weight shape: [16384]
+516: model.layers.37.mlp.down_proj.weight shape: [16384, 53248]
+517: model.layers.37.mlp.gate_proj.weight shape: [53248, 16384]
+518: model.layers.37.mlp.up_proj.weight shape: [53248, 16384]
+519: model.layers.37.post_attention_layernorm.weight shape: [16384]
+520: model.layers.37.self_attn.k_proj.weight shape: [1024, 16384]
+521: model.layers.37.self_attn.o_proj.weight shape: [16384, 16384]
+522: model.layers.37.self_attn.q_proj.weight shape: [16384, 16384]
+523: model.layers.37.self_attn.v_proj.weight shape: [1024, 16384]
+524: model.layers.38.input_layernorm.weight shape: [16384]
+525: model.layers.38.mlp.down_proj.weight shape: [16384, 53248]
+526: model.layers.38.mlp.gate_proj.weight shape: [53248, 16384]
+527: model.layers.38.mlp.up_proj.weight shape: [53248, 16384]
+528: model.layers.38.post_attention_layernorm.weight shape: [16384]
+529: model.layers.38.self_attn.k_proj.weight shape: [1024, 16384]
+530: model.layers.38.self_attn.o_proj.weight shape: [16384, 16384]
+531: model.layers.38.self_attn.q_proj.weight shape: [16384, 16384]
+532: model.layers.38.self_attn.v_proj.weight shape: [1024, 16384]
+533: model.layers.39.input_layernorm.weight shape: [16384]
+534: model.layers.39.mlp.down_proj.weight shape: [16384, 53248]
+535: model.layers.39.mlp.gate_proj.weight shape: [53248, 16384]
+536: model.layers.39.mlp.up_proj.weight shape: [53248, 16384]
+537: model.layers.39.post_attention_layernorm.weight shape: [16384]
+538: model.layers.39.self_attn.k_proj.weight shape: [1024, 16384]
+539: model.layers.39.self_attn.o_proj.weight shape: [16384, 16384]
+540: model.layers.39.self_attn.q_proj.weight shape: [16384, 16384]
+541: model.layers.39.self_attn.v_proj.weight shape: [1024, 16384]
+542: model.layers.4.input_layernorm.weight shape: [16384]
+543: model.layers.4.mlp.down_proj.weight shape: [16384, 53248]
+544: model.layers.4.mlp.gate_proj.weight shape: [53248, 16384]
+545: model.layers.4.mlp.up_proj.weight shape: [53248, 16384]
+546: model.layers.4.post_attention_layernorm.weight shape: [16384]
+547: model.layers.4.self_attn.k_proj.weight shape: [1024, 16384]
+548: model.layers.4.self_attn.o_proj.weight shape: [16384, 16384]
+549: model.layers.4.self_attn.q_proj.weight shape: [16384, 16384]
+550: model.layers.4.self_attn.v_proj.weight shape: [1024, 16384]
+551: model.layers.40.input_layernorm.weight shape: [16384]
+552: model.layers.40.mlp.down_proj.weight shape: [16384, 53248]
+553: model.layers.40.mlp.gate_proj.weight shape: [53248, 16384]
+554: model.layers.40.mlp.up_proj.weight shape: [53248, 16384]
+555: model.layers.40.post_attention_layernorm.weight shape: [16384]
+556: model.layers.40.self_attn.k_proj.weight shape: [1024, 16384]
+557: model.layers.40.self_attn.o_proj.weight shape: [16384, 16384]
+558: model.layers.40.self_attn.q_proj.weight shape: [16384, 16384]
+559: model.layers.40.self_attn.v_proj.weight shape: [1024, 16384]
+560: model.layers.41.input_layernorm.weight shape: [16384]
+561: model.layers.41.mlp.down_proj.weight shape: [16384, 53248]
+562: model.layers.41.mlp.gate_proj.weight shape: [53248, 16384]
+563: model.layers.41.mlp.up_proj.weight shape: [53248, 16384]
+564: model.layers.41.post_attention_layernorm.weight shape: [16384]
+565: model.layers.41.self_attn.k_proj.weight shape: [1024, 16384]
+566: model.layers.41.self_attn.o_proj.weight shape: [16384, 16384]
+567: model.layers.41.self_attn.q_proj.weight shape: [16384, 16384]
+568: model.layers.41.self_attn.v_proj.weight shape: [1024, 16384]
+569: model.layers.42.input_layernorm.weight shape: [16384]
+570: model.layers.42.mlp.down_proj.weight shape: [16384, 53248]
+571: model.layers.42.mlp.gate_proj.weight shape: [53248, 16384]
+572: model.layers.42.mlp.up_proj.weight shape: [53248, 16384]
+573: model.layers.42.post_attention_layernorm.weight shape: [16384]
+574: model.layers.42.self_attn.k_proj.weight shape: [1024, 16384]
+575: model.layers.42.self_attn.o_proj.weight shape: [16384, 16384]
+576: model.layers.42.self_attn.q_proj.weight shape: [16384, 16384]
+577: model.layers.42.self_attn.v_proj.weight shape: [1024, 16384]
+578: model.layers.43.input_layernorm.weight shape: [16384]
+579: model.layers.43.mlp.down_proj.weight shape: [16384, 53248]
+580: model.layers.43.mlp.gate_proj.weight shape: [53248, 16384]
+581: model.layers.43.mlp.up_proj.weight shape: [53248, 16384]
+582: model.layers.43.post_attention_layernorm.weight shape: [16384]
+583: model.layers.43.self_attn.k_proj.weight shape: [1024, 16384]
+584: model.layers.43.self_attn.o_proj.weight shape: [16384, 16384]
+585: model.layers.43.self_attn.q_proj.weight shape: [16384, 16384]
+586: model.layers.43.self_attn.v_proj.weight shape: [1024, 16384]
+587: model.layers.44.input_layernorm.weight shape: [16384]
+588: model.layers.44.mlp.down_proj.weight shape: [16384, 53248]
+589: model.layers.44.mlp.gate_proj.weight shape: [53248, 16384]
+590: model.layers.44.mlp.up_proj.weight shape: [53248, 16384]
+591: model.layers.44.post_attention_layernorm.weight shape: [16384]
+592: model.layers.44.self_attn.k_proj.weight shape: [1024, 16384]
+593: model.layers.44.self_attn.o_proj.weight shape: [16384, 16384]
+594: model.layers.44.self_attn.q_proj.weight shape: [16384, 16384]
+595: model.layers.44.self_attn.v_proj.weight shape: [1024, 16384]
+596: model.layers.45.input_layernorm.weight shape: [16384]
+597: model.layers.45.mlp.down_proj.weight shape: [16384, 53248]
+598: model.layers.45.mlp.gate_proj.weight shape: [53248, 16384]
+599: model.layers.45.mlp.up_proj.weight shape: [53248, 16384]
+600: model.layers.45.post_attention_layernorm.weight shape: [16384]
+601: model.layers.45.self_attn.k_proj.weight shape: [1024, 16384]
+602: model.layers.45.self_attn.o_proj.weight shape: [16384, 16384]
+603: model.layers.45.self_attn.q_proj.weight shape: [16384, 16384]
+604: model.layers.45.self_attn.v_proj.weight shape: [1024, 16384]
+605: model.layers.46.input_layernorm.weight shape: [16384]
+606: model.layers.46.mlp.down_proj.weight shape: [16384, 53248]
+607: model.layers.46.mlp.gate_proj.weight shape: [53248, 16384]
+608: model.layers.46.mlp.up_proj.weight shape: [53248, 16384]
+609: model.layers.46.post_attention_layernorm.weight shape: [16384]
+610: model.layers.46.self_attn.k_proj.weight shape: [1024, 16384]
+611: model.layers.46.self_attn.o_proj.weight shape: [16384, 16384]
+612: model.layers.46.self_attn.q_proj.weight shape: [16384, 16384]
+613: model.layers.46.self_attn.v_proj.weight shape: [1024, 16384]
+614: model.layers.47.input_layernorm.weight shape: [16384]
+615: model.layers.47.mlp.down_proj.weight shape: [16384, 53248]
+616: model.layers.47.mlp.gate_proj.weight shape: [53248, 16384]
+617: model.layers.47.mlp.up_proj.weight shape: [53248, 16384]
+618: model.layers.47.post_attention_layernorm.weight shape: [16384]
+619: model.layers.47.self_attn.k_proj.weight shape: [1024, 16384]
+620: model.layers.47.self_attn.o_proj.weight shape: [16384, 16384]
+621: model.layers.47.self_attn.q_proj.weight shape: [16384, 16384]
+622: model.layers.47.self_attn.v_proj.weight shape: [1024, 16384]
+623: model.layers.48.input_layernorm.weight shape: [16384]
+624: model.layers.48.mlp.down_proj.weight shape: [16384, 53248]
+625: model.layers.48.mlp.gate_proj.weight shape: [53248, 16384]
+626: model.layers.48.mlp.up_proj.weight shape: [53248, 16384]
+627: model.layers.48.post_attention_layernorm.weight shape: [16384]
+628: model.layers.48.self_attn.k_proj.weight shape: [1024, 16384]
+629: model.layers.48.self_attn.o_proj.weight shape: [16384, 16384]
+630: model.layers.48.self_attn.q_proj.weight shape: [16384, 16384]
+631: model.layers.48.self_attn.v_proj.weight shape: [1024, 16384]
+632: model.layers.49.input_layernorm.weight shape: [16384]
+633: model.layers.49.mlp.down_proj.weight shape: [16384, 53248]
+634: model.layers.49.mlp.gate_proj.weight shape: [53248, 16384]
+635: model.layers.49.mlp.up_proj.weight shape: [53248, 16384]
+636: model.layers.49.post_attention_layernorm.weight shape: [16384]
+637: model.layers.49.self_attn.k_proj.weight shape: [1024, 16384]
+638: model.layers.49.self_attn.o_proj.weight shape: [16384, 16384]
+639: model.layers.49.self_attn.q_proj.weight shape: [16384, 16384]
+640: model.layers.49.self_attn.v_proj.weight shape: [1024, 16384]
+641: model.layers.5.input_layernorm.weight shape: [16384]
+642: model.layers.5.mlp.down_proj.weight shape: [16384, 53248]
+643: model.layers.5.mlp.gate_proj.weight shape: [53248, 16384]
+644: model.layers.5.mlp.up_proj.weight shape: [53248, 16384]
+645: model.layers.5.post_attention_layernorm.weight shape: [16384]
+646: model.layers.5.self_attn.k_proj.weight shape: [1024, 16384]
+647: model.layers.5.self_attn.o_proj.weight shape: [16384, 16384]
+648: model.layers.5.self_attn.q_proj.weight shape: [16384, 16384]
+649: model.layers.5.self_attn.v_proj.weight shape: [1024, 16384]
+650: model.layers.50.input_layernorm.weight shape: [16384]
+651: model.layers.50.mlp.down_proj.weight shape: [16384, 53248]
+652: model.layers.50.mlp.gate_proj.weight shape: [53248, 16384]
+653: model.layers.50.mlp.up_proj.weight shape: [53248, 16384]
+654: model.layers.50.post_attention_layernorm.weight shape: [16384]
+655: model.layers.50.self_attn.k_proj.weight shape: [1024, 16384]
+656: model.layers.50.self_attn.o_proj.weight shape: [16384, 16384]
+657: model.layers.50.self_attn.q_proj.weight shape: [16384, 16384]
+658: model.layers.50.self_attn.v_proj.weight shape: [1024, 16384]
+659: model.layers.51.input_layernorm.weight shape: [16384]
+660: model.layers.51.mlp.down_proj.weight shape: [16384, 53248]
+661: model.layers.51.mlp.gate_proj.weight shape: [53248, 16384]
+662: model.layers.51.mlp.up_proj.weight shape: [53248, 16384]
+663: model.layers.51.post_attention_layernorm.weight shape: [16384]
+664: model.layers.51.self_attn.k_proj.weight shape: [1024, 16384]
+665: model.layers.51.self_attn.o_proj.weight shape: [16384, 16384]
+666: model.layers.51.self_attn.q_proj.weight shape: [16384, 16384]
+667: model.layers.51.self_attn.v_proj.weight shape: [1024, 16384]
+668: model.layers.52.input_layernorm.weight shape: [16384]
+669: model.layers.52.mlp.down_proj.weight shape: [16384, 53248]
+670: model.layers.52.mlp.gate_proj.weight shape: [53248, 16384]
+671: model.layers.52.mlp.up_proj.weight shape: [53248, 16384]
+672: model.layers.52.post_attention_layernorm.weight shape: [16384]
+673: model.layers.52.self_attn.k_proj.weight shape: [1024, 16384]
+674: model.layers.52.self_attn.o_proj.weight shape: [16384, 16384]
+675: model.layers.52.self_attn.q_proj.weight shape: [16384, 16384]
+676: model.layers.52.self_attn.v_proj.weight shape: [1024, 16384]
+677: model.layers.53.input_layernorm.weight shape: [16384]
+678: model.layers.53.mlp.down_proj.weight shape: [16384, 53248]
+679: model.layers.53.mlp.gate_proj.weight shape: [53248, 16384]
+680: model.layers.53.mlp.up_proj.weight shape: [53248, 16384]
+681: model.layers.53.post_attention_layernorm.weight shape: [16384]
+682: model.layers.53.self_attn.k_proj.weight shape: [1024, 16384]
+683: model.layers.53.self_attn.o_proj.weight shape: [16384, 16384]
+684: model.layers.53.self_attn.q_proj.weight shape: [16384, 16384]
+685: model.layers.53.self_attn.v_proj.weight shape: [1024, 16384]
+686: model.layers.54.input_layernorm.weight shape: [16384]
+687: model.layers.54.mlp.down_proj.weight shape: [16384, 53248]
+688: model.layers.54.mlp.gate_proj.weight shape: [53248, 16384]
+689: model.layers.54.mlp.up_proj.weight shape: [53248, 16384]
+690: model.layers.54.post_attention_layernorm.weight shape: [16384]
+691: model.layers.54.self_attn.k_proj.weight shape: [1024, 16384]
+692: model.layers.54.self_attn.o_proj.weight shape: [16384, 16384]
+693: model.layers.54.self_attn.q_proj.weight shape: [16384, 16384]
+694: model.layers.54.self_attn.v_proj.weight shape: [1024, 16384]
+695: model.layers.55.input_layernorm.weight shape: [16384]
+696: model.layers.55.mlp.down_proj.weight shape: [16384, 53248]
+697: model.layers.55.mlp.gate_proj.weight shape: [53248, 16384]
+698: model.layers.55.mlp.up_proj.weight shape: [53248, 16384]
+699: model.layers.55.post_attention_layernorm.weight shape: [16384]
+700: model.layers.55.self_attn.k_proj.weight shape: [1024, 16384]
+701: model.layers.55.self_attn.o_proj.weight shape: [16384, 16384]
+702: model.layers.55.self_attn.q_proj.weight shape: [16384, 16384]
+703: model.layers.55.self_attn.v_proj.weight shape: [1024, 16384]
+704: model.layers.56.input_layernorm.weight shape: [16384]
+705: model.layers.56.mlp.down_proj.weight shape: [16384, 53248]
+706: model.layers.56.mlp.gate_proj.weight shape: [53248, 16384]
+707: model.layers.56.mlp.up_proj.weight shape: [53248, 16384]
+708: model.layers.56.post_attention_layernorm.weight shape: [16384]
+709: model.layers.56.self_attn.k_proj.weight shape: [1024, 16384]
+710: model.layers.56.self_attn.o_proj.weight shape: [16384, 16384]
+711: model.layers.56.self_attn.q_proj.weight shape: [16384, 16384]
+712: model.layers.56.self_attn.v_proj.weight shape: [1024, 16384]
+713: model.layers.57.input_layernorm.weight shape: [16384]
+714: model.layers.57.mlp.down_proj.weight shape: [16384, 53248]
+715: model.layers.57.mlp.gate_proj.weight shape: [53248, 16384]
+716: model.layers.57.mlp.up_proj.weight shape: [53248, 16384]
+717: model.layers.57.post_attention_layernorm.weight shape: [16384]
+718: model.layers.57.self_attn.k_proj.weight shape: [1024, 16384]
+719: model.layers.57.self_attn.o_proj.weight shape: [16384, 16384]
+720: model.layers.57.self_attn.q_proj.weight shape: [16384, 16384]
+721: model.layers.57.self_attn.v_proj.weight shape: [1024, 16384]
+722: model.layers.58.input_layernorm.weight shape: [16384]
+723: model.layers.58.mlp.down_proj.weight shape: [16384, 53248]
+724: model.layers.58.mlp.gate_proj.weight shape: [53248, 16384]
+725: model.layers.58.mlp.up_proj.weight shape: [53248, 16384]
+726: model.layers.58.post_attention_layernorm.weight shape: [16384]
+727: model.layers.58.self_attn.k_proj.weight shape: [1024, 16384]
+728: model.layers.58.self_attn.o_proj.weight shape: [16384, 16384]
+729: model.layers.58.self_attn.q_proj.weight shape: [16384, 16384]
+730: model.layers.58.self_attn.v_proj.weight shape: [1024, 16384]
+731: model.layers.59.input_layernorm.weight shape: [16384]
+732: model.layers.59.mlp.down_proj.weight shape: [16384, 53248]
+733: model.layers.59.mlp.gate_proj.weight shape: [53248, 16384]
+734: model.layers.59.mlp.up_proj.weight shape: [53248, 16384]
+735: model.layers.59.post_attention_layernorm.weight shape: [16384]
+736: model.layers.59.self_attn.k_proj.weight shape: [1024, 16384]
+737: model.layers.59.self_attn.o_proj.weight shape: [16384, 16384]
+738: model.layers.59.self_attn.q_proj.weight shape: [16384, 16384]
+739: model.layers.59.self_attn.v_proj.weight shape: [1024, 16384]
+740: model.layers.6.input_layernorm.weight shape: [16384]
+741: model.layers.6.mlp.down_proj.weight shape: [16384, 53248]
+742: model.layers.6.mlp.gate_proj.weight shape: [53248, 16384]
+743: model.layers.6.mlp.up_proj.weight shape: [53248, 16384]
+744: model.layers.6.post_attention_layernorm.weight shape: [16384]
+745: model.layers.6.self_attn.k_proj.weight shape: [1024, 16384]
+746: model.layers.6.self_attn.o_proj.weight shape: [16384, 16384]
+747: model.layers.6.self_attn.q_proj.weight shape: [16384, 16384]
+748: model.layers.6.self_attn.v_proj.weight shape: [1024, 16384]
+749: model.layers.60.input_layernorm.weight shape: [16384]
+750: model.layers.60.mlp.down_proj.weight shape: [16384, 53248]
+751: model.layers.60.mlp.gate_proj.weight shape: [53248, 16384]
+752: model.layers.60.mlp.up_proj.weight shape: [53248, 16384]
+753: model.layers.60.post_attention_layernorm.weight shape: [16384]
+754: model.layers.60.self_attn.k_proj.weight shape: [1024, 16384]
+755: model.layers.60.self_attn.o_proj.weight shape: [16384, 16384]
+756: model.layers.60.self_attn.q_proj.weight shape: [16384, 16384]
+757: model.layers.60.self_attn.v_proj.weight shape: [1024, 16384]
+758: model.layers.61.input_layernorm.weight shape: [16384]
+759: model.layers.61.mlp.down_proj.weight shape: [16384, 53248]
+760: model.layers.61.mlp.gate_proj.weight shape: [53248, 16384]
+761: model.layers.61.mlp.up_proj.weight shape: [53248, 16384]
+762: model.layers.61.post_attention_layernorm.weight shape: [16384]
+763: model.layers.61.self_attn.k_proj.weight shape: [1024, 16384]
+764: model.layers.61.self_attn.o_proj.weight shape: [16384, 16384]
+765: model.layers.61.self_attn.q_proj.weight shape: [16384, 16384]
+766: model.layers.61.self_attn.v_proj.weight shape: [1024, 16384]
+767: model.layers.62.input_layernorm.weight shape: [16384]
+768: model.layers.62.mlp.down_proj.weight shape: [16384, 53248]
+769: model.layers.62.mlp.gate_proj.weight shape: [53248, 16384]
+770: model.layers.62.mlp.up_proj.weight shape: [53248, 16384]
+771: model.layers.62.post_attention_layernorm.weight shape: [16384]
+772: model.layers.62.self_attn.k_proj.weight shape: [1024, 16384]
+773: model.layers.62.self_attn.o_proj.weight shape: [16384, 16384]
+774: model.layers.62.self_attn.q_proj.weight shape: [16384, 16384]
+775: model.layers.62.self_attn.v_proj.weight shape: [1024, 16384]
+776: model.layers.63.input_layernorm.weight shape: [16384]
+777: model.layers.63.mlp.down_proj.weight shape: [16384, 53248]
+778: model.layers.63.mlp.gate_proj.weight shape: [53248, 16384]
+779: model.layers.63.mlp.up_proj.weight shape: [53248, 16384]
+780: model.layers.63.post_attention_layernorm.weight shape: [16384]
+781: model.layers.63.self_attn.k_proj.weight shape: [1024, 16384]
+782: model.layers.63.self_attn.o_proj.weight shape: [16384, 16384]
+783: model.layers.63.self_attn.q_proj.weight shape: [16384, 16384]
+784: model.layers.63.self_attn.v_proj.weight shape: [1024, 16384]
+785: model.layers.64.input_layernorm.weight shape: [16384]
+786: model.layers.64.mlp.down_proj.weight shape: [16384, 53248]
+787: model.layers.64.mlp.gate_proj.weight shape: [53248, 16384]
+788: model.layers.64.mlp.up_proj.weight shape: [53248, 16384]
+789: model.layers.64.post_attention_layernorm.weight shape: [16384]
+790: model.layers.64.self_attn.k_proj.weight shape: [1024, 16384]
+791: model.layers.64.self_attn.o_proj.weight shape: [16384, 16384]
+792: model.layers.64.self_attn.q_proj.weight shape: [16384, 16384]
+793: model.layers.64.self_attn.v_proj.weight shape: [1024, 16384]
+794: model.layers.65.input_layernorm.weight shape: [16384]
+795: model.layers.65.mlp.down_proj.weight shape: [16384, 53248]
+796: model.layers.65.mlp.gate_proj.weight shape: [53248, 16384]
+797: model.layers.65.mlp.up_proj.weight shape: [53248, 16384]
+798: model.layers.65.post_attention_layernorm.weight shape: [16384]
+799: model.layers.65.self_attn.k_proj.weight shape: [1024, 16384]
+800: model.layers.65.self_attn.o_proj.weight shape: [16384, 16384]
+801: model.layers.65.self_attn.q_proj.weight shape: [16384, 16384]
+802: model.layers.65.self_attn.v_proj.weight shape: [1024, 16384]
+803: model.layers.66.input_layernorm.weight shape: [16384]
+804: model.layers.66.mlp.down_proj.weight shape: [16384, 53248]
+805: model.layers.66.mlp.gate_proj.weight shape: [53248, 16384]
+806: model.layers.66.mlp.up_proj.weight shape: [53248, 16384]
+807: model.layers.66.post_attention_layernorm.weight shape: [16384]
+808: model.layers.66.self_attn.k_proj.weight shape: [1024, 16384]
+809: model.layers.66.self_attn.o_proj.weight shape: [16384, 16384]
+810: model.layers.66.self_attn.q_proj.weight shape: [16384, 16384]
+811: model.layers.66.self_attn.v_proj.weight shape: [1024, 16384]
+812: model.layers.67.input_layernorm.weight shape: [16384]
+813: model.layers.67.mlp.down_proj.weight shape: [16384, 53248]
+814: model.layers.67.mlp.gate_proj.weight shape: [53248, 16384]
+815: model.layers.67.mlp.up_proj.weight shape: [53248, 16384]
+816: model.layers.67.post_attention_layernorm.weight shape: [16384]
+817: model.layers.67.self_attn.k_proj.weight shape: [1024, 16384]
+818: model.layers.67.self_attn.o_proj.weight shape: [16384, 16384]
+819: model.layers.67.self_attn.q_proj.weight shape: [16384, 16384]
+820: model.layers.67.self_attn.v_proj.weight shape: [1024, 16384]
+821: model.layers.68.input_layernorm.weight shape: [16384]
+822: model.layers.68.mlp.down_proj.weight shape: [16384, 53248]
+823: model.layers.68.mlp.gate_proj.weight shape: [53248, 16384]
+824: model.layers.68.mlp.up_proj.weight shape: [53248, 16384]
+825: model.layers.68.post_attention_layernorm.weight shape: [16384]
+826: model.layers.68.self_attn.k_proj.weight shape: [1024, 16384]
+827: model.layers.68.self_attn.o_proj.weight shape: [16384, 16384]
+828: model.layers.68.self_attn.q_proj.weight shape: [16384, 16384]
+829: model.layers.68.self_attn.v_proj.weight shape: [1024, 16384]
+830: model.layers.69.input_layernorm.weight shape: [16384]
+831: model.layers.69.mlp.down_proj.weight shape: [16384, 53248]
+832: model.layers.69.mlp.gate_proj.weight shape: [53248, 16384]
+833: model.layers.69.mlp.up_proj.weight shape: [53248, 16384]
+834: model.layers.69.post_attention_layernorm.weight shape: [16384]
+835: model.layers.69.self_attn.k_proj.weight shape: [1024, 16384]
+836: model.layers.69.self_attn.o_proj.weight shape: [16384, 16384]
+837: model.layers.69.self_attn.q_proj.weight shape: [16384, 16384]
+838: model.layers.69.self_attn.v_proj.weight shape: [1024, 16384]
+839: model.layers.7.input_layernorm.weight shape: [16384]
+840: model.layers.7.mlp.down_proj.weight shape: [16384, 53248]
+841: model.layers.7.mlp.gate_proj.weight shape: [53248, 16384]
+842: model.layers.7.mlp.up_proj.weight shape: [53248, 16384]
+843: model.layers.7.post_attention_layernorm.weight shape: [16384]
+844: model.layers.7.self_attn.k_proj.weight shape: [1024, 16384]
+845: model.layers.7.self_attn.o_proj.weight shape: [16384, 16384]
+846: model.layers.7.self_attn.q_proj.weight shape: [16384, 16384]
+847: model.layers.7.self_attn.v_proj.weight shape: [1024, 16384]
+848: model.layers.70.input_layernorm.weight shape: [16384]
+849: model.layers.70.mlp.down_proj.weight shape: [16384, 53248]
+850: model.layers.70.mlp.gate_proj.weight shape: [53248, 16384]
+851: model.layers.70.mlp.up_proj.weight shape: [53248, 16384]
+852: model.layers.70.post_attention_layernorm.weight shape: [16384]
+853: model.layers.70.self_attn.k_proj.weight shape: [1024, 16384]
+854: model.layers.70.self_attn.o_proj.weight shape: [16384, 16384]
+855: model.layers.70.self_attn.q_proj.weight shape: [16384, 16384]
+856: model.layers.70.self_attn.v_proj.weight shape: [1024, 16384]
+857: model.layers.71.input_layernorm.weight shape: [16384]
+858: model.layers.71.mlp.down_proj.weight shape: [16384, 53248]
+859: model.layers.71.mlp.gate_proj.weight shape: [53248, 16384]
+860: model.layers.71.mlp.up_proj.weight shape: [53248, 16384]
+861: model.layers.71.post_attention_layernorm.weight shape: [16384]
+862: model.layers.71.self_attn.k_proj.weight shape: [1024, 16384]
+863: model.layers.71.self_attn.o_proj.weight shape: [16384, 16384]
+864: model.layers.71.self_attn.q_proj.weight shape: [16384, 16384]
+865: model.layers.71.self_attn.v_proj.weight shape: [1024, 16384]
+866: model.layers.72.input_layernorm.weight shape: [16384]
+867: model.layers.72.mlp.down_proj.weight shape: [16384, 53248]
+868: model.layers.72.mlp.gate_proj.weight shape: [53248, 16384]
+869: model.layers.72.mlp.up_proj.weight shape: [53248, 16384]
+870: model.layers.72.post_attention_layernorm.weight shape: [16384]
+871: model.layers.72.self_attn.k_proj.weight shape: [1024, 16384]
+872: model.layers.72.self_attn.o_proj.weight shape: [16384, 16384]
+873: model.layers.72.self_attn.q_proj.weight shape: [16384, 16384]
+874: model.layers.72.self_attn.v_proj.weight shape: [1024, 16384]
+875: model.layers.73.input_layernorm.weight shape: [16384]
+876: model.layers.73.mlp.down_proj.weight shape: [16384, 53248]
+877: model.layers.73.mlp.gate_proj.weight shape: [53248, 16384]
+878: model.layers.73.mlp.up_proj.weight shape: [53248, 16384]
+879: model.layers.73.post_attention_layernorm.weight shape: [16384]
+880: model.layers.73.self_attn.k_proj.weight shape: [1024, 16384]
+881: model.layers.73.self_attn.o_proj.weight shape: [16384, 16384]
+882: model.layers.73.self_attn.q_proj.weight shape: [16384, 16384]
+883: model.layers.73.self_attn.v_proj.weight shape: [1024, 16384]
+884: model.layers.74.input_layernorm.weight shape: [16384]
+885: model.layers.74.mlp.down_proj.weight shape: [16384, 53248]
+886: model.layers.74.mlp.gate_proj.weight shape: [53248, 16384]
+887: model.layers.74.mlp.up_proj.weight shape: [53248, 16384]
+888: model.layers.74.post_attention_layernorm.weight shape: [16384]
+889: model.layers.74.self_attn.k_proj.weight shape: [1024, 16384]
+890: model.layers.74.self_attn.o_proj.weight shape: [16384, 16384]
+891: model.layers.74.self_attn.q_proj.weight shape: [16384, 16384]
+892: model.layers.74.self_attn.v_proj.weight shape: [1024, 16384]
+893: model.layers.75.input_layernorm.weight shape: [16384]
+894: model.layers.75.mlp.down_proj.weight shape: [16384, 53248]
+895: model.layers.75.mlp.gate_proj.weight shape: [53248, 16384]
+896: model.layers.75.mlp.up_proj.weight shape: [53248, 16384]
+897: model.layers.75.post_attention_layernorm.weight shape: [16384]
+898: model.layers.75.self_attn.k_proj.weight shape: [1024, 16384]
+899: model.layers.75.self_attn.o_proj.weight shape: [16384, 16384]
+900: model.layers.75.self_attn.q_proj.weight shape: [16384, 16384]
+901: model.layers.75.self_attn.v_proj.weight shape: [1024, 16384]
+902: model.layers.76.input_layernorm.weight shape: [16384]
+903: model.layers.76.mlp.down_proj.weight shape: [16384, 53248]
+904: model.layers.76.mlp.gate_proj.weight shape: [53248, 16384]
+905: model.layers.76.mlp.up_proj.weight shape: [53248, 16384]
+906: model.layers.76.post_attention_layernorm.weight shape: [16384]
+907: model.layers.76.self_attn.k_proj.weight shape: [1024, 16384]
+908: model.layers.76.self_attn.o_proj.weight shape: [16384, 16384]
+909: model.layers.76.self_attn.q_proj.weight shape: [16384, 16384]
+910: model.layers.76.self_attn.v_proj.weight shape: [1024, 16384]
+911: model.layers.77.input_layernorm.weight shape: [16384]
+912: model.layers.77.mlp.down_proj.weight shape: [16384, 53248]
+913: model.layers.77.mlp.gate_proj.weight shape: [53248, 16384]
+914: model.layers.77.mlp.up_proj.weight shape: [53248, 16384]
+915: model.layers.77.post_attention_layernorm.weight shape: [16384]
+916: model.layers.77.self_attn.k_proj.weight shape: [1024, 16384]
+917: model.layers.77.self_attn.o_proj.weight shape: [16384, 16384]
+918: model.layers.77.self_attn.q_proj.weight shape: [16384, 16384]
+919: model.layers.77.self_attn.v_proj.weight shape: [1024, 16384]
+920: model.layers.78.input_layernorm.weight shape: [16384]
+921: model.layers.78.mlp.down_proj.weight shape: [16384, 53248]
+922: model.layers.78.mlp.gate_proj.weight shape: [53248, 16384]
+923: model.layers.78.mlp.up_proj.weight shape: [53248, 16384]
+924: model.layers.78.post_attention_layernorm.weight shape: [16384]
+925: model.layers.78.self_attn.k_proj.weight shape: [1024, 16384]
+926: model.layers.78.self_attn.o_proj.weight shape: [16384, 16384]
+927: model.layers.78.self_attn.q_proj.weight shape: [16384, 16384]
+928: model.layers.78.self_attn.v_proj.weight shape: [1024, 16384]
+929: model.layers.79.input_layernorm.weight shape: [16384]
+930: model.layers.79.mlp.down_proj.weight shape: [16384, 53248]
+931: model.layers.79.mlp.gate_proj.weight shape: [53248, 16384]
+932: model.layers.79.mlp.up_proj.weight shape: [53248, 16384]
+933: model.layers.79.post_attention_layernorm.weight shape: [16384]
+934: model.layers.79.self_attn.k_proj.weight shape: [1024, 16384]
+935: model.layers.79.self_attn.o_proj.weight shape: [16384, 16384]
+936: model.layers.79.self_attn.q_proj.weight shape: [16384, 16384]
+937: model.layers.79.self_attn.v_proj.weight shape: [1024, 16384]
+938: model.layers.8.input_layernorm.weight shape: [16384]
+939: model.layers.8.mlp.down_proj.weight shape: [16384, 53248]
+940: model.layers.8.mlp.gate_proj.weight shape: [53248, 16384]
+941: model.layers.8.mlp.up_proj.weight shape: [53248, 16384]
+942: model.layers.8.post_attention_layernorm.weight shape: [16384]
+943: model.layers.8.self_attn.k_proj.weight shape: [1024, 16384]
+944: model.layers.8.self_attn.o_proj.weight shape: [16384, 16384]
+945: model.layers.8.self_attn.q_proj.weight shape: [16384, 16384]
+946: model.layers.8.self_attn.v_proj.weight shape: [1024, 16384]
+947: model.layers.80.input_layernorm.weight shape: [16384]
+948: model.layers.80.mlp.down_proj.weight shape: [16384, 53248]
+949: model.layers.80.mlp.gate_proj.weight shape: [53248, 16384]
+950: model.layers.80.mlp.up_proj.weight shape: [53248, 16384]
+951: model.layers.80.post_attention_layernorm.weight shape: [16384]
+952: model.layers.80.self_attn.k_proj.weight shape: [1024, 16384]
+953: model.layers.80.self_attn.o_proj.weight shape: [16384, 16384]
+954: model.layers.80.self_attn.q_proj.weight shape: [16384, 16384]
+955: model.layers.80.self_attn.v_proj.weight shape: [1024, 16384]
+956: model.layers.81.input_layernorm.weight shape: [16384]
+957: model.layers.81.mlp.down_proj.weight shape: [16384, 53248]
+958: model.layers.81.mlp.gate_proj.weight shape: [53248, 16384]
+959: model.layers.81.mlp.up_proj.weight shape: [53248, 16384]
+960: model.layers.81.post_attention_layernorm.weight shape: [16384]
+961: model.layers.81.self_attn.k_proj.weight shape: [1024, 16384]
+962: model.layers.81.self_attn.o_proj.weight shape: [16384, 16384]
+963: model.layers.81.self_attn.q_proj.weight shape: [16384, 16384]
+964: model.layers.81.self_attn.v_proj.weight shape: [1024, 16384]
+965: model.layers.82.input_layernorm.weight shape: [16384]
+966: model.layers.82.mlp.down_proj.weight shape: [16384, 53248]
+967: model.layers.82.mlp.gate_proj.weight shape: [53248, 16384]
+968: model.layers.82.mlp.up_proj.weight shape: [53248, 16384]
+969: model.layers.82.post_attention_layernorm.weight shape: [16384]
+970: model.layers.82.self_attn.k_proj.weight shape: [1024, 16384]
+971: model.layers.82.self_attn.o_proj.weight shape: [16384, 16384]
+972: model.layers.82.self_attn.q_proj.weight shape: [16384, 16384]
+973: model.layers.82.self_attn.v_proj.weight shape: [1024, 16384]
+974: model.layers.83.input_layernorm.weight shape: [16384]
+975: model.layers.83.mlp.down_proj.weight shape: [16384, 53248]
+976: model.layers.83.mlp.gate_proj.weight shape: [53248, 16384]
+977: model.layers.83.mlp.up_proj.weight shape: [53248, 16384]
+978: model.layers.83.post_attention_layernorm.weight shape: [16384]
+979: model.layers.83.self_attn.k_proj.weight shape: [1024, 16384]
+980: model.layers.83.self_attn.o_proj.weight shape: [16384, 16384]
+981: model.layers.83.self_attn.q_proj.weight shape: [16384, 16384]
+982: model.layers.83.self_attn.v_proj.weight shape: [1024, 16384]
+983: model.layers.84.input_layernorm.weight shape: [16384]
+984: model.layers.84.mlp.down_proj.weight shape: [16384, 53248]
+985: model.layers.84.mlp.gate_proj.weight shape: [53248, 16384]
+986: model.layers.84.mlp.up_proj.weight shape: [53248, 16384]
+987: model.layers.84.post_attention_layernorm.weight shape: [16384]
+988: model.layers.84.self_attn.k_proj.weight shape: [1024, 16384]
+989: model.layers.84.self_attn.o_proj.weight shape: [16384, 16384]
+990: model.layers.84.self_attn.q_proj.weight shape: [16384, 16384]
+991: model.layers.84.self_attn.v_proj.weight shape: [1024, 16384]
+992: model.layers.85.input_layernorm.weight shape: [16384]
+993: model.layers.85.mlp.down_proj.weight shape: [16384, 53248]
+994: model.layers.85.mlp.gate_proj.weight shape: [53248, 16384]
+995: model.layers.85.mlp.up_proj.weight shape: [53248, 16384]
+996: model.layers.85.post_attention_layernorm.weight shape: [16384]
+997: model.layers.85.self_attn.k_proj.weight shape: [1024, 16384]
+998: model.layers.85.self_attn.o_proj.weight shape: [16384, 16384]
+999: model.layers.85.self_attn.q_proj.weight shape: [16384, 16384]
+1000: model.layers.85.self_attn.v_proj.weight shape: [1024, 16384]
+1001: model.layers.86.input_layernorm.weight shape: [16384]
+1002: model.layers.86.mlp.down_proj.weight shape: [16384, 53248]
+1003: model.layers.86.mlp.gate_proj.weight shape: [53248, 16384]
+1004: model.layers.86.mlp.up_proj.weight shape: [53248, 16384]
+1005: model.layers.86.post_attention_layernorm.weight shape: [16384]
+1006: model.layers.86.self_attn.k_proj.weight shape: [1024, 16384]
+1007: model.layers.86.self_attn.o_proj.weight shape: [16384, 16384]
+1008: model.layers.86.self_attn.q_proj.weight shape: [16384, 16384]
+1009: model.layers.86.self_attn.v_proj.weight shape: [1024, 16384]
+1010: model.layers.87.input_layernorm.weight shape: [16384]
+1011: model.layers.87.mlp.down_proj.weight shape: [16384, 53248]
+1012: model.layers.87.mlp.gate_proj.weight shape: [53248, 16384]
+1013: model.layers.87.mlp.up_proj.weight shape: [53248, 16384]
+1014: model.layers.87.post_attention_layernorm.weight shape: [16384]
+1015: model.layers.87.self_attn.k_proj.weight shape: [1024, 16384]
+1016: model.layers.87.self_attn.o_proj.weight shape: [16384, 16384]
+1017: model.layers.87.self_attn.q_proj.weight shape: [16384, 16384]
+1018: model.layers.87.self_attn.v_proj.weight shape: [1024, 16384]
+1019: model.layers.88.input_layernorm.weight shape: [16384]
+1020: model.layers.88.mlp.down_proj.weight shape: [16384, 53248]
+1021: model.layers.88.mlp.gate_proj.weight shape: [53248, 16384]
+1022: model.layers.88.mlp.up_proj.weight shape: [53248, 16384]
+1023: model.layers.88.post_attention_layernorm.weight shape: [16384]
+1024: model.layers.88.self_attn.k_proj.weight shape: [1024, 16384]
+1025: model.layers.88.self_attn.o_proj.weight shape: [16384, 16384]
+1026: model.layers.88.self_attn.q_proj.weight shape: [16384, 16384]
+1027: model.layers.88.self_attn.v_proj.weight shape: [1024, 16384]
+1028: model.layers.89.input_layernorm.weight shape: [16384]
+1029: model.layers.89.mlp.down_proj.weight shape: [16384, 53248]
+1030: model.layers.89.mlp.gate_proj.weight shape: [53248, 16384]
+1031: model.layers.89.mlp.up_proj.weight shape: [53248, 16384]
+1032: model.layers.89.post_attention_layernorm.weight shape: [16384]
+1033: model.layers.89.self_attn.k_proj.weight shape: [1024, 16384]
+1034: model.layers.89.self_attn.o_proj.weight shape: [16384, 16384]
+1035: model.layers.89.self_attn.q_proj.weight shape: [16384, 16384]
+1036: model.layers.89.self_attn.v_proj.weight shape: [1024, 16384]
+1037: model.layers.9.input_layernorm.weight shape: [16384]
+1038: model.layers.9.mlp.down_proj.weight shape: [16384, 53248]
+1039: model.layers.9.mlp.gate_proj.weight shape: [53248, 16384]
+1040: model.layers.9.mlp.up_proj.weight shape: [53248, 16384]
+1041: model.layers.9.post_attention_layernorm.weight shape: [16384]
+1042: model.layers.9.self_attn.k_proj.weight shape: [1024, 16384]
+1043: model.layers.9.self_attn.o_proj.weight shape: [16384, 16384]
+1044: model.layers.9.self_attn.q_proj.weight shape: [16384, 16384]
+1045: model.layers.9.self_attn.v_proj.weight shape: [1024, 16384]
+1046: model.layers.90.input_layernorm.weight shape: [16384]
+1047: model.layers.90.mlp.down_proj.weight shape: [16384, 53248]
+1048: model.layers.90.mlp.gate_proj.weight shape: [53248, 16384]
+1049: model.layers.90.mlp.up_proj.weight shape: [53248, 16384]
+1050: model.layers.90.post_attention_layernorm.weight shape: [16384]
+1051: model.layers.90.self_attn.k_proj.weight shape: [1024, 16384]
+1052: model.layers.90.self_attn.o_proj.weight shape: [16384, 16384]
+1053: model.layers.90.self_attn.q_proj.weight shape: [16384, 16384]
+1054: model.layers.90.self_attn.v_proj.weight shape: [1024, 16384]
+1055: model.layers.91.input_layernorm.weight shape: [16384]
+1056: model.layers.91.mlp.down_proj.weight shape: [16384, 53248]
+1057: model.layers.91.mlp.gate_proj.weight shape: [53248, 16384]
+1058: model.layers.91.mlp.up_proj.weight shape: [53248, 16384]
+1059: model.layers.91.post_attention_layernorm.weight shape: [16384]
+1060: model.layers.91.self_attn.k_proj.weight shape: [1024, 16384]
+1061: model.layers.91.self_attn.o_proj.weight shape: [16384, 16384]
+1062: model.layers.91.self_attn.q_proj.weight shape: [16384, 16384]
+1063: model.layers.91.self_attn.v_proj.weight shape: [1024, 16384]
+1064: model.layers.92.input_layernorm.weight shape: [16384]
+1065: model.layers.92.mlp.down_proj.weight shape: [16384, 53248]
+1066: model.layers.92.mlp.gate_proj.weight shape: [53248, 16384]
+1067: model.layers.92.mlp.up_proj.weight shape: [53248, 16384]
+1068: model.layers.92.post_attention_layernorm.weight shape: [16384]
+1069: model.layers.92.self_attn.k_proj.weight shape: [1024, 16384]
+1070: model.layers.92.self_attn.o_proj.weight shape: [16384, 16384]
+1071: model.layers.92.self_attn.q_proj.weight shape: [16384, 16384]
+1072: model.layers.92.self_attn.v_proj.weight shape: [1024, 16384]
+1073: model.layers.93.input_layernorm.weight shape: [16384]
+1074: model.layers.93.mlp.down_proj.weight shape: [16384, 53248]
+1075: model.layers.93.mlp.gate_proj.weight shape: [53248, 16384]
+1076: model.layers.93.mlp.up_proj.weight shape: [53248, 16384]
+1077: model.layers.93.post_attention_layernorm.weight shape: [16384]
+1078: model.layers.93.self_attn.k_proj.weight shape: [1024, 16384]
+1079: model.layers.93.self_attn.o_proj.weight shape: [16384, 16384]
+1080: model.layers.93.self_attn.q_proj.weight shape: [16384, 16384]
+1081: model.layers.93.self_attn.v_proj.weight shape: [1024, 16384]
+1082: model.layers.94.input_layernorm.weight shape: [16384]
+1083: model.layers.94.mlp.down_proj.weight shape: [16384, 53248]
+1084: model.layers.94.mlp.gate_proj.weight shape: [53248, 16384]
+1085: model.layers.94.mlp.up_proj.weight shape: [53248, 16384]
+1086: model.layers.94.post_attention_layernorm.weight shape: [16384]
+1087: model.layers.94.self_attn.k_proj.weight shape: [1024, 16384]
+1088: model.layers.94.self_attn.o_proj.weight shape: [16384, 16384]
+1089: model.layers.94.self_attn.q_proj.weight shape: [16384, 16384]
+1090: model.layers.94.self_attn.v_proj.weight shape: [1024, 16384]
+1091: model.layers.95.input_layernorm.weight shape: [16384]
+1092: model.layers.95.mlp.down_proj.weight shape: [16384, 53248]
+1093: model.layers.95.mlp.gate_proj.weight shape: [53248, 16384]
+1094: model.layers.95.mlp.up_proj.weight shape: [53248, 16384]
+1095: model.layers.95.post_attention_layernorm.weight shape: [16384]
+1096: model.layers.95.self_attn.k_proj.weight shape: [1024, 16384]
+1097: model.layers.95.self_attn.o_proj.weight shape: [16384, 16384]
+1098: model.layers.95.self_attn.q_proj.weight shape: [16384, 16384]
+1099: model.layers.95.self_attn.v_proj.weight shape: [1024, 16384]
+1100: model.layers.96.input_layernorm.weight shape: [16384]
+1101: model.layers.96.mlp.down_proj.weight shape: [16384, 53248]
+1102: model.layers.96.mlp.gate_proj.weight shape: [53248, 16384]
+1103: model.layers.96.mlp.up_proj.weight shape: [53248, 16384]
+1104: model.layers.96.post_attention_layernorm.weight shape: [16384]
+1105: model.layers.96.self_attn.k_proj.weight shape: [1024, 16384]
+1106: model.layers.96.self_attn.o_proj.weight shape: [16384, 16384]
+1107: model.layers.96.self_attn.q_proj.weight shape: [16384, 16384]
+1108: model.layers.96.self_attn.v_proj.weight shape: [1024, 16384]
+1109: model.layers.97.input_layernorm.weight shape: [16384]
+1110: model.layers.97.mlp.down_proj.weight shape: [16384, 53248]
+1111: model.layers.97.mlp.gate_proj.weight shape: [53248, 16384]
+1112: model.layers.97.mlp.up_proj.weight shape: [53248, 16384]
+1113: model.layers.97.post_attention_layernorm.weight shape: [16384]
+1114: model.layers.97.self_attn.k_proj.weight shape: [1024, 16384]
+1115: model.layers.97.self_attn.o_proj.weight shape: [16384, 16384]
+1116: model.layers.97.self_attn.q_proj.weight shape: [16384, 16384]
+1117: model.layers.97.self_attn.v_proj.weight shape: [1024, 16384]
+1118: model.layers.98.input_layernorm.weight shape: [16384]
+1119: model.layers.98.mlp.down_proj.weight shape: [16384, 53248]
+1120: model.layers.98.mlp.gate_proj.weight shape: [53248, 16384]
+1121: model.layers.98.mlp.up_proj.weight shape: [53248, 16384]
+1122: model.layers.98.post_attention_layernorm.weight shape: [16384]
+1123: model.layers.98.self_attn.k_proj.weight shape: [1024, 16384]
+1124: model.layers.98.self_attn.o_proj.weight shape: [16384, 16384]
+1125: model.layers.98.self_attn.q_proj.weight shape: [16384, 16384]
+1126: model.layers.98.self_attn.v_proj.weight shape: [1024, 16384]
+1127: model.layers.99.input_layernorm.weight shape: [16384]
+1128: model.layers.99.mlp.down_proj.weight shape: [16384, 53248]
+1129: model.layers.99.mlp.gate_proj.weight shape: [53248, 16384]
+1130: model.layers.99.mlp.up_proj.weight shape: [53248, 16384]
+1131: model.layers.99.post_attention_layernorm.weight shape: [16384]
+1132: model.layers.99.self_attn.k_proj.weight shape: [1024, 16384]
+1133: model.layers.99.self_attn.o_proj.weight shape: [16384, 16384]
+1134: model.layers.99.self_attn.q_proj.weight shape: [16384, 16384]
+1135: model.layers.99.self_attn.v_proj.weight shape: [1024, 16384]
+1136: model.norm.weight shape: [16384]
diff --git a/test/Microsoft.ML.GenAI.LLaMA.Tests/Approvals/LLaMA3_1Tests.Llama_3_1_70b_ShapeTest.approved.txt b/test/Microsoft.ML.GenAI.LLaMA.Tests/Approvals/LLaMA3_1Tests.Llama_3_1_70b_ShapeTest.approved.txt
new file mode 100644
index 0000000000..5add8770c5
--- /dev/null
+++ b/test/Microsoft.ML.GenAI.LLaMA.Tests/Approvals/LLaMA3_1Tests.Llama_3_1_70b_ShapeTest.approved.txt
@@ -0,0 +1,723 @@
+﻿0: lm_head.weight shape: [128256, 8192]
+1: model.embed_tokens.weight shape: [128256, 8192]
+2: model.layers.0.input_layernorm.weight shape: [8192]
+3: model.layers.0.mlp.down_proj.weight shape: [8192, 28672]
+4: model.layers.0.mlp.gate_proj.weight shape: [28672, 8192]
+5: model.layers.0.mlp.up_proj.weight shape: [28672, 8192]
+6: model.layers.0.post_attention_layernorm.weight shape: [8192]
+7: model.layers.0.self_attn.k_proj.weight shape: [1024, 8192]
+8: model.layers.0.self_attn.o_proj.weight shape: [8192, 8192]
+9: model.layers.0.self_attn.q_proj.weight shape: [8192, 8192]
+10: model.layers.0.self_attn.v_proj.weight shape: [1024, 8192]
+11: model.layers.1.input_layernorm.weight shape: [8192]
+12: model.layers.1.mlp.down_proj.weight shape: [8192, 28672]
+13: model.layers.1.mlp.gate_proj.weight shape: [28672, 8192]
+14: model.layers.1.mlp.up_proj.weight shape: [28672, 8192]
+15: model.layers.1.post_attention_layernorm.weight shape: [8192]
+16: model.layers.1.self_attn.k_proj.weight shape: [1024, 8192]
+17: model.layers.1.self_attn.o_proj.weight shape: [8192, 8192]
+18: model.layers.1.self_attn.q_proj.weight shape: [8192, 8192]
+19: model.layers.1.self_attn.v_proj.weight shape: [1024, 8192]
+20: model.layers.10.input_layernorm.weight shape: [8192]
+21: model.layers.10.mlp.down_proj.weight shape: [8192, 28672]
+22: model.layers.10.mlp.gate_proj.weight shape: [28672, 8192]
+23: model.layers.10.mlp.up_proj.weight shape: [28672, 8192]
+24: model.layers.10.post_attention_layernorm.weight shape: [8192]
+25: model.layers.10.self_attn.k_proj.weight shape: [1024, 8192]
+26: model.layers.10.self_attn.o_proj.weight shape: [8192, 8192]
+27: model.layers.10.self_attn.q_proj.weight shape: [8192, 8192]
+28: model.layers.10.self_attn.v_proj.weight shape: [1024, 8192]
+29: model.layers.11.input_layernorm.weight shape: [8192]
+30: model.layers.11.mlp.down_proj.weight shape: [8192, 28672]
+31: model.layers.11.mlp.gate_proj.weight shape: [28672, 8192]
+32: model.layers.11.mlp.up_proj.weight shape: [28672, 8192]
+33: model.layers.11.post_attention_layernorm.weight shape: [8192]
+34: model.layers.11.self_attn.k_proj.weight shape: [1024, 8192]
+35: model.layers.11.self_attn.o_proj.weight shape: [8192, 8192]
+36: model.layers.11.self_attn.q_proj.weight shape: [8192, 8192]
+37: model.layers.11.self_attn.v_proj.weight shape: [1024, 8192]
+38: model.layers.12.input_layernorm.weight shape: [8192]
+39: model.layers.12.mlp.down_proj.weight shape: [8192, 28672]
+40: model.layers.12.mlp.gate_proj.weight shape: [28672, 8192]
+41: model.layers.12.mlp.up_proj.weight shape: [28672, 8192]
+42: model.layers.12.post_attention_layernorm.weight shape: [8192]
+43: model.layers.12.self_attn.k_proj.weight shape: [1024, 8192]
+44: model.layers.12.self_attn.o_proj.weight shape: [8192, 8192]
+45: model.layers.12.self_attn.q_proj.weight shape: [8192, 8192]
+46: model.layers.12.self_attn.v_proj.weight shape: [1024, 8192]
+47: model.layers.13.input_layernorm.weight shape: [8192]
+48: model.layers.13.mlp.down_proj.weight shape: [8192, 28672]
+49: model.layers.13.mlp.gate_proj.weight shape: [28672, 8192]
+50: model.layers.13.mlp.up_proj.weight shape: [28672, 8192]
+51: model.layers.13.post_attention_layernorm.weight shape: [8192]
+52: model.layers.13.self_attn.k_proj.weight shape: [1024, 8192]
+53: model.layers.13.self_attn.o_proj.weight shape: [8192, 8192]
+54: model.layers.13.self_attn.q_proj.weight shape: [8192, 8192]
+55: model.layers.13.self_attn.v_proj.weight shape: [1024, 8192]
+56: model.layers.14.input_layernorm.weight shape: [8192]
+57: model.layers.14.mlp.down_proj.weight shape: [8192, 28672]
+58: model.layers.14.mlp.gate_proj.weight shape: [28672, 8192]
+59: model.layers.14.mlp.up_proj.weight shape: [28672, 8192]
+60: model.layers.14.post_attention_layernorm.weight shape: [8192]
+61: model.layers.14.self_attn.k_proj.weight shape: [1024, 8192]
+62: model.layers.14.self_attn.o_proj.weight shape: [8192, 8192]
+63: model.layers.14.self_attn.q_proj.weight shape: [8192, 8192]
+64: model.layers.14.self_attn.v_proj.weight shape: [1024, 8192]
+65: model.layers.15.input_layernorm.weight shape: [8192]
+66: model.layers.15.mlp.down_proj.weight shape: [8192, 28672]
+67: model.layers.15.mlp.gate_proj.weight shape: [28672, 8192]
+68: model.layers.15.mlp.up_proj.weight shape: [28672, 8192]
+69: model.layers.15.post_attention_layernorm.weight shape: [8192]
+70: model.layers.15.self_attn.k_proj.weight shape: [1024, 8192]
+71: model.layers.15.self_attn.o_proj.weight shape: [8192, 8192]
+72: model.layers.15.self_attn.q_proj.weight shape: [8192, 8192]
+73: model.layers.15.self_attn.v_proj.weight shape: [1024, 8192]
+74: model.layers.16.input_layernorm.weight shape: [8192]
+75: model.layers.16.mlp.down_proj.weight shape: [8192, 28672]
+76: model.layers.16.mlp.gate_proj.weight shape: [28672, 8192]
+77: model.layers.16.mlp.up_proj.weight shape: [28672, 8192]
+78: model.layers.16.post_attention_layernorm.weight shape: [8192]
+79: model.layers.16.self_attn.k_proj.weight shape: [1024, 8192]
+80: model.layers.16.self_attn.o_proj.weight shape: [8192, 8192]
+81: model.layers.16.self_attn.q_proj.weight shape: [8192, 8192]
+82: model.layers.16.self_attn.v_proj.weight shape: [1024, 8192]
+83: model.layers.17.input_layernorm.weight shape: [8192]
+84: model.layers.17.mlp.down_proj.weight shape: [8192, 28672]
+85: model.layers.17.mlp.gate_proj.weight shape: [28672, 8192]
+86: model.layers.17.mlp.up_proj.weight shape: [28672, 8192]
+87: model.layers.17.post_attention_layernorm.weight shape: [8192]
+88: model.layers.17.self_attn.k_proj.weight shape: [1024, 8192]
+89: model.layers.17.self_attn.o_proj.weight shape: [8192, 8192]
+90: model.layers.17.self_attn.q_proj.weight shape: [8192, 8192]
+91: model.layers.17.self_attn.v_proj.weight shape: [1024, 8192]
+92: model.layers.18.input_layernorm.weight shape: [8192]
+93: model.layers.18.mlp.down_proj.weight shape: [8192, 28672]
+94: model.layers.18.mlp.gate_proj.weight shape: [28672, 8192]
+95: model.layers.18.mlp.up_proj.weight shape: [28672, 8192]
+96: model.layers.18.post_attention_layernorm.weight shape: [8192]
+97: model.layers.18.self_attn.k_proj.weight shape: [1024, 8192]
+98: model.layers.18.self_attn.o_proj.weight shape: [8192, 8192]
+99: model.layers.18.self_attn.q_proj.weight shape: [8192, 8192]
+100: model.layers.18.self_attn.v_proj.weight shape: [1024, 8192]
+101: model.layers.19.input_layernorm.weight shape: [8192]
+102: model.layers.19.mlp.down_proj.weight shape: [8192, 28672]
+103: model.layers.19.mlp.gate_proj.weight shape: [28672, 8192]
+104: model.layers.19.mlp.up_proj.weight shape: [28672, 8192]
+105: model.layers.19.post_attention_layernorm.weight shape: [8192]
+106: model.layers.19.self_attn.k_proj.weight shape: [1024, 8192]
+107: model.layers.19.self_attn.o_proj.weight shape: [8192, 8192]
+108: model.layers.19.self_attn.q_proj.weight shape: [8192, 8192]
+109: model.layers.19.self_attn.v_proj.weight shape: [1024, 8192]
+110: model.layers.2.input_layernorm.weight shape: [8192]
+111: model.layers.2.mlp.down_proj.weight shape: [8192, 28672]
+112: model.layers.2.mlp.gate_proj.weight shape: [28672, 8192]
+113: model.layers.2.mlp.up_proj.weight shape: [28672, 8192]
+114: model.layers.2.post_attention_layernorm.weight shape: [8192]
+115: model.layers.2.self_attn.k_proj.weight shape: [1024, 8192]
+116: model.layers.2.self_attn.o_proj.weight shape: [8192, 8192]
+117: model.layers.2.self_attn.q_proj.weight shape: [8192, 8192]
+118: model.layers.2.self_attn.v_proj.weight shape: [1024, 8192]
+119: model.layers.20.input_layernorm.weight shape: [8192]
+120: model.layers.20.mlp.down_proj.weight shape: [8192, 28672]
+121: model.layers.20.mlp.gate_proj.weight shape: [28672, 8192]
+122: model.layers.20.mlp.up_proj.weight shape: [28672, 8192]
+123: model.layers.20.post_attention_layernorm.weight shape: [8192]
+124: model.layers.20.self_attn.k_proj.weight shape: [1024, 8192]
+125: model.layers.20.self_attn.o_proj.weight shape: [8192, 8192]
+126: model.layers.20.self_attn.q_proj.weight shape: [8192, 8192]
+127: model.layers.20.self_attn.v_proj.weight shape: [1024, 8192]
+128: model.layers.21.input_layernorm.weight shape: [8192]
+129: model.layers.21.mlp.down_proj.weight shape: [8192, 28672]
+130: model.layers.21.mlp.gate_proj.weight shape: [28672, 8192]
+131: model.layers.21.mlp.up_proj.weight shape: [28672, 8192]
+132: model.layers.21.post_attention_layernorm.weight shape: [8192]
+133: model.layers.21.self_attn.k_proj.weight shape: [1024, 8192]
+134: model.layers.21.self_attn.o_proj.weight shape: [8192, 8192]
+135: model.layers.21.self_attn.q_proj.weight shape: [8192, 8192]
+136: model.layers.21.self_attn.v_proj.weight shape: [1024, 8192]
+137: model.layers.22.input_layernorm.weight shape: [8192]
+138: model.layers.22.mlp.down_proj.weight shape: [8192, 28672]
+139: model.layers.22.mlp.gate_proj.weight shape: [28672, 8192]
+140: model.layers.22.mlp.up_proj.weight shape: [28672, 8192]
+141: model.layers.22.post_attention_layernorm.weight shape: [8192]
+142: model.layers.22.self_attn.k_proj.weight shape: [1024, 8192]
+143: model.layers.22.self_attn.o_proj.weight shape: [8192, 8192]
+144: model.layers.22.self_attn.q_proj.weight shape: [8192, 8192]
+145: model.layers.22.self_attn.v_proj.weight shape: [1024, 8192]
+146: model.layers.23.input_layernorm.weight shape: [8192]
+147: model.layers.23.mlp.down_proj.weight shape: [8192, 28672]
+148: model.layers.23.mlp.gate_proj.weight shape: [28672, 8192]
+149: model.layers.23.mlp.up_proj.weight shape: [28672, 8192]
+150: model.layers.23.post_attention_layernorm.weight shape: [8192]
+151: model.layers.23.self_attn.k_proj.weight shape: [1024, 8192]
+152: model.layers.23.self_attn.o_proj.weight shape: [8192, 8192]
+153: model.layers.23.self_attn.q_proj.weight shape: [8192, 8192]
+154: model.layers.23.self_attn.v_proj.weight shape: [1024, 8192]
+155: model.layers.24.input_layernorm.weight shape: [8192]
+156: model.layers.24.mlp.down_proj.weight shape: [8192, 28672]
+157: model.layers.24.mlp.gate_proj.weight shape: [28672, 8192]
+158: model.layers.24.mlp.up_proj.weight shape: [28672, 8192]
+159: model.layers.24.post_attention_layernorm.weight shape: [8192]
+160: model.layers.24.self_attn.k_proj.weight shape: [1024, 8192]
+161: model.layers.24.self_attn.o_proj.weight shape: [8192, 8192]
+162: model.layers.24.self_attn.q_proj.weight shape: [8192, 8192]
+163: model.layers.24.self_attn.v_proj.weight shape: [1024, 8192]
+164: model.layers.25.input_layernorm.weight shape: [8192]
+165: model.layers.25.mlp.down_proj.weight shape: [8192, 28672]
+166: model.layers.25.mlp.gate_proj.weight shape: [28672, 8192]
+167: model.layers.25.mlp.up_proj.weight shape: [28672, 8192]
+168: model.layers.25.post_attention_layernorm.weight shape: [8192]
+169: model.layers.25.self_attn.k_proj.weight shape: [1024, 8192]
+170: model.layers.25.self_attn.o_proj.weight shape: [8192, 8192]
+171: model.layers.25.self_attn.q_proj.weight shape: [8192, 8192]
+172: model.layers.25.self_attn.v_proj.weight shape: [1024, 8192]
+173: model.layers.26.input_layernorm.weight shape: [8192]
+174: model.layers.26.mlp.down_proj.weight shape: [8192, 28672]
+175: model.layers.26.mlp.gate_proj.weight shape: [28672, 8192]
+176: model.layers.26.mlp.up_proj.weight shape: [28672, 8192]
+177: model.layers.26.post_attention_layernorm.weight shape: [8192]
+178: model.layers.26.self_attn.k_proj.weight shape: [1024, 8192]
+179: model.layers.26.self_attn.o_proj.weight shape: [8192, 8192]
+180: model.layers.26.self_attn.q_proj.weight shape: [8192, 8192]
+181: model.layers.26.self_attn.v_proj.weight shape: [1024, 8192]
+182: model.layers.27.input_layernorm.weight shape: [8192]
+183: model.layers.27.mlp.down_proj.weight shape: [8192, 28672]
+184: model.layers.27.mlp.gate_proj.weight shape: [28672, 8192]
+185: model.layers.27.mlp.up_proj.weight shape: [28672, 8192]
+186: model.layers.27.post_attention_layernorm.weight shape: [8192]
+187: model.layers.27.self_attn.k_proj.weight shape: [1024, 8192]
+188: model.layers.27.self_attn.o_proj.weight shape: [8192, 8192]
+189: model.layers.27.self_attn.q_proj.weight shape: [8192, 8192]
+190: model.layers.27.self_attn.v_proj.weight shape: [1024, 8192]
+191: model.layers.28.input_layernorm.weight shape: [8192]
+192: model.layers.28.mlp.down_proj.weight shape: [8192, 28672]
+193: model.layers.28.mlp.gate_proj.weight shape: [28672, 8192]
+194: model.layers.28.mlp.up_proj.weight shape: [28672, 8192]
+195: model.layers.28.post_attention_layernorm.weight shape: [8192]
+196: model.layers.28.self_attn.k_proj.weight shape: [1024, 8192]
+197: model.layers.28.self_attn.o_proj.weight shape: [8192, 8192]
+198: model.layers.28.self_attn.q_proj.weight shape: [8192, 8192]
+199: model.layers.28.self_attn.v_proj.weight shape: [1024, 8192]
+200: model.layers.29.input_layernorm.weight shape: [8192]
+201: model.layers.29.mlp.down_proj.weight shape: [8192, 28672]
+202: model.layers.29.mlp.gate_proj.weight shape: [28672, 8192]
+203: model.layers.29.mlp.up_proj.weight shape: [28672, 8192]
+204: model.layers.29.post_attention_layernorm.weight shape: [8192]
+205: model.layers.29.self_attn.k_proj.weight shape: [1024, 8192]
+206: model.layers.29.self_attn.o_proj.weight shape: [8192, 8192]
+207: model.layers.29.self_attn.q_proj.weight shape: [8192, 8192]
+208: model.layers.29.self_attn.v_proj.weight shape: [1024, 8192]
+209: model.layers.3.input_layernorm.weight shape: [8192]
+210: model.layers.3.mlp.down_proj.weight shape: [8192, 28672]
+211: model.layers.3.mlp.gate_proj.weight shape: [28672, 8192]
+212: model.layers.3.mlp.up_proj.weight shape: [28672, 8192]
+213: model.layers.3.post_attention_layernorm.weight shape: [8192]
+214: model.layers.3.self_attn.k_proj.weight shape: [1024, 8192]
+215: model.layers.3.self_attn.o_proj.weight shape: [8192, 8192]
+216: model.layers.3.self_attn.q_proj.weight shape: [8192, 8192]
+217: model.layers.3.self_attn.v_proj.weight shape: [1024, 8192]
+218: model.layers.30.input_layernorm.weight shape: [8192]
+219: model.layers.30.mlp.down_proj.weight shape: [8192, 28672]
+220: model.layers.30.mlp.gate_proj.weight shape: [28672, 8192]
+221: model.layers.30.mlp.up_proj.weight shape: [28672, 8192]
+222: model.layers.30.post_attention_layernorm.weight shape: [8192]
+223: model.layers.30.self_attn.k_proj.weight shape: [1024, 8192]
+224: model.layers.30.self_attn.o_proj.weight shape: [8192, 8192]
+225: model.layers.30.self_attn.q_proj.weight shape: [8192, 8192]
+226: model.layers.30.self_attn.v_proj.weight shape: [1024, 8192]
+227: model.layers.31.input_layernorm.weight shape: [8192]
+228: model.layers.31.mlp.down_proj.weight shape: [8192, 28672]
+229: model.layers.31.mlp.gate_proj.weight shape: [28672, 8192]
+230: model.layers.31.mlp.up_proj.weight shape: [28672, 8192]
+231: model.layers.31.post_attention_layernorm.weight shape: [8192]
+232: model.layers.31.self_attn.k_proj.weight shape: [1024, 8192]
+233: model.layers.31.self_attn.o_proj.weight shape: [8192, 8192]
+234: model.layers.31.self_attn.q_proj.weight shape: [8192, 8192]
+235: model.layers.31.self_attn.v_proj.weight shape: [1024, 8192]
+236: model.layers.32.input_layernorm.weight shape: [8192]
+237: model.layers.32.mlp.down_proj.weight shape: [8192, 28672]
+238: model.layers.32.mlp.gate_proj.weight shape: [28672, 8192]
+239: model.layers.32.mlp.up_proj.weight shape: [28672, 8192]
+240: model.layers.32.post_attention_layernorm.weight shape: [8192]
+241: model.layers.32.self_attn.k_proj.weight shape: [1024, 8192]
+242: model.layers.32.self_attn.o_proj.weight shape: [8192, 8192]
+243: model.layers.32.self_attn.q_proj.weight shape: [8192, 8192]
+244: model.layers.32.self_attn.v_proj.weight shape: [1024, 8192]
+245: model.layers.33.input_layernorm.weight shape: [8192]
+246: model.layers.33.mlp.down_proj.weight shape: [8192, 28672]
+247: model.layers.33.mlp.gate_proj.weight shape: [28672, 8192]
+248: model.layers.33.mlp.up_proj.weight shape: [28672, 8192]
+249: model.layers.33.post_attention_layernorm.weight shape: [8192]
+250: model.layers.33.self_attn.k_proj.weight shape: [1024, 8192]
+251: model.layers.33.self_attn.o_proj.weight shape: [8192, 8192]
+252: model.layers.33.self_attn.q_proj.weight shape: [8192, 8192]
+253: model.layers.33.self_attn.v_proj.weight shape: [1024, 8192]
+254: model.layers.34.input_layernorm.weight shape: [8192]
+255: model.layers.34.mlp.down_proj.weight shape: [8192, 28672]
+256: model.layers.34.mlp.gate_proj.weight shape: [28672, 8192]
+257: model.layers.34.mlp.up_proj.weight shape: [28672, 8192]
+258: model.layers.34.post_attention_layernorm.weight shape: [8192]
+259: model.layers.34.self_attn.k_proj.weight shape: [1024, 8192]
+260: model.layers.34.self_attn.o_proj.weight shape: [8192, 8192]
+261: model.layers.34.self_attn.q_proj.weight shape: [8192, 8192]
+262: model.layers.34.self_attn.v_proj.weight shape: [1024, 8192]
+263: model.layers.35.input_layernorm.weight shape: [8192]
+264: model.layers.35.mlp.down_proj.weight shape: [8192, 28672]
+265: model.layers.35.mlp.gate_proj.weight shape: [28672, 8192]
+266: model.layers.35.mlp.up_proj.weight shape: [28672, 8192]
+267: model.layers.35.post_attention_layernorm.weight shape: [8192]
+268: model.layers.35.self_attn.k_proj.weight shape: [1024, 8192]
+269: model.layers.35.self_attn.o_proj.weight shape: [8192, 8192]
+270: model.layers.35.self_attn.q_proj.weight shape: [8192, 8192]
+271: model.layers.35.self_attn.v_proj.weight shape: [1024, 8192]
+272: model.layers.36.input_layernorm.weight shape: [8192]
+273: model.layers.36.mlp.down_proj.weight shape: [8192, 28672]
+274: model.layers.36.mlp.gate_proj.weight shape: [28672, 8192]
+275: model.layers.36.mlp.up_proj.weight shape: [28672, 8192]
+276: model.layers.36.post_attention_layernorm.weight shape: [8192]
+277: model.layers.36.self_attn.k_proj.weight shape: [1024, 8192]
+278: model.layers.36.self_attn.o_proj.weight shape: [8192, 8192]
+279: model.layers.36.self_attn.q_proj.weight shape: [8192, 8192]
+280: model.layers.36.self_attn.v_proj.weight shape: [1024, 8192]
+281: model.layers.37.input_layernorm.weight shape: [8192]
+282: model.layers.37.mlp.down_proj.weight shape: [8192, 28672]
+283: model.layers.37.mlp.gate_proj.weight shape: [28672, 8192]
+284: model.layers.37.mlp.up_proj.weight shape: [28672, 8192]
+285: model.layers.37.post_attention_layernorm.weight shape: [8192]
+286: model.layers.37.self_attn.k_proj.weight shape: [1024, 8192]
+287: model.layers.37.self_attn.o_proj.weight shape: [8192, 8192]
+288: model.layers.37.self_attn.q_proj.weight shape: [8192, 8192]
+289: model.layers.37.self_attn.v_proj.weight shape: [1024, 8192]
+290: model.layers.38.input_layernorm.weight shape: [8192]
+291: model.layers.38.mlp.down_proj.weight shape: [8192, 28672]
+292: model.layers.38.mlp.gate_proj.weight shape: [28672, 8192]
+293: model.layers.38.mlp.up_proj.weight shape: [28672, 8192]
+294: model.layers.38.post_attention_layernorm.weight shape: [8192]
+295: model.layers.38.self_attn.k_proj.weight shape: [1024, 8192]
+296: model.layers.38.self_attn.o_proj.weight shape: [8192, 8192]
+297: model.layers.38.self_attn.q_proj.weight shape: [8192, 8192]
+298: model.layers.38.self_attn.v_proj.weight shape: [1024, 8192]
+299: model.layers.39.input_layernorm.weight shape: [8192]
+300: model.layers.39.mlp.down_proj.weight shape: [8192, 28672]
+301: model.layers.39.mlp.gate_proj.weight shape: [28672, 8192]
+302: model.layers.39.mlp.up_proj.weight shape: [28672, 8192]
+303: model.layers.39.post_attention_layernorm.weight shape: [8192]
+304: model.layers.39.self_attn.k_proj.weight shape: [1024, 8192]
+305: model.layers.39.self_attn.o_proj.weight shape: [8192, 8192]
+306: model.layers.39.self_attn.q_proj.weight shape: [8192, 8192]
+307: model.layers.39.self_attn.v_proj.weight shape: [1024, 8192]
+308: model.layers.4.input_layernorm.weight shape: [8192]
+309: model.layers.4.mlp.down_proj.weight shape: [8192, 28672]
+310: model.layers.4.mlp.gate_proj.weight shape: [28672, 8192]
+311: model.layers.4.mlp.up_proj.weight shape: [28672, 8192]
+312: model.layers.4.post_attention_layernorm.weight shape: [8192]
+313: model.layers.4.self_attn.k_proj.weight shape: [1024, 8192]
+314: model.layers.4.self_attn.o_proj.weight shape: [8192, 8192]
+315: model.layers.4.self_attn.q_proj.weight shape: [8192, 8192]
+316: model.layers.4.self_attn.v_proj.weight shape: [1024, 8192]
+317: model.layers.40.input_layernorm.weight shape: [8192]
+318: model.layers.40.mlp.down_proj.weight shape: [8192, 28672]
+319: model.layers.40.mlp.gate_proj.weight shape: [28672, 8192]
+320: model.layers.40.mlp.up_proj.weight shape: [28672, 8192]
+321: model.layers.40.post_attention_layernorm.weight shape: [8192]
+322: model.layers.40.self_attn.k_proj.weight shape: [1024, 8192]
+323: model.layers.40.self_attn.o_proj.weight shape: [8192, 8192]
+324: model.layers.40.self_attn.q_proj.weight shape: [8192, 8192]
+325: model.layers.40.self_attn.v_proj.weight shape: [1024, 8192]
+326: model.layers.41.input_layernorm.weight shape: [8192]
+327: model.layers.41.mlp.down_proj.weight shape: [8192, 28672]
+328: model.layers.41.mlp.gate_proj.weight shape: [28672, 8192]
+329: model.layers.41.mlp.up_proj.weight shape: [28672, 8192]
+330: model.layers.41.post_attention_layernorm.weight shape: [8192]
+331: model.layers.41.self_attn.k_proj.weight shape: [1024, 8192]
+332: model.layers.41.self_attn.o_proj.weight shape: [8192, 8192]
+333: model.layers.41.self_attn.q_proj.weight shape: [8192, 8192]
+334: model.layers.41.self_attn.v_proj.weight shape: [1024, 8192]
+335: model.layers.42.input_layernorm.weight shape: [8192]
+336: model.layers.42.mlp.down_proj.weight shape: [8192, 28672]
+337: model.layers.42.mlp.gate_proj.weight shape: [28672, 8192]
+338: model.layers.42.mlp.up_proj.weight shape: [28672, 8192]
+339: model.layers.42.post_attention_layernorm.weight shape: [8192]
+340: model.layers.42.self_attn.k_proj.weight shape: [1024, 8192]
+341: model.layers.42.self_attn.o_proj.weight shape: [8192, 8192]
+342: model.layers.42.self_attn.q_proj.weight shape: [8192, 8192]
+343: model.layers.42.self_attn.v_proj.weight shape: [1024, 8192]
+344: model.layers.43.input_layernorm.weight shape: [8192]
+345: model.layers.43.mlp.down_proj.weight shape: [8192, 28672]
+346: model.layers.43.mlp.gate_proj.weight shape: [28672, 8192]
+347: model.layers.43.mlp.up_proj.weight shape: [28672, 8192]
+348: model.layers.43.post_attention_layernorm.weight shape: [8192]
+349: model.layers.43.self_attn.k_proj.weight shape: [1024, 8192]
+350: model.layers.43.self_attn.o_proj.weight shape: [8192, 8192]
+351: model.layers.43.self_attn.q_proj.weight shape: [8192, 8192]
+352: model.layers.43.self_attn.v_proj.weight shape: [1024, 8192]
+353: model.layers.44.input_layernorm.weight shape: [8192]
+354: model.layers.44.mlp.down_proj.weight shape: [8192, 28672]
+355: model.layers.44.mlp.gate_proj.weight shape: [28672, 8192]
+356: model.layers.44.mlp.up_proj.weight shape: [28672, 8192]
+357: model.layers.44.post_attention_layernorm.weight shape: [8192]
+358: model.layers.44.self_attn.k_proj.weight shape: [1024, 8192]
+359: model.layers.44.self_attn.o_proj.weight shape: [8192, 8192]
+360: model.layers.44.self_attn.q_proj.weight shape: [8192, 8192]
+361: model.layers.44.self_attn.v_proj.weight shape: [1024, 8192]
+362: model.layers.45.input_layernorm.weight shape: [8192]
+363: model.layers.45.mlp.down_proj.weight shape: [8192, 28672]
+364: model.layers.45.mlp.gate_proj.weight shape: [28672, 8192]
+365: model.layers.45.mlp.up_proj.weight shape: [28672, 8192]
+366: model.layers.45.post_attention_layernorm.weight shape: [8192]
+367: model.layers.45.self_attn.k_proj.weight shape: [1024, 8192]
+368: model.layers.45.self_attn.o_proj.weight shape: [8192, 8192]
+369: model.layers.45.self_attn.q_proj.weight shape: [8192, 8192]
+370: model.layers.45.self_attn.v_proj.weight shape: [1024, 8192]
+371: model.layers.46.input_layernorm.weight shape: [8192]
+372: model.layers.46.mlp.down_proj.weight shape: [8192, 28672]
+373: model.layers.46.mlp.gate_proj.weight shape: [28672, 8192]
+374: model.layers.46.mlp.up_proj.weight shape: [28672, 8192]
+375: model.layers.46.post_attention_layernorm.weight shape: [8192]
+376: model.layers.46.self_attn.k_proj.weight shape: [1024, 8192]
+377: model.layers.46.self_attn.o_proj.weight shape: [8192, 8192]
+378: model.layers.46.self_attn.q_proj.weight shape: [8192, 8192]
+379: model.layers.46.self_attn.v_proj.weight shape: [1024, 8192]
+380: model.layers.47.input_layernorm.weight shape: [8192]
+381: model.layers.47.mlp.down_proj.weight shape: [8192, 28672]
+382: model.layers.47.mlp.gate_proj.weight shape: [28672, 8192]
+383: model.layers.47.mlp.up_proj.weight shape: [28672, 8192]
+384: model.layers.47.post_attention_layernorm.weight shape: [8192]
+385: model.layers.47.self_attn.k_proj.weight shape: [1024, 8192]
+386: model.layers.47.self_attn.o_proj.weight shape: [8192, 8192]
+387: model.layers.47.self_attn.q_proj.weight shape: [8192, 8192]
+388: model.layers.47.self_attn.v_proj.weight shape: [1024, 8192]
+389: model.layers.48.input_layernorm.weight shape: [8192]
+390: model.layers.48.mlp.down_proj.weight shape: [8192, 28672]
+391: model.layers.48.mlp.gate_proj.weight shape: [28672, 8192]
+392: model.layers.48.mlp.up_proj.weight shape: [28672, 8192]
+393: model.layers.48.post_attention_layernorm.weight shape: [8192]
+394: model.layers.48.self_attn.k_proj.weight shape: [1024, 8192]
+395: model.layers.48.self_attn.o_proj.weight shape: [8192, 8192]
+396: model.layers.48.self_attn.q_proj.weight shape: [8192, 8192]
+397: model.layers.48.self_attn.v_proj.weight shape: [1024, 8192]
+398: model.layers.49.input_layernorm.weight shape: [8192]
+399: model.layers.49.mlp.down_proj.weight shape: [8192, 28672]
+400: model.layers.49.mlp.gate_proj.weight shape: [28672, 8192]
+401: model.layers.49.mlp.up_proj.weight shape: [28672, 8192]
+402: model.layers.49.post_attention_layernorm.weight shape: [8192]
+403: model.layers.49.self_attn.k_proj.weight shape: [1024, 8192]
+404: model.layers.49.self_attn.o_proj.weight shape: [8192, 8192]
+405: model.layers.49.self_attn.q_proj.weight shape: [8192, 8192]
+406: model.layers.49.self_attn.v_proj.weight shape: [1024, 8192]
+407: model.layers.5.input_layernorm.weight shape: [8192]
+408: model.layers.5.mlp.down_proj.weight shape: [8192, 28672]
+409: model.layers.5.mlp.gate_proj.weight shape: [28672, 8192]
+410: model.layers.5.mlp.up_proj.weight shape: [28672, 8192]
+411: model.layers.5.post_attention_layernorm.weight shape: [8192]
+412: model.layers.5.self_attn.k_proj.weight shape: [1024, 8192]
+413: model.layers.5.self_attn.o_proj.weight shape: [8192, 8192]
+414: model.layers.5.self_attn.q_proj.weight shape: [8192, 8192]
+415: model.layers.5.self_attn.v_proj.weight shape: [1024, 8192]
+416: model.layers.50.input_layernorm.weight shape: [8192]
+417: model.layers.50.mlp.down_proj.weight shape: [8192, 28672]
+418: model.layers.50.mlp.gate_proj.weight shape: [28672, 8192]
+419: model.layers.50.mlp.up_proj.weight shape: [28672, 8192]
+420: model.layers.50.post_attention_layernorm.weight shape: [8192]
+421: model.layers.50.self_attn.k_proj.weight shape: [1024, 8192]
+422: model.layers.50.self_attn.o_proj.weight shape: [8192, 8192]
+423: model.layers.50.self_attn.q_proj.weight shape: [8192, 8192]
+424: model.layers.50.self_attn.v_proj.weight shape: [1024, 8192]
+425: model.layers.51.input_layernorm.weight shape: [8192]
+426: model.layers.51.mlp.down_proj.weight shape: [8192, 28672]
+427: model.layers.51.mlp.gate_proj.weight shape: [28672, 8192]
+428: model.layers.51.mlp.up_proj.weight shape: [28672, 8192]
+429: model.layers.51.post_attention_layernorm.weight shape: [8192]
+430: model.layers.51.self_attn.k_proj.weight shape: [1024, 8192]
+431: model.layers.51.self_attn.o_proj.weight shape: [8192, 8192]
+432: model.layers.51.self_attn.q_proj.weight shape: [8192, 8192]
+433: model.layers.51.self_attn.v_proj.weight shape: [1024, 8192]
+434: model.layers.52.input_layernorm.weight shape: [8192]
+435: model.layers.52.mlp.down_proj.weight shape: [8192, 28672]
+436: model.layers.52.mlp.gate_proj.weight shape: [28672, 8192]
+437: model.layers.52.mlp.up_proj.weight shape: [28672, 8192]
+438: model.layers.52.post_attention_layernorm.weight shape: [8192]
+439: model.layers.52.self_attn.k_proj.weight shape: [1024, 8192]
+440: model.layers.52.self_attn.o_proj.weight shape: [8192, 8192]
+441: model.layers.52.self_attn.q_proj.weight shape: [8192, 8192]
+442: model.layers.52.self_attn.v_proj.weight shape: [1024, 8192]
+443: model.layers.53.input_layernorm.weight shape: [8192]
+444: model.layers.53.mlp.down_proj.weight shape: [8192, 28672]
+445: model.layers.53.mlp.gate_proj.weight shape: [28672, 8192]
+446: model.layers.53.mlp.up_proj.weight shape: [28672, 8192]
+447: model.layers.53.post_attention_layernorm.weight shape: [8192]
+448: model.layers.53.self_attn.k_proj.weight shape: [1024, 8192]
+449: model.layers.53.self_attn.o_proj.weight shape: [8192, 8192]
+450: model.layers.53.self_attn.q_proj.weight shape: [8192, 8192]
+451: model.layers.53.self_attn.v_proj.weight shape: [1024, 8192]
+452: model.layers.54.input_layernorm.weight shape: [8192]
+453: model.layers.54.mlp.down_proj.weight shape: [8192, 28672]
+454: model.layers.54.mlp.gate_proj.weight shape: [28672, 8192]
+455: model.layers.54.mlp.up_proj.weight shape: [28672, 8192]
+456: model.layers.54.post_attention_layernorm.weight shape: [8192]
+457: model.layers.54.self_attn.k_proj.weight shape: [1024, 8192]
+458: model.layers.54.self_attn.o_proj.weight shape: [8192, 8192]
+459: model.layers.54.self_attn.q_proj.weight shape: [8192, 8192]
+460: model.layers.54.self_attn.v_proj.weight shape: [1024, 8192]
+461: model.layers.55.input_layernorm.weight shape: [8192]
+462: model.layers.55.mlp.down_proj.weight shape: [8192, 28672]
+463: model.layers.55.mlp.gate_proj.weight shape: [28672, 8192]
+464: model.layers.55.mlp.up_proj.weight shape: [28672, 8192]
+465: model.layers.55.post_attention_layernorm.weight shape: [8192]
+466: model.layers.55.self_attn.k_proj.weight shape: [1024, 8192]
+467: model.layers.55.self_attn.o_proj.weight shape: [8192, 8192]
+468: model.layers.55.self_attn.q_proj.weight shape: [8192, 8192]
+469: model.layers.55.self_attn.v_proj.weight shape: [1024, 8192]
+470: model.layers.56.input_layernorm.weight shape: [8192]
+471: model.layers.56.mlp.down_proj.weight shape: [8192, 28672]
+472: model.layers.56.mlp.gate_proj.weight shape: [28672, 8192]
+473: model.layers.56.mlp.up_proj.weight shape: [28672, 8192]
+474: model.layers.56.post_attention_layernorm.weight shape: [8192]
+475: model.layers.56.self_attn.k_proj.weight shape: [1024, 8192]
+476: model.layers.56.self_attn.o_proj.weight shape: [8192, 8192]
+477: model.layers.56.self_attn.q_proj.weight shape: [8192, 8192]
+478: model.layers.56.self_attn.v_proj.weight shape: [1024, 8192]
+479: model.layers.57.input_layernorm.weight shape: [8192]
+480: model.layers.57.mlp.down_proj.weight shape: [8192, 28672]
+481: model.layers.57.mlp.gate_proj.weight shape: [28672, 8192]
+482: model.layers.57.mlp.up_proj.weight shape: [28672, 8192]
+483: model.layers.57.post_attention_layernorm.weight shape: [8192]
+484: model.layers.57.self_attn.k_proj.weight shape: [1024, 8192]
+485: model.layers.57.self_attn.o_proj.weight shape: [8192, 8192]
+486: model.layers.57.self_attn.q_proj.weight shape: [8192, 8192]
+487: model.layers.57.self_attn.v_proj.weight shape: [1024, 8192]
+488: model.layers.58.input_layernorm.weight shape: [8192]
+489: model.layers.58.mlp.down_proj.weight shape: [8192, 28672]
+490: model.layers.58.mlp.gate_proj.weight shape: [28672, 8192]
+491: model.layers.58.mlp.up_proj.weight shape: [28672, 8192]
+492: model.layers.58.post_attention_layernorm.weight shape: [8192]
+493: model.layers.58.self_attn.k_proj.weight shape: [1024, 8192]
+494: model.layers.58.self_attn.o_proj.weight shape: [8192, 8192]
+495: model.layers.58.self_attn.q_proj.weight shape: [8192, 8192]
+496: model.layers.58.self_attn.v_proj.weight shape: [1024, 8192]
+497: model.layers.59.input_layernorm.weight shape: [8192]
+498: model.layers.59.mlp.down_proj.weight shape: [8192, 28672]
+499: model.layers.59.mlp.gate_proj.weight shape: [28672, 8192]
+500: model.layers.59.mlp.up_proj.weight shape: [28672, 8192]
+501: model.layers.59.post_attention_layernorm.weight shape: [8192]
+502: model.layers.59.self_attn.k_proj.weight shape: [1024, 8192]
+503: model.layers.59.self_attn.o_proj.weight shape: [8192, 8192]
+504: model.layers.59.self_attn.q_proj.weight shape: [8192, 8192]
+505: model.layers.59.self_attn.v_proj.weight shape: [1024, 8192]
+506: model.layers.6.input_layernorm.weight shape: [8192]
+507: model.layers.6.mlp.down_proj.weight shape: [8192, 28672]
+508: model.layers.6.mlp.gate_proj.weight shape: [28672, 8192]
+509: model.layers.6.mlp.up_proj.weight shape: [28672, 8192]
+510: model.layers.6.post_attention_layernorm.weight shape: [8192]
+511: model.layers.6.self_attn.k_proj.weight shape: [1024, 8192]
+512: model.layers.6.self_attn.o_proj.weight shape: [8192, 8192]
+513: model.layers.6.self_attn.q_proj.weight shape: [8192, 8192]
+514: model.layers.6.self_attn.v_proj.weight shape: [1024, 8192]
+515: model.layers.60.input_layernorm.weight shape: [8192]
+516: model.layers.60.mlp.down_proj.weight shape: [8192, 28672]
+517: model.layers.60.mlp.gate_proj.weight shape: [28672, 8192]
+518: model.layers.60.mlp.up_proj.weight shape: [28672, 8192]
+519: model.layers.60.post_attention_layernorm.weight shape: [8192]
+520: model.layers.60.self_attn.k_proj.weight shape: [1024, 8192]
+521: model.layers.60.self_attn.o_proj.weight shape: [8192, 8192]
+522: model.layers.60.self_attn.q_proj.weight shape: [8192, 8192]
+523: model.layers.60.self_attn.v_proj.weight shape: [1024, 8192]
+524: model.layers.61.input_layernorm.weight shape: [8192]
+525: model.layers.61.mlp.down_proj.weight shape: [8192, 28672]
+526: model.layers.61.mlp.gate_proj.weight shape: [28672, 8192]
+527: model.layers.61.mlp.up_proj.weight shape: [28672, 8192]
+528: model.layers.61.post_attention_layernorm.weight shape: [8192]
+529: model.layers.61.self_attn.k_proj.weight shape: [1024, 8192]
+530: model.layers.61.self_attn.o_proj.weight shape: [8192, 8192]
+531: model.layers.61.self_attn.q_proj.weight shape: [8192, 8192]
+532: model.layers.61.self_attn.v_proj.weight shape: [1024, 8192]
+533: model.layers.62.input_layernorm.weight shape: [8192]
+534: model.layers.62.mlp.down_proj.weight shape: [8192, 28672]
+535: model.layers.62.mlp.gate_proj.weight shape: [28672, 8192]
+536: model.layers.62.mlp.up_proj.weight shape: [28672, 8192]
+537: model.layers.62.post_attention_layernorm.weight shape: [8192]
+538: model.layers.62.self_attn.k_proj.weight shape: [1024, 8192]
+539: model.layers.62.self_attn.o_proj.weight shape: [8192, 8192]
+540: model.layers.62.self_attn.q_proj.weight shape: [8192, 8192]
+541: model.layers.62.self_attn.v_proj.weight shape: [1024, 8192]
+542: model.layers.63.input_layernorm.weight shape: [8192]
+543: model.layers.63.mlp.down_proj.weight shape: [8192, 28672]
+544: model.layers.63.mlp.gate_proj.weight shape: [28672, 8192]
+545: model.layers.63.mlp.up_proj.weight shape: [28672, 8192]
+546: model.layers.63.post_attention_layernorm.weight shape: [8192]
+547: model.layers.63.self_attn.k_proj.weight shape: [1024, 8192]
+548: model.layers.63.self_attn.o_proj.weight shape: [8192, 8192]
+549: model.layers.63.self_attn.q_proj.weight shape: [8192, 8192]
+550: model.layers.63.self_attn.v_proj.weight shape: [1024, 8192]
+551: model.layers.64.input_layernorm.weight shape: [8192]
+552: model.layers.64.mlp.down_proj.weight shape: [8192, 28672]
+553: model.layers.64.mlp.gate_proj.weight shape: [28672, 8192]
+554: model.layers.64.mlp.up_proj.weight shape: [28672, 8192]
+555: model.layers.64.post_attention_layernorm.weight shape: [8192]
+556: model.layers.64.self_attn.k_proj.weight shape: [1024, 8192]
+557: model.layers.64.self_attn.o_proj.weight shape: [8192, 8192]
+558: model.layers.64.self_attn.q_proj.weight shape: [8192, 8192]
+559: model.layers.64.self_attn.v_proj.weight shape: [1024, 8192]
+560: model.layers.65.input_layernorm.weight shape: [8192]
+561: model.layers.65.mlp.down_proj.weight shape: [8192, 28672]
+562: model.layers.65.mlp.gate_proj.weight shape: [28672, 8192]
+563: model.layers.65.mlp.up_proj.weight shape: [28672, 8192]
+564: model.layers.65.post_attention_layernorm.weight shape: [8192]
+565: model.layers.65.self_attn.k_proj.weight shape: [1024, 8192]
+566: model.layers.65.self_attn.o_proj.weight shape: [8192, 8192]
+567: model.layers.65.self_attn.q_proj.weight shape: [8192, 8192]
+568: model.layers.65.self_attn.v_proj.weight shape: [1024, 8192]
+569: model.layers.66.input_layernorm.weight shape: [8192]
+570: model.layers.66.mlp.down_proj.weight shape: [8192, 28672]
+571: model.layers.66.mlp.gate_proj.weight shape: [28672, 8192]
+572: model.layers.66.mlp.up_proj.weight shape: [28672, 8192]
+573: model.layers.66.post_attention_layernorm.weight shape: [8192]
+574: model.layers.66.self_attn.k_proj.weight shape: [1024, 8192]
+575: model.layers.66.self_attn.o_proj.weight shape: [8192, 8192]
+576: model.layers.66.self_attn.q_proj.weight shape: [8192, 8192]
+577: model.layers.66.self_attn.v_proj.weight shape: [1024, 8192]
+578: model.layers.67.input_layernorm.weight shape: [8192]
+579: model.layers.67.mlp.down_proj.weight shape: [8192, 28672]
+580: model.layers.67.mlp.gate_proj.weight shape: [28672, 8192]
+581: model.layers.67.mlp.up_proj.weight shape: [28672, 8192]
+582: model.layers.67.post_attention_layernorm.weight shape: [8192]
+583: model.layers.67.self_attn.k_proj.weight shape: [1024, 8192]
+584: model.layers.67.self_attn.o_proj.weight shape: [8192, 8192]
+585: model.layers.67.self_attn.q_proj.weight shape: [8192, 8192]
+586: model.layers.67.self_attn.v_proj.weight shape: [1024, 8192]
+587: model.layers.68.input_layernorm.weight shape: [8192]
+588: model.layers.68.mlp.down_proj.weight shape: [8192, 28672]
+589: model.layers.68.mlp.gate_proj.weight shape: [28672, 8192]
+590: model.layers.68.mlp.up_proj.weight shape: [28672, 8192]
+591: model.layers.68.post_attention_layernorm.weight shape: [8192]
+592: model.layers.68.self_attn.k_proj.weight shape: [1024, 8192]
+593: model.layers.68.self_attn.o_proj.weight shape: [8192, 8192]
+594: model.layers.68.self_attn.q_proj.weight shape: [8192, 8192]
+595: model.layers.68.self_attn.v_proj.weight shape: [1024, 8192]
+596: model.layers.69.input_layernorm.weight shape: [8192]
+597: model.layers.69.mlp.down_proj.weight shape: [8192, 28672]
+598: model.layers.69.mlp.gate_proj.weight shape: [28672, 8192]
+599: model.layers.69.mlp.up_proj.weight shape: [28672, 8192]
+600: model.layers.69.post_attention_layernorm.weight shape: [8192]
+601: model.layers.69.self_attn.k_proj.weight shape: [1024, 8192]
+602: model.layers.69.self_attn.o_proj.weight shape: [8192, 8192]
+603: model.layers.69.self_attn.q_proj.weight shape: [8192, 8192]
+604: model.layers.69.self_attn.v_proj.weight shape: [1024, 8192]
+605: model.layers.7.input_layernorm.weight shape: [8192]
+606: model.layers.7.mlp.down_proj.weight shape: [8192, 28672]
+607: model.layers.7.mlp.gate_proj.weight shape: [28672, 8192]
+608: model.layers.7.mlp.up_proj.weight shape: [28672, 8192]
+609: model.layers.7.post_attention_layernorm.weight shape: [8192]
+610: model.layers.7.self_attn.k_proj.weight shape: [1024, 8192]
+611: model.layers.7.self_attn.o_proj.weight shape: [8192, 8192]
+612: model.layers.7.self_attn.q_proj.weight shape: [8192, 8192]
+613: model.layers.7.self_attn.v_proj.weight shape: [1024, 8192]
+614: model.layers.70.input_layernorm.weight shape: [8192]
+615: model.layers.70.mlp.down_proj.weight shape: [8192, 28672]
+616: model.layers.70.mlp.gate_proj.weight shape: [28672, 8192]
+617: model.layers.70.mlp.up_proj.weight shape: [28672, 8192]
+618: model.layers.70.post_attention_layernorm.weight shape: [8192]
+619: model.layers.70.self_attn.k_proj.weight shape: [1024, 8192]
+620: model.layers.70.self_attn.o_proj.weight shape: [8192, 8192]
+621: model.layers.70.self_attn.q_proj.weight shape: [8192, 8192]
+622: model.layers.70.self_attn.v_proj.weight shape: [1024, 8192]
+623: model.layers.71.input_layernorm.weight shape: [8192]
+624: model.layers.71.mlp.down_proj.weight shape: [8192, 28672]
+625: model.layers.71.mlp.gate_proj.weight shape: [28672, 8192]
+626: model.layers.71.mlp.up_proj.weight shape: [28672, 8192]
+627: model.layers.71.post_attention_layernorm.weight shape: [8192]
+628: model.layers.71.self_attn.k_proj.weight shape: [1024, 8192]
+629: model.layers.71.self_attn.o_proj.weight shape: [8192, 8192]
+630: model.layers.71.self_attn.q_proj.weight shape: [8192, 8192]
+631: model.layers.71.self_attn.v_proj.weight shape: [1024, 8192]
+632: model.layers.72.input_layernorm.weight shape: [8192]
+633: model.layers.72.mlp.down_proj.weight shape: [8192, 28672]
+634: model.layers.72.mlp.gate_proj.weight shape: [28672, 8192]
+635: model.layers.72.mlp.up_proj.weight shape: [28672, 8192]
+636: model.layers.72.post_attention_layernorm.weight shape: [8192]
+637: model.layers.72.self_attn.k_proj.weight shape: [1024, 8192]
+638: model.layers.72.self_attn.o_proj.weight shape: [8192, 8192]
+639: model.layers.72.self_attn.q_proj.weight shape: [8192, 8192]
+640: model.layers.72.self_attn.v_proj.weight shape: [1024, 8192]
+641: model.layers.73.input_layernorm.weight shape: [8192]
+642: model.layers.73.mlp.down_proj.weight shape: [8192, 28672]
+643: model.layers.73.mlp.gate_proj.weight shape: [28672, 8192]
+644: model.layers.73.mlp.up_proj.weight shape: [28672, 8192]
+645: model.layers.73.post_attention_layernorm.weight shape: [8192]
+646: model.layers.73.self_attn.k_proj.weight shape: [1024, 8192]
+647: model.layers.73.self_attn.o_proj.weight shape: [8192, 8192]
+648: model.layers.73.self_attn.q_proj.weight shape: [8192, 8192]
+649: model.layers.73.self_attn.v_proj.weight shape: [1024, 8192]
+650: model.layers.74.input_layernorm.weight shape: [8192]
+651: model.layers.74.mlp.down_proj.weight shape: [8192, 28672]
+652: model.layers.74.mlp.gate_proj.weight shape: [28672, 8192]
+653: model.layers.74.mlp.up_proj.weight shape: [28672, 8192]
+654: model.layers.74.post_attention_layernorm.weight shape: [8192]
+655: model.layers.74.self_attn.k_proj.weight shape: [1024, 8192]
+656: model.layers.74.self_attn.o_proj.weight shape: [8192, 8192]
+657: model.layers.74.self_attn.q_proj.weight shape: [8192, 8192]
+658: model.layers.74.self_attn.v_proj.weight shape: [1024, 8192]
+659: model.layers.75.input_layernorm.weight shape: [8192]
+660: model.layers.75.mlp.down_proj.weight shape: [8192, 28672]
+661: model.layers.75.mlp.gate_proj.weight shape: [28672, 8192]
+662: model.layers.75.mlp.up_proj.weight shape: [28672, 8192]
+663: model.layers.75.post_attention_layernorm.weight shape: [8192]
+664: model.layers.75.self_attn.k_proj.weight shape: [1024, 8192]
+665: model.layers.75.self_attn.o_proj.weight shape: [8192, 8192]
+666: model.layers.75.self_attn.q_proj.weight shape: [8192, 8192]
+667: model.layers.75.self_attn.v_proj.weight shape: [1024, 8192]
+668: model.layers.76.input_layernorm.weight shape: [8192]
+669: model.layers.76.mlp.down_proj.weight shape: [8192, 28672]
+670: model.layers.76.mlp.gate_proj.weight shape: [28672, 8192]
+671: model.layers.76.mlp.up_proj.weight shape: [28672, 8192]
+672: model.layers.76.post_attention_layernorm.weight shape: [8192]
+673: model.layers.76.self_attn.k_proj.weight shape: [1024, 8192]
+674: model.layers.76.self_attn.o_proj.weight shape: [8192, 8192]
+675: model.layers.76.self_attn.q_proj.weight shape: [8192, 8192]
+676: model.layers.76.self_attn.v_proj.weight shape: [1024, 8192]
+677: model.layers.77.input_layernorm.weight shape: [8192]
+678: model.layers.77.mlp.down_proj.weight shape: [8192, 28672]
+679: model.layers.77.mlp.gate_proj.weight shape: [28672, 8192]
+680: model.layers.77.mlp.up_proj.weight shape: [28672, 8192]
+681: model.layers.77.post_attention_layernorm.weight shape: [8192]
+682: model.layers.77.self_attn.k_proj.weight shape: [1024, 8192]
+683: model.layers.77.self_attn.o_proj.weight shape: [8192, 8192]
+684: model.layers.77.self_attn.q_proj.weight shape: [8192, 8192]
+685: model.layers.77.self_attn.v_proj.weight shape: [1024, 8192]
+686: model.layers.78.input_layernorm.weight shape: [8192]
+687: model.layers.78.mlp.down_proj.weight shape: [8192, 28672]
+688: model.layers.78.mlp.gate_proj.weight shape: [28672, 8192]
+689: model.layers.78.mlp.up_proj.weight shape: [28672, 8192]
+690: model.layers.78.post_attention_layernorm.weight shape: [8192]
+691: model.layers.78.self_attn.k_proj.weight shape: [1024, 8192]
+692: model.layers.78.self_attn.o_proj.weight shape: [8192, 8192]
+693: model.layers.78.self_attn.q_proj.weight shape: [8192, 8192]
+694: model.layers.78.self_attn.v_proj.weight shape: [1024, 8192]
+695: model.layers.79.input_layernorm.weight shape: [8192]
+696: model.layers.79.mlp.down_proj.weight shape: [8192, 28672]
+697: model.layers.79.mlp.gate_proj.weight shape: [28672, 8192]
+698: model.layers.79.mlp.up_proj.weight shape: [28672, 8192]
+699: model.layers.79.post_attention_layernorm.weight shape: [8192]
+700: model.layers.79.self_attn.k_proj.weight shape: [1024, 8192]
+701: model.layers.79.self_attn.o_proj.weight shape: [8192, 8192]
+702: model.layers.79.self_attn.q_proj.weight shape: [8192, 8192]
+703: model.layers.79.self_attn.v_proj.weight shape: [1024, 8192]
+704: model.layers.8.input_layernorm.weight shape: [8192]
+705: model.layers.8.mlp.down_proj.weight shape: [8192, 28672]
+706: model.layers.8.mlp.gate_proj.weight shape: [28672, 8192]
+707: model.layers.8.mlp.up_proj.weight shape: [28672, 8192]
+708: model.layers.8.post_attention_layernorm.weight shape: [8192]
+709: model.layers.8.self_attn.k_proj.weight shape: [1024, 8192]
+710: model.layers.8.self_attn.o_proj.weight shape: [8192, 8192]
+711: model.layers.8.self_attn.q_proj.weight shape: [8192, 8192]
+712: model.layers.8.self_attn.v_proj.weight shape: [1024, 8192]
+713: model.layers.9.input_layernorm.weight shape: [8192]
+714: model.layers.9.mlp.down_proj.weight shape: [8192, 28672]
+715: model.layers.9.mlp.gate_proj.weight shape: [28672, 8192]
+716: model.layers.9.mlp.up_proj.weight shape: [28672, 8192]
+717: model.layers.9.post_attention_layernorm.weight shape: [8192]
+718: model.layers.9.self_attn.k_proj.weight shape: [1024, 8192]
+719: model.layers.9.self_attn.o_proj.weight shape: [8192, 8192]
+720: model.layers.9.self_attn.q_proj.weight shape: [8192, 8192]
+721: model.layers.9.self_attn.v_proj.weight shape: [1024, 8192]
+722: model.norm.weight shape: [8192]
diff --git a/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs b/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs
index 9028b8933c..945919c56d 100644
--- a/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs
+++ b/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs
@@ -42,13 +42,33 @@ public void Llama_3_1_8b_ShapeTest()
         Approvals.Verify(stateDictStr);
     }
 
+    [Fact]
+    [UseReporter(typeof(DiffReporter))]
+    [UseApprovalSubdirectory("Approvals")]
+    public void Llama_3_1_70b_ShapeTest()
+    {
+        var model = new LlamaForCausalLM(LlamaConfig.Llama3_1_70B_Instruct);
+        var stateDictStr = model.PeekShape();
+        Approvals.Verify(stateDictStr);
+    }
+
+    [Fact]
+    [UseReporter(typeof(DiffReporter))]
+    [UseApprovalSubdirectory("Approvals")]
+    public void Llama_3_1_405b_ShapeTest()
+    {
+        var model = new LlamaForCausalLM(LlamaConfig.Llama3_1_405B_Instruct);
+        var stateDictStr = model.PeekShape();
+        Approvals.Verify(stateDictStr);
+    }
+
     [Fact]
     [UseReporter(typeof(DiffReporter))]
     [UseApprovalSubdirectory("Approvals")]
     public void TokenizerTest()
     {
         var modelWeightFolder = Path.Join("C:\\Users\\xiaoyuz\\source\\repos\\Meta-Llama-3.1-8B-Instruct\\original");
-        var tokenizer = Llama3_1TokenizerHelper.FromPretrained(Path.Join(modelWeightFolder, "tokenizer.model"));
+        var tokenizer = LlamaTokenizerHelper.FromPretrained(modelWeightFolder);
 
         var messages = new string[]
         {

From 3d4482d458274a69fa20d0f74cf9160ed536b5db Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Thu, 15 Aug 2024 10:10:51 -0700
Subject: [PATCH 06/24] clean up

---
 .../RopeScalingObject.cs                      |  7 --
 src/Microsoft.ML.GenAI.LLaMA/Utils.cs         | 73 -------------------
 2 files changed, 80 deletions(-)
 delete mode 100644 src/Microsoft.ML.GenAI.LLaMA/RopeScalingObject.cs

diff --git a/src/Microsoft.ML.GenAI.LLaMA/RopeScalingObject.cs b/src/Microsoft.ML.GenAI.LLaMA/RopeScalingObject.cs
deleted file mode 100644
index ab5d0238e7..0000000000
--- a/src/Microsoft.ML.GenAI.LLaMA/RopeScalingObject.cs
+++ /dev/null
@@ -1,7 +0,0 @@
-﻿// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-// See the LICENSE file in the project root for more information.
-
-using System.Text.Json.Serialization;
-
-namespace Microsoft.ML.GenAI.LLaMA;
diff --git a/src/Microsoft.ML.GenAI.LLaMA/Utils.cs b/src/Microsoft.ML.GenAI.LLaMA/Utils.cs
index b3dec789f3..622aba9fff 100644
--- a/src/Microsoft.ML.GenAI.LLaMA/Utils.cs
+++ b/src/Microsoft.ML.GenAI.LLaMA/Utils.cs
@@ -10,79 +10,6 @@ namespace Microsoft.ML.GenAI.LLaMA;
 
 internal static class Utils
 {
-    public static Tensor ApplyRotaryEmbeddings(Tensor input, Tensor freqsComplex)
-    {
-        // Separate the last dimension pairs of two values, representing the real and imaginary parts of the complex number
-        // Two consecutive values will become a single complex number
-        // (B, Seq_Len, H, Head_Dim) -> (B, Seq_Len, H, Head_Dim/2)
-        var inputComplex = input.to_type(ScalarType.Float32).reshape(input.shape[0], input.shape[1], input.shape[2], -1, 2).view_as_complex();
-
-        // Reshape the freqs_complex tensor to match the shape of the x_complex tensor. So we need to add the batch dimension and the head dimension
-        // (Seq_Len, Head_Dim/2) --> (1, Seq_Len, 1, Head_Dim/2)
-        var freqsComplexReshaped = freqsComplex.unsqueeze(0).unsqueeze(2);
-
-        // Multiply each complex number in the x_complex tensor by the corresponding complex number in the freqs_complex tensor
-        // Which results in the rotation of the complex number as shown in the Figure 1 of the paper
-        // (B, Seq_Len, H, Head_Dim/2) * (1, Seq_Len, 1, Head_Dim/2) = (B, Seq_Len, H, Head_Dim/2)
-        var rotatedComplex = inputComplex * freqsComplexReshaped;
-        // Console.WriteLine(rotated_complex.mean().ToSingle());
-
-        // Convert the complex number back to the real number
-        // (B, Seq_Len, H, Head_Dim/2) -> (B, Seq_Len, H, Head_Dim/2, 2)
-        var rotated = rotatedComplex.view_as_real();
-
-        // (B, Seq_Len, H, Head_Dim/2, 2) -> (B, Seq_Len, H, Head_Dim)
-        var rotatedReshaped = rotated.reshape(rotated.shape[0], rotated.shape[1], rotated.shape[2], -1);
-
-        return rotatedReshaped.type_as(input);
-    }
-
-    public static Tensor PrecomputeThetaPosFrequencies(int headDim, int seqLen, float theta = 10000.0f)
-    {
-        // As written in the paragraph 3.2.2 of the paper
-        // >> In order to generalize our results in 2D to any xi ∈ Rd where **d is even**, [...]
-        if (headDim % 2 != 0)
-        {
-            throw new ArgumentException("Dimension must be divisible by 2", nameof(headDim));
-        }
-
-        // Build the theta parameter
-        // According to the formula theta_i = 10000^(-2(i-1)/dim) for i = [1, 2, ... dim/2]
-        // Shape: (Head_Dim / 2)
-        var thetaNumerator = torch.arange(0, headDim, 2).to(torch.float32);
-        // Shape: (Head_Dim / 2)
-        var thetaInput = torch.pow(theta, -1.0f * (thetaNumerator / headDim)); // (Dim / 2)
-        // Construct the positions (the "m" parameter)
-        // Shape: (Seq_Len)
-        var m = torch.arange(seqLen);
-        // Multiply each theta by each position using the outer product.
-        // Shape: (Seq_Len) outer_product* (Head_Dim / 2) -> (Seq_Len, Head_Dim / 2)
-        var freqs = torch.outer(m, thetaInput).to(torch.float32);
-
-        // We can compute complex numbers in the polar form c = R * exp(m * theta), where R = 1 as follows:
-        // (Seq_Len, Head_Dim / 2) -> (Seq_Len, Head_Dim / 2)
-        var freqsComplex = torch.polar(torch.ones_like(freqs), freqs);
-
-        return freqsComplex;
-    }
-
-
-    public static Tensor RepeatKV(Tensor x, int nRep)
-    {
-        var batchSize = x.shape[0];
-        var seqLen = x.shape[1];
-        var nKVHeads = x.shape[2];
-        var headDim = x.shape[3];
-        if (nRep == 1)
-        {
-            return x;
-        }
-
-        return x.unsqueeze(3)
-                .expand(batchSize, seqLen, nKVHeads, nRep, headDim)
-                .reshape(batchSize, seqLen, nKVHeads * nRep, headDim);
-    }
-
     public static string GetEmbeddedResource(string resourceName)
     {
         // read file content from embedded resource

From c77b518d5752eea55928d4ecda5171a20ef4f236 Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Fri, 16 Aug 2024 12:39:33 -0700
Subject: [PATCH 07/24] add tests

---
 .../Microsoft.ML.GenAI.Samples/Llama/test.cs  | 23 ++---
 .../Microsoft.ML.GenAI.Core.csproj            |  2 +
 .../Utility/IChatTemplateBuilder.cs           | 27 ++++++
 .../Llama3_1ChatTemplateBuilder.cs            | 90 +++++++++++++++++++
 .../LlamaCausalLMAgent.cs                     | 89 ++++++++++++++++++
 .../LlamaChatCompletionService.cs             | 55 ++++++++++++
 .../LlamaTextCompletionService.cs             | 77 ++++++++++++++++
 .../Microsoft.ML.GenAI.LLaMA.csproj           |  2 -
 .../Microsoft.ML.GenAI.Phi.csproj             |  2 -
 .../Phi3/Phi3CausalLMChatCompletionService.cs |  4 +-
 ...emplateFromAutoGenChatHistory.approved.txt |  7 ++
 ...FromSemanticKernelChatHistory.approved.txt |  7 ++
 .../LLaMA3_1Tests.cs                          | 46 ++++++++--
 13 files changed, 403 insertions(+), 28 deletions(-)
 create mode 100644 src/Microsoft.ML.GenAI.Core/Utility/IChatTemplateBuilder.cs
 create mode 100644 src/Microsoft.ML.GenAI.LLaMA/Llama3_1ChatTemplateBuilder.cs
 create mode 100644 src/Microsoft.ML.GenAI.LLaMA/LlamaCausalLMAgent.cs
 create mode 100644 src/Microsoft.ML.GenAI.LLaMA/LlamaChatCompletionService.cs
 create mode 100644 src/Microsoft.ML.GenAI.LLaMA/LlamaTextCompletionService.cs
 create mode 100644 test/Microsoft.ML.GenAI.LLaMA.Tests/Approvals/LLaMA3_1Tests.ItBuildChatTemplateFromAutoGenChatHistory.approved.txt
 create mode 100644 test/Microsoft.ML.GenAI.LLaMA.Tests/Approvals/LLaMA3_1Tests.ItBuildChatTemplateFromSemanticKernelChatHistory.approved.txt

diff --git a/docs/samples/Microsoft.ML.GenAI.Samples/Llama/test.cs b/docs/samples/Microsoft.ML.GenAI.Samples/Llama/test.cs
index bd7f6ed996..ea274b0e6b 100644
--- a/docs/samples/Microsoft.ML.GenAI.Samples/Llama/test.cs
+++ b/docs/samples/Microsoft.ML.GenAI.Samples/Llama/test.cs
@@ -4,6 +4,7 @@
 using System.Text;
 using System.Text.Json;
 using System.Threading.Tasks;
+using AutoGen.Core;
 using Microsoft.ML.GenAI.Core;
 using Microsoft.ML.GenAI.Core.Extension;
 using Microsoft.ML.GenAI.LLaMA;
@@ -15,7 +16,7 @@ namespace Microsoft.ML.GenAI.Samples.Llama;
 
 internal class LlamaSample
 {
-    public static void Run()
+    public static async void Run()
     {
         var device = "cuda";
         if (device == "cuda")
@@ -26,7 +27,7 @@ public static void Run()
         var defaultType = ScalarType.Float16;
         torch.manual_seed(1);
         torch.set_default_dtype(defaultType);
-        var weightFolder = @"C:\Users\xiaoyuz\source\repos\Meta-Llama-3.1-70B-Instruct";
+        var weightFolder = @"C:\Users\xiaoyuz\source\repos\Meta-Llama-3.1-8B-Instruct";
         var configName = "config.json";
         var quantizeToInt8 = false;
         var quantizeToInt4 = false;
@@ -106,23 +107,17 @@ public static void Run()
         Console.WriteLine($"Start loading to device: {device}");
         model = model.ToDynamicLoadingModel(deviceMap, "cuda");
         timer.Stop();
-        Console.WriteLine($"Phi3 loaded to device: {device} in {timer.ElapsedMilliseconds / 1000} s");
+        Console.WriteLine($"Model loaded to device: {device} in {timer.ElapsedMilliseconds / 1000} s");
         var pipeline = new CausalLMPipeline<TiktokenTokenizer, LlamaForCausalLM>(tokenizer, model, device);
         torch.set_default_device(device);
 
-        var prompt = """
-            <|begin_of_text|>
-            <|start_header_id|>system<|end_header_id|>
-            You are a pirate chatbot who always responds in pirate speak!<|eot_id|>
-            <|start_header_id|>user<|end_header_id|>
-            Who are you?<|eot_id|>
+        var agent = new LlamaCausalLMAgent(pipeline, "assistant")
+            .RegisterPrintMessage();
 
-            <|start_header_id|>assistant<|end_header_id|>
+        var task = """
+            Write a C# program to print the sum of two numbers.
             """;
 
-        foreach (var word in pipeline.GenerateStreaming(prompt, stopSequences: ["<|eot_id|>"]))
-        {
-            Console.Write(word);
-        }
+        await agent.SendAsync(task);
     }
 }
diff --git a/src/Microsoft.ML.GenAI.Core/Microsoft.ML.GenAI.Core.csproj b/src/Microsoft.ML.GenAI.Core/Microsoft.ML.GenAI.Core.csproj
index 2827fa237a..8745b81c6d 100644
--- a/src/Microsoft.ML.GenAI.Core/Microsoft.ML.GenAI.Core.csproj
+++ b/src/Microsoft.ML.GenAI.Core/Microsoft.ML.GenAI.Core.csproj
@@ -8,6 +8,8 @@
   </PropertyGroup>
 
   <ItemGroup>
+    <PackageReference Include="AutoGen.Core" Version="$(AutoGenVersion)" />
+    <PackageReference Include="Microsoft.SemanticKernel.Abstractions" Version="$(SemanticKernelVersion)" />
     <PackageReference Include="System.Memory" Version="$(SystemMemoryVersion)" />
     <PackageReference Include="TorchSharp" Version="$(TorchSharpVersion)" />
   </ItemGroup>
diff --git a/src/Microsoft.ML.GenAI.Core/Utility/IChatTemplateBuilder.cs b/src/Microsoft.ML.GenAI.Core/Utility/IChatTemplateBuilder.cs
new file mode 100644
index 0000000000..a0720694c3
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Core/Utility/IChatTemplateBuilder.cs
@@ -0,0 +1,27 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+using AutoGen.Core;
+using Microsoft.SemanticKernel.ChatCompletion;
+
+namespace Microsoft.ML.GenAI.Core;
+
+public interface ISemanticKernelChatTemplateBuilder
+{
+    string BuildPrompt(ChatHistory chatHistory);
+}
+
+public interface IAutoGenChatTemplateBuilder
+{
+    string BuildPrompt(IEnumerable<IMessage> messages);
+}
+
+public interface IChatTemplateBuilder : IAutoGenChatTemplateBuilder, ISemanticKernelChatTemplateBuilder
+{
+}
diff --git a/src/Microsoft.ML.GenAI.LLaMA/Llama3_1ChatTemplateBuilder.cs b/src/Microsoft.ML.GenAI.LLaMA/Llama3_1ChatTemplateBuilder.cs
new file mode 100644
index 0000000000..b96dee6dba
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.LLaMA/Llama3_1ChatTemplateBuilder.cs
@@ -0,0 +1,90 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Text;
+using AutoGen.Core;
+using Microsoft.ML.GenAI.Core;
+using Microsoft.SemanticKernel;
+using Microsoft.SemanticKernel.ChatCompletion;
+
+namespace Microsoft.ML.GenAI.LLaMA;
+#pragma warning disable MSML_GeneralName // This name should be PascalCased
+public class Llama3_1ChatTemplateBuilder : IChatTemplateBuilder
+#pragma warning restore MSML_GeneralName // This name should be PascalCased
+{
+    private const char Newline = '\n';
+
+    public string BuildPrompt(IEnumerable<IMessage> messages)
+    {
+        var availableRoles = new[] { Role.System, Role.User, Role.Assistant };
+        if (messages.Any(m => m.GetContent() is null))
+        {
+            throw new InvalidOperationException("Please provide a message with content.");
+        }
+
+        if (messages.Any(m => m.GetRole() is null || availableRoles.Contains(m.GetRole()!.Value) == false))
+        {
+            throw new InvalidOperationException("Please provide a message with a valid role. The valid roles are System, User, and Assistant.");
+        }
+
+        // construct template based on instruction from
+        // https://github.com/meta-llama/llama3/blob/11817d47e1ba7a4959b025eb1ca308572e0e3963/llama/generation.py#L280
+
+        var sb = new StringBuilder();
+        sb.Append("<|begin_of_text|>");
+        foreach (var message in messages)
+        {
+            var role = message.GetRole()!.Value;
+            var content = message.GetContent()!;
+            sb.Append(message switch
+            {
+                _ when message.GetRole() == Role.System => $"<|start_header_id|>system<|end_header_id|>{Newline}{content.Trim()}<|eot_id|>{Newline}",
+                _ when message.GetRole() == Role.User => $"<|start_header_id|>user<|end_header_id|>{Newline}{content.Trim()}<|eot_id|>{Newline}",
+                _ when message.GetRole() == Role.Assistant => $"<|start_header_id|>assistant<|end_header_id|>{Newline}{content.Trim()}<|eot_id|>{Newline}",
+                _ => throw new InvalidOperationException("Invalid role.")
+            });
+        }
+
+        sb.Append($"<|start_header_id|>assistant<|end_header_id|>{Newline}");
+        var input = sb.ToString();
+
+        return input;
+    }
+
+    public string BuildPrompt(ChatHistory chatHistory)
+    {
+        // build prompt from chat history
+        var sb = new StringBuilder();
+
+        sb.Append("<|begin_of_text|>");
+        foreach (var message in chatHistory)
+        {
+            foreach (var item in message.Items)
+            {
+                if (item is not TextContent textContent)
+                {
+                    throw new NotSupportedException($"Only text content is supported, but got {item.GetType().Name}");
+                }
+
+                var text = textContent.Text?.Trim() ?? string.Empty;
+
+                var prompt = message.Role switch
+                {
+                    _ when message.Role == AuthorRole.System => $"<|start_header_id|>system<|end_header_id|>{Newline}{text}<|eot_id|>{Newline}",
+                    _ when message.Role == AuthorRole.User => $"<|start_header_id|>user<|end_header_id|>{Newline}{text}<|eot_id|>{Newline}",
+                    _ when message.Role == AuthorRole.Assistant => $"<|start_header_id|>assistant<|end_header_id|>{Newline}{text}<|eot_id|>{Newline}",
+                    _ => throw new NotSupportedException($"Unsupported role {message.Role}")
+                };
+
+                sb.Append(prompt);
+            }
+        }
+
+        sb.Append($"<|start_header_id|>assistant<|end_header_id|>{Newline}");
+
+        return sb.ToString();
+    }
+
+    public static Llama3_1ChatTemplateBuilder Instance { get; } = new Llama3_1ChatTemplateBuilder();
+}
diff --git a/src/Microsoft.ML.GenAI.LLaMA/LlamaCausalLMAgent.cs b/src/Microsoft.ML.GenAI.LLaMA/LlamaCausalLMAgent.cs
new file mode 100644
index 0000000000..5deabd6df2
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.LLaMA/LlamaCausalLMAgent.cs
@@ -0,0 +1,89 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Runtime.CompilerServices;
+using AutoGen.Core;
+using Microsoft.ML.GenAI.Core;
+using Microsoft.ML.Tokenizers;
+
+namespace Microsoft.ML.GenAI.LLaMA;
+
+public class LlamaCausalLMAgent : IStreamingAgent
+{
+    private const char Newline = '\n';
+    private readonly ICausalLMPipeline<Tokenizer, LlamaForCausalLM> _pipeline;
+    private readonly string? _systemMessage;
+    private readonly IAutoGenChatTemplateBuilder _templateBuilder;
+
+    /// <summary>
+    /// Create a new instance of <see cref="LlamaCausalLMAgent"/>.
+    /// </summary>
+    /// <param name="pipeline">pipeline</param>
+    /// <param name="name">agent name</param>
+    /// <param name="systemMessage">system message.</param>
+    /// <param name="templateBuilder">the template builder to build chat prompt. If the value is null, <see cref="Llama3_1ChatTemplateBuilder.Instance"/> would be used.</param>
+    public LlamaCausalLMAgent(
+        ICausalLMPipeline<Tokenizer, LlamaForCausalLM> pipeline,
+        string name,
+        string? systemMessage = "you are a helpful assistant",
+        IAutoGenChatTemplateBuilder? templateBuilder = null)
+    {
+        this.Name = name;
+        this._pipeline = pipeline;
+        this._systemMessage = systemMessage;
+        this._templateBuilder = templateBuilder ?? Llama3_1ChatTemplateBuilder.Instance;
+    }
+
+    public string Name { get; }
+
+    public Task<IMessage> GenerateReplyAsync(IEnumerable<IMessage> messages, GenerateReplyOptions? options = null, CancellationToken cancellationToken = default)
+    {
+        if (_systemMessage != null)
+        {
+            var systemMessage = new TextMessage(Role.System, _systemMessage, from: this.Name);
+            messages = messages.Prepend(systemMessage);
+        }
+        var input = _templateBuilder.BuildPrompt(messages);
+        var maxLen = options?.MaxToken ?? 1024;
+        var temperature = options?.Temperature ?? 0.7f;
+        var stopTokenSequence = options?.StopSequence ?? [];
+        stopTokenSequence = stopTokenSequence.Append("<|eot_id|>").ToArray();
+
+        var output = _pipeline.Generate(
+            input,
+            maxLen: maxLen,
+            temperature: temperature,
+            stopSequences: stopTokenSequence) ?? throw new InvalidOperationException("Failed to generate a reply.");
+
+        return Task.FromResult<IMessage>(new TextMessage(Role.Assistant, output, from: this.Name));
+    }
+
+#pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously
+    public async IAsyncEnumerable<IStreamingMessage> GenerateStreamingReplyAsync(
+#pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously
+        IEnumerable<IMessage> messages,
+        GenerateReplyOptions? options = null,
+        [EnumeratorCancellation] CancellationToken cancellationToken = default)
+    {
+        if (_systemMessage != null)
+        {
+            var systemMessage = new TextMessage(Role.System, _systemMessage, from: this.Name);
+            messages = messages.Prepend(systemMessage);
+        }
+        var input = _templateBuilder.BuildPrompt(messages);
+        var maxLen = options?.MaxToken ?? 1024;
+        var temperature = options?.Temperature ?? 0.7f;
+        var stopTokenSequence = options?.StopSequence ?? [];
+        stopTokenSequence = stopTokenSequence.Append("<|eot_id|>").ToArray();
+
+        foreach (var output in _pipeline.GenerateStreaming(
+            input,
+            maxLen: maxLen,
+            temperature: temperature,
+            stopSequences: stopTokenSequence))
+        {
+            yield return new TextMessageUpdate(Role.Assistant, output, from: this.Name);
+        }
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.LLaMA/LlamaChatCompletionService.cs b/src/Microsoft.ML.GenAI.LLaMA/LlamaChatCompletionService.cs
new file mode 100644
index 0000000000..3e43e7eefb
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.LLaMA/LlamaChatCompletionService.cs
@@ -0,0 +1,55 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Runtime.CompilerServices;
+using Microsoft.ML.GenAI.Core;
+using Microsoft.ML.Tokenizers;
+using Microsoft.SemanticKernel;
+using Microsoft.SemanticKernel.ChatCompletion;
+
+namespace Microsoft.ML.GenAI.LLaMA;
+
+public class LlamaChatCompletionService : IChatCompletionService
+{
+    private readonly ICausalLMPipeline<Tokenizer, LlamaForCausalLM> _pipeline;
+    private readonly LlamaTextCompletionService _textGenerationService;
+    private readonly ISemanticKernelChatTemplateBuilder _templateBuilder;
+
+    /// <summary>
+    /// Create a new instance of <see cref="LlamaChatCompletionService"/>.
+    /// </summary>
+    /// <param name="pipeline">pipeline</param>
+    /// <param name="templateBuilder">The template builder to use for generating chat prompts, if not provided, <see cref="Llama3_1ChatTemplateBuilder.Instance"/> will be used.</param>
+    public LlamaChatCompletionService(ICausalLMPipeline<Tokenizer, LlamaForCausalLM> pipeline, ISemanticKernelChatTemplateBuilder? templateBuilder = null)
+    {
+        _pipeline = pipeline;
+        _textGenerationService = new LlamaTextCompletionService(pipeline);
+        _templateBuilder = templateBuilder ?? Llama3_1ChatTemplateBuilder.Instance;
+    }
+
+    public IReadOnlyDictionary<string, object?> Attributes => _textGenerationService.Attributes;
+
+    public async Task<IReadOnlyList<ChatMessageContent>> GetChatMessageContentsAsync(ChatHistory chatHistory, PromptExecutionSettings? executionSettings = null, Kernel? kernel = null, CancellationToken cancellationToken = default)
+    {
+        var prompt = _templateBuilder.BuildPrompt(chatHistory);
+        var replies = await _textGenerationService.GetTextContentsAsync(prompt, executionSettings, kernel, cancellationToken);
+
+        return replies.Select(reply => new ChatMessageContent(AuthorRole.Assistant, reply.Text)).ToList();
+    }
+
+    public async IAsyncEnumerable<StreamingChatMessageContent> GetStreamingChatMessageContentsAsync(
+        ChatHistory chatHistory,
+        PromptExecutionSettings? executionSettings = null,
+        Kernel? kernel = null,
+        [EnumeratorCancellation]
+        CancellationToken cancellationToken = default)
+    {
+        var prompt = _templateBuilder.BuildPrompt(chatHistory);
+
+        await foreach (var reply in _textGenerationService.GetStreamingTextContentsAsync(prompt, executionSettings, kernel, cancellationToken))
+        {
+            yield return new StreamingChatMessageContent(AuthorRole.Assistant, reply.Text);
+        }
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.LLaMA/LlamaTextCompletionService.cs b/src/Microsoft.ML.GenAI.LLaMA/LlamaTextCompletionService.cs
new file mode 100644
index 0000000000..5ac0a9afb9
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.LLaMA/LlamaTextCompletionService.cs
@@ -0,0 +1,77 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Runtime.CompilerServices;
+using System.Text;
+using System.Threading.Tasks;
+using Microsoft.ML.GenAI.Core;
+using Microsoft.ML.Tokenizers;
+using Microsoft.SemanticKernel;
+using Microsoft.SemanticKernel.TextGeneration;
+
+namespace Microsoft.ML.GenAI.LLaMA;
+
+public class LlamaTextCompletionService : ITextGenerationService
+{
+    private readonly ICausalLMPipeline<Tokenizer, LlamaForCausalLM> _pipeline;
+
+    public LlamaTextCompletionService(ICausalLMPipeline<Tokenizer, LlamaForCausalLM> pipeline)
+    {
+        _pipeline = pipeline;
+    }
+
+    public IReadOnlyDictionary<string, object?> Attributes => new Dictionary<string, object?>()
+    {
+        { "temperature", null },
+        { "max_token", null },
+        { "stop_token_sequence", null },
+        { "top_p", null },
+    };
+
+#pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously
+    public async IAsyncEnumerable<StreamingTextContent> GetStreamingTextContentsAsync(
+#pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously
+        string prompt,
+        PromptExecutionSettings? executionSettings = null,
+        Kernel? kernel = null,
+        [EnumeratorCancellation]
+        CancellationToken cancellationToken = default)
+    {
+        var temperature = executionSettings?.ExtensionData?["temperature"] as float? ?? 0.7f;
+        var maxToken = executionSettings?.ExtensionData?["max_token"] as int? ?? 100;
+        var stopTokenSequence = executionSettings?.ExtensionData?["stop_token_sequence"] as string[] ?? Array.Empty<string>();
+        var topP = executionSettings?.ExtensionData?["top_p"] as float? ?? 0.9f;
+        stopTokenSequence.Append("<|eot_id|>");
+
+        foreach (var item in _pipeline.GenerateStreaming(
+            prompt,
+            maxToken,
+            temperature,
+            topP,
+            stopTokenSequence))
+        {
+            yield return new StreamingTextContent(item);
+        }
+    }
+
+    public Task<IReadOnlyList<TextContent>> GetTextContentsAsync(string prompt, PromptExecutionSettings? executionSettings = null, Kernel? kernel = null, CancellationToken cancellationToken = default)
+    {
+        var temperature = executionSettings?.ExtensionData?["temperature"] as float? ?? 0.7f;
+        var maxToken = executionSettings?.ExtensionData?["max_token"] as int? ?? 512;
+        var stopTokenSequence = executionSettings?.ExtensionData?["stop_token_sequence"] as List<string> ?? new List<string>();
+        var topP = executionSettings?.ExtensionData?["top_p"] as float? ?? 0.9f;
+        stopTokenSequence.Add("<|eot_id|>");
+        var response = _pipeline.Generate(
+            prompt,
+            maxToken,
+            temperature,
+            stopSequences: stopTokenSequence.ToArray(),
+            topP: topP);
+
+        return Task.FromResult<IReadOnlyList<TextContent>>([new TextContent(response)]);
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.LLaMA/Microsoft.ML.GenAI.LLaMA.csproj b/src/Microsoft.ML.GenAI.LLaMA/Microsoft.ML.GenAI.LLaMA.csproj
index 8c7200fa1e..95b74f4001 100644
--- a/src/Microsoft.ML.GenAI.LLaMA/Microsoft.ML.GenAI.LLaMA.csproj
+++ b/src/Microsoft.ML.GenAI.LLaMA/Microsoft.ML.GenAI.LLaMA.csproj
@@ -7,10 +7,8 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="AutoGen.Core" Version="$(AutoGenVersion)" />
     <PackageReference Include="TorchSharp.PyBridge" Version="$(TorchSharpPyBridgeVersion)" />
     <PackageReference Include="TorchSharp" Version="$(TorchSharpVersion)" />
-    <PackageReference Include="Microsoft.SemanticKernel.Abstractions" Version="$(SemanticKernelVersion)" />
   </ItemGroup>
 
   <ItemGroup>
diff --git a/src/Microsoft.ML.GenAI.Phi/Microsoft.ML.GenAI.Phi.csproj b/src/Microsoft.ML.GenAI.Phi/Microsoft.ML.GenAI.Phi.csproj
index af8b6aed6e..e8605ba403 100644
--- a/src/Microsoft.ML.GenAI.Phi/Microsoft.ML.GenAI.Phi.csproj
+++ b/src/Microsoft.ML.GenAI.Phi/Microsoft.ML.GenAI.Phi.csproj
@@ -7,10 +7,8 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="AutoGen.Core" Version="$(AutoGenVersion)" />
     <PackageReference Include="TorchSharp.PyBridge" Version="$(TorchSharpPyBridgeVersion)" />
     <PackageReference Include="TorchSharp" Version="$(TorchSharpVersion)" />
-    <PackageReference Include="Microsoft.SemanticKernel.Abstractions" Version="$(SemanticKernelVersion)" />
   </ItemGroup>
 
   <ItemGroup>
diff --git a/src/Microsoft.ML.GenAI.Phi/Phi3/Phi3CausalLMChatCompletionService.cs b/src/Microsoft.ML.GenAI.Phi/Phi3/Phi3CausalLMChatCompletionService.cs
index efe3089fdb..480e0d7e04 100644
--- a/src/Microsoft.ML.GenAI.Phi/Phi3/Phi3CausalLMChatCompletionService.cs
+++ b/src/Microsoft.ML.GenAI.Phi/Phi3/Phi3CausalLMChatCompletionService.cs
@@ -33,8 +33,8 @@ public async Task<IReadOnlyList<ChatMessageContent>> GetChatMessageContentsAsync
         CancellationToken cancellationToken = default)
     {
         var prompt = BuildPrompt(chatHistory);
-        var reply = await _textGenerationService.GetTextContentAsync(prompt, executionSettings, kernel, cancellationToken);
-        return [new ChatMessageContent(AuthorRole.Assistant, reply.Text)];
+        var replies = await _textGenerationService.GetTextContentsAsync(prompt, executionSettings, kernel, cancellationToken);
+        return replies.Select(reply => new ChatMessageContent(AuthorRole.Assistant, reply.Text)).ToList();
     }
 
     public async IAsyncEnumerable<StreamingChatMessageContent> GetStreamingChatMessageContentsAsync(
diff --git a/test/Microsoft.ML.GenAI.LLaMA.Tests/Approvals/LLaMA3_1Tests.ItBuildChatTemplateFromAutoGenChatHistory.approved.txt b/test/Microsoft.ML.GenAI.LLaMA.Tests/Approvals/LLaMA3_1Tests.ItBuildChatTemplateFromAutoGenChatHistory.approved.txt
new file mode 100644
index 0000000000..e4a2466fec
--- /dev/null
+++ b/test/Microsoft.ML.GenAI.LLaMA.Tests/Approvals/LLaMA3_1Tests.ItBuildChatTemplateFromAutoGenChatHistory.approved.txt
@@ -0,0 +1,7 @@
+﻿<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+You are a helpful AI assistant.<|eot_id|>
+<|start_header_id|>user<|end_header_id|>
+Hello?<|eot_id|>
+<|start_header_id|>assistant<|end_header_id|>
+World!<|eot_id|>
+<|start_header_id|>assistant<|end_header_id|>
diff --git a/test/Microsoft.ML.GenAI.LLaMA.Tests/Approvals/LLaMA3_1Tests.ItBuildChatTemplateFromSemanticKernelChatHistory.approved.txt b/test/Microsoft.ML.GenAI.LLaMA.Tests/Approvals/LLaMA3_1Tests.ItBuildChatTemplateFromSemanticKernelChatHistory.approved.txt
new file mode 100644
index 0000000000..e4a2466fec
--- /dev/null
+++ b/test/Microsoft.ML.GenAI.LLaMA.Tests/Approvals/LLaMA3_1Tests.ItBuildChatTemplateFromSemanticKernelChatHistory.approved.txt
@@ -0,0 +1,7 @@
+﻿<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+You are a helpful AI assistant.<|eot_id|>
+<|start_header_id|>user<|end_header_id|>
+Hello?<|eot_id|>
+<|start_header_id|>assistant<|end_header_id|>
+World!<|eot_id|>
+<|start_header_id|>assistant<|end_header_id|>
diff --git a/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs b/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs
index 945919c56d..5f3c4b48d3 100644
--- a/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs
+++ b/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs
@@ -2,20 +2,16 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
-using System;
-using System.Collections.Generic;
-using System.Linq;
 using System.Text;
-using System.Threading.Tasks;
 using ApprovalTests;
 using ApprovalTests.Namers;
 using ApprovalTests.Reporters;
+using AutoGen.Core;
+using Microsoft.ML.GenAI.Core.Extension;
+using Microsoft.SemanticKernel;
+using Microsoft.SemanticKernel.ChatCompletion;
 using TorchSharp;
 using Xunit;
-using Microsoft.ML.GenAI.Core.Extension;
-using Microsoft.ML.Tokenizers;
-using FluentAssertions;
-using System.Text.RegularExpressions;
 
 namespace Microsoft.ML.GenAI.LLaMA.Tests;
 
@@ -92,4 +88,38 @@ public void TokenizerTest()
         }
         Approvals.Verify(sb.ToString());
     }
+
+    [Fact]
+    [UseReporter(typeof(DiffReporter))]
+    [UseApprovalSubdirectory("Approvals")]
+    public void ItBuildChatTemplateFromAutoGenChatHistory()
+    {
+        var chatHistory = new List<IMessage>
+        {
+            new TextMessage(Role.System, "You are a helpful AI assistant."),
+            new TextMessage(Role.User, "Hello?"),
+            new TextMessage(Role.Assistant, "World!"),
+        };
+
+        var prompt = Llama3_1ChatTemplateBuilder.Instance.BuildPrompt(chatHistory);
+
+        Approvals.Verify(prompt);
+    }
+
+    [Fact]
+    [UseReporter(typeof(DiffReporter))]
+    [UseApprovalSubdirectory("Approvals")]
+    public void ItBuildChatTemplateFromSemanticKernelChatHistory()
+    {
+        var chatHistory = new ChatHistory
+        {
+            new ChatMessageContent(AuthorRole.System, "You are a helpful AI assistant."),
+            new ChatMessageContent(AuthorRole.User, "Hello?"),
+            new ChatMessageContent(AuthorRole.Assistant, "World!"),
+        };
+
+        var prompt = Llama3_1ChatTemplateBuilder.Instance.BuildPrompt(chatHistory);
+
+        Approvals.Verify(prompt);
+    }
 }

From 6498e2e110f3b415c6afd148881f7f4bb39b2d0a Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Fri, 16 Aug 2024 14:35:26 -0700
Subject: [PATCH 08/24] update

---
 .../Microsoft.ML.GenAI.Samples/Llama/test.cs  | 75 +------------------
 eng/Versions.props                            |  2 +-
 .../Extension/ModuleExtension.cs              | 51 +++++++++++++
 .../LlamaForCausalLM.cs                       | 49 ++++++++++++
 4 files changed, 103 insertions(+), 74 deletions(-)

diff --git a/docs/samples/Microsoft.ML.GenAI.Samples/Llama/test.cs b/docs/samples/Microsoft.ML.GenAI.Samples/Llama/test.cs
index ea274b0e6b..9ce2f0016b 100644
--- a/docs/samples/Microsoft.ML.GenAI.Samples/Llama/test.cs
+++ b/docs/samples/Microsoft.ML.GenAI.Samples/Llama/test.cs
@@ -29,87 +29,16 @@ public static async void Run()
         torch.set_default_dtype(defaultType);
         var weightFolder = @"C:\Users\xiaoyuz\source\repos\Meta-Llama-3.1-8B-Instruct";
         var configName = "config.json";
-        var quantizeToInt8 = false;
-        var quantizeToInt4 = false;
-        var modelSizeOnCudaInGB = 18;
-        var modelSizeOnMemoryInGB = 640;
-        var modelSizeOnDiskInGB = 200;
         var originalWeightFolder = Path.Combine(weightFolder, "original");
 
         Console.WriteLine("Loading Llama from huggingface model weight folder");
         var stopWatch = System.Diagnostics.Stopwatch.StartNew();
         stopWatch.Start();
         var tokenizer = LlamaTokenizerHelper.FromPretrained(originalWeightFolder);
-        Console.WriteLine("Loading Phi3 from huggingface model weight folder");
-        torch.set_default_device("meta");
-        var configPath = System.IO.Path.Combine(weightFolder, configName);
-        var config = JsonSerializer.Deserialize<LlamaConfig>(System.IO.File.ReadAllText(configPath)) ?? throw new ArgumentNullException(nameof(configPath));
-        var timer = System.Diagnostics.Stopwatch.StartNew();
-        var model = new LlamaForCausalLM(config);
-        var tokenzierPath = System.IO.Path.Combine(weightFolder, "tokenizer.model");
+        Console.WriteLine("Loading llama from huggingface model weight folder");
+        var model = LlamaForCausalLM.FromPretrained(weightFolder, configName, layersOnTargetDevice: 30);
 
-        if (quantizeToInt8)
-        {
-            model.ToInt8QuantizeModule();
-        }
-        else if (quantizeToInt4)
-        {
-            model.ToInt4QuantizeModule();
-        }
-
-        var deviceSizeMap = new Dictionary<string, long>
-        {
-            ["cuda"] = modelSizeOnCudaInGB * 1L * 1024 * 1024 * 1024,
-            ["cpu"] = modelSizeOnMemoryInGB * 1L * 1024 * 1024 * 1024,
-            ["disk"] = modelSizeOnDiskInGB * 1L * 1024 * 1024 * 1024,
-        };
-
-        var deviceMap = model.InferDeviceMapForEachLayer(
-            devices: ["cuda", "cpu", "disk"],
-            deviceSizeMapInByte: deviceSizeMap);
-
-        var deviceMapJson = JsonSerializer.Serialize(deviceMap, new JsonSerializerOptions { WriteIndented = true });
-        Console.WriteLine($"Device map:");
-        Console.WriteLine(deviceMapJson);
-
-        // load weight
-        torch.set_default_device("cpu");
-
-        Console.WriteLine("Start loading");
-        timer = System.Diagnostics.Stopwatch.StartNew();
-        model = new LlamaForCausalLM(config);
-        timer.Stop();
-        Console.WriteLine($"model created in {timer.ElapsedMilliseconds / 1000} s");
-
-        timer = System.Diagnostics.Stopwatch.StartNew();
-        model.LoadSafeTensors(weightFolder);
-        timer.Stop();
-        Console.WriteLine($"weight loaded in {timer.ElapsedMilliseconds / 1000} s");
-
-        if (quantizeToInt8 || quantizeToInt4)
-        {
-            timer = System.Diagnostics.Stopwatch.StartNew();
-            Console.WriteLine("Start quantizing if needed");
-            if (quantizeToInt8)
-            {
-                model.ToInt8QuantizeModule();
-            }
-            else if (quantizeToInt4)
-            {
-                model.ToInt4QuantizeModule();
-            }
-            Console.WriteLine("Quantizing done");
-            timer.Stop();
-            Console.WriteLine($"Quantizing done in {timer.ElapsedMilliseconds / 1000} s");
-        }
-
-        timer = System.Diagnostics.Stopwatch.StartNew();
-        Console.WriteLine($"Start loading to device: {device}");
-        model = model.ToDynamicLoadingModel(deviceMap, "cuda");
-        timer.Stop();
-        Console.WriteLine($"Model loaded to device: {device} in {timer.ElapsedMilliseconds / 1000} s");
         var pipeline = new CausalLMPipeline<TiktokenTokenizer, LlamaForCausalLM>(tokenizer, model, device);
-        torch.set_default_device(device);
 
         var agent = new LlamaCausalLMAgent(pipeline, "assistant")
             .RegisterPrintMessage();
diff --git a/eng/Versions.props b/eng/Versions.props
index 84b28e1b8f..3b7fe5bd01 100644
--- a/eng/Versions.props
+++ b/eng/Versions.props
@@ -96,7 +96,7 @@
     <MicrosoftMLTensorFlowTestModelsVersion>0.0.13-test</MicrosoftMLTensorFlowTestModelsVersion>
     <MicrosoftMLTestDatabasesVersion>0.0.6-test</MicrosoftMLTestDatabasesVersion>
     <MicrosoftMLTestModelsVersion>0.0.7-test</MicrosoftMLTestModelsVersion>
-    <MicrosoftMLTestTokenizersVersion>2.0.0-beta.24219.1</MicrosoftMLTestTokenizersVersion>
+    <MicrosoftMLTestTokenizersVersion>2.0.0-beta.24415.1</MicrosoftMLTestTokenizersVersion>
     <SystemDataSqlClientVersion>4.8.6</SystemDataSqlClientVersion>
     <SystemDataSQLiteCoreVersion>1.0.118</SystemDataSQLiteCoreVersion>
     <XunitCombinatorialVersion>1.6.24</XunitCombinatorialVersion>
diff --git a/src/Microsoft.ML.GenAI.Core/Extension/ModuleExtension.cs b/src/Microsoft.ML.GenAI.Core/Extension/ModuleExtension.cs
index 18633728a5..a904c394b9 100644
--- a/src/Microsoft.ML.GenAI.Core/Extension/ModuleExtension.cs
+++ b/src/Microsoft.ML.GenAI.Core/Extension/ModuleExtension.cs
@@ -197,6 +197,57 @@ public static Dictionary<string, string> InferDeviceMapForEachLayer(
         return deviceMap;
     }
 
+    /// <summary>
+    /// Infer the device map for each layer in the model.
+    /// The device map is a dictionary where the key is the device id (e.g. "cuda:0") and the value is the memory size in bytes of the device.
+    /// When inferring the device map, each layer in the model will be placed on the device in the order of the devices list.
+    /// </summary>
+    /// <param name="model"></param>
+    /// <param name="numberOfLayerToBePlaced">a list of key-value pairs where the key is the device id (e.g. "cuda:0") and the value is the number of layers to be placed on the device.
+    /// If you want to place all remaining layers on the device, set that value to -1.
+    /// e.g. [{"cuda:0", 2}, {"cpu", -1}], the first 2 layers will be placed on "cuda:0" and the rest will be placed on "cpu".
+    /// </param>
+    /// <returns></returns>
+    public static Dictionary<string, string> InferDeviceMapForEachLayer(
+        this nn.Module model,
+        IEnumerable<KeyValuePair<string, int>> numberOfLayerToBePlaced)
+    {
+        var layerSizeMap = model.GetSizeForEachDynamicLayerInBytes()
+            .OrderByDescending(x => x.Value)
+            .ToList();
+
+        var deviceMap = new Dictionary<string, string>();
+        foreach (var (device, count) in numberOfLayerToBePlaced)
+        {
+            if (count != -1)
+            {
+                var topK = layerSizeMap.Take(count).ToList();
+                layerSizeMap = layerSizeMap.Skip(count).ToList();
+                foreach (var (key, value) in topK)
+                {
+                    deviceMap[key] = device;
+                }
+            }
+            else
+            {
+                foreach (var (key, value) in layerSizeMap)
+                {
+                    deviceMap[key] = device;
+                }
+
+                layerSizeMap.Clear();
+                break;
+            }
+        }
+
+        if (layerSizeMap.Count > 0)
+        {
+            throw new ArgumentException("The layer count is not enough to cover all layers, did you forget to set the last layer count to -1?");
+        }
+
+        return deviceMap;
+    }
+
     internal static string Peek(this nn.Module model)
     {
         var sb = new StringBuilder();
diff --git a/src/Microsoft.ML.GenAI.LLaMA/LlamaForCausalLM.cs b/src/Microsoft.ML.GenAI.LLaMA/LlamaForCausalLM.cs
index 9f70749ff9..e72e60f6d6 100644
--- a/src/Microsoft.ML.GenAI.LLaMA/LlamaForCausalLM.cs
+++ b/src/Microsoft.ML.GenAI.LLaMA/LlamaForCausalLM.cs
@@ -5,6 +5,7 @@
 using System.Diagnostics;
 using System.Text.Json;
 using Microsoft.ML.GenAI.Core;
+using Microsoft.ML.GenAI.Core.Extension;
 using Microsoft.ML.GenAI.LLaMA.Module;
 using TorchSharp;
 using TorchSharp.PyBridge;
@@ -64,6 +65,54 @@ public static LlamaForCausalLM FromPretrained(
         return model;
     }
 
+    public static LlamaForCausalLM FromPretrained(
+        string modelFolder,
+        string configName = "config.json",
+        string checkPointName = "model.safetensors.index.json",
+        bool quantizeToInt8 = false,
+        bool quantizeToInt4 = false,
+        int layersOnTargetDevice = -1,
+        ScalarType torchDtype = ScalarType.BFloat16,
+        string targetDevice = "cuda")
+    {
+        if (layersOnTargetDevice == -1 && quantizeToInt4 == false && quantizeToInt8 == false)
+        {
+            return FromPretrained(modelFolder, configName, checkPointName, torchDtype, targetDevice);
+        }
+
+        var originalDefaultDevice = torch.get_default_device();
+        torch.set_default_device("meta");
+        var config = Path.Join(modelFolder, configName);
+        var modelConfig = JsonSerializer.Deserialize<LlamaConfig>(File.ReadAllText(config)) ?? throw new ArgumentNullException(nameof(config));
+        modelConfig.DType = torchDtype;
+        var model = new LlamaForCausalLM(modelConfig);
+
+        if (quantizeToInt8)
+        {
+            model.ToInt8QuantizeModule();
+        }
+        else if (quantizeToInt4)
+        {
+            model.ToInt4QuantizeModule();
+        }
+
+        var deviceMap = model.InferDeviceMapForEachLayer(
+            [
+                KeyValuePair.Create(targetDevice, layersOnTargetDevice),
+                KeyValuePair.Create("cpu", -1)
+            ]);
+
+        torch.set_default_device("cpu");
+
+        model.LoadSafeTensors(modelFolder, checkPointName);
+
+        model = model.ToDynamicLoadingModel(deviceMap, targetDevice);
+
+        torch.set_default_device(originalDefaultDevice);
+
+        return model;
+    }
+
     public void LoadSafeTensors(string modelFolder, string checkPointName = "model.safetensors.index.json")
     {
         this.load_checkpoint(path: modelFolder, checkpointName: checkPointName, strict: true, useTqdm: false);

From 6557eac10200705e99b97fa89ec85fbd4561834a Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Fri, 16 Aug 2024 14:43:49 -0700
Subject: [PATCH 09/24] fix error

---
 docs/samples/Microsoft.ML.GenAI.Samples/Llama/test.cs | 3 +--
 src/Microsoft.ML.GenAI.LLaMA/LlamaForCausalLM.cs      | 1 +
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/samples/Microsoft.ML.GenAI.Samples/Llama/test.cs b/docs/samples/Microsoft.ML.GenAI.Samples/Llama/test.cs
index 9ce2f0016b..8b6b20704b 100644
--- a/docs/samples/Microsoft.ML.GenAI.Samples/Llama/test.cs
+++ b/docs/samples/Microsoft.ML.GenAI.Samples/Llama/test.cs
@@ -35,8 +35,7 @@ public static async void Run()
         var stopWatch = System.Diagnostics.Stopwatch.StartNew();
         stopWatch.Start();
         var tokenizer = LlamaTokenizerHelper.FromPretrained(originalWeightFolder);
-        Console.WriteLine("Loading llama from huggingface model weight folder");
-        var model = LlamaForCausalLM.FromPretrained(weightFolder, configName, layersOnTargetDevice: 30);
+        var model = LlamaForCausalLM.FromPretrained(weightFolder, configName, layersOnTargetDevice: -1);
 
         var pipeline = new CausalLMPipeline<TiktokenTokenizer, LlamaForCausalLM>(tokenizer, model, device);
 
diff --git a/src/Microsoft.ML.GenAI.LLaMA/LlamaForCausalLM.cs b/src/Microsoft.ML.GenAI.LLaMA/LlamaForCausalLM.cs
index e72e60f6d6..59c01b340b 100644
--- a/src/Microsoft.ML.GenAI.LLaMA/LlamaForCausalLM.cs
+++ b/src/Microsoft.ML.GenAI.LLaMA/LlamaForCausalLM.cs
@@ -103,6 +103,7 @@ public static LlamaForCausalLM FromPretrained(
             ]);
 
         torch.set_default_device("cpu");
+        model = new LlamaForCausalLM(modelConfig);
 
         model.LoadSafeTensors(modelFolder, checkPointName);
 

From fd61cff18d7e304b46a26e96717c49cdf75e1b78 Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Fri, 16 Aug 2024 14:57:48 -0700
Subject: [PATCH 10/24] calculate rotary embedding in model layer

---
 .../Module/Attention.cs                        | 17 ++++++++++++++---
 .../Module/LlamaDecoderLayer.cs                |  3 ++-
 .../Module/LlamaModel.cs                       | 18 ++++++++++++++++--
 .../Module/Phi3DecoderLayer.cs                 |  2 +-
 4 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/src/Microsoft.ML.GenAI.Core/Module/Attention.cs b/src/Microsoft.ML.GenAI.Core/Module/Attention.cs
index 6a846cb684..5242eaef9e 100644
--- a/src/Microsoft.ML.GenAI.Core/Module/Attention.cs
+++ b/src/Microsoft.ML.GenAI.Core/Module/Attention.cs
@@ -23,6 +23,7 @@ public AttentionInput(
         Tensor positionIds,
         Tensor? attentionMask = null,
         IKVCache? cache = null,
+        (Tensor, Tensor)? positionalEmbeddings = null, // cos, sin
         bool outputAttentions = false)
     {
         this.HiddenStates = hiddenStates;
@@ -37,6 +38,8 @@ public AttentionInput(
 
     public Tensor PositionIds { get; set; }
 
+    public (Tensor, Tensor)? PositionalEmbeddings { get; set; }
+
     public IKVCache? Cache { get; set; }
 
     public bool OutputAttentions { get; set; }
@@ -170,10 +173,18 @@ public override AttentionOutput forward(AttentionInput input)
                 kvSeqLen += pastKeyValue.GetUsableLength(kvSeqLen, this._layerIdx);
             }
 
-            var embOutput = this.rotary_emb.forward(new RotaryEmbeddingInput(valueStates, positionIds, kvSeqLen));
-            (var cos, var sin) = (embOutput.Cos, embOutput.Sin);
+            if (input.PositionalEmbeddings is (Tensor cos, Tensor sin))
+            {
+                (queryStates, keyStates) = Utils.ApplyRotaryPosEmb(queryStates, keyStates, cos, sin);
+            }
+            else
+            {
+                throw new NotImplementedException("Positional embeddings are not implemented");
+                //var embOutput = this.rotary_emb.forward(new RotaryEmbeddingInput(valueStates, positionIds, kvSeqLen));
+                //(cos, sin) = (embOutput.Cos, embOutput.Sin);
 
-            (queryStates, keyStates) = Utils.ApplyRotaryPosEmb(queryStates, keyStates, cos, sin);
+                //(queryStates, keyStates) = Utils.ApplyRotaryPosEmb(queryStates, keyStates, cos, sin);
+            }
 
             if (pastKeyValue is not null)
             {
diff --git a/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaDecoderLayer.cs b/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaDecoderLayer.cs
index bedd255bad..c41591f3da 100644
--- a/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaDecoderLayer.cs
+++ b/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaDecoderLayer.cs
@@ -36,7 +36,7 @@ public DecoderLayerInput(
 
     public Tensor PositionIds { get; set; }
 
-    public (Tensor, Tensor) PositionalEmbeddings { get; set; }
+    public (Tensor, Tensor)? PositionalEmbeddings { get; set; }
 
     public IKVCache? PastKeyValue { get; set; }
 
@@ -132,6 +132,7 @@ public override DecoderLayerOutput forward(DecoderLayerInput input)
             attentionMask: input.AttentionMask,
             positionIds: input.PositionIds,
             cache: input.PastKeyValue,
+            positionalEmbeddings: input.PositionalEmbeddings,
             outputAttentions: input.OutputAttentions);
 
         var selfAttnOutput = this.self_attn.forward(selfAttnInput);
diff --git a/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaModel.cs b/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaModel.cs
index cf08f31b54..658cd38563 100644
--- a/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaModel.cs
+++ b/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaModel.cs
@@ -26,6 +26,8 @@ internal class LlamaModel : nn.Module<CausalLMModelInput, CausalLMModelOutput>
     private readonly ModuleList<LlamaDecoderLayer> layers;
     private readonly RMSNorm norm;
 #pragma warning restore MSML_PrivateFieldName // Private field name not in: _camelCase format
+    private readonly nn.Module<RotaryEmbeddingInput, RotaryEmbeddingOutput> _rotaryEmb;
+
 
     public LlamaModel(LlamaConfig config)
         : base(nameof(LlamaModel))
@@ -33,7 +35,7 @@ public LlamaModel(LlamaConfig config)
         this._config = config;
         this._paddingIdx = config.PadTokenId;
         this._vocabSize = config.VocabSize;
-
+        var headDim = config.HiddenSize / config.NumAttentionHeads;
         this.embed_tokens = nn.Embedding(config.VocabSize, config.HiddenSize, padding_idx: this._paddingIdx, dtype: config.DType);
         this.layers = new ModuleList<LlamaDecoderLayer>();
 
@@ -44,6 +46,11 @@ public LlamaModel(LlamaConfig config)
         this.norm = new RMSNorm(config.HiddenSize, config.RmsNormEps, config.DType);
         this._cache = new DynamicKVCache();
         this.RegisterComponents();
+        this._rotaryEmb = config.RopeScaling switch
+        {
+            null => new RotaryEmbedding(config.RopeTheta, config.MaxPositionEmbeddings, headDim),
+            _ => new RotaryEmbedding(config.RopeTheta, headDim, config.RopeScaling),
+        };
     }
 
 #pragma warning disable MSML_GeneralName // This name should be PascalCased
@@ -113,6 +120,7 @@ public override CausalLMModelOutput forward(CausalLMModelInput input)
         var allHiddenStates = new List<Tensor>();
         var allAttentions = new List<Tensor>();
 
+        var embOutput = this._rotaryEmb.forward(new RotaryEmbeddingInput(hiddenStates, positionIds, pastKeyValuesLength));
         foreach (var layer in this.layers)
         {
             if (outputHiddenStates)
@@ -120,7 +128,13 @@ public override CausalLMModelOutput forward(CausalLMModelInput input)
                 allHiddenStates.Add(hiddenStates);
             }
 
-            var decoderInput = new DecoderLayerInput(hiddenStates, attentionMask!, positionIds, this._cache, outputAttentions: outputAttentions);
+            var decoderInput = new DecoderLayerInput(
+                hiddenStates,
+                attentionMask!,
+                positionIds,
+                this._cache,
+                positionEmbeddings: (embOutput.Cos, embOutput.Sin),
+                outputAttentions: outputAttentions);
             var layerOutput = layer.forward(decoderInput);
             hiddenStates = layerOutput.HiddenStates;
             if (outputAttentions && layerOutput.Attentions is not null)
diff --git a/src/Microsoft.ML.GenAI.Phi/Module/Phi3DecoderLayer.cs b/src/Microsoft.ML.GenAI.Phi/Module/Phi3DecoderLayer.cs
index b42b6a81fe..d4dbc68aaa 100644
--- a/src/Microsoft.ML.GenAI.Phi/Module/Phi3DecoderLayer.cs
+++ b/src/Microsoft.ML.GenAI.Phi/Module/Phi3DecoderLayer.cs
@@ -110,7 +110,7 @@ public override Phi3DecoderLayerOutput forward(Phi3DecoderLayerInput input)
         var residual = input.HiddenStates;
         hiddenStates = this.input_layernorm.forward(hiddenStates);
 
-        var attentionInput = new AttentionInput(hiddenStates, input.PositionIds, input.AttentionMask, input.PastKeyValue, input.OutputAttentions);
+        var attentionInput = new AttentionInput(hiddenStates, input.PositionIds, input.AttentionMask, input.PastKeyValue, outputAttentions: input.OutputAttentions);
         var output = this.self_attn.forward(attentionInput);
         var attnOutputs = output.HiddenStates;
         var selfAttnWeights = output.Attentions;

From a3f2e413bcf9bcc4353d0fcaf3b3bfcbb5da305b Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Fri, 16 Aug 2024 15:18:02 -0700
Subject: [PATCH 11/24] remove rotary_emb from attention

---
 .../Microsoft.ML.GenAI.Samples/Llama/test.cs  |  2 +-
 .../Phi3Mini/AutoGenSample.cs                 |  4 +-
 .../Module/Attention.cs                       | 23 +----
 .../Module/LlamaDecoderLayer.cs               | 12 +--
 .../Module/LlamaModel.cs                      | 10 +--
 .../Module/Phi3Attention.cs                   | 86 -------------------
 .../Module/Phi3DecoderLayer.cs                | 31 ++++++-
 .../Module/Phi3Model.cs                       | 18 +++-
 src/Microsoft.ML.GenAI.Phi/README.md          |  4 +-
 9 files changed, 63 insertions(+), 127 deletions(-)
 delete mode 100644 src/Microsoft.ML.GenAI.Phi/Module/Phi3Attention.cs

diff --git a/docs/samples/Microsoft.ML.GenAI.Samples/Llama/test.cs b/docs/samples/Microsoft.ML.GenAI.Samples/Llama/test.cs
index 8b6b20704b..49fcdf5892 100644
--- a/docs/samples/Microsoft.ML.GenAI.Samples/Llama/test.cs
+++ b/docs/samples/Microsoft.ML.GenAI.Samples/Llama/test.cs
@@ -43,7 +43,7 @@ public static async void Run()
             .RegisterPrintMessage();
 
         var task = """
-            Write a C# program to print the sum of two numbers.
+            Write a C# program to print the sum of two numbers. Use top-level statement, put code between ```csharp and ```.
             """;
 
         await agent.SendAsync(task);
diff --git a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs
index 5b3dce01de..392aec674d 100644
--- a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs
+++ b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs
@@ -25,8 +25,8 @@ public static async Task RunAsync()
         var defaultType = ScalarType.Float16;
         torch.manual_seed(1);
         torch.set_default_dtype(defaultType);
-        var weightFolder = @"C:\Users\xiaoyuz\source\repos\Phi-3-medium-4k-instruct";
-        var pipeline = Utils.LoadPhi3Mini4KFromFolder(weightFolder, device: device, quantizeToInt8: true);
+        var weightFolder = @"C:\Users\xiaoyuz\source\repos\Phi-3-mini-4k-instruct";
+        var pipeline = Utils.LoadPhi3Mini4KFromFolder(weightFolder, device: device, quantizeToInt8: false);
 
         // agent
         var agent = new Phi3Agent(pipeline, "assistant")
diff --git a/src/Microsoft.ML.GenAI.Core/Module/Attention.cs b/src/Microsoft.ML.GenAI.Core/Module/Attention.cs
index 5242eaef9e..869c213b74 100644
--- a/src/Microsoft.ML.GenAI.Core/Module/Attention.cs
+++ b/src/Microsoft.ML.GenAI.Core/Module/Attention.cs
@@ -21,15 +21,16 @@ internal class AttentionInput
     public AttentionInput(
         Tensor hiddenStates,
         Tensor positionIds,
+        RotaryEmbeddingOutput positionalEmbeddings, // cos, sin
         Tensor? attentionMask = null,
         IKVCache? cache = null,
-        (Tensor, Tensor)? positionalEmbeddings = null, // cos, sin
         bool outputAttentions = false)
     {
         this.HiddenStates = hiddenStates;
         this.AttentionMask = attentionMask;
         this.PositionIds = positionIds;
         this.Cache = cache;
+        this.PositionalEmbeddings = positionalEmbeddings;
         this.OutputAttentions = outputAttentions;
     }
     public Tensor HiddenStates { get; set; }
@@ -38,7 +39,7 @@ public AttentionInput(
 
     public Tensor PositionIds { get; set; }
 
-    public (Tensor, Tensor)? PositionalEmbeddings { get; set; }
+    public RotaryEmbeddingOutput PositionalEmbeddings { get; set; }
 
     public IKVCache? Cache { get; set; }
 
@@ -81,7 +82,6 @@ internal class Attention : nn.Module<AttentionInput, AttentionOutput>
     private readonly QuantizedLinear? q_proj;
     private readonly QuantizedLinear? k_proj;
     private readonly QuantizedLinear? v_proj;
-    private readonly nn.Module<RotaryEmbeddingInput, RotaryEmbeddingOutput> rotary_emb;
 #pragma warning restore MSML_PrivateFieldName // Private field name not in: _camelCase format
 
     public Attention(
@@ -95,7 +95,6 @@ public Attention(
         int originalMaxPositionEmbeddings,
         int layerIdx,
         ScalarType dtype,
-        nn.Module<RotaryEmbeddingInput, RotaryEmbeddingOutput> rotaryEmbedding,
         bool attentionBias = false,
         bool useQkvProj = true)
         : base(nameof(Attention))
@@ -124,8 +123,6 @@ public Attention(
             this.k_proj = new QuantizedLinear(this._hiddenSize, this._numKeyValueHeads * this._headDim, hasBias: attentionBias, dtype: dtype);
             this.v_proj = new QuantizedLinear(this._hiddenSize, this._numKeyValueHeads * this._headDim, hasBias: attentionBias, dtype: dtype);
         }
-
-        this.rotary_emb = rotaryEmbedding;
     }
 
 #pragma warning disable MSML_GeneralName // This name should be PascalCased
@@ -172,19 +169,7 @@ public override AttentionOutput forward(AttentionInput input)
             {
                 kvSeqLen += pastKeyValue.GetUsableLength(kvSeqLen, this._layerIdx);
             }
-
-            if (input.PositionalEmbeddings is (Tensor cos, Tensor sin))
-            {
-                (queryStates, keyStates) = Utils.ApplyRotaryPosEmb(queryStates, keyStates, cos, sin);
-            }
-            else
-            {
-                throw new NotImplementedException("Positional embeddings are not implemented");
-                //var embOutput = this.rotary_emb.forward(new RotaryEmbeddingInput(valueStates, positionIds, kvSeqLen));
-                //(cos, sin) = (embOutput.Cos, embOutput.Sin);
-
-                //(queryStates, keyStates) = Utils.ApplyRotaryPosEmb(queryStates, keyStates, cos, sin);
-            }
+            (queryStates, keyStates) = Utils.ApplyRotaryPosEmb(queryStates, keyStates, input.PositionalEmbeddings.Cos, input.PositionalEmbeddings.Sin);
 
             if (pastKeyValue is not null)
             {
diff --git a/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaDecoderLayer.cs b/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaDecoderLayer.cs
index c41591f3da..0e3132f739 100644
--- a/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaDecoderLayer.cs
+++ b/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaDecoderLayer.cs
@@ -19,8 +19,8 @@ public DecoderLayerInput(
         Tensor hiddenStates,
         Tensor attentionMask,
         Tensor positionIds,
+        RotaryEmbeddingOutput positionEmbeddings, // cos, sin
         IKVCache? pastKeyValue = null,
-        (Tensor, Tensor)? positionEmbeddings = null, // cos, sin
         bool outputAttentions = false)
     {
         this.HiddenStates = hiddenStates;
@@ -28,6 +28,7 @@ public DecoderLayerInput(
         this.PositionIds = positionIds;
         this.PastKeyValue = pastKeyValue;
         this.OutputAttentions = outputAttentions;
+        this.PositionalEmbeddings = positionEmbeddings;
     }
 
     public Tensor HiddenStates { get; set; }
@@ -36,7 +37,7 @@ public DecoderLayerInput(
 
     public Tensor PositionIds { get; set; }
 
-    public (Tensor, Tensor)? PositionalEmbeddings { get; set; }
+    public RotaryEmbeddingOutput PositionalEmbeddings { get; set; }
 
     public IKVCache? PastKeyValue { get; set; }
 
@@ -106,12 +107,7 @@ private Attention CreateAttention(LlamaConfig config, int layerIndex)
             layerIdx: layerIndex,
             useQkvProj: false,
             dtype: config.DType,
-            attentionBias: config.AttentionBias,
-            rotaryEmbedding: config.RopeScaling switch
-            {
-                null => new RotaryEmbedding(config.RopeTheta, config.MaxPositionEmbeddings, headDim),
-                _ => new RotaryEmbedding(config.RopeTheta, headDim, config.RopeScaling),
-            });
+            attentionBias: config.AttentionBias);
     }
 
 #pragma warning disable MSML_GeneralName // This name should be PascalCased
diff --git a/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaModel.cs b/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaModel.cs
index 658cd38563..91616b336f 100644
--- a/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaModel.cs
+++ b/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaModel.cs
@@ -129,11 +129,11 @@ public override CausalLMModelOutput forward(CausalLMModelInput input)
             }
 
             var decoderInput = new DecoderLayerInput(
-                hiddenStates,
-                attentionMask!,
-                positionIds,
-                this._cache,
-                positionEmbeddings: (embOutput.Cos, embOutput.Sin),
+                hiddenStates: hiddenStates,
+                attentionMask: attentionMask!,
+                positionIds: positionIds,
+                pastKeyValue: this._cache,
+                positionEmbeddings: embOutput,
                 outputAttentions: outputAttentions);
             var layerOutput = layer.forward(decoderInput);
             hiddenStates = layerOutput.HiddenStates;
diff --git a/src/Microsoft.ML.GenAI.Phi/Module/Phi3Attention.cs b/src/Microsoft.ML.GenAI.Phi/Module/Phi3Attention.cs
deleted file mode 100644
index d8a3393fcb..0000000000
--- a/src/Microsoft.ML.GenAI.Phi/Module/Phi3Attention.cs
+++ /dev/null
@@ -1,86 +0,0 @@
-﻿// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-// See the LICENSE file in the project root for more information.
-
-using System;
-using System.Collections.Generic;
-using System.Diagnostics.Contracts;
-using System.Linq;
-using System.Text;
-using System.Threading.Tasks;
-using Microsoft.ML.GenAI.Core;
-using TorchSharp;
-using TorchSharp.Modules;
-using static TorchSharp.torch;
-
-namespace Microsoft.ML.GenAI.Phi.Module;
-
-internal class Phi3AttentionInput
-{
-    public Phi3AttentionInput(
-        Tensor hiddenStates,
-        Tensor positionIds,
-        Tensor? attentionMask = null,
-        IKVCache? cache = null,
-        bool outputAttentions = false)
-    {
-        this.HiddenStates = hiddenStates;
-        this.AttentionMask = attentionMask;
-        this.PositionIds = positionIds;
-        this.Cache = cache;
-        this.OutputAttentions = outputAttentions;
-    }
-    public Tensor HiddenStates { get; set; }
-
-    public Tensor? AttentionMask { get; set; }
-
-    public Tensor PositionIds { get; set; }
-
-    public IKVCache? Cache { get; set; }
-
-    public bool OutputAttentions { get; set; }
-}
-
-internal class Phi3AttentionOutput
-{
-    public Phi3AttentionOutput(
-        Tensor hiddenStates,
-        Tensor? attentions = null,
-        IKVCache? cache = null)
-    {
-        this.HiddenStates = hiddenStates;
-        this.Attentions = attentions;
-        this.Cache = cache;
-    }
-
-    public Tensor HiddenStates { get; set; }
-
-    public Tensor? Attentions { get; set; }
-
-    public IKVCache? Cache { get; set; }
-}
-
-internal class Phi3Attention
-{
-    public static Attention FromConfig(Phi3Config config, int layerIdx)
-    {
-        var headDim = config.HiddenSize / config.NumAttentionHeads;
-        return new Attention(
-            attentionDropout: config.AttentionDropout,
-            hiddenSize: config.HiddenSize,
-            numHeads: config.NumAttentionHeads,
-            headDim: headDim,
-            numKeyValueHeads: config.NumKeyValueHeads ?? throw new ArgumentException("num_key_value_heads must be specified"),
-            numKeyValueGroups: config.NumAttentionHeads / config.NumKeyValueHeads ?? throw new ArgumentException("num_key_value_heads must be specified"),
-            maxPositionEmbeddings: config.MaxPositionEmbeddings,
-            originalMaxPositionEmbeddings: config.OriginalMaxPositionEmbeddings,
-            layerIdx: layerIdx,
-            useQkvProj: true,
-            dtype: config.DType,
-            rotaryEmbedding: config.RopeScaling switch
-            {
-                null => new RotaryEmbedding(config.RopeTheta, config.MaxPositionEmbeddings, headDim),
-                _ => new Phi3SuScaledRotaryEmbedding(headDim, config),
-            });
-    }
-}
diff --git a/src/Microsoft.ML.GenAI.Phi/Module/Phi3DecoderLayer.cs b/src/Microsoft.ML.GenAI.Phi/Module/Phi3DecoderLayer.cs
index d4dbc68aaa..35b9313b33 100644
--- a/src/Microsoft.ML.GenAI.Phi/Module/Phi3DecoderLayer.cs
+++ b/src/Microsoft.ML.GenAI.Phi/Module/Phi3DecoderLayer.cs
@@ -20,6 +20,7 @@ public Phi3DecoderLayerInput(
         Tensor hiddenStates,
         Tensor attentionMask,
         Tensor positionIds,
+        RotaryEmbeddingOutput positionalEmbeddings, // cos, sin
         IKVCache? pastKeyValue = null,
         bool outputAttentions = false)
     {
@@ -27,6 +28,7 @@ public Phi3DecoderLayerInput(
         this.AttentionMask = attentionMask;
         this.PositionIds = positionIds;
         this.PastKeyValue = pastKeyValue;
+        this.PositionalEmbeddings = positionalEmbeddings;
         this.OutputAttentions = outputAttentions;
     }
 
@@ -36,6 +38,8 @@ public Phi3DecoderLayerInput(
 
     public Tensor PositionIds { get; set; }
 
+    public RotaryEmbeddingOutput PositionalEmbeddings { get; set; } // cos, sin
+
     public IKVCache? PastKeyValue { get; set; }
 
     public bool OutputAttentions { get; set; }
@@ -78,7 +82,7 @@ public Phi3DecoderLayer(Phi3Config config, int layerIdx)
         this._config = config;
         if (config.AttnImplementation == "eager")
         {
-            this.self_attn = Phi3Attention.FromConfig(config, layerIdx);
+            this.self_attn = this.CreateAttentionFromConfig(config, layerIdx);
         }
         else
         {
@@ -110,7 +114,13 @@ public override Phi3DecoderLayerOutput forward(Phi3DecoderLayerInput input)
         var residual = input.HiddenStates;
         hiddenStates = this.input_layernorm.forward(hiddenStates);
 
-        var attentionInput = new AttentionInput(hiddenStates, input.PositionIds, input.AttentionMask, input.PastKeyValue, outputAttentions: input.OutputAttentions);
+        var attentionInput = new AttentionInput(
+            hiddenStates: hiddenStates,
+            positionIds: input.PositionIds,
+            attentionMask: input.AttentionMask,
+            cache: input.PastKeyValue,
+            positionalEmbeddings: input.PositionalEmbeddings,
+            outputAttentions: input.OutputAttentions);
         var output = this.self_attn.forward(attentionInput);
         var attnOutputs = output.HiddenStates;
         var selfAttnWeights = output.Attentions;
@@ -127,4 +137,21 @@ public override Phi3DecoderLayerOutput forward(Phi3DecoderLayerInput input)
         }
         return new Phi3DecoderLayerOutput(hiddenStates.MoveToOuterDisposeScope(), selfAttnWeights?.MoveToOuterDisposeScope(), presentKeyValue);
     }
+
+    private Attention CreateAttentionFromConfig(Phi3Config config, int layerIdx)
+    {
+        var headDim = config.HiddenSize / config.NumAttentionHeads;
+        return new Attention(
+            attentionDropout: config.AttentionDropout,
+            hiddenSize: config.HiddenSize,
+            numHeads: config.NumAttentionHeads,
+            headDim: headDim,
+            numKeyValueHeads: config.NumKeyValueHeads ?? throw new ArgumentException("num_key_value_heads must be specified"),
+            numKeyValueGroups: config.NumAttentionHeads / config.NumKeyValueHeads ?? throw new ArgumentException("num_key_value_heads must be specified"),
+            maxPositionEmbeddings: config.MaxPositionEmbeddings,
+            originalMaxPositionEmbeddings: config.OriginalMaxPositionEmbeddings,
+            layerIdx: layerIdx,
+            useQkvProj: true,
+            dtype: config.DType);
+    }
 }
diff --git a/src/Microsoft.ML.GenAI.Phi/Module/Phi3Model.cs b/src/Microsoft.ML.GenAI.Phi/Module/Phi3Model.cs
index 463ea5cddc..e873ddd9d8 100644
--- a/src/Microsoft.ML.GenAI.Phi/Module/Phi3Model.cs
+++ b/src/Microsoft.ML.GenAI.Phi/Module/Phi3Model.cs
@@ -22,6 +22,7 @@ internal class Phi3Model : nn.Module<CausalLMModelInput, CausalLMModelOutput>
     private readonly ModuleList<Phi3DecoderLayer> layers;
     private readonly RMSNorm norm;
 #pragma warning restore MSML_PrivateFieldName // Private field name not in: _camelCase format
+    private readonly nn.Module<RotaryEmbeddingInput, RotaryEmbeddingOutput> _rotaryEmb;
 
     public Phi3Model(Phi3Config config)
         : base(nameof(Phi3Model))
@@ -29,6 +30,7 @@ public Phi3Model(Phi3Config config)
         this._config = config;
         this._paddingIdx = config.PadTokenId ?? 32000;
         this._vocabSize = config.VocabSize;
+        var headDim = config.HiddenSize / config.NumAttentionHeads;
 
         this.embed_tokens = nn.Embedding(config.VocabSize, config.HiddenSize, padding_idx: this._paddingIdx, dtype: config.DType);
         this.embed_dropout = nn.Dropout(config.EmbdPdrop);
@@ -41,6 +43,12 @@ public Phi3Model(Phi3Config config)
         this.norm = new RMSNorm(config.HiddenSize, config.RmsNormEps, config.DType);
         this._cache = new DynamicKVCache();
         this.RegisterComponents();
+
+        this._rotaryEmb = config.RopeScaling switch
+        {
+            null => new RotaryEmbedding(config.RopeTheta, config.MaxPositionEmbeddings, headDim),
+            _ => new Phi3SuScaledRotaryEmbedding(headDim, config),
+        };
     }
 #pragma warning disable MSML_GeneralName // This name should be PascalCased
     public override CausalLMModelOutput forward(CausalLMModelInput input)
@@ -104,7 +112,7 @@ public override CausalLMModelOutput forward(CausalLMModelInput input)
         }
 
         var hiddenStates = inputsEmbeds;
-
+        var positionEmbeddings = this._rotaryEmb.forward(new RotaryEmbeddingInput(hiddenStates, positionIds, seqLength));
         var allHiddenStates = new List<Tensor>();
         var allAttentions = new List<Tensor>();
         foreach (var layer in this.layers)
@@ -113,7 +121,13 @@ public override CausalLMModelOutput forward(CausalLMModelInput input)
             {
                 allHiddenStates.Add(hiddenStates);
             }
-            var decoderInput = new Phi3DecoderLayerInput(hiddenStates, attentionMask!, positionIds, this._cache, outputAttentions);
+            var decoderInput = new Phi3DecoderLayerInput(
+                hiddenStates: hiddenStates,
+                attentionMask: attentionMask!,
+                positionIds: positionIds,
+                pastKeyValue: this._cache,
+                positionalEmbeddings: positionEmbeddings,
+                outputAttentions: outputAttentions);
             var layerOutput = layer.forward(decoderInput);
             hiddenStates = layerOutput.HiddenStates;
             if (outputAttentions && layerOutput.Attentions is not null)
diff --git a/src/Microsoft.ML.GenAI.Phi/README.md b/src/Microsoft.ML.GenAI.Phi/README.md
index 758a78ad47..2daf51039e 100644
--- a/src/Microsoft.ML.GenAI.Phi/README.md
+++ b/src/Microsoft.ML.GenAI.Phi/README.md
@@ -6,10 +6,10 @@ The following phi-models are supported and tested:
 - [x] [Phi-2](https://huggingface.co/microsoft/phi-2)
 - [x] [Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct)
 - [x] [Phi-3-mini-128k-instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct)
+- [x] [Phi-3-medium-4k-instruct](https://huggingface.co/microsoft/Phi-3-medium-4k-instruct)
+- [x] [Phi-3-medium-128k-instruct](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct)
 - [ ] [Phi-3-small-8k-instruct](https://huggingface.co/microsoft/Phi-3-small-8k-instruct)
 - [ ] [Phi-3-small-128k-instruct](https://huggingface.co/microsoft/Phi-3-small-128k-instruct)
-- [ ] [Phi-3-medium-4k-instruct](https://huggingface.co/microsoft/Phi-3-medium-4k-instruct)
-- [ ] [Phi-3-medium-128k-instruct](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct)
 - [ ] [Phi-3-vision-128k-instruct](https://huggingface.co/microsoft/Phi-3-large-4k-instruct)
 
 ## Getting Started with Semantic Kernel

From 9a03accc0303269fa2168678410732ed9ac86097 Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Fri, 16 Aug 2024 19:47:52 -0700
Subject: [PATCH 12/24] update feed

---
 NuGet.config                                         | 4 ++--
 test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/NuGet.config b/NuGet.config
index 15f4fc551b..7e119ad408 100644
--- a/NuGet.config
+++ b/NuGet.config
@@ -12,7 +12,7 @@
     <add key="vs-buildservices" value="https://pkgs.dev.azure.com/azure-public/vside/_packaging/vs-buildservices/nuget/v3/index.json" />
     <add key="dotnet5-roslyn" value="https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet5/nuget/v3/index.json" />
     <add key="mlnet-daily" value="https://pkgs.dev.azure.com/dnceng/public/_packaging/MachineLearning/nuget/v3/index.json" />
-    <add key="mlnet-assets" value="https://pkgs.dev.azure.com/dnceng/public/_packaging/machinelearning-assets/nuget/v3/index.json" />
+    <add key="dotnet-libraries-transport" value="https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet-libraries-transport/nuget/v3/index.json" />
     <add key="dotnet8" value="https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet8/nuget/v3/index.json" />
   </packageSources>
   <packageSourceMapping>
@@ -37,7 +37,7 @@
     <packageSource key="mlnet-daily">
       <package pattern="*" />
     </packageSource>
-    <packageSource key="mlnet-assets">
+    <packageSource key="dotnet-libraries-transport">
       <package pattern="*" />
     </packageSource>
     <packageSource key="dotnet8">
diff --git a/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs b/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs
index 5f3c4b48d3..2a702174ed 100644
--- a/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs
+++ b/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs
@@ -63,7 +63,7 @@ public void Llama_3_1_405b_ShapeTest()
     [UseApprovalSubdirectory("Approvals")]
     public void TokenizerTest()
     {
-        var modelWeightFolder = Path.Join("C:\\Users\\xiaoyuz\\source\\repos\\Meta-Llama-3.1-8B-Instruct\\original");
+        var modelWeightFolder = Path.Join("Llama-3.1");
         var tokenizer = LlamaTokenizerHelper.FromPretrained(modelWeightFolder);
 
         var messages = new string[]

From 0542bf74ad79604117199f957ee9d6a2b5f5cc6a Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Fri, 16 Aug 2024 19:56:57 -0700
Subject: [PATCH 13/24] update .csproj

---
 src/Microsoft.ML.GenAI.LLaMA/Microsoft.ML.GenAI.LLaMA.csproj | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/Microsoft.ML.GenAI.LLaMA/Microsoft.ML.GenAI.LLaMA.csproj b/src/Microsoft.ML.GenAI.LLaMA/Microsoft.ML.GenAI.LLaMA.csproj
index 95b74f4001..5b0cb0acc0 100644
--- a/src/Microsoft.ML.GenAI.LLaMA/Microsoft.ML.GenAI.LLaMA.csproj
+++ b/src/Microsoft.ML.GenAI.LLaMA/Microsoft.ML.GenAI.LLaMA.csproj
@@ -21,9 +21,4 @@
     <EmbeddedResource Include="Resource\Config\*.json" />
   </ItemGroup>
 
-  <ItemGroup>
-    <None Remove="Resource\Config\meta-llama-3.1-405B-Instruct.json" />
-    <None Remove="Resource\Config\meta-llama-3.1-70B-Instruct.json" />
-  </ItemGroup>
-
 </Project>

From 485f9aff4697845bd926e7848db83dc74d4f50de Mon Sep 17 00:00:00 2001
From: Xiaoyun Zhang <bigmiao.zhang@gmail.com>
Date: Mon, 19 Aug 2024 09:30:32 -0700
Subject: [PATCH 14/24] Update NuGet.config

---
 NuGet.config | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/NuGet.config b/NuGet.config
index 7e119ad408..5f023aa721 100644
--- a/NuGet.config
+++ b/NuGet.config
@@ -12,6 +12,7 @@
     <add key="vs-buildservices" value="https://pkgs.dev.azure.com/azure-public/vside/_packaging/vs-buildservices/nuget/v3/index.json" />
     <add key="dotnet5-roslyn" value="https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet5/nuget/v3/index.json" />
     <add key="mlnet-daily" value="https://pkgs.dev.azure.com/dnceng/public/_packaging/MachineLearning/nuget/v3/index.json" />
+    <add key="mlnet-assets" value="https://pkgs.dev.azure.com/dnceng/public/_packaging/machinelearning-assets/nuget/v3/index.json" />
     <add key="dotnet-libraries-transport" value="https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet-libraries-transport/nuget/v3/index.json" />
     <add key="dotnet8" value="https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet8/nuget/v3/index.json" />
   </packageSources>
@@ -37,6 +38,9 @@
     <packageSource key="mlnet-daily">
       <package pattern="*" />
     </packageSource>
+    <packageSource key="mlnet-assets">
+      <package pattern="*" />
+    </packageSource>
     <packageSource key="dotnet-libraries-transport">
       <package pattern="*" />
     </packageSource>

From 896a21b4578aaa09ed9783d8913ab87b397d9865 Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Mon, 19 Aug 2024 10:40:31 -0700
Subject: [PATCH 15/24] fix test

---
 .../Microsoft.ML.GenAI.LLaMA.Tests.csproj                    | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/test/Microsoft.ML.GenAI.LLaMA.Tests/Microsoft.ML.GenAI.LLaMA.Tests.csproj b/test/Microsoft.ML.GenAI.LLaMA.Tests/Microsoft.ML.GenAI.LLaMA.Tests.csproj
index a810482d7e..643c1d91b2 100644
--- a/test/Microsoft.ML.GenAI.LLaMA.Tests/Microsoft.ML.GenAI.LLaMA.Tests.csproj
+++ b/test/Microsoft.ML.GenAI.LLaMA.Tests/Microsoft.ML.GenAI.LLaMA.Tests.csproj
@@ -23,6 +23,11 @@
     <PackageReference Include="TorchSharp" Version="$(TorchSharpVersion)" />
   </ItemGroup>
 
+
+  <ItemGroup Condition="'$(TargetArchitecture)' != 'x64'">
+    <Compile Remove="LLaMA3_1Tests.cs" />
+  </ItemGroup>
+
   <ItemGroup>
     <None Update="Approvals\**\*">
       <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>

From d3d5b6156fac704f990edf2d91a6ae223fc052f9 Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Tue, 20 Aug 2024 23:12:42 -0700
Subject: [PATCH 16/24] pass device

---
 test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs b/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs
index 2a702174ed..a910b9cc2d 100644
--- a/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs
+++ b/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs
@@ -33,7 +33,7 @@ public LLaMA3_1Tests()
     [UseApprovalSubdirectory("Approvals")]
     public void Llama_3_1_8b_ShapeTest()
     {
-        var model = new LlamaForCausalLM(LlamaConfig.Llama3_1_8B_Instruct);
+        var model = new LlamaForCausalLM(LlamaConfig.Llama3_1_8B_Instruct, "meta");
         var stateDictStr = model.PeekShape();
         Approvals.Verify(stateDictStr);
     }
@@ -43,7 +43,7 @@ public void Llama_3_1_8b_ShapeTest()
     [UseApprovalSubdirectory("Approvals")]
     public void Llama_3_1_70b_ShapeTest()
     {
-        var model = new LlamaForCausalLM(LlamaConfig.Llama3_1_70B_Instruct);
+        var model = new LlamaForCausalLM(LlamaConfig.Llama3_1_70B_Instruct, "meta");
         var stateDictStr = model.PeekShape();
         Approvals.Verify(stateDictStr);
     }
@@ -53,7 +53,7 @@ public void Llama_3_1_70b_ShapeTest()
     [UseApprovalSubdirectory("Approvals")]
     public void Llama_3_1_405b_ShapeTest()
     {
-        var model = new LlamaForCausalLM(LlamaConfig.Llama3_1_405B_Instruct);
+        var model = new LlamaForCausalLM(LlamaConfig.Llama3_1_405B_Instruct, "meta");
         var stateDictStr = model.PeekShape();
         Approvals.Verify(stateDictStr);
     }

From d161b494896cbe0db3374a5da948e6dbb8bc24a0 Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Wed, 21 Aug 2024 09:45:05 -0700
Subject: [PATCH 17/24] fix test

---
 src/Microsoft.ML.GenAI.LLaMA/Module/LlamaModel.cs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaModel.cs b/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaModel.cs
index 91616b336f..1ba7820a9f 100644
--- a/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaModel.cs
+++ b/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaModel.cs
@@ -29,14 +29,14 @@ internal class LlamaModel : nn.Module<CausalLMModelInput, CausalLMModelOutput>
     private readonly nn.Module<RotaryEmbeddingInput, RotaryEmbeddingOutput> _rotaryEmb;
 
 
-    public LlamaModel(LlamaConfig config)
+    public LlamaModel(LlamaConfig config, string? device = null)
         : base(nameof(LlamaModel))
     {
         this._config = config;
         this._paddingIdx = config.PadTokenId;
         this._vocabSize = config.VocabSize;
         var headDim = config.HiddenSize / config.NumAttentionHeads;
-        this.embed_tokens = nn.Embedding(config.VocabSize, config.HiddenSize, padding_idx: this._paddingIdx, dtype: config.DType);
+        this.embed_tokens = nn.Embedding(config.VocabSize, config.HiddenSize, padding_idx: this._paddingIdx, dtype: config.DType, device: device);
         this.layers = new ModuleList<LlamaDecoderLayer>();
 
         for (int i = 0; i < config.NumHiddenLayers; i++)

From 32b74e388bd608d88adaf6aa633e0fc62183aeff Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Wed, 21 Aug 2024 12:53:08 -0700
Subject: [PATCH 18/24] update constructor

---
 src/Microsoft.ML.GenAI.LLaMA/LlamaForCausalLM.cs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Microsoft.ML.GenAI.LLaMA/LlamaForCausalLM.cs b/src/Microsoft.ML.GenAI.LLaMA/LlamaForCausalLM.cs
index 59c01b340b..b7e038da1b 100644
--- a/src/Microsoft.ML.GenAI.LLaMA/LlamaForCausalLM.cs
+++ b/src/Microsoft.ML.GenAI.LLaMA/LlamaForCausalLM.cs
@@ -23,13 +23,13 @@ public class LlamaForCausalLM : nn.Module<CausalLMModelInput, CausalLMModelOutpu
     private readonly LlamaModel model;
 #pragma warning restore MSML_PrivateFieldName // Private field name not in: _camelCase format
 
-    public LlamaForCausalLM(LlamaConfig config)
+    public LlamaForCausalLM(LlamaConfig config, string? device = null)
         : base(nameof(LlamaForCausalLM))
     {
         _config = config;
         _vocabSize = config.VocabSize;
 
-        model = new LlamaModel(config);
+        model = new LlamaModel(config, device);
         lm_head = new GenAILinear(config.HiddenSize, config.VocabSize, hasBias: false);
 
         this.RegisterComponents();

From b2b155c8132b26e4974664e321c302b5dd497317 Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Wed, 21 Aug 2024 16:24:06 -0700
Subject: [PATCH 19/24] disable 405b test

---
 test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs b/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs
index a910b9cc2d..edac0f75d4 100644
--- a/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs
+++ b/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs
@@ -48,7 +48,7 @@ public void Llama_3_1_70b_ShapeTest()
         Approvals.Verify(stateDictStr);
     }
 
-    [Fact]
+    [Fact(Skip ="This test still takes too much space when running on helix")]
     [UseReporter(typeof(DiffReporter))]
     [UseApprovalSubdirectory("Approvals")]
     public void Llama_3_1_405b_ShapeTest()

From 2b48080fdeb33a9266d05055cfb5fc3ee535f3bc Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Wed, 21 Aug 2024 22:16:33 -0700
Subject: [PATCH 20/24] update

---
 test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs b/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs
index edac0f75d4..3435801c87 100644
--- a/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs
+++ b/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs
@@ -48,7 +48,7 @@ public void Llama_3_1_70b_ShapeTest()
         Approvals.Verify(stateDictStr);
     }
 
-    [Fact(Skip ="This test still takes too much space when running on helix")]
+    [Fact(Skip = "This test still takes too much space when running on helix")]
     [UseReporter(typeof(DiffReporter))]
     [UseApprovalSubdirectory("Approvals")]
     public void Llama_3_1_405b_ShapeTest()

From e0ba71945b31afc7519d64e9c75525c56fceb9a5 Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Thu, 22 Aug 2024 09:04:02 -0700
Subject: [PATCH 21/24] disable 70b test

---
 test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs b/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs
index 3435801c87..d986f18cab 100644
--- a/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs
+++ b/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs
@@ -38,7 +38,7 @@ public void Llama_3_1_8b_ShapeTest()
         Approvals.Verify(stateDictStr);
     }
 
-    [Fact]
+    [Fact(Skip = "This test still takes too much space when running on helix")]
     [UseReporter(typeof(DiffReporter))]
     [UseApprovalSubdirectory("Approvals")]
     public void Llama_3_1_70b_ShapeTest()

From 093a38e207543985f06a6fa12e3806392b6f6a19 Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Mon, 26 Aug 2024 12:03:00 -0700
Subject: [PATCH 22/24] use windows only fact

---
 test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs b/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs
index d986f18cab..7d97150f7b 100644
--- a/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs
+++ b/test/Microsoft.ML.GenAI.LLaMA.Tests/LLaMA3_1Tests.cs
@@ -38,7 +38,7 @@ public void Llama_3_1_8b_ShapeTest()
         Approvals.Verify(stateDictStr);
     }
 
-    [Fact(Skip = "This test still takes too much space when running on helix")]
+    [WindowsOnlyFact]
     [UseReporter(typeof(DiffReporter))]
     [UseApprovalSubdirectory("Approvals")]
     public void Llama_3_1_70b_ShapeTest()
@@ -48,7 +48,7 @@ public void Llama_3_1_70b_ShapeTest()
         Approvals.Verify(stateDictStr);
     }
 
-    [Fact(Skip = "This test still takes too much space when running on helix")]
+    [WindowsOnlyFact]
     [UseReporter(typeof(DiffReporter))]
     [UseApprovalSubdirectory("Approvals")]
     public void Llama_3_1_405b_ShapeTest()

From 51bdcc6e03386c558200b6e8e819eabdf8abd3a5 Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Wed, 28 Aug 2024 10:39:08 -0700
Subject: [PATCH 23/24] revert change

---
 docs/samples/Microsoft.ML.GenAI.Samples/Program.cs | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/docs/samples/Microsoft.ML.GenAI.Samples/Program.cs b/docs/samples/Microsoft.ML.GenAI.Samples/Program.cs
index 100748ca7b..1560bad306 100644
--- a/docs/samples/Microsoft.ML.GenAI.Samples/Program.cs
+++ b/docs/samples/Microsoft.ML.GenAI.Samples/Program.cs
@@ -1,6 +1,4 @@
 ﻿// See https://aka.ms/new-console-template for more information
-using Microsoft.ML.GenAI.Samples.Llama;
 using Microsoft.ML.GenAI.Samples.Phi3Mini;
 
-LlamaSample.Run();
-//await AutoGenSample.RunAsync();
+await SemanticKernelSample.RunChatCompletionSample();

From 19a2d7ea6ba8f75fae17cc2e59859be9d552296c Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Wed, 28 Aug 2024 10:41:11 -0700
Subject: [PATCH 24/24] rename test to LLaMA3_1

---
 .../Microsoft.ML.GenAI.Samples/Llama/{test.cs => LLaMA3_1.cs}     | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename docs/samples/Microsoft.ML.GenAI.Samples/Llama/{test.cs => LLaMA3_1.cs} (100%)

diff --git a/docs/samples/Microsoft.ML.GenAI.Samples/Llama/test.cs b/docs/samples/Microsoft.ML.GenAI.Samples/Llama/LLaMA3_1.cs
similarity index 100%
rename from docs/samples/Microsoft.ML.GenAI.Samples/Llama/test.cs
rename to docs/samples/Microsoft.ML.GenAI.Samples/Llama/LLaMA3_1.cs