diff --git a/LLama.Unittest/BasicTest.cs b/LLama.Unittest/BasicTest.cs index 2cd1806f9..2c06dd47b 100644 --- a/LLama.Unittest/BasicTest.cs +++ b/LLama.Unittest/BasicTest.cs @@ -1,16 +1,20 @@ using System.Text; using LLama.Common; +using LLama.Native; +using Xunit.Abstractions; namespace LLama.Unittest { public sealed class BasicTest : IDisposable { + private readonly ITestOutputHelper _testOutputHelper; private readonly ModelParams _params; private readonly LLamaWeights _model; - public BasicTest() + public BasicTest(ITestOutputHelper testOutputHelper) { + _testOutputHelper = testOutputHelper; _params = new ModelParams(Constants.ModelPath) { ContextSize = 2048 @@ -30,5 +34,57 @@ public void BasicModelProperties() Assert.Equal(4096, _model.ContextSize); Assert.Equal(4096, _model.EmbeddingSize); } + + [Fact] + public void AdvancedModelProperties() + { + var expected = new Dictionary + { + { "general.name", "LLaMA v2" }, + { "general.architecture", "llama" }, + { "general.quantization_version", "2" }, + { "general.file_type", "2" }, + + { "llama.context_length", "4096" }, + { "llama.rope.dimension_count", "128" }, + { "llama.embedding_length", "4096" }, + { "llama.block_count", "32" }, + { "llama.feed_forward_length", "11008" }, + { "llama.attention.head_count", "32" }, + { "llama.attention.head_count_kv", "32" }, + { "llama.attention.layer_norm_rms_epsilon", "0.000001" }, + + { "tokenizer.ggml.eos_token_id", "2" }, + { "tokenizer.ggml.model", "llama" }, + { "tokenizer.ggml.bos_token_id", "1" }, + { "tokenizer.ggml.unknown_token_id", "0" }, + }; + + var metaCount = NativeApi.llama_model_meta_count(_model.NativeHandle); + Assert.Equal(expected.Count, metaCount); + + Span buffer = stackalloc byte[128]; + for (var i = 0; i < expected.Count; i++) + { + unsafe + { + fixed (byte* ptr = buffer) + { + var length = NativeApi.llama_model_meta_key_by_index(_model.NativeHandle, i, ptr, 128); + Assert.True(length > 0); + var key = Encoding.UTF8.GetString(buffer[..length]); + + length = NativeApi.llama_model_meta_val_str_by_index(_model.NativeHandle, i, ptr, 128); + Assert.True(length > 0); + var val = Encoding.UTF8.GetString(buffer[..length]); + + _testOutputHelper.WriteLine($"{key} == {val}"); + + Assert.True(expected.ContainsKey(key)); + Assert.Equal(expected[key], val); + } + } + } + } } } \ No newline at end of file diff --git a/LLama.Unittest/LLama.Unittest.csproj b/LLama.Unittest/LLama.Unittest.csproj index 0532244df..8effd951a 100644 --- a/LLama.Unittest/LLama.Unittest.csproj +++ b/LLama.Unittest/LLama.Unittest.csproj @@ -8,6 +8,8 @@ enable false + + true diff --git a/LLama/LLamaSharp.Runtime.targets b/LLama/LLamaSharp.Runtime.targets index 9085e5f7f..93600b299 100644 --- a/LLama/LLamaSharp.Runtime.targets +++ b/LLama/LLamaSharp.Runtime.targets @@ -3,41 +3,69 @@ true - - PreserveNewest - runtimes/win-x64/native/libllama.dll - - - PreserveNewest - runtimes/win-x64/native/cuda11/libllama.dll - - - PreserveNewest - runtimes/win-x64/native/cuda12/libllama.dll - - - PreserveNewest - runtimes/linux-x64/native/libllama.so - - - PreserveNewest - runtimes/linux-x64/native/cuda11/libllama.so - - - PreserveNewest - runtimes/linux-x64/native/cuda12/libllama.so - - - PreserveNewest - runtimes/osx-arm64/native/libllama.dylib - - - PreserveNewest - runtimes/osx-arm64/native/ggml-metal.metal - - - PreserveNewest - runtimes/osx-x64/native/libllama.dylib - + + + PreserveNewest + runtimes/win-x64/native/noavx/libllama.dll + + + PreserveNewest + runtimes/win-x64/native/avx/libllama.dll + + + PreserveNewest + runtimes/win-x64/native/avx2/libllama.dll + + + PreserveNewest + runtimes/win-x64/native/avx512/libllama.dll + + + PreserveNewest + runtimes/win-x64/native/cuda11/libllama.dll + + + PreserveNewest + runtimes/win-x64/native/cuda12/libllama.dll + + + + PreserveNewest + runtimes/linux-x64/native/noavx/libllama.so + + + PreserveNewest + runtimes/linux-x64/native/avx/libllama.so + + + PreserveNewest + runtimes/linux-x64/native/avx2/libllama.so + + + PreserveNewest + runtimes/linux-x64/native/avx512/libllama.so + + + PreserveNewest + runtimes/linux-x64/native/cuda11/libllama.so + + + PreserveNewest + runtimes/linux-x64/native/cuda12/libllama.so + + + + PreserveNewest + runtimes/osx-arm64/native/libllama.dylib + + + PreserveNewest + runtimes/osx-arm64/native/ggml-metal.metal + + + + PreserveNewest + runtimes/osx-x64/native/libllama.dylib + \ No newline at end of file diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs index 074a8e9fd..a4f97a004 100644 --- a/LLama/Native/NativeApi.cs +++ b/LLama/Native/NativeApi.cs @@ -302,6 +302,20 @@ public static int llama_tokenize(SafeLLamaContextHandle ctx, string text, Encodi [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] public static extern llama_token llama_token_nl(SafeLlamaModelHandle model); + /// + /// Returns -1 if unknown, 1 for true or 0 for false. + /// + /// + [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] + public static extern int llama_add_bos_token(SafeLlamaModelHandle model); + + /// + /// Returns -1 if unknown, 1 for true or 0 for false. + /// + /// + [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] + public static extern int llama_add_eos_token(SafeLlamaModelHandle model); + /// /// Print out timing information for this context /// @@ -348,18 +362,77 @@ public static int llama_tokenize(SafeLLamaContextHandle ctx, string text, Encodi public static extern int llama_n_embd(SafeLlamaModelHandle model); /// - /// Get the size of the model in bytes + /// Get the model's RoPE frequency scaling factor + /// + /// + /// + [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] + public static extern float llama_rope_freq_scale_train(SafeLlamaModelHandle model); + + /// + /// Get metadata value as a string by key name + /// + /// + /// + /// + /// + /// The length of the string on success, or -1 on failure + [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] + public static extern int llama_model_meta_val_str(SafeLlamaModelHandle model, byte* key, byte* buf, long buf_size); + + /// + /// Get the number of metadata key/value pairs /// /// /// [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] + public static extern int llama_model_meta_count(SafeLlamaModelHandle model); + + /// + /// Get metadata key name by index + /// + /// + /// + /// + /// + /// The length of the string on success, or -1 on failure + [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] + public static extern int llama_model_meta_key_by_index(SafeLlamaModelHandle model, int index, byte* buf, long buf_size); + + /// + /// Get metadata value as a string by index + /// + /// + /// + /// + /// + /// The length of the string on success, or -1 on failure + [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] + public static extern int llama_model_meta_val_str_by_index(SafeLlamaModelHandle model, int index, byte* buf, long buf_size); + + /// + /// Get a string describing the model type + /// + /// + /// + /// + /// The length of the string on success, or -1 on failure + [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] + public static extern int llama_model_desc(SafeLlamaModelHandle model, byte* buf, long buf_size); + + /// + /// Get the size of the model in bytes + /// + /// + /// The size of the model + [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] public static extern ulong llama_model_size(SafeLlamaModelHandle model); /// /// Get the number of parameters in this model /// /// - /// + /// The functions return the length of the string on success, or -1 on failure [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] public static extern ulong llama_model_n_params(SafeLlamaModelHandle model); @@ -370,7 +443,7 @@ public static int llama_tokenize(SafeLLamaContextHandle ctx, string text, Encodi /// /// buffer to write string into /// size of the buffer - /// The length writte, or if the buffer is too small a negative that indicates the length required + /// The length written, or if the buffer is too small a negative that indicates the length required [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] public static extern int llama_token_to_piece(SafeLlamaModelHandle model, int llamaToken, byte* buffer, int length); diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec index 29466a1fe..e7ae5e58a 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec @@ -17,11 +17,21 @@ - - - - - + + + + + + + + + + + + + + + diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.nuspec index d8876f4f3..4b7b6f046 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.nuspec @@ -17,8 +17,10 @@ - - + + + + diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.nuspec index 5ffd8ef12..d915ba4c1 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.nuspec @@ -17,8 +17,10 @@ - - + + + + diff --git a/LLama/runtimes/deps/avx/libllama.dll b/LLama/runtimes/deps/avx/libllama.dll new file mode 100644 index 000000000..55d574843 Binary files /dev/null and b/LLama/runtimes/deps/avx/libllama.dll differ diff --git a/LLama/runtimes/deps/avx/libllama.so b/LLama/runtimes/deps/avx/libllama.so new file mode 100644 index 000000000..e9360b95b Binary files /dev/null and b/LLama/runtimes/deps/avx/libllama.so differ diff --git a/LLama/runtimes/deps/avx2/libllama.dll b/LLama/runtimes/deps/avx2/libllama.dll new file mode 100644 index 000000000..52330a971 Binary files /dev/null and b/LLama/runtimes/deps/avx2/libllama.dll differ diff --git a/LLama/runtimes/deps/avx2/libllama.so b/LLama/runtimes/deps/avx2/libllama.so new file mode 100644 index 000000000..9f84c424c Binary files /dev/null and b/LLama/runtimes/deps/avx2/libllama.so differ diff --git a/LLama/runtimes/deps/avx512/libllama.dll b/LLama/runtimes/deps/avx512/libllama.dll new file mode 100644 index 000000000..5f68f81b4 Binary files /dev/null and b/LLama/runtimes/deps/avx512/libllama.dll differ diff --git a/LLama/runtimes/deps/avx512/libllama.so b/LLama/runtimes/deps/avx512/libllama.so new file mode 100644 index 000000000..2791a7491 Binary files /dev/null and b/LLama/runtimes/deps/avx512/libllama.so differ diff --git a/LLama/runtimes/libllama-cuda11.dll b/LLama/runtimes/deps/cu11.7.1/libllama.dll similarity index 64% rename from LLama/runtimes/libllama-cuda11.dll rename to LLama/runtimes/deps/cu11.7.1/libllama.dll index ab4f4be28..8aa06f952 100644 Binary files a/LLama/runtimes/libllama-cuda11.dll and b/LLama/runtimes/deps/cu11.7.1/libllama.dll differ diff --git a/LLama/runtimes/libllama-cuda11.so b/LLama/runtimes/deps/cu11.7.1/libllama.so similarity index 61% rename from LLama/runtimes/libllama-cuda11.so rename to LLama/runtimes/deps/cu11.7.1/libllama.so index 146b30abd..4f98e823b 100644 Binary files a/LLama/runtimes/libllama-cuda11.so and b/LLama/runtimes/deps/cu11.7.1/libllama.so differ diff --git a/LLama/runtimes/libllama-cuda12.dll b/LLama/runtimes/deps/cu12.1.0/libllama.dll similarity index 63% rename from LLama/runtimes/libllama-cuda12.dll rename to LLama/runtimes/deps/cu12.1.0/libllama.dll index a51954b89..802e357e8 100644 Binary files a/LLama/runtimes/libllama-cuda12.dll and b/LLama/runtimes/deps/cu12.1.0/libllama.dll differ diff --git a/LLama/runtimes/libllama-cuda12.so b/LLama/runtimes/deps/cu12.1.0/libllama.so similarity index 60% rename from LLama/runtimes/libllama-cuda12.so rename to LLama/runtimes/deps/cu12.1.0/libllama.so index 615d9c704..5a794f8e7 100644 Binary files a/LLama/runtimes/libllama-cuda12.so and b/LLama/runtimes/deps/cu12.1.0/libllama.so differ diff --git a/LLama/runtimes/deps/libllama.dll b/LLama/runtimes/deps/libllama.dll new file mode 100644 index 000000000..a68c94185 Binary files /dev/null and b/LLama/runtimes/deps/libllama.dll differ diff --git a/LLama/runtimes/deps/libllama.so b/LLama/runtimes/deps/libllama.so new file mode 100644 index 000000000..d0ef8a591 Binary files /dev/null and b/LLama/runtimes/deps/libllama.so differ diff --git a/LLama/runtimes/osx-arm64/ggml-metal.metal b/LLama/runtimes/deps/osx-arm64/ggml-metal.metal similarity index 96% rename from LLama/runtimes/osx-arm64/ggml-metal.metal rename to LLama/runtimes/deps/osx-arm64/ggml-metal.metal index 7c35f23a7..5d1357cd7 100644 --- a/LLama/runtimes/osx-arm64/ggml-metal.metal +++ b/LLama/runtimes/deps/osx-arm64/ggml-metal.metal @@ -792,7 +792,7 @@ kernel void kernel_mul_mv_f32_f32( constant int64_t & ne0, constant int64_t & ne1, uint3 tgpig[[threadgroup_position_in_grid]], - uint tiisg[[thread_index_in_simdgroup]]) { + uint tiisg[[thread_index_in_simdgroup]]) { const int64_t r0 = tgpig.x; const int64_t rb = tgpig.y*N_F32_F32; @@ -844,6 +844,79 @@ kernel void kernel_mul_mv_f32_f32( } } +#define N_F16_F16 4 + +kernel void kernel_mul_mv_f16_f16( + device const char * src0, + device const char * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]]) { + + const int64_t r0 = tgpig.x; + const int64_t rb = tgpig.y*N_F16_F16; + const int64_t im = tgpig.z; + + device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02); + + if (ne00 < 128) { + for (int row = 0; row < N_F16_F16; ++row) { + int r1 = rb + row; + if (r1 >= ne11) { + break; + } + + device const half * y = (device const half *) (src1 + r1*nb11 + im*nb12); + + float sumf = 0; + for (int i = tiisg; i < ne00; i += 32) { + sumf += (half) x[i] * (half) y[i]; + } + + float all_sum = simd_sum(sumf); + if (tiisg == 0) { + dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum; + } + } + } else { + device const half4 * x4 = (device const half4 *)x; + for (int row = 0; row < N_F16_F16; ++row) { + int r1 = rb + row; + if (r1 >= ne11) { + break; + } + + device const half * y = (device const half *) (src1 + r1*nb11 + im*nb12); + device const half4 * y4 = (device const half4 *) y; + + float sumf = 0; + for (int i = tiisg; i < ne00/4; i += 32) { + for (int k = 0; k < 4; ++k) sumf += (half) x4[i][k] * y4[i][k]; + } + + float all_sum = simd_sum(sumf); + if (tiisg == 0) { + for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (half) x[i] * y[i]; + dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum; + } + } + } +} + kernel void kernel_mul_mv_f16_f32_1row( device const char * src0, device const char * src1, @@ -1229,6 +1302,39 @@ kernel void kernel_rope( template [[host_name("kernel_rope_f32")]] kernel rope_t kernel_rope; template [[host_name("kernel_rope_f16")]] kernel rope_t kernel_rope; +kernel void kernel_im2col_f16( + device const float * x, + device half * dst, + constant int32_t & ofs0, + constant int32_t & ofs1, + constant int32_t & IW, + constant int32_t & IH, + constant int32_t & CHW, + constant int32_t & s0, + constant int32_t & s1, + constant int32_t & p0, + constant int32_t & p1, + constant int32_t & d0, + constant int32_t & d1, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tgpg[[threadgroups_per_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + const int32_t iiw = tgpig[2] * s0 + tpitg[2] * d0 - p0; + const int32_t iih = tgpig[1] * s1 + tpitg[1] * d1 - p1; + + const int32_t offset_dst = + (tpitg[0] * tgpg[1] * tgpg[2] + tgpig[1] * tgpg[2] + tgpig[2]) * CHW + + (tgpig[0] * (ntg[1] * ntg[2]) + tpitg[1] * ntg[2] + tpitg[2]); + + if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) { + dst[offset_dst] = 0.0f; + } else { + const int32_t offset_src = tpitg[0] * ofs0 + tgpig[0] * ofs1; + dst[offset_dst] = x[offset_src + iih * IW + iiw]; + } +} + kernel void kernel_cpy_f16_f16( device const half * src0, device half * dst, diff --git a/LLama/runtimes/deps/osx-arm64/libllama.dylib b/LLama/runtimes/deps/osx-arm64/libllama.dylib new file mode 100644 index 000000000..df57f7dfa Binary files /dev/null and b/LLama/runtimes/deps/osx-arm64/libllama.dylib differ diff --git a/LLama/runtimes/deps/osx-x64/libllama.dylib b/LLama/runtimes/deps/osx-x64/libllama.dylib new file mode 100644 index 000000000..ee6f29b47 Binary files /dev/null and b/LLama/runtimes/deps/osx-x64/libllama.dylib differ diff --git a/LLama/runtimes/libllama.dll b/LLama/runtimes/libllama.dll deleted file mode 100644 index d2cc2a7be..000000000 Binary files a/LLama/runtimes/libllama.dll and /dev/null differ diff --git a/LLama/runtimes/libllama.so b/LLama/runtimes/libllama.so deleted file mode 100644 index e5a01286a..000000000 Binary files a/LLama/runtimes/libllama.so and /dev/null differ diff --git a/LLama/runtimes/osx-arm64/libllama.dylib b/LLama/runtimes/osx-arm64/libllama.dylib deleted file mode 100644 index 54d7a9324..000000000 Binary files a/LLama/runtimes/osx-arm64/libllama.dylib and /dev/null differ diff --git a/LLama/runtimes/osx-x64/libllama.dylib b/LLama/runtimes/osx-x64/libllama.dylib deleted file mode 100644 index 37eb3cd43..000000000 Binary files a/LLama/runtimes/osx-x64/libllama.dylib and /dev/null differ