From 6a6135cc85f602b629ac513a1c24e7f122cae6e3 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Tue, 5 Aug 2025 13:30:08 -0700 Subject: [PATCH 01/15] Begin work on set_rows --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 126 ++++++++++++++++++++++++++- tests/test-backend-ops.cpp | 11 +-- 2 files changed, 131 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 5009e26a2065e..8ba416f0eb2bd 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -20,6 +20,7 @@ #ifdef GGML_WEBGPU_DEBUG # define WEBGPU_LOG_DEBUG(msg) std::cout << msg << std::endl +# define WEBGPU_DEBUG_BUF_ELEMS 32 #else # define WEBGPU_LOG_DEBUG(msg) ((void) 0) #endif // GGML_WEBGPU_DEBUG @@ -125,6 +126,7 @@ struct webgpu_context_struct { wgpu::ComputePipeline memset_pipeline; wgpu::ComputePipeline mul_mat_pipeline; + wgpu::ComputePipeline set_rows_pipeline; wgpu::ComputePipeline cpy_pipeline; size_t memset_bytes_per_thread; @@ -139,6 +141,12 @@ struct webgpu_context_struct { std::vector staged_param_bufs; std::vector callback_futures; + +#ifdef GGML_WEBGPU_DEBUG + wgpu::Buffer debug_host_buf; + wgpu::Buffer debug_dev_buf; +#endif + }; typedef std::shared_ptr webgpu_context; @@ -283,6 +291,27 @@ static void ggml_backend_webgpu_map_buffer(webgpu_context & ctx, UINT64_MAX); } +#ifdef GGML_WEBGPU_DEBUG +// This function adds debugging information to shaders, as WebGPU does not support printing directly. +// To use, add a bind group entry to the setup for the shader you are debugging, add the buffer and +// debug statements in the shader, and then call this function after encoding the commands. +static void ggml_backend_webgpu_debug(webgpu_context & ctx) { + wgpu::CommandEncoder encoder = ctx->device.CreateCommandEncoder(); + encoder.CopyBufferToBuffer(ctx->debug_dev_buf, 0, ctx->debug_host_buf, 0, ctx->debug_host_buf.GetSize()); + wgpu::CommandBuffer commands = encoder.Finish(); + ctx->queue.Submit(1, &commands); + + ggml_backend_webgpu_map_buffer(ctx, ctx->debug_host_buf, wgpu::MapMode::Read, 0, ctx->debug_host_buf.GetSize()); + const uint32_t * debug_data = (const uint32_t *) ctx->debug_host_buf.GetConstMappedRange(); + std::cout << "debug data:"; + for (size_t i = 0; i < WEBGPU_DEBUG_BUF_ELEMS; i++) { + std::cout << " " << i << ": " << debug_data[i]; + } + std::cout << "\n"; + ctx->debug_host_buf.Unmap(); +} +#endif + static void ggml_backend_webgpu_build_and_enqueue(webgpu_context & ctx, wgpu::ComputePipeline & pipeline, std::vector params, @@ -429,6 +458,74 @@ static void ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src, ggml_tensor ggml_backend_webgpu_build_and_enqueue(ctx, ctx->cpy_pipeline, params, entries, wg_x); } +static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * idx, ggml_tensor * dst) { + // For set rows specifically, we need to check if src and idx are empty tensors. + if (ggml_is_empty(src) || ggml_is_empty(idx)) { + return; + } + + size_t src_offset = ggml_backend_webgpu_tensor_offset(src); + // assumes power of 2 offset alignment + size_t src_misalignment = src_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1); + // align to minimum offset alignment + src_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1); + size_t idx_offset = ggml_backend_webgpu_tensor_offset(idx); + size_t idx_misalignment = idx_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1); + idx_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1); + size_t dst_offset = ggml_backend_webgpu_tensor_offset(dst); + size_t dst_misalignment = dst_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1); + dst_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1); + + std::vector params = { + (uint32_t) (src_misalignment / ggml_type_size(src->type)), + (uint32_t) (idx_misalignment / ggml_type_size(idx->type)), + (uint32_t) (dst_misalignment / ggml_type_size(dst->type)), + // Convert byte-strides to element-strides + (uint32_t) (src->nb[1] / ggml_type_size(src->type)), + (uint32_t) (src->nb[2] / ggml_type_size(src->type)), + (uint32_t) (src->nb[3] / ggml_type_size(src->type)), + (uint32_t) (idx->nb[0] / ggml_type_size(idx->type)), + (uint32_t) (idx->nb[1] / ggml_type_size(idx->type)), + (uint32_t) (idx->nb[2] / ggml_type_size(idx->type)), + (uint32_t) (dst->nb[1] / ggml_type_size(dst->type)), + (uint32_t) (dst->nb[2] / ggml_type_size(dst->type)), + (uint32_t) (dst->nb[3] / ggml_type_size(dst->type)), + // Shape of src + (uint32_t) src->ne[0], + (uint32_t) src->ne[1], + (uint32_t) src->ne[2], + (uint32_t) src->ne[3], + // broadcast shape of idx + (uint32_t) (src->ne[2] / idx->ne[1]), + (uint32_t) (src->ne[3] / idx->ne[2]) + }; + + std::vector entries = { + { .binding = 0, + .buffer = ggml_backend_webgpu_tensor_buf(src), + .offset = ggml_backend_webgpu_tensor_offset(src), + .size = ggml_nbytes(src) }, + { .binding = 1, + .buffer = ggml_backend_webgpu_tensor_buf(idx), + .offset = ggml_backend_webgpu_tensor_offset(idx), + .size = ggml_nbytes(idx) }, + { .binding = 2, + .buffer = ggml_backend_webgpu_tensor_buf(dst), + .offset = ggml_backend_webgpu_tensor_offset(dst), + .size = ggml_nbytes(dst) }, + { .binding = 3, + .buffer = ctx->debug_dev_buf, + .offset = 0, + .size = ctx->debug_dev_buf.GetSize() } + }; + + size_t max_wg_size = ctx->limits.maxComputeWorkgroupSizeX; + uint32_t wg_x = (src->ne[1] * src->ne[2] * src->ne[3] + max_wg_size - 1) / max_wg_size; + ggml_backend_webgpu_build_and_enqueue(ctx, ctx->set_rows_pipeline, params, entries, wg_x); + ggml_backend_webgpu_submit_queue(ctx); + ggml_backend_webgpu_debug(ctx); +} + static void ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst) { std::vector params = { (uint32_t) dst->ne[1], // number of rows in result (M) @@ -487,6 +584,11 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node) { ggml_webgpu_cpy(ctx, src0, node); break; } + case GGML_OP_SET_ROWS: + { + ggml_webgpu_set_rows(ctx, src0, src1, node); + break; + } case GGML_OP_MUL_MAT: { ggml_webgpu_mul_mat(ctx, src0, src1, node); @@ -771,6 +873,13 @@ static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) { ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline, wgsl_mul_mat, "mul_mat"); } +static void ggml_webgpu_init_set_rows_pipeline(webgpu_context & webgpu_ctx) { + std::vector constants(1); + constants[0].key = "wg_size"; + constants[0].value = webgpu_ctx->limits.maxComputeWorkgroupSizeX; + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->set_rows_pipeline, wgsl_set_rows, "set_rows", constants); +} + static void ggml_webgpu_init_cpy_pipeline(webgpu_context & webgpu_ctx) { std::vector constants(1); constants[0].key = "wg_size"; @@ -831,7 +940,22 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co ggml_webgpu_init_memset_pipeline(webgpu_ctx); ggml_webgpu_init_mul_mat_pipeline(webgpu_ctx); + ggml_webgpu_init_set_rows_pipeline(webgpu_ctx); ggml_webgpu_init_cpy_pipeline(webgpu_ctx); + +#ifdef GGML_WEBGPU_DEBUG + // Initialize debug buffers + ggml_webgpu_create_buffer(webgpu_ctx->device, + webgpu_ctx->debug_host_buf, + WEBGPU_DEBUG_BUF_ELEMS * sizeof(uint32_t), + wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead, + "debug_host_buf"); + ggml_webgpu_create_buffer(webgpu_ctx->device, + webgpu_ctx->debug_dev_buf, + WEBGPU_DEBUG_BUF_ELEMS * sizeof(uint32_t), + wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc, + "debug_dev_buf"); +#endif webgpu_ctx->device_init = true; } @@ -882,7 +1006,7 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const case GGML_OP_VIEW: case GGML_OP_PERMUTE: return true; - case GGML_OP_CPY: + case GGML_OP_CPY | GGML_OP_SET_ROWS: return op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32; case GGML_OP_MUL_MAT: return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32; diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index d29779cd12b22..6eb240456e37a 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1212,12 +1212,13 @@ struct test_case { double err = nmse(f1.data(), f2.data(), f1.size()); if (err > ud->max_err) { + //printf("Backends %s and %s mismatch: ", bn1, bn2); printf("[%s] NMSE = %.9f > %.9f ", ggml_op_desc(t1), err, ud->max_err); - //for (int i = 0; i < (int) f1.size(); i++) { - // printf("%5d %9.6f %9.6f, diff = %9.6f\n", i, f1[i], f2[i], f1[i] - f2[i]); - //} - //printf("\n"); - //exit(1); + for (int i = 0; i < (int) f1.size(); i++) { + printf("%5d %9.6f %9.6f, diff = %9.6f\n", i, f1[i], f2[i], f1[i] - f2[i]); + } + printf("\n"); + exit(1); ud->ok = false; } return true; From b2dbfcdcb143d6f572ec9ee5a36ee4494199f8a6 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Tue, 5 Aug 2025 16:33:15 -0700 Subject: [PATCH 02/15] Work on set rows --- .github/workflows/build.yml | 2 - ggml/src/ggml-webgpu/ggml-webgpu.cpp | 13 +--- .../ggml-webgpu/wgsl-shaders/set_rows.wgsl | 73 +++++++++++++++++++ tests/test-backend-ops.cpp | 12 +-- 4 files changed, 83 insertions(+), 17 deletions(-) create mode 100644 ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 63e40c3586285..3d4f837e24895 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -179,7 +179,6 @@ jobs: - name: Test id: cmake_test run: | - export LLAMA_SET_ROWS=0 cd build ctest -L main --verbose --timeout 900 @@ -438,7 +437,6 @@ jobs: - name: Test id: cmake_test run: | - export LLAMA_SET_ROWS=0 cd build # This is using llvmpipe and runs slower than other backends ctest -L main --verbose --timeout 3600 diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 8ba416f0eb2bd..b4c3ea08dc99d 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -495,9 +495,9 @@ static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_t (uint32_t) src->ne[1], (uint32_t) src->ne[2], (uint32_t) src->ne[3], - // broadcast shape of idx - (uint32_t) (src->ne[2] / idx->ne[1]), - (uint32_t) (src->ne[3] / idx->ne[2]) + // Shape of idx + (uint32_t) (idx->ne[1]), + (uint32_t) (idx->ne[2]) }; std::vector entries = { @@ -512,18 +512,13 @@ static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_t { .binding = 2, .buffer = ggml_backend_webgpu_tensor_buf(dst), .offset = ggml_backend_webgpu_tensor_offset(dst), - .size = ggml_nbytes(dst) }, - { .binding = 3, - .buffer = ctx->debug_dev_buf, - .offset = 0, - .size = ctx->debug_dev_buf.GetSize() } + .size = ggml_nbytes(dst) } }; size_t max_wg_size = ctx->limits.maxComputeWorkgroupSizeX; uint32_t wg_x = (src->ne[1] * src->ne[2] * src->ne[3] + max_wg_size - 1) / max_wg_size; ggml_backend_webgpu_build_and_enqueue(ctx, ctx->set_rows_pipeline, params, entries, wg_x); ggml_backend_webgpu_submit_queue(ctx); - ggml_backend_webgpu_debug(ctx); } static void ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst) { diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl new file mode 100644 index 0000000000000..9b8c2634e40cb --- /dev/null +++ b/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl @@ -0,0 +1,73 @@ +enable f16; + +@group(0) @binding(0) +var src: array; + +@group(0) @binding(1) +var idx: array; + +@group(0) @binding(2) +var dst: array; + +struct Params { + offset_src: u32, // in elements + offset_idx: u32, // in elements + offset_dst: u32, // in elements + + // Strides (in elements) + stride_src1: u32, + stride_src2: u32, + stride_src3: u32, + + stride_idx0: u32, + stride_idx1: u32, + stride_idx2: u32, + + stride_dst1: u32, + stride_dst2: u32, + stride_dst3: u32, + + // Shape of src + ne0: u32, + n_rows: u32, + ne2: u32, + ne3: u32, + + // Shape of idx + idx1: u32, + idx2: u32, +}; + +@group(0) @binding(3) +var params: Params; + +override wg_size: u32; +@compute @workgroup_size(wg_size) +fn main(@builtin(global_invocation_id) gid: vec3) { + if (gid.x >= params.n_rows * params.ne2 * params.ne3) { + return; + } + var i = gid.x; + let i_src3 = i / (params.ne2 * params.n_rows); + let i_dst3 = i / (params.ne2 * 3); + + i = i % (params.ne2 * params.n_rows); + let i_src2 = i / params.n_rows; + let i_src1 = i % params.n_rows; + + let i_idx2 = i_src3 % params.idx2; + let i_idx1 = i_src2 % params.idx1; + let i_idx0 = i_src1; + + let idx_high = (params.offset_idx + i_idx0 * params.stride_idx0 + i_idx1 * params.stride_idx1 + i_idx2 * params.stride_idx2) * 2; + + let idx_high_val = idx[idx_high]; + let idx_low_val = idx[idx_high + 1]; + + let i_dst_row = params.offset_dst + idx_high_val * params.stride_dst1 + i_src2 * params.stride_dst2 + i_src3 * params.stride_dst3; + let i_src_row = params.offset_src + i_src1 * params.stride_src1 + i_src2 * params.stride_src2 + i_src3 * params.stride_src3; + + for (var i: u32 = 0; i < params.ne0; i++) { + dst[i_dst_row + i] = f16(src[i_src_row + i]); + } +} diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 6eb240456e37a..5b9dc7f59b1f1 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1213,12 +1213,12 @@ struct test_case { double err = nmse(f1.data(), f2.data(), f1.size()); if (err > ud->max_err) { //printf("Backends %s and %s mismatch: ", bn1, bn2); - printf("[%s] NMSE = %.9f > %.9f ", ggml_op_desc(t1), err, ud->max_err); - for (int i = 0; i < (int) f1.size(); i++) { - printf("%5d %9.6f %9.6f, diff = %9.6f\n", i, f1[i], f2[i], f1[i] - f2[i]); - } - printf("\n"); - exit(1); + //printf("[%s] NMSE = %.9f > %.9f ", ggml_op_desc(t1), err, ud->max_err); + //for (int i = 0; i < (int) f1.size(); i++) { + // printf("%5d %9.6f %9.6f, diff = %9.6f\n", i, f1[i], f2[i], f1[i] - f2[i]); + //} + //printf("\n"); + //exit(1); ud->ok = false; } return true; From 248f7a512f3e0f0adf728369b3b01037dc2f636b Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Wed, 6 Aug 2025 10:15:50 -0700 Subject: [PATCH 03/15] Add error buffers for reporting unsupported SET_ROWS indices --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 141 ++++++++++++------ .../ggml-webgpu/wgsl-shaders/set_rows.wgsl | 11 +- 2 files changed, 107 insertions(+), 45 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index b4c3ea08dc99d..1a223ed939076 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -19,7 +19,7 @@ #include #ifdef GGML_WEBGPU_DEBUG -# define WEBGPU_LOG_DEBUG(msg) std::cout << msg << std::endl +# define WEBGPU_LOG_DEBUG(msg) std::cout << msg << std::endl # define WEBGPU_DEBUG_BUF_ELEMS 32 #else # define WEBGPU_LOG_DEBUG(msg) ((void) 0) @@ -27,11 +27,13 @@ /* Constants */ -#define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 16 -#define WEBGPU_MUL_MAT_WG_SIZE 64 -#define WEBGPU_NUM_PARAM_BUFS 100 -#define WEBGPU_PARAMS_BUF_SIZE_BYTES 256 -#define WEBGPU_STORAGE_BUF_BINDING_MULT 4 // a storage buffer binding size must be a multiple of 4 +#define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 16 +#define WEBGPU_MUL_MAT_WG_SIZE 64 +#define WEBGPU_NUM_PARAM_BUFS 100 +#define WEBGPU_PARAMS_BUF_SIZE_BYTES 128 // enough for 32 parameters +#define WEBGPU_NUM_SET_ROWS_ERROR_BUFS 32 +#define WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES 4 +#define WEBGPU_STORAGE_BUF_BINDING_MULT 4 // a storage buffer binding size must be a multiple of 4 /* End Constants */ @@ -55,46 +57,42 @@ static void ggml_webgpu_create_buffer(wgpu::Device & device, wgpu::BufferUsage usage, const char * label); -struct webgpu_param_bufs { +struct webgpu_pool_bufs { wgpu::Buffer host_buf; wgpu::Buffer dev_buf; }; // Holds a pool of parameter buffers for WebGPU operations -struct webgpu_param_buf_pool { - std::vector free; +struct webgpu_buf_pool { + std::vector free; std::mutex mutex; std::condition_variable cv; - void init(wgpu::Device device) { - for (int i = 0; i < WEBGPU_NUM_PARAM_BUFS; i++) { + void init(wgpu::Device device, + int num_bufs, + size_t buf_size, + wgpu::BufferUsage dev_buf_usage, + wgpu::BufferUsage host_buf_usage) { + for (int i = 0; i < num_bufs; i++) { wgpu::Buffer host_buf; wgpu::Buffer dev_buf; - ggml_webgpu_create_buffer(device, - host_buf, - WEBGPU_PARAMS_BUF_SIZE_BYTES, - wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::MapWrite, - "ggml_webgpu_host_params_buf"); - ggml_webgpu_create_buffer(device, - dev_buf, - WEBGPU_PARAMS_BUF_SIZE_BYTES, - wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::Uniform, - "ggml_webgpu_dev_params_buf"); + ggml_webgpu_create_buffer(device, host_buf, buf_size, host_buf_usage, "ggml_webgpu_host_pool_buf"); + ggml_webgpu_create_buffer(device, dev_buf, buf_size, dev_buf_usage, "ggml_webgpu_dev_pool_buf"); free.push_back({ host_buf, dev_buf }); } } - webgpu_param_bufs alloc_bufs() { + webgpu_pool_bufs alloc_bufs() { std::unique_lock lock(mutex); cv.wait(lock, [this] { return !free.empty(); }); - webgpu_param_bufs bufs = free.back(); + webgpu_pool_bufs bufs = free.back(); free.pop_back(); return bufs; } - void free_bufs(std::vector bufs) { + void free_bufs(std::vector bufs) { std::lock_guard lock(mutex); free.insert(free.end(), bufs.begin(), bufs.end()); cv.notify_all(); @@ -122,7 +120,8 @@ struct webgpu_context_struct { bool device_init = false; - webgpu_param_buf_pool param_buf_pool; + webgpu_buf_pool param_buf_pool; + webgpu_buf_pool set_rows_error_buf_pool; wgpu::ComputePipeline memset_pipeline; wgpu::ComputePipeline mul_mat_pipeline; @@ -138,7 +137,9 @@ struct webgpu_context_struct { std::vector staged_command_bufs; // Parameter buffers associated with the staged command buffers - std::vector staged_param_bufs; + std::vector staged_param_bufs; + // Buffers associated with set_rows operations, used to store potential errors + std::vector staged_set_row_error_bufs; std::vector callback_futures; @@ -146,7 +147,6 @@ struct webgpu_context_struct { wgpu::Buffer debug_host_buf; wgpu::Buffer debug_dev_buf; #endif - }; typedef std::shared_ptr webgpu_context; @@ -257,20 +257,55 @@ static void ggml_backend_webgpu_submit_queue(webgpu_context & ctx) { return; } ctx->queue.Submit(ctx->staged_command_bufs.size(), ctx->staged_command_bufs.data()); + + // If there are SET_ROWS operations in this submission, copy their error buffers to the host. + if (ctx->staged_set_row_error_bufs.size() > 0) { + wgpu::CommandEncoder encoder = ctx->device.CreateCommandEncoder(); + for (auto & error_bufs : ctx->staged_set_row_error_bufs) { + // Copy the error buffer to the host buffer + encoder.CopyBufferToBuffer(error_bufs.dev_buf, 0, error_bufs.host_buf, 0, error_bufs.host_buf.GetSize()); + } + wgpu::CommandBuffer commands = encoder.Finish(); + ctx->queue.Submit(1, &commands); + } + ctx->staged_command_bufs.clear(); - std::vector staged_param_bufs = std::move(ctx->staged_param_bufs); + std::vector staged_param_bufs = std::move(ctx->staged_param_bufs); + std::vector staged_set_row_error_bufs = std::move(ctx->staged_set_row_error_bufs); // Free the staged parameter buffers once the submission completes - wgpu::Future f = ctx->queue.OnSubmittedWorkDone( + wgpu::Future p_f = ctx->queue.OnSubmittedWorkDone( wgpu::CallbackMode::AllowSpontaneous, [ctx, staged_param_bufs](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) { if (status != wgpu::QueueWorkDoneStatus::Success) { GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n", message.data); } - // Free the staged parameter buffers + // Free the staged buffers ctx->param_buf_pool.free_bufs(staged_param_bufs); }); - ctx->callback_futures.push_back({ f }); + ctx->callback_futures.push_back({ p_f }); + + // Check for errrors in SET_ROWS operations + for (auto & error_bufs : staged_set_row_error_bufs) { + wgpu::Future f = error_bufs.host_buf.MapAsync( + wgpu::MapMode::Read, + 0, + error_bufs.host_buf.GetSize(), + wgpu::CallbackMode::AllowSpontaneous, + [ctx, error_bufs](wgpu::MapAsyncStatus status, wgpu::StringView message) { + if (status != wgpu::MapAsyncStatus::Success) { + GGML_LOG_ERROR("ggml_webgpu: Failed to map error buffer: %s\n", message.data); + } else { + const uint32_t * error_data = (const uint32_t *) error_bufs.host_buf.GetConstMappedRange(); + if (*error_data) { + GGML_ABORT("ggml_webgpu: SET_ROWS index > 2^32, unsupported."); + } + // We can't unmap in here due to WebGPU reentrancy limitations. + ctx->set_rows_error_buf_pool.free_bufs({ error_bufs }); + } + }); + ctx->callback_futures.push_back({ f }); + } } static void ggml_backend_webgpu_map_buffer(webgpu_context & ctx, @@ -294,7 +329,7 @@ static void ggml_backend_webgpu_map_buffer(webgpu_context & ctx, #ifdef GGML_WEBGPU_DEBUG // This function adds debugging information to shaders, as WebGPU does not support printing directly. // To use, add a bind group entry to the setup for the shader you are debugging, add the buffer and -// debug statements in the shader, and then call this function after encoding the commands. +// debug statements in the shader, and then call this function after encoding the commands and submitting them. static void ggml_backend_webgpu_debug(webgpu_context & ctx) { wgpu::CommandEncoder encoder = ctx->device.CreateCommandEncoder(); encoder.CopyBufferToBuffer(ctx->debug_dev_buf, 0, ctx->debug_host_buf, 0, ctx->debug_host_buf.GetSize()); @@ -318,7 +353,7 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context & std::vector bind_group_entries, uint32_t wg_x, bool submit_and_wait = false) { - webgpu_param_bufs params_bufs = ctx->param_buf_pool.alloc_bufs(); + webgpu_pool_bufs params_bufs = ctx->param_buf_pool.alloc_bufs(); ggml_backend_webgpu_map_buffer(ctx, params_bufs.host_buf, wgpu::MapMode::Write, 0, params_bufs.host_buf.GetSize()); uint32_t * _params = (uint32_t *) params_bufs.host_buf.GetMappedRange(); @@ -464,6 +499,12 @@ static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_t return; } + // allocate error bufs + webgpu_pool_bufs error_bufs = ctx->set_rows_error_buf_pool.alloc_bufs(); + if (error_bufs.host_buf.GetMapState() == wgpu::BufferMapState::Mapped) { + error_bufs.host_buf.Unmap(); + } + size_t src_offset = ggml_backend_webgpu_tensor_offset(src); // assumes power of 2 offset alignment size_t src_misalignment = src_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1); @@ -476,8 +517,7 @@ static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_t size_t dst_misalignment = dst_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1); dst_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1); - std::vector params = { - (uint32_t) (src_misalignment / ggml_type_size(src->type)), + std::vector params = { (uint32_t) (src_misalignment / ggml_type_size(src->type)), (uint32_t) (idx_misalignment / ggml_type_size(idx->type)), (uint32_t) (dst_misalignment / ggml_type_size(dst->type)), // Convert byte-strides to element-strides @@ -497,28 +537,31 @@ static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_t (uint32_t) src->ne[3], // Shape of idx (uint32_t) (idx->ne[1]), - (uint32_t) (idx->ne[2]) - }; + (uint32_t) (idx->ne[2]) }; std::vector entries = { { .binding = 0, .buffer = ggml_backend_webgpu_tensor_buf(src), .offset = ggml_backend_webgpu_tensor_offset(src), - .size = ggml_nbytes(src) }, + .size = ggml_nbytes(src) }, { .binding = 1, .buffer = ggml_backend_webgpu_tensor_buf(idx), .offset = ggml_backend_webgpu_tensor_offset(idx), - .size = ggml_nbytes(idx) }, + .size = ggml_nbytes(idx) }, { .binding = 2, .buffer = ggml_backend_webgpu_tensor_buf(dst), .offset = ggml_backend_webgpu_tensor_offset(dst), - .size = ggml_nbytes(dst) } + .size = ggml_nbytes(dst) }, + { .binding = 3, .buffer = error_bufs.dev_buf, .offset = 0, .size = error_bufs.dev_buf.GetSize() } }; size_t max_wg_size = ctx->limits.maxComputeWorkgroupSizeX; - uint32_t wg_x = (src->ne[1] * src->ne[2] * src->ne[3] + max_wg_size - 1) / max_wg_size; + uint32_t wg_x = (src->ne[1] * src->ne[2] * src->ne[3] + max_wg_size - 1) / max_wg_size; + + std::lock_guard lock(ctx->mutex); + ctx->staged_set_row_error_bufs.push_back(error_bufs); + ggml_backend_webgpu_build_and_enqueue(ctx, ctx->set_rows_pipeline, params, entries, wg_x); - ggml_backend_webgpu_submit_queue(ctx); } static void ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst) { @@ -872,7 +915,8 @@ static void ggml_webgpu_init_set_rows_pipeline(webgpu_context & webgpu_ctx) { std::vector constants(1); constants[0].key = "wg_size"; constants[0].value = webgpu_ctx->limits.maxComputeWorkgroupSizeX; - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->set_rows_pipeline, wgsl_set_rows, "set_rows", constants); + ggml_webgpu_create_pipeline( + webgpu_ctx->device, webgpu_ctx->set_rows_pipeline, wgsl_set_rows, "set_rows", constants); } static void ggml_webgpu_init_cpy_pipeline(webgpu_context & webgpu_ctx) { @@ -931,7 +975,16 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co webgpu_ctx->queue = webgpu_ctx->device.GetQueue(); // Create buffer pool for shader parameters - webgpu_ctx->param_buf_pool.init(webgpu_ctx->device); + webgpu_ctx->param_buf_pool.init(webgpu_ctx->device, + WEBGPU_NUM_PARAM_BUFS, + WEBGPU_PARAMS_BUF_SIZE_BYTES, + wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::Uniform, + wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::MapWrite); + webgpu_ctx->set_rows_error_buf_pool.init(webgpu_ctx->device, + WEBGPU_NUM_SET_ROWS_ERROR_BUFS, + WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES, + wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::Storage, + wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead); ggml_webgpu_init_memset_pipeline(webgpu_ctx); ggml_webgpu_init_mul_mat_pipeline(webgpu_ctx); diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl index 9b8c2634e40cb..4bd6f94a2351f 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl @@ -9,6 +9,9 @@ var idx: array; @group(0) @binding(2) var dst: array; +@group(0) @binding(3) +var error: atomic; + struct Params { offset_src: u32, // in elements offset_idx: u32, // in elements @@ -38,7 +41,7 @@ struct Params { idx2: u32, }; -@group(0) @binding(3) +@group(0) @binding(4) var params: Params; override wg_size: u32; @@ -64,6 +67,12 @@ fn main(@builtin(global_invocation_id) gid: vec3) { let idx_high_val = idx[idx_high]; let idx_low_val = idx[idx_high + 1]; + if (idx_low_val != 0) { + // Upper bits of index are not zero, output will be incorrect + atomicStore(&error, 1); + return; + } + let i_dst_row = params.offset_dst + idx_high_val * params.stride_dst1 + i_src2 * params.stride_dst2 + i_src3 * params.stride_dst3; let i_src_row = params.offset_src + i_src1 * params.stride_src1 + i_src2 * params.stride_src2 + i_src3 * params.stride_src3; From 4ad09861234eb0adba01490335d819f555847fb9 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Wed, 6 Aug 2025 12:25:55 -0700 Subject: [PATCH 04/15] Remove extra comments --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 1 - tests/test-backend-ops.cpp | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 1a223ed939076..ba1addc8d9f29 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -499,7 +499,6 @@ static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_t return; } - // allocate error bufs webgpu_pool_bufs error_bufs = ctx->set_rows_error_buf_pool.alloc_bufs(); if (error_bufs.host_buf.GetMapState() == wgpu::BufferMapState::Mapped) { error_bufs.host_buf.Unmap(); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 5b9dc7f59b1f1..d29779cd12b22 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1212,8 +1212,7 @@ struct test_case { double err = nmse(f1.data(), f2.data(), f1.size()); if (err > ud->max_err) { - //printf("Backends %s and %s mismatch: ", bn1, bn2); - //printf("[%s] NMSE = %.9f > %.9f ", ggml_op_desc(t1), err, ud->max_err); + printf("[%s] NMSE = %.9f > %.9f ", ggml_op_desc(t1), err, ud->max_err); //for (int i = 0; i < (int) f1.size(); i++) { // printf("%5d %9.6f %9.6f, diff = %9.6f\n", i, f1[i], f2[i], f1[i] - f2[i]); //} From 6355137c76a15dfc979249c7e2af337084f1ada5 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Thu, 7 Aug 2025 13:48:09 -0700 Subject: [PATCH 05/15] Work on templating for different types in shaders --- ggml/src/ggml-webgpu/CMakeLists.txt | 4 +- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 82 +++++++++++++++---- .../ggml-webgpu/wgsl-shaders/embed_wgsl.py | 69 +++++++++++----- .../{mul_mat.wgsl => mul_mat.tmpl.wgsl} | 35 +++++++- 4 files changed, 148 insertions(+), 42 deletions(-) rename ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat.wgsl => mul_mat.tmpl.wgsl} (68%) diff --git a/ggml/src/ggml-webgpu/CMakeLists.txt b/ggml/src/ggml-webgpu/CMakeLists.txt index 79ef68b85a477..78a985a4d167a 100644 --- a/ggml/src/ggml-webgpu/CMakeLists.txt +++ b/ggml/src/ggml-webgpu/CMakeLists.txt @@ -20,8 +20,8 @@ add_custom_command( COMMAND ${CMAKE_COMMAND} -E make_directory ${SHADER_OUTPUT_DIR} COMMAND ${CMAKE_COMMAND} -E env PYTHONIOENCODING=utf-8 ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/wgsl-shaders/embed_wgsl.py - --input "${SHADER_DIR}" - --output "${SHADER_HEADER}" + --input_dir "${SHADER_DIR}" + --output_file "${SHADER_HEADER}" DEPENDS ${WGSL_SHADER_FILES} ${CMAKE_CURRENT_SOURCE_DIR}/wgsl-shaders/embed_wgsl.py VERBATIM ) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index ba1addc8d9f29..c9d5164674226 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -50,6 +50,13 @@ static uint64_t webgpu_tensor_offset(const ggml_tensor * tensor) { /* Struct definitions */ +struct webgpu_pipeline_info { + std::string name; + const char * shader_code; + ggml_type src0_type; + ggml_type src1_type; +}; + // Forward reference static void ggml_webgpu_create_buffer(wgpu::Device & device, wgpu::Buffer & buffer, @@ -124,7 +131,8 @@ struct webgpu_context_struct { webgpu_buf_pool set_rows_error_buf_pool; wgpu::ComputePipeline memset_pipeline; - wgpu::ComputePipeline mul_mat_pipeline; + // [src0 0=fp32,1=fp16][src1 0=fp32,1=fp16] + wgpu::ComputePipeline mul_mat_pipeline[2][2]; wgpu::ComputePipeline set_rows_pipeline; wgpu::ComputePipeline cpy_pipeline; @@ -227,6 +235,15 @@ static void ggml_webgpu_create_buffer(wgpu::Device & device, /** End WebGPU object initializations */ +/** Utility Functions */ + +size_t ggml_webgpu_binding_size(ggml_tensor * t, size_t misalignment) { + return (ggml_nbytes(t) + misalignment + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & + ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1); +} + +/** End Utility Functions */ + /** WebGPU Actions */ // Wait for the queue to finish processing all submitted work @@ -479,13 +496,11 @@ static void ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src, ggml_tensor { .binding = 0, .buffer = ggml_backend_webgpu_tensor_buf(src), .offset = src_offset, - .size = (ggml_nbytes(src) + src_misalignment + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & - ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1) }, + .size = ggml_webgpu_binding_size(src, src_misalignment) }, { .binding = 1, .buffer = ggml_backend_webgpu_tensor_buf(dst), .offset = dst_offset, - .size = (ggml_nbytes(dst) + dst_misalignment + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & - ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1) } + .size = ggml_webgpu_binding_size(dst, dst_misalignment) } }; size_t max_wg_size = ctx->limits.maxComputeWorkgroupSizeX; @@ -542,15 +557,15 @@ static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_t { .binding = 0, .buffer = ggml_backend_webgpu_tensor_buf(src), .offset = ggml_backend_webgpu_tensor_offset(src), - .size = ggml_nbytes(src) }, + .size = ggml_webgpu_binding_size(src, src_misalignment) }, { .binding = 1, .buffer = ggml_backend_webgpu_tensor_buf(idx), .offset = ggml_backend_webgpu_tensor_offset(idx), - .size = ggml_nbytes(idx) }, + .size = ggml_webgpu_binding_size(idx, idx_misalignment) }, { .binding = 2, .buffer = ggml_backend_webgpu_tensor_buf(dst), .offset = ggml_backend_webgpu_tensor_offset(dst), - .size = ggml_nbytes(dst) }, + .size = ggml_webgpu_binding_size(dst, dst_misalignment) }, { .binding = 3, .buffer = error_bufs.dev_buf, .offset = 0, .size = error_bufs.dev_buf.GetSize() } }; @@ -564,7 +579,21 @@ static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_t } static void ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst) { + size_t src0_offset = ggml_backend_webgpu_tensor_offset(src0); + size_t src0_misalignment = src0_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1); + // align to minimum offset alignment + src0_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1); + size_t src1_offset = ggml_backend_webgpu_tensor_offset(src1); + size_t src1_misalignment = src1_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1); + src1_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1); + size_t dst_offset = ggml_backend_webgpu_tensor_offset(dst); + size_t dst_misalignment = dst_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1); + dst_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1); + std::vector params = { + (uint32_t) (src0_misalignment / ggml_type_size(src0->type)), + (uint32_t) (src1_misalignment / ggml_type_size(src1->type)), + (uint32_t) (dst_misalignment / ggml_type_size(dst->type)), (uint32_t) dst->ne[1], // number of rows in result (M) (uint32_t) dst->ne[0], // number of columns in result (N) (uint32_t) src0->ne[0], // number of columns in src0/src1 (K) @@ -584,20 +613,20 @@ static void ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_t { .binding = 0, .buffer = ggml_backend_webgpu_tensor_buf(src0), .offset = ggml_backend_webgpu_tensor_offset(src0), - .size = ggml_nbytes(src0) }, + .size = ggml_webgpu_binding_size(src0, src0_misalignment) }, { .binding = 1, .buffer = ggml_backend_webgpu_tensor_buf(src1), .offset = ggml_backend_webgpu_tensor_offset(src1), - .size = ggml_nbytes(src1) }, + .size = ggml_webgpu_binding_size(src1, src1_misalignment) }, { .binding = 2, .buffer = ggml_backend_webgpu_tensor_buf(dst), .offset = ggml_backend_webgpu_tensor_offset(dst), - .size = ggml_nbytes(dst) } + .size = ggml_webgpu_binding_size(dst, dst_misalignment) } }; uint32_t wg_x = (dst->ne[0] * dst->ne[1] * dst->ne[2] * dst->ne[3] + WEBGPU_MUL_MAT_WG_SIZE - 1) / WEBGPU_MUL_MAT_WG_SIZE; - ggml_backend_webgpu_build_and_enqueue(ctx, ctx->mul_mat_pipeline, params, entries, wg_x); + ggml_backend_webgpu_build_and_enqueue(ctx, ctx->mul_mat_pipeline[src0->type][src1->type], params, entries, wg_x); } // Returns true if node has enqueued work into the queue, false otherwise @@ -907,7 +936,31 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_context & webgpu_ctx) { } static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) { - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline, wgsl_mul_mat, "mul_mat"); + webgpu_pipeline_info pipeline_infos[4] = { + { .name = "mul_mat_f32_f32", + .shader_code = wgsl_mul_mat_f32_f32, + .src0_type = GGML_TYPE_F32, + .src1_type = GGML_TYPE_F32 }, + { .name = "mul_mat_f16_f16", + .shader_code = wgsl_mul_mat_f16_f16, + .src0_type = GGML_TYPE_F16, + .src1_type = GGML_TYPE_F16 }, + { .name = "mul_mat_f32_f16", + .shader_code = wgsl_mul_mat_f32_f16, + .src0_type = GGML_TYPE_F32, + .src1_type = GGML_TYPE_F16 }, + { .name = "mul_mat_f16_f32", + .shader_code = wgsl_mul_mat_f16_f32, + .src0_type = GGML_TYPE_F16, + .src1_type = GGML_TYPE_F32 } + }; + + for (auto & pipeline_info : pipeline_infos) { + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[pipeline_info.src0_type][pipeline_info.src1_type], + pipeline_info.shader_code, + pipeline_info.name.data()); + } } static void ggml_webgpu_init_set_rows_pipeline(webgpu_context & webgpu_ctx) { @@ -1056,7 +1109,8 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const case GGML_OP_CPY | GGML_OP_SET_ROWS: return op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32; case GGML_OP_MUL_MAT: - return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32; + return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) && + (op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F16); default: return false; } diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py b/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py index 962dcd6b170ed..41933af61e522 100755 --- a/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +++ b/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py @@ -1,35 +1,60 @@ import os +import re +import ast import argparse - -def escape_triple_quotes(wgsl): - # Simple defense in case of embedded """ - return wgsl.replace('"""', '\\"""') - - -def to_cpp_string_literal(varname, content): - return f'const char* wgsl_{varname} = R"({content})";\n' - +variants_regex = re.compile(r'//\s*Variants:\s*\n(\[.*?\])', re.DOTALL) + +def remove_variants_block(template_text): + return re.sub(variants_regex, '', template_text) + +def extract_variants(template_text): + match = re.search(variants_regex, template_text) + if not match: + return None + return ast.literal_eval(match.group(1)) + +def write_shader(shader_name, shader_code, output_dir, outfile): + if output_dir: + wgsl_filename = os.path.join(output_dir, f"{shader_name}.wgsl") + with open(wgsl_filename, 'w', encoding='utf-8') as f_out: + f_out.write(shader_code) + outfile.write(f'const char* wgsl_{shader_name} = R"({shader_code})";\n') + outfile.write('\n') + +def generate_variants(shader_path, output_dir, outfile): + shader_base_name = shader_path.split("/")[-1].split(".")[0] + with open(shader_path, 'r', encoding='utf-8') as f: + shader_code = f.read() + variants = extract_variants(shader_code) + shader_code = remove_variants_block(shader_code) + if not variants: + write_shader(shader_base_name, shader_code, output_dir, outfile) + else: + for variant in variants: + shader_variant = shader_code + parts = [] + for key, val in variant.items(): + parts.append(val) + shader_variant = shader_variant.replace(key, val) + output_name = f"{shader_base_name}_" + "_".join(parts) + write_shader(output_name, shader_variant, output_dir, outfile) def main(): parser = argparse.ArgumentParser() - parser.add_argument('--input', required=True) - parser.add_argument('--output', required=True) + parser.add_argument('--input_dir', required=True) + parser.add_argument('--output_file', required=True) + parser.add_argument('--output_dir') args = parser.parse_args() - - with open(args.output, 'w', encoding='utf-8') as out: + if args.output_dir: + os.makedirs(args.output_dir, exist_ok=True) + with open(args.output_file, 'w', encoding='utf-8') as out: out.write("// Auto-generated shader embedding \n\n") - for fname in sorted(os.listdir(args.input)): + for fname in sorted(os.listdir(args.input_dir)): if not fname.endswith('.wgsl'): continue - shader_path = os.path.join(args.input, fname) - varname = os.path.splitext(fname)[0] - with open(shader_path, 'r', encoding='utf-8') as f: - content = f.read() - content = escape_triple_quotes(content) - out.write(to_cpp_string_literal(varname, content)) - out.write('\n') - + shader_path = os.path.join(args.input_dir, fname) + generate_variants(shader_path, args.output_dir, out) if __name__ == '__main__': main() diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl similarity index 68% rename from ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl rename to ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl index 054aab566f96b..2ffda79e01604 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl @@ -1,4 +1,30 @@ +// Variants: +[ + { + "SRC0_TYPE" : "f32", + "SRC1_TYPE" : "f32" + }, + { + "SRC0_TYPE" : "f16", + "SRC1_TYPE" : "f16" + }, + { + "SRC0_TYPE" : "f16", + "SRC1_TYPE" : "f32" + }, + { + "SRC0_TYPE" : "f32", + "SRC1_TYPE" : "f16" + } +] + +// Shader Template: +enable f16; + struct MulMatParams { + offset_src0: u32, // in elements + offset_src1: u32, // in elements + offset_dst: u32, // in elements m: u32, n: u32, k: u32, @@ -16,8 +42,8 @@ struct MulMatParams { broadcast3: u32 }; -@group(0) @binding(0) var src0: array; // N rows, K columns -@group(0) @binding(1) var src1: array; // M rows, K columns (transposed) +@group(0) @binding(0) var src0: array; // N rows, K columns +@group(0) @binding(1) var src1: array; // M rows, K columns (transposed) @group(0) @binding(2) var dst: array; // M rows, N columns @group(0) @binding(3) var params: MulMatParams; @@ -50,7 +76,8 @@ fn main(@builtin(global_invocation_id) global_id: vec3) { for (var i: u32 = 0u; i < params.k; i = i + 1u) { let src0_idx = src03_idx * params.stride_03 + src02_idx * params.stride_02 + col * params.stride_01 + i; let src1_idx = src13_idx * params.stride_13 + src12_idx * params.stride_12 + row * params.stride_11 + i; - sum = sum + src0[src0_idx] * src1[src1_idx]; + sum = sum + f32(src0[params.offset_src0 + src0_idx]) * f32(src1[params.offset_src1 + src1_idx]); } - dst[dst3_idx * dst3_stride + dst2_idx * dst2_stride + row * params.n + col] = sum; + dst[params.offset_dst + dst3_idx * dst3_stride + dst2_idx * dst2_stride + row * params.n + col] = sum; + } From 831ea3c372873766f89c20a7f038f16b98df2bbe Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Thu, 7 Aug 2025 15:11:16 -0700 Subject: [PATCH 06/15] Work on shader type generation --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 141 +++++++++++---------------- 1 file changed, 58 insertions(+), 83 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index c9d5164674226..af932b1d4fdd3 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -235,15 +235,6 @@ static void ggml_webgpu_create_buffer(wgpu::Device & device, /** End WebGPU object initializations */ -/** Utility Functions */ - -size_t ggml_webgpu_binding_size(ggml_tensor * t, size_t misalignment) { - return (ggml_nbytes(t) + misalignment + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & - ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1); -} - -/** End Utility Functions */ - /** WebGPU Actions */ // Wait for the queue to finish processing all submitted work @@ -438,15 +429,6 @@ static void ggml_backend_webgpu_buffer_memset(webgpu_context & ctx, ggml_backend_webgpu_build_and_enqueue(ctx, ctx->memset_pipeline, params, entries, wg_x, true); } -static size_t ggml_backend_webgpu_tensor_offset(const ggml_tensor * tensor) { - return webgpu_tensor_offset(tensor) + tensor->view_offs; -} - -static wgpu::Buffer ggml_backend_webgpu_tensor_buf(const ggml_tensor * tensor) { - ggml_backend_webgpu_buffer_context * ctx = (ggml_backend_webgpu_buffer_context *) tensor->buffer->context; - return ctx->buffer; -} - /** End WebGPU Actions */ /** GGML Backend Interface */ @@ -464,19 +446,35 @@ static void ggml_backend_webgpu_free(ggml_backend_t backend) { GGML_UNUSED(ctx); } +static size_t ggml_webgpu_tensor_offset(const ggml_tensor * tensor) { + return webgpu_tensor_offset(tensor) + tensor->view_offs; +} + +static wgpu::Buffer ggml_webgpu_tensor_buf(const ggml_tensor * tensor) { + ggml_backend_webgpu_buffer_context * ctx = (ggml_backend_webgpu_buffer_context *) tensor->buffer->context; + return ctx->buffer; +} +static size_t ggml_webgpu_tensor_misalignment(webgpu_context & ctx, ggml_tensor * t) { + size_t offset = ggml_webgpu_tensor_offset(t); + return offset & (ctx->limits.minStorageBufferOffsetAlignment - 1); +} + +static size_t ggml_webgpu_tensor_align_offset(webgpu_context & ctx, ggml_tensor * t) { + size_t offset = ggml_webgpu_tensor_offset(t); + return offset & ~(ctx->limits.minStorageBufferOffsetAlignment - 1); +} + +static size_t ggml_webgpu_tensor_binding_size(webgpu_context & ctx, ggml_tensor * t) { + return (ggml_nbytes(t) + ggml_webgpu_tensor_misalignment(ctx, t) + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & + ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1); +} + static void ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) { - size_t src_offset = ggml_backend_webgpu_tensor_offset(src); - // assumes power of 2 offset alignment - size_t src_misalignment = src_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1); - // align to minimum offset alignment - src_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1); - size_t dst_offset = ggml_backend_webgpu_tensor_offset(dst); - size_t dst_misalignment = dst_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1); - dst_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1); - uint32_t ne = (uint32_t) ggml_nelements(dst); + uint32_t ne = (uint32_t) ggml_nelements(dst); + std::vector params = { ne, - (uint32_t) (src_misalignment / ggml_type_size(src->type)), - (uint32_t) (dst_misalignment / ggml_type_size(dst->type)), + (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)), + (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)), // Convert byte-strides to element-strides (uint32_t) (src->nb[0] / ggml_type_size(src->type)), (uint32_t) (src->nb[1] / ggml_type_size(src->type)), @@ -494,13 +492,13 @@ static void ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src, ggml_tensor std::vector entries = { { .binding = 0, - .buffer = ggml_backend_webgpu_tensor_buf(src), - .offset = src_offset, - .size = ggml_webgpu_binding_size(src, src_misalignment) }, + .buffer = ggml_webgpu_tensor_buf(src), + .offset = ggml_webgpu_tensor_align_offset(ctx, src), + .size = ggml_webgpu_tensor_binding_size(ctx, src) }, { .binding = 1, - .buffer = ggml_backend_webgpu_tensor_buf(dst), - .offset = dst_offset, - .size = ggml_webgpu_binding_size(dst, dst_misalignment) } + .buffer = ggml_webgpu_tensor_buf(dst), + .offset = ggml_webgpu_tensor_align_offset(ctx, dst), + .size = ggml_webgpu_tensor_binding_size(ctx, dst) } }; size_t max_wg_size = ctx->limits.maxComputeWorkgroupSizeX; @@ -519,21 +517,9 @@ static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_t error_bufs.host_buf.Unmap(); } - size_t src_offset = ggml_backend_webgpu_tensor_offset(src); - // assumes power of 2 offset alignment - size_t src_misalignment = src_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1); - // align to minimum offset alignment - src_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1); - size_t idx_offset = ggml_backend_webgpu_tensor_offset(idx); - size_t idx_misalignment = idx_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1); - idx_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1); - size_t dst_offset = ggml_backend_webgpu_tensor_offset(dst); - size_t dst_misalignment = dst_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1); - dst_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1); - - std::vector params = { (uint32_t) (src_misalignment / ggml_type_size(src->type)), - (uint32_t) (idx_misalignment / ggml_type_size(idx->type)), - (uint32_t) (dst_misalignment / ggml_type_size(dst->type)), + std::vector params = { (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)), + (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, idx) / ggml_type_size(idx->type)), + (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)), // Convert byte-strides to element-strides (uint32_t) (src->nb[1] / ggml_type_size(src->type)), (uint32_t) (src->nb[2] / ggml_type_size(src->type)), @@ -555,18 +541,18 @@ static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_t std::vector entries = { { .binding = 0, - .buffer = ggml_backend_webgpu_tensor_buf(src), - .offset = ggml_backend_webgpu_tensor_offset(src), - .size = ggml_webgpu_binding_size(src, src_misalignment) }, + .buffer = ggml_webgpu_tensor_buf(src), + .offset = ggml_webgpu_tensor_align_offset(ctx, src), + .size = ggml_webgpu_tensor_binding_size(ctx, src) }, { .binding = 1, - .buffer = ggml_backend_webgpu_tensor_buf(idx), - .offset = ggml_backend_webgpu_tensor_offset(idx), - .size = ggml_webgpu_binding_size(idx, idx_misalignment) }, + .buffer = ggml_webgpu_tensor_buf(idx), + .offset = ggml_webgpu_tensor_align_offset(ctx, idx), + .size = ggml_webgpu_tensor_binding_size(ctx, idx) }, { .binding = 2, - .buffer = ggml_backend_webgpu_tensor_buf(dst), - .offset = ggml_backend_webgpu_tensor_offset(dst), - .size = ggml_webgpu_binding_size(dst, dst_misalignment) }, - { .binding = 3, .buffer = error_bufs.dev_buf, .offset = 0, .size = error_bufs.dev_buf.GetSize() } + .buffer = ggml_webgpu_tensor_buf(dst), + .offset = ggml_webgpu_tensor_align_offset(ctx, dst), + .size = ggml_webgpu_tensor_binding_size(ctx, dst) }, + { .binding = 3, .buffer = error_bufs.dev_buf, .offset = 0, .size = error_bufs.dev_buf.GetSize() } }; size_t max_wg_size = ctx->limits.maxComputeWorkgroupSizeX; @@ -579,21 +565,10 @@ static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_t } static void ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst) { - size_t src0_offset = ggml_backend_webgpu_tensor_offset(src0); - size_t src0_misalignment = src0_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1); - // align to minimum offset alignment - src0_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1); - size_t src1_offset = ggml_backend_webgpu_tensor_offset(src1); - size_t src1_misalignment = src1_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1); - src1_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1); - size_t dst_offset = ggml_backend_webgpu_tensor_offset(dst); - size_t dst_misalignment = dst_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1); - dst_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1); - std::vector params = { - (uint32_t) (src0_misalignment / ggml_type_size(src0->type)), - (uint32_t) (src1_misalignment / ggml_type_size(src1->type)), - (uint32_t) (dst_misalignment / ggml_type_size(dst->type)), + (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)), + (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)), + (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)), (uint32_t) dst->ne[1], // number of rows in result (M) (uint32_t) dst->ne[0], // number of columns in result (N) (uint32_t) src0->ne[0], // number of columns in src0/src1 (K) @@ -611,17 +586,17 @@ static void ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_t std::vector entries = { { .binding = 0, - .buffer = ggml_backend_webgpu_tensor_buf(src0), - .offset = ggml_backend_webgpu_tensor_offset(src0), - .size = ggml_webgpu_binding_size(src0, src0_misalignment) }, + .buffer = ggml_webgpu_tensor_buf(src0), + .offset = ggml_webgpu_tensor_align_offset(ctx, src0), + .size = ggml_webgpu_tensor_binding_size(ctx, src0) }, { .binding = 1, - .buffer = ggml_backend_webgpu_tensor_buf(src1), - .offset = ggml_backend_webgpu_tensor_offset(src1), - .size = ggml_webgpu_binding_size(src1, src1_misalignment) }, + .buffer = ggml_webgpu_tensor_buf(src1), + .offset = ggml_webgpu_tensor_align_offset(ctx, src1), + .size = ggml_webgpu_tensor_binding_size(ctx, src1) }, { .binding = 2, - .buffer = ggml_backend_webgpu_tensor_buf(dst), - .offset = ggml_backend_webgpu_tensor_offset(dst), - .size = ggml_webgpu_binding_size(dst, dst_misalignment) } + .buffer = ggml_webgpu_tensor_buf(dst), + .offset = ggml_webgpu_tensor_align_offset(ctx, dst), + .size = ggml_webgpu_tensor_binding_size(ctx, dst) } }; uint32_t wg_x = From 688b51dbbcd70a731f74a9dab5d9906a45ec7b72 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Mon, 11 Aug 2025 16:39:12 -0700 Subject: [PATCH 07/15] Working q4_0 mul_mat and some templating for different types --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 19 ++-- .../ggml-webgpu/wgsl-shaders/embed_wgsl.py | 83 +++++++++------- .../wgsl-shaders/mul_mat.tmpl.wgsl | 96 ++++++++++++++++--- 3 files changed, 142 insertions(+), 56 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index af932b1d4fdd3..bd9c5b4ede3a1 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -131,8 +131,7 @@ struct webgpu_context_struct { webgpu_buf_pool set_rows_error_buf_pool; wgpu::ComputePipeline memset_pipeline; - // [src0 0=fp32,1=fp16][src1 0=fp32,1=fp16] - wgpu::ComputePipeline mul_mat_pipeline[2][2]; + wgpu::ComputePipeline mul_mat_pipeline[3][2]; wgpu::ComputePipeline set_rows_pipeline; wgpu::ComputePipeline cpy_pipeline; @@ -339,13 +338,14 @@ static void ggml_backend_webgpu_map_buffer(webgpu_context & ctx, // To use, add a bind group entry to the setup for the shader you are debugging, add the buffer and // debug statements in the shader, and then call this function after encoding the commands and submitting them. static void ggml_backend_webgpu_debug(webgpu_context & ctx) { + ggml_backend_webgpu_submit_queue(ctx); wgpu::CommandEncoder encoder = ctx->device.CreateCommandEncoder(); encoder.CopyBufferToBuffer(ctx->debug_dev_buf, 0, ctx->debug_host_buf, 0, ctx->debug_host_buf.GetSize()); wgpu::CommandBuffer commands = encoder.Finish(); ctx->queue.Submit(1, &commands); ggml_backend_webgpu_map_buffer(ctx, ctx->debug_host_buf, wgpu::MapMode::Read, 0, ctx->debug_host_buf.GetSize()); - const uint32_t * debug_data = (const uint32_t *) ctx->debug_host_buf.GetConstMappedRange(); + const float * debug_data = (const float *) ctx->debug_host_buf.GetConstMappedRange(); std::cout << "debug data:"; for (size_t i = 0; i < WEBGPU_DEBUG_BUF_ELEMS; i++) { std::cout << " " << i << ": " << debug_data[i]; @@ -911,7 +911,7 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_context & webgpu_ctx) { } static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) { - webgpu_pipeline_info pipeline_infos[4] = { + webgpu_pipeline_info pipeline_infos[5] = { { .name = "mul_mat_f32_f32", .shader_code = wgsl_mul_mat_f32_f32, .src0_type = GGML_TYPE_F32, @@ -927,7 +927,11 @@ static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) { { .name = "mul_mat_f16_f32", .shader_code = wgsl_mul_mat_f16_f32, .src0_type = GGML_TYPE_F16, - .src1_type = GGML_TYPE_F32 } + .src1_type = GGML_TYPE_F32 }, + { .name = "mul_mat_q4_0_f32", + .shader_code = wgsl_mul_mat_q4_0_f32, + .src0_type = GGML_TYPE_Q4_0, + .src1_type = GGML_TYPE_F32 } }; for (auto & pipeline_info : pipeline_infos) { @@ -1084,8 +1088,9 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const case GGML_OP_CPY | GGML_OP_SET_ROWS: return op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32; case GGML_OP_MUL_MAT: - return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) && - (op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F16); + return ((op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) && + (op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F16)) || + (op->src[0]->type == GGML_TYPE_Q4_0 && op->src[1]->type == GGML_TYPE_F32); default: return false; } diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py b/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py index 41933af61e522..678b1f91e90b1 100755 --- a/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +++ b/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py @@ -3,58 +3,73 @@ import ast import argparse -variants_regex = re.compile(r'//\s*Variants:\s*\n(\[.*?\])', re.DOTALL) +def extract_block(text, name): + pattern = rf'#define\({name}\)\s*(.*?)#end\({name}\)' + match = re.search(pattern, text, re.DOTALL) + if not match: + raise ValueError(f"Missing block: {name}") + return match.group(1).strip() -def remove_variants_block(template_text): - return re.sub(variants_regex, '', template_text) +def parse_decls(decls_text): + decls = {} + for name, code in re.findall(r'#decl\((.*?)\)\s*(.*?)#enddecl\(\1\)', decls_text, re.DOTALL): + decls[name.strip()] = code.strip() + return decls -def extract_variants(template_text): - match = re.search(variants_regex, template_text) - if not match: - return None - return ast.literal_eval(match.group(1)) +def replace_placeholders(shader_text, replacements): + for key, val in replacements.items(): + pattern = rf'\b{re.escape(key)}\b' + shader_text = re.sub(pattern, str(val), shader_text) + return shader_text def write_shader(shader_name, shader_code, output_dir, outfile): if output_dir: wgsl_filename = os.path.join(output_dir, f"{shader_name}.wgsl") - with open(wgsl_filename, 'w', encoding='utf-8') as f_out: + with open(wgsl_filename, "w", encoding="utf-8") as f_out: f_out.write(shader_code) - outfile.write(f'const char* wgsl_{shader_name} = R"({shader_code})";\n') - outfile.write('\n') + outfile.write(f'const char* wgsl_{shader_name} = R"({shader_code})";\n\n') def generate_variants(shader_path, output_dir, outfile): shader_base_name = shader_path.split("/")[-1].split(".")[0] - with open(shader_path, 'r', encoding='utf-8') as f: - shader_code = f.read() - variants = extract_variants(shader_code) - shader_code = remove_variants_block(shader_code) - if not variants: - write_shader(shader_base_name, shader_code, output_dir, outfile) + + with open(shader_path, "r", encoding="utf-8") as f: + text = f.read() + + try: + variants = ast.literal_eval(extract_block(text, "VARIANTS")) + except ValueError: + write_shader(shader_base_name, text, output_dir, outfile) else: + decls_map = parse_decls(extract_block(text, "DECLS")) + shader_template = extract_block(text, "SHADER") + for variant in variants: - shader_variant = shader_code - parts = [] - for key, val in variant.items(): - parts.append(val) - shader_variant = shader_variant.replace(key, val) - output_name = f"{shader_base_name}_" + "_".join(parts) - write_shader(output_name, shader_variant, output_dir, outfile) + decls_key = variant["DECLS"] + if decls_key not in decls_map: + raise ValueError(f"DECLS key '{decls_key}' not found.") + decls_code = decls_map[decls_key] + "\n\n" + + shader_variant = replace_placeholders(shader_template, variant["REPLS"]) + final_shader = re.sub(rf'\bDECLS\b', decls_code, shader_variant) + + output_name = f"{shader_base_name}_" + "_".join([variant["REPLS"]["SRC0_TYPE"], variant["REPLS"]["SRC1_TYPE"]]) + write_shader(output_name, final_shader, output_dir, outfile) def main(): parser = argparse.ArgumentParser() - parser.add_argument('--input_dir', required=True) - parser.add_argument('--output_file', required=True) - parser.add_argument('--output_dir') + parser.add_argument("--input_dir", required=True) + parser.add_argument("--output_file", required=True) + parser.add_argument("--output_dir") args = parser.parse_args() + if args.output_dir: os.makedirs(args.output_dir, exist_ok=True) - with open(args.output_file, 'w', encoding='utf-8') as out: - out.write("// Auto-generated shader embedding \n\n") + + with open(args.output_file, "w", encoding="utf-8") as out: + out.write("// Auto-generated shader embedding\n\n") for fname in sorted(os.listdir(args.input_dir)): - if not fname.endswith('.wgsl'): - continue - shader_path = os.path.join(args.input_dir, fname) - generate_variants(shader_path, args.output_dir, out) + if fname.endswith(".wgsl"): + generate_variants(os.path.join(args.input_dir, fname), args.output_dir, out) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl index 2ffda79e01604..b39d3a5537e9d 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl @@ -1,26 +1,90 @@ -// Variants: +#define(VARIANTS) + [ { - "SRC0_TYPE" : "f32", - "SRC1_TYPE" : "f32" + "REPLS": { + "SRC0_TYPE" : "f32", + "SRC1_TYPE" : "f32", + "BLOCK_SIZE" : 1 + }, + "DECLS" : "FLOAT" + }, + { + "REPLS": { + "SRC0_TYPE" : "f16", + "SRC1_TYPE" : "f16", + "BLOCK_SIZE" : 1 + }, + "DECLS" : "FLOAT" }, { - "SRC0_TYPE" : "f16", - "SRC1_TYPE" : "f16" + "REPLS": { + "SRC0_TYPE" : "f16", + "SRC1_TYPE" : "f32", + "BLOCK_SIZE" : 1 + }, + "DECLS" : "FLOAT" }, { - "SRC0_TYPE" : "f16", - "SRC1_TYPE" : "f32" + "REPLS": { + "SRC0_TYPE" : "f32", + "SRC1_TYPE" : "f16", + "BLOCK_SIZE" : 1 + }, + "DECLS" : "FLOAT" }, { - "SRC0_TYPE" : "f32", - "SRC1_TYPE" : "f16" + "REPLS": { + "SRC0_TYPE": "q4_0", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 32 + }, + "DECLS": "Q4_0" } ] -// Shader Template: +#end(VARIANTS) + +#define(DECLS) + +#decl(FLOAT) +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + return f32(src0[src0_idx_base + offset]) * f32(src1[src1_idx_base + offset]); +} +#enddecl(FLOAT) + +#decl(Q4_0) +struct q4_0 { + d: f16, + qs: array +}; + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block_q4_0: q4_0 = src0[src0_idx_base + offset]; + let d = f32(block_q4_0.d); + var sum: f32 = 0.0; + for (var j: u32 = 0; j < 4; j++) { + let q_packed = bitcast(vec2(block_q4_0.qs[2 * j], block_q4_0.qs[2 * j + 1])); + for (var k: u32 = 0; k < 4; k++) { + let q_byte = (q_packed >> (k * 8)) & 0xFF; + let q_hi = (f32((q_byte >> 4) & 0xF) - 8.0f) * d; + let q_lo = (f32(q_byte & 0xF) - 8.0f) * d; + let src1_offset = src1_idx_base + offset * 32 + j * 4 + k; + sum += q_lo * src1[src1_offset]; + sum += q_hi * src1[src1_offset + 16]; + } + } + return sum; +} +#enddecl(Q4_0) +#end(DECLS) + +#define(SHADER) + enable f16; +DECLS + struct MulMatParams { offset_src0: u32, // in elements offset_src1: u32, // in elements @@ -72,12 +136,14 @@ fn main(@builtin(global_invocation_id) global_id: vec3) { let row = dst2_rem / params.n; // output row let col = dst2_rem % params.n; // output column + let src0_idx_base = params.offset_src0 + src03_idx * params.stride_03 + src02_idx * params.stride_02 + col * params.stride_01; + let src1_idx_base = params.offset_src1 + src13_idx * params.stride_13 + src12_idx * params.stride_12 + row * params.stride_11; + var sum = 0.0; - for (var i: u32 = 0u; i < params.k; i = i + 1u) { - let src0_idx = src03_idx * params.stride_03 + src02_idx * params.stride_02 + col * params.stride_01 + i; - let src1_idx = src13_idx * params.stride_13 + src12_idx * params.stride_12 + row * params.stride_11 + i; - sum = sum + f32(src0[params.offset_src0 + src0_idx]) * f32(src1[params.offset_src1 + src1_idx]); + for (var i: u32 = 0u; i < params.k/BLOCK_SIZE; i = i + 1u) { + sum += multiply_add(src0_idx_base, src1_idx_base, i); } dst[params.offset_dst + dst3_idx * dst3_stride + dst2_idx * dst2_stride + row * params.n + col] = sum; - } + +#end(SHADER) From 1aa40f1a5dd9d3320d6721f58a11de8b90a3b7e4 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Mon, 11 Aug 2025 20:27:12 -0700 Subject: [PATCH 08/15] Add q4_0_f16 matmul and fix device init --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 163 +++++++++--------- .../wgsl-shaders/mul_mat.tmpl.wgsl | 13 +- 2 files changed, 91 insertions(+), 85 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index bd9c5b4ede3a1..eb5bc580bfb18 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -125,8 +125,6 @@ struct webgpu_context_struct { std::recursive_mutex mutex; - bool device_init = false; - webgpu_buf_pool param_buf_pool; webgpu_buf_pool set_rows_error_buf_pool; @@ -454,6 +452,7 @@ static wgpu::Buffer ggml_webgpu_tensor_buf(const ggml_tensor * tensor) { ggml_backend_webgpu_buffer_context * ctx = (ggml_backend_webgpu_buffer_context *) tensor->buffer->context; return ctx->buffer; } + static size_t ggml_webgpu_tensor_misalignment(webgpu_context & ctx, ggml_tensor * t) { size_t offset = ggml_webgpu_tensor_offset(t); return offset & (ctx->limits.minStorageBufferOffsetAlignment - 1); @@ -911,7 +910,7 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_context & webgpu_ctx) { } static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) { - webgpu_pipeline_info pipeline_infos[5] = { + webgpu_pipeline_info pipeline_infos[6] = { { .name = "mul_mat_f32_f32", .shader_code = wgsl_mul_mat_f32_f32, .src0_type = GGML_TYPE_F32, @@ -928,10 +927,14 @@ static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) { .shader_code = wgsl_mul_mat_f16_f32, .src0_type = GGML_TYPE_F16, .src1_type = GGML_TYPE_F32 }, - { .name = "mul_mat_q4_0_f32", - .shader_code = wgsl_mul_mat_q4_0_f32, - .src0_type = GGML_TYPE_Q4_0, - .src1_type = GGML_TYPE_F32 } + { .name = "mul_mat_q4_0_f32", + .shader_code = wgsl_mul_mat_q4_0_f32, + .src0_type = GGML_TYPE_Q4_0, + .src1_type = GGML_TYPE_F32 }, + { .name = "mul_mat_q4_0_f16", + .shader_code = wgsl_mul_mat_q4_0_f16, + .src0_type = GGML_TYPE_Q4_0, + .src1_type = GGML_TYPE_F16 } }; for (auto & pipeline_info : pipeline_infos) { @@ -965,79 +968,6 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co ggml_backend_webgpu_device_context * dev_ctx = static_cast(dev->context); webgpu_context webgpu_ctx = dev_ctx->webgpu_ctx; - // Multiple threads may try to initialize the device - std::lock_guard lock(webgpu_ctx->mutex); - if (!webgpu_ctx->device_init) { - // Initialize device - std::vector required_features = { wgpu::FeatureName::ShaderF16, - wgpu::FeatureName::ImplicitDeviceSynchronization }; - wgpu::DeviceDescriptor dev_desc; - dev_desc.requiredLimits = &webgpu_ctx->limits; - dev_desc.requiredFeatures = required_features.data(); - dev_desc.requiredFeatureCount = required_features.size(); - dev_desc.SetDeviceLostCallback( - wgpu::CallbackMode::AllowSpontaneous, - [](const wgpu::Device & device, wgpu::DeviceLostReason reason, wgpu::StringView message) { - GGML_UNUSED(device); - GGML_LOG_ERROR( - "ggml_webgpu: Device lost! Reason: %d, Message: %s\n", static_cast(reason), message.data); - }); - dev_desc.SetUncapturedErrorCallback( - [](const wgpu::Device & device, wgpu::ErrorType reason, wgpu::StringView message) { - GGML_UNUSED(device); - GGML_LOG_ERROR( - "ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast(reason), message.data); - }); - webgpu_ctx->instance.WaitAny( - webgpu_ctx->adapter.RequestDevice( - &dev_desc, - wgpu::CallbackMode::AllowSpontaneous, - [webgpu_ctx](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) { - if (status != wgpu::RequestDeviceStatus::Success) { - GGML_LOG_ERROR("ggml_webgpu: Failed to get a device: %s\n", message.data); - return; - } - webgpu_ctx->device = std::move(device); - }), - UINT64_MAX); - GGML_ASSERT(webgpu_ctx->device != nullptr); - - // Initialize (compute) queue - webgpu_ctx->queue = webgpu_ctx->device.GetQueue(); - - // Create buffer pool for shader parameters - webgpu_ctx->param_buf_pool.init(webgpu_ctx->device, - WEBGPU_NUM_PARAM_BUFS, - WEBGPU_PARAMS_BUF_SIZE_BYTES, - wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::Uniform, - wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::MapWrite); - webgpu_ctx->set_rows_error_buf_pool.init(webgpu_ctx->device, - WEBGPU_NUM_SET_ROWS_ERROR_BUFS, - WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES, - wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::Storage, - wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead); - - ggml_webgpu_init_memset_pipeline(webgpu_ctx); - ggml_webgpu_init_mul_mat_pipeline(webgpu_ctx); - ggml_webgpu_init_set_rows_pipeline(webgpu_ctx); - ggml_webgpu_init_cpy_pipeline(webgpu_ctx); - -#ifdef GGML_WEBGPU_DEBUG - // Initialize debug buffers - ggml_webgpu_create_buffer(webgpu_ctx->device, - webgpu_ctx->debug_host_buf, - WEBGPU_DEBUG_BUF_ELEMS * sizeof(uint32_t), - wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead, - "debug_host_buf"); - ggml_webgpu_create_buffer(webgpu_ctx->device, - webgpu_ctx->debug_dev_buf, - WEBGPU_DEBUG_BUF_ELEMS * sizeof(uint32_t), - wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc, - "debug_dev_buf"); -#endif - webgpu_ctx->device_init = true; - } - static ggml_backend_webgpu_context backend_ctx; backend_ctx.name = GGML_WEBGPU_NAME + std::string(": ") + dev_ctx->device_name; backend_ctx.webgpu_ctx = webgpu_ctx; @@ -1088,9 +1018,9 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const case GGML_OP_CPY | GGML_OP_SET_ROWS: return op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32; case GGML_OP_MUL_MAT: - return ((op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) && - (op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F16)) || - (op->src[0]->type == GGML_TYPE_Q4_0 && op->src[1]->type == GGML_TYPE_F32); + return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16 || + op->src[0]->type == GGML_TYPE_Q4_0) && + (op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F16); default: return false; } @@ -1157,6 +1087,73 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t wgpu::AdapterInfo info{}; ctx->adapter.GetInfo(&info); + // Initialize device + std::vector required_features = { wgpu::FeatureName::ShaderF16, + wgpu::FeatureName::ImplicitDeviceSynchronization }; + wgpu::DeviceDescriptor dev_desc; + dev_desc.requiredLimits = &ctx->limits; + dev_desc.requiredFeatures = required_features.data(); + dev_desc.requiredFeatureCount = required_features.size(); + dev_desc.SetDeviceLostCallback( + wgpu::CallbackMode::AllowSpontaneous, + [](const wgpu::Device & device, wgpu::DeviceLostReason reason, wgpu::StringView message) { + GGML_UNUSED(device); + GGML_LOG_ERROR( + "ggml_webgpu: Device lost! Reason: %d, Message: %s\n", static_cast(reason), message.data); + }); + dev_desc.SetUncapturedErrorCallback( + [](const wgpu::Device & device, wgpu::ErrorType reason, wgpu::StringView message) { + GGML_UNUSED(device); + GGML_LOG_ERROR( + "ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast(reason), message.data); + }); + ctx->instance.WaitAny(ctx->adapter.RequestDevice( + &dev_desc, + wgpu::CallbackMode::AllowSpontaneous, + [ctx](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) { + if (status != wgpu::RequestDeviceStatus::Success) { + GGML_LOG_ERROR("ggml_webgpu: Failed to get a device: %s\n", message.data); + return; + } + ctx->device = std::move(device); + }), + UINT64_MAX); + GGML_ASSERT(ctx->device != nullptr); + + // Initialize (compute) queue + ctx->queue = ctx->device.GetQueue(); + + // Create buffer pool for shader parameters + ctx->param_buf_pool.init(ctx->device, + WEBGPU_NUM_PARAM_BUFS, + WEBGPU_PARAMS_BUF_SIZE_BYTES, + wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::Uniform, + wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::MapWrite); + ctx->set_rows_error_buf_pool.init(ctx->device, + WEBGPU_NUM_SET_ROWS_ERROR_BUFS, + WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES, + wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::Storage, + wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead); + + ggml_webgpu_init_memset_pipeline(ctx); + ggml_webgpu_init_mul_mat_pipeline(ctx); + ggml_webgpu_init_set_rows_pipeline(ctx); + ggml_webgpu_init_cpy_pipeline(ctx); + +#ifdef GGML_WEBGPU_DEBUG + // Initialize debug buffers + ggml_webgpu_create_buffer(ctx->device, + ctx->debug_host_buf, + WEBGPU_DEBUG_BUF_ELEMS * sizeof(uint32_t), + wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead, + "debug_host_buf"); + ggml_webgpu_create_buffer(ctx->device, + ctx->debug_dev_buf, + WEBGPU_DEBUG_BUF_ELEMS * sizeof(uint32_t), + wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc, + "debug_dev_buf"); +#endif + static ggml_backend_webgpu_device_context device_ctx; device_ctx.webgpu_ctx = ctx; device_ctx.device_name = GGML_WEBGPU_NAME; diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl index b39d3a5537e9d..64d462efa1e25 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl @@ -40,7 +40,16 @@ "BLOCK_SIZE": 32 }, "DECLS": "Q4_0" + }, + { + "REPLS": { + "SRC0_TYPE": "q4_0", + "SRC1_TYPE": "f16", + "BLOCK_SIZE": 32 + }, + "DECLS": "Q4_0" } + ] #end(VARIANTS) @@ -70,8 +79,8 @@ fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { let q_hi = (f32((q_byte >> 4) & 0xF) - 8.0f) * d; let q_lo = (f32(q_byte & 0xF) - 8.0f) * d; let src1_offset = src1_idx_base + offset * 32 + j * 4 + k; - sum += q_lo * src1[src1_offset]; - sum += q_hi * src1[src1_offset + 16]; + sum += q_lo * f32(src1[src1_offset]); + sum += q_hi * f32(src1[src1_offset + 16]); } } return sum; From c3611f9d0a4d2392ca16adc6022803fb00c34aec Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Tue, 12 Aug 2025 17:40:34 -0700 Subject: [PATCH 09/15] Add matmul support for basic quantization types --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 71 +++++- .../wgsl-shaders/mul_mat.tmpl.wgsl | 218 +++++++++++++++++- 2 files changed, 281 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index eb5bc580bfb18..6cf07e8937125 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -129,7 +129,7 @@ struct webgpu_context_struct { webgpu_buf_pool set_rows_error_buf_pool; wgpu::ComputePipeline memset_pipeline; - wgpu::ComputePipeline mul_mat_pipeline[3][2]; + wgpu::ComputePipeline mul_mat_pipeline[10][2]; wgpu::ComputePipeline set_rows_pipeline; wgpu::ComputePipeline cpy_pipeline; @@ -910,7 +910,7 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_context & webgpu_ctx) { } static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) { - webgpu_pipeline_info pipeline_infos[6] = { + webgpu_pipeline_info pipeline_infos[16] = { { .name = "mul_mat_f32_f32", .shader_code = wgsl_mul_mat_f32_f32, .src0_type = GGML_TYPE_F32, @@ -934,6 +934,46 @@ static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) { { .name = "mul_mat_q4_0_f16", .shader_code = wgsl_mul_mat_q4_0_f16, .src0_type = GGML_TYPE_Q4_0, + .src1_type = GGML_TYPE_F16 }, + { .name = "mul_mat_q4_1_f32", + .shader_code = wgsl_mul_mat_q4_1_f32, + .src0_type = GGML_TYPE_Q4_1, + .src1_type = GGML_TYPE_F32 }, + { .name = "mul_mat_q4_1_f16", + .shader_code = wgsl_mul_mat_q4_1_f16, + .src0_type = GGML_TYPE_Q4_1, + .src1_type = GGML_TYPE_F16 }, + { .name = "mul_mat_q5_0_f32", + .shader_code = wgsl_mul_mat_q5_0_f32, + .src0_type = GGML_TYPE_Q5_0, + .src1_type = GGML_TYPE_F32 }, + { .name = "mul_mat_q5_0_f16", + .shader_code = wgsl_mul_mat_q5_0_f16, + .src0_type = GGML_TYPE_Q5_0, + .src1_type = GGML_TYPE_F16 }, + { .name = "mul_mat_q5_1_f32", + .shader_code = wgsl_mul_mat_q5_1_f32, + .src0_type = GGML_TYPE_Q5_1, + .src1_type = GGML_TYPE_F32 }, + { .name = "mul_mat_q5_1_f16", + .shader_code = wgsl_mul_mat_q5_1_f16, + .src0_type = GGML_TYPE_Q5_1, + .src1_type = GGML_TYPE_F16 }, + { .name = "mul_mat_q8_0_f32", + .shader_code = wgsl_mul_mat_q8_0_f32, + .src0_type = GGML_TYPE_Q8_0, + .src1_type = GGML_TYPE_F32 }, + { .name = "mul_mat_q8_0_f16", + .shader_code = wgsl_mul_mat_q8_0_f16, + .src0_type = GGML_TYPE_Q8_0, + .src1_type = GGML_TYPE_F16 }, + { .name = "mul_mat_q8_1_f32", + .shader_code = wgsl_mul_mat_q8_1_f32, + .src0_type = GGML_TYPE_Q8_1, + .src1_type = GGML_TYPE_F32 }, + { .name = "mul_mat_q8_1_f16", + .shader_code = wgsl_mul_mat_q8_1_f16, + .src0_type = GGML_TYPE_Q8_1, .src1_type = GGML_TYPE_F16 } }; @@ -1015,12 +1055,31 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const case GGML_OP_VIEW: case GGML_OP_PERMUTE: return true; - case GGML_OP_CPY | GGML_OP_SET_ROWS: + case GGML_OP_CPY: + case GGML_OP_SET_ROWS: return op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32; case GGML_OP_MUL_MAT: - return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16 || - op->src[0]->type == GGML_TYPE_Q4_0) && - (op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F16); + switch(op->src[0]->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_1: + break; + default: + return false; + } + switch(op->src[1]->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + break; + default: + return false; + } + return true; default: return false; } diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl index 64d462efa1e25..7b7201ed14bd2 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl @@ -48,8 +48,87 @@ "BLOCK_SIZE": 32 }, "DECLS": "Q4_0" + }, + { + "REPLS": { + "SRC0_TYPE": "q4_1", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 32 + }, + "DECLS": "Q4_1" + }, + { + "REPLS": { + "SRC0_TYPE": "q4_1", + "SRC1_TYPE": "f16", + "BLOCK_SIZE": 32 + }, + "DECLS": "Q4_1" + }, + { + "REPLS": { + "SRC0_TYPE": "q5_0", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 32 + }, + "DECLS": "Q5_0" + }, + { + "REPLS": { + "SRC0_TYPE": "q5_0", + "SRC1_TYPE": "f16", + "BLOCK_SIZE": 32 + }, + "DECLS": "Q5_0" + }, + { + "REPLS": { + "SRC0_TYPE": "q5_1", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 32 + }, + "DECLS": "Q5_1" + }, + { + "REPLS": { + "SRC0_TYPE": "q5_1", + "SRC1_TYPE": "f16", + "BLOCK_SIZE": 32 + }, + "DECLS": "Q5_1" + }, + { + "REPLS": { + "SRC0_TYPE": "q8_0", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 32 + }, + "DECLS": "Q8_0" + }, + { + "REPLS": { + "SRC0_TYPE": "q8_0", + "SRC1_TYPE": "f16", + "BLOCK_SIZE": 32 + }, + "DECLS": "Q8_0" + }, + { + "REPLS": { + "SRC0_TYPE": "q8_1", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 32 + }, + "DECLS": "Q8_1" + }, + { + "REPLS": { + "SRC0_TYPE": "q8_1", + "SRC1_TYPE": "f16", + "BLOCK_SIZE": 32 + }, + "DECLS": "Q8_1" } - ] #end(VARIANTS) @@ -69,7 +148,7 @@ struct q4_0 { }; fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { - let block_q4_0: q4_0 = src0[src0_idx_base + offset]; + let block_q4_0 = src0[src0_idx_base + offset]; let d = f32(block_q4_0.d); var sum: f32 = 0.0; for (var j: u32 = 0; j < 4; j++) { @@ -86,6 +165,141 @@ fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { return sum; } #enddecl(Q4_0) + +#decl(Q4_1) +struct q4_1 { + d: f16, + m: f16, + qs: array +}; + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block_q4_1 = src0[src0_idx_base + offset]; + let d = f32(block_q4_1.d); + let m = f32(block_q4_1.m); + var sum: f32 = 0.0; + for (var j: u32 = 0; j < 4; j++) { + let q_packed = block_q4_1.qs[j]; + for (var k: u32 = 0; k < 4; k++) { + let q_byte = (q_packed >> (k * 8)) & 0xFF; + let q_hi = f32((q_byte >> 4) & 0xF) * d + m; + let q_lo = f32(q_byte & 0xF) * d + m; + let src1_offset = src1_idx_base + offset * 32 + j * 4 + k; + sum += q_lo * f32(src1[src1_offset]); + sum += q_hi * f32(src1[src1_offset + 16]); + } + } + return sum; +} +#enddecl(Q4_1) + +#decl(Q5_0) +struct q5_0 { + d: f16, + qh: array, + qs: array +}; + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block_q5_0 = src0[src0_idx_base + offset]; + let d = f32(block_q5_0.d); + var sum: f32 = 0.0; + let qh_packed = bitcast(vec2(block_q5_0.qh[0], block_q5_0.qh[1])); + for (var j: u32 = 0; j < 4; j++) { + let q_packed = bitcast(vec2(block_q5_0.qs[2 * j], block_q5_0.qs[2 * j + 1])); + for (var k: u32 = 0; k < 4; k++) { + let q_byte = (q_packed >> (k * 8)) & 0xFF; + let qh_hi = (qh_packed >> (j * 4 + k + 12)) & 0x10; + let q_hi = (f32(((q_byte >> 4) & 0xF) | qh_hi) - 16.0) * d; + let qh_lo = ((qh_packed >> (j * 4 + k)) << 4) & 0x10; + let q_lo = (f32((q_byte & 0xF) | qh_lo) - 16.0) * d; + let src1_offset = src1_idx_base + offset * 32 + j * 4 + k; + sum += q_lo * f32(src1[src1_offset]); + sum += q_hi * f32(src1[src1_offset + 16]); + } + } + return sum; +} +#enddecl(Q5_0) + +#decl(Q5_1) +struct q5_1 { + d: f16, + m: f16, + qh: u32, + qs: array +}; + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block_q5_1 = src0[src0_idx_base + offset]; + let d = f32(block_q5_1.d); + let m = f32(block_q5_1.m); + var sum: f32 = 0.0; + for (var j: u32 = 0; j < 4; j++) { + let q_packed = block_q5_1.qs[j]; + for (var k: u32 = 0; k < 4; k++) { + let q_byte = (q_packed >> (k * 8)) & 0xFF; + let qh_hi = (block_q5_1.qh >> (j * 4 + k + 12)) & 0x10; + let q_hi = f32(((q_byte >> 4) & 0xF) | qh_hi) * d + m; + let qh_lo = ((block_q5_1.qh >> (j * 4 + k)) << 4) & 0x10; + let q_lo = f32((q_byte & 0xF) | qh_lo) * d + m; + let src1_offset = src1_idx_base + offset * 32 + j * 4 + k; + sum += q_lo * f32(src1[src1_offset]); + sum += q_hi * f32(src1[src1_offset + 16]); + } + } + return sum; +} +#enddecl(Q5_1) + +#decl(Q8_0) +struct q8_0 { + d: f16, + qs: array +}; + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block_q8_0 = src0[src0_idx_base + offset]; + let d = f32(block_q8_0.d); + var sum: f32 = 0.0; + for (var j: u32 = 0; j < 8; j++) { + let q_packed = bitcast(vec2(block_q8_0.qs[2 * j], block_q8_0.qs[2 * j + 1])); + for (var k: u32 = 0; k < 4; k++) { + let q_byte = bitcast((((q_packed >> (k * 8)) & 0xFF) << 24)) >> 24; // sign-extend + let q_val = f32(q_byte) * d; + let src1_offset = src1_idx_base + offset * 32 + j * 4 + k; + sum += q_val * f32(src1[src1_offset]); + } + } + return sum; +} +#enddecl(Q8_0) + +#decl(Q8_1) +struct q8_1 { + d: f16, + m: f16, + qs: array +}; + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block_q8_1 = src0[src0_idx_base + offset]; + let d = f32(block_q8_1.d); + let m = f32(block_q8_1.m); + var sum: f32 = 0.0; + for (var j: u32 = 0; j < 8; j++) { + let q_packed = block_q8_1.qs[j]; + for (var k: u32 = 0; k < 4; k++) { + let q_byte = bitcast((((q_packed >> (k * 8)) & 0xFF) << 24)) >> 24; // sign-extend + let q_val = f32(q_byte) * d + m; + let src1_offset = src1_idx_base + offset * 32 + j * 4 + k; + sum += q_val * f32(src1[src1_offset]); + } + } + return sum; +} +#enddecl(Q8_1) + #end(DECLS) #define(SHADER) From de4da871837087c89b8adddf748c16464e9d5de2 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Wed, 13 Aug 2025 17:16:51 -0700 Subject: [PATCH 10/15] Add q2_k and q3_k quantization --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 77 +++----- .../wgsl-shaders/mul_mat.tmpl.wgsl | 172 ++++++++++++------ 2 files changed, 144 insertions(+), 105 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 6cf07e8937125..02f48442f12b4 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -129,7 +129,7 @@ struct webgpu_context_struct { webgpu_buf_pool set_rows_error_buf_pool; wgpu::ComputePipeline memset_pipeline; - wgpu::ComputePipeline mul_mat_pipeline[10][2]; + wgpu::ComputePipeline mul_mat_pipeline[12][2]; wgpu::ComputePipeline set_rows_pipeline; wgpu::ComputePipeline cpy_pipeline; @@ -910,7 +910,7 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_context & webgpu_ctx) { } static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) { - webgpu_pipeline_info pipeline_infos[16] = { + webgpu_pipeline_info pipeline_infos[10] = { { .name = "mul_mat_f32_f32", .shader_code = wgsl_mul_mat_f32_f32, .src0_type = GGML_TYPE_F32, @@ -919,10 +919,6 @@ static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) { .shader_code = wgsl_mul_mat_f16_f16, .src0_type = GGML_TYPE_F16, .src1_type = GGML_TYPE_F16 }, - { .name = "mul_mat_f32_f16", - .shader_code = wgsl_mul_mat_f32_f16, - .src0_type = GGML_TYPE_F32, - .src1_type = GGML_TYPE_F16 }, { .name = "mul_mat_f16_f32", .shader_code = wgsl_mul_mat_f16_f32, .src0_type = GGML_TYPE_F16, @@ -931,50 +927,30 @@ static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) { .shader_code = wgsl_mul_mat_q4_0_f32, .src0_type = GGML_TYPE_Q4_0, .src1_type = GGML_TYPE_F32 }, - { .name = "mul_mat_q4_0_f16", - .shader_code = wgsl_mul_mat_q4_0_f16, - .src0_type = GGML_TYPE_Q4_0, - .src1_type = GGML_TYPE_F16 }, { .name = "mul_mat_q4_1_f32", .shader_code = wgsl_mul_mat_q4_1_f32, .src0_type = GGML_TYPE_Q4_1, .src1_type = GGML_TYPE_F32 }, - { .name = "mul_mat_q4_1_f16", - .shader_code = wgsl_mul_mat_q4_1_f16, - .src0_type = GGML_TYPE_Q4_1, - .src1_type = GGML_TYPE_F16 }, { .name = "mul_mat_q5_0_f32", .shader_code = wgsl_mul_mat_q5_0_f32, .src0_type = GGML_TYPE_Q5_0, .src1_type = GGML_TYPE_F32 }, - { .name = "mul_mat_q5_0_f16", - .shader_code = wgsl_mul_mat_q5_0_f16, - .src0_type = GGML_TYPE_Q5_0, - .src1_type = GGML_TYPE_F16 }, { .name = "mul_mat_q5_1_f32", .shader_code = wgsl_mul_mat_q5_1_f32, .src0_type = GGML_TYPE_Q5_1, .src1_type = GGML_TYPE_F32 }, - { .name = "mul_mat_q5_1_f16", - .shader_code = wgsl_mul_mat_q5_1_f16, - .src0_type = GGML_TYPE_Q5_1, - .src1_type = GGML_TYPE_F16 }, { .name = "mul_mat_q8_0_f32", .shader_code = wgsl_mul_mat_q8_0_f32, .src0_type = GGML_TYPE_Q8_0, .src1_type = GGML_TYPE_F32 }, - { .name = "mul_mat_q8_0_f16", - .shader_code = wgsl_mul_mat_q8_0_f16, - .src0_type = GGML_TYPE_Q8_0, - .src1_type = GGML_TYPE_F16 }, - { .name = "mul_mat_q8_1_f32", - .shader_code = wgsl_mul_mat_q8_1_f32, - .src0_type = GGML_TYPE_Q8_1, + { .name = "mul_mat_q2_k_f32", + .shader_code = wgsl_mul_mat_q2_k_f32, + .src0_type = GGML_TYPE_Q2_K, .src1_type = GGML_TYPE_F32 }, - { .name = "mul_mat_q8_1_f16", - .shader_code = wgsl_mul_mat_q8_1_f16, - .src0_type = GGML_TYPE_Q8_1, - .src1_type = GGML_TYPE_F16 } + { .name = "mul_mat_q3_k_f32", + .shader_code = wgsl_mul_mat_q3_k_f32, + .src0_type = GGML_TYPE_Q3_K, + .src1_type = GGML_TYPE_F32 } }; for (auto & pipeline_info : pipeline_infos) { @@ -1058,28 +1034,29 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const case GGML_OP_CPY: case GGML_OP_SET_ROWS: return op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32; - case GGML_OP_MUL_MAT: - switch(op->src[0]->type) { - case GGML_TYPE_F32: - case GGML_TYPE_F16: - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q8_1: - break; - default: - return false; - } + case GGML_OP_MUL_MAT: { switch(op->src[1]->type) { - case GGML_TYPE_F32: case GGML_TYPE_F16: - break; + return op->src[0]->type == GGML_TYPE_F16; + case GGML_TYPE_F32: + switch(op->src[0]->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + return true; + default: + return false; + } default: return false; } - return true; + } default: return false; } diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl index 7b7201ed14bd2..2c94a8fea36d1 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl @@ -25,14 +25,6 @@ }, "DECLS" : "FLOAT" }, - { - "REPLS": { - "SRC0_TYPE" : "f32", - "SRC1_TYPE" : "f16", - "BLOCK_SIZE" : 1 - }, - "DECLS" : "FLOAT" - }, { "REPLS": { "SRC0_TYPE": "q4_0", @@ -41,14 +33,6 @@ }, "DECLS": "Q4_0" }, - { - "REPLS": { - "SRC0_TYPE": "q4_0", - "SRC1_TYPE": "f16", - "BLOCK_SIZE": 32 - }, - "DECLS": "Q4_0" - }, { "REPLS": { "SRC0_TYPE": "q4_1", @@ -57,14 +41,6 @@ }, "DECLS": "Q4_1" }, - { - "REPLS": { - "SRC0_TYPE": "q4_1", - "SRC1_TYPE": "f16", - "BLOCK_SIZE": 32 - }, - "DECLS": "Q4_1" - }, { "REPLS": { "SRC0_TYPE": "q5_0", @@ -73,14 +49,6 @@ }, "DECLS": "Q5_0" }, - { - "REPLS": { - "SRC0_TYPE": "q5_0", - "SRC1_TYPE": "f16", - "BLOCK_SIZE": 32 - }, - "DECLS": "Q5_0" - }, { "REPLS": { "SRC0_TYPE": "q5_1", @@ -89,14 +57,6 @@ }, "DECLS": "Q5_1" }, - { - "REPLS": { - "SRC0_TYPE": "q5_1", - "SRC1_TYPE": "f16", - "BLOCK_SIZE": 32 - }, - "DECLS": "Q5_1" - }, { "REPLS": { "SRC0_TYPE": "q8_0", @@ -107,27 +67,19 @@ }, { "REPLS": { - "SRC0_TYPE": "q8_0", - "SRC1_TYPE": "f16", - "BLOCK_SIZE": 32 - }, - "DECLS": "Q8_0" - }, - { - "REPLS": { - "SRC0_TYPE": "q8_1", + "SRC0_TYPE": "q2_k", "SRC1_TYPE": "f32", - "BLOCK_SIZE": 32 + "BLOCK_SIZE": 256 }, - "DECLS": "Q8_1" + "DECLS": "Q2_K" }, { "REPLS": { - "SRC0_TYPE": "q8_1", - "SRC1_TYPE": "f16", - "BLOCK_SIZE": 32 + "SRC0_TYPE": "q3_k", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 256 }, - "DECLS": "Q8_1" + "DECLS": "Q3_K" } ] @@ -300,6 +252,116 @@ fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { } #enddecl(Q8_1) +#decl(Q2_K) +// 16 blocks of 16 elements each +struct q2_k { + scales: array, + qs: array, + d: f16, + dmin: f16 +}; + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + let d = f32(block.d); + let m = f32(block.dmin); + var sum = 0.0; + var src1_i = src1_idx_base + offset * 256; + var is: u32 = 0; + // 2 halves of the block (128 elements each) + for (var q_b_idx: u32 = 0; q_b_idx < 64; q_b_idx += 32) { + // 4 groups (each group has 2 blocks of 16 elements) + for (var shift: u32 = 0; shift < 8; shift += 2) { + // 2 blocks + for (var k: u32 = 0; k < 32; k += 16) { + let sc = (block.scales[is / 4] >> ((is % 4) * 8)) & 0xFF; + is++; + let dl = d * f32(sc & 0xF); + let ml = m * f32(sc >> 4); + for (var l: u32 = 0u; l < 16; l++) { + let q_idx = q_b_idx + k + l; + let q_byte = (block.qs[q_idx / 4] >> ((q_idx % 4) * 8)) & 0xFF; + let qs_val = (q_byte >> shift) & 3; + sum += (f32(qs_val) * dl - ml) * src1[src1_i]; + src1_i++; + } + } + } + } + return sum; +} + +#enddecl(Q2_K) + +#decl(Q3_K) +// 16 blocks of 16 elements each +struct q3_k { + hmask: array, + qs: array, + scales: array, // 6-bit quantized values + d: f16 +}; + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + let d = f32(block.d); + + // extract 6-bit scales, which consist of 4-bits from first 8 bytes of scale, + // and 2-bits from the last 4 bytes + let kmask1: u32 = 0x03030303; + let kmask2: u32 = 0x0f0f0f0f; + var scale_vals: array; + for (var i: u32 = 0; i < 4; i++) { + scale_vals[i] = bitcast(vec2(block.scales[2 * i], block.scales[2 * i + 1])); + } + var tmp: u32 = scale_vals[2]; + scale_vals[2] = ((scale_vals[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4); + scale_vals[3] = ((scale_vals[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4); + scale_vals[0] = (scale_vals[0] & kmask2) | ((tmp & kmask1) << 4); + scale_vals[1] = (scale_vals[1] & kmask2) | (((tmp >> 2) & kmask1) << 4); + + // convert half-precision floats to packed 32-bit integers + var hmask_vals: array; + for (var i: u32 = 0; i < 8; i++) { + hmask_vals[i] = bitcast(vec2(block.hmask[2 * i], block.hmask[2 * i + 1])); + } + var qs_vals: array; + for (var i: u32 = 0; i < 16; i++) { + qs_vals[i] = bitcast(vec2(block.qs[2 * i], block.qs[2 * i + 1])); + } + + var sum = 0.0; + var src1_i = src1_idx_base + offset * 256; + var is: u32 = 0; + var m: u32 = 1; + // 2 halves of the block (128 elements each) + for (var q_b_idx: u32 = 0; q_b_idx < 64; q_b_idx += 32) { + // 4 groups (each group has 2 blocks of 16 elements) + for (var shift: u32 = 0; shift < 8; shift += 2) { + // 2 blocks + for (var k: u32 = 0; k < 32; k += 16) { + let sc = (scale_vals[is / 4] >> ((is % 4) * 8)) & 0xFF; + is++; + let dl = d * (f32(sc) - 32.0); + for (var l: u32 = 0u; l < 16u; l++) { + let q_idx = q_b_idx + k + l; + let hm_idx = k + l; + let q_byte = (qs_vals[q_idx / 4] >> ((q_idx % 4) * 8)) & 0xFF; + let hmask_byte = (hmask_vals[hm_idx / 4] >> ((hm_idx % 4) * 8)) & 0xFF; + let hm = select(4.0, 0.0, (hmask_byte & m) != 0); + let qs_val = (q_byte >> shift) & 3; + sum += ((f32(qs_val) - hm) * dl) * src1[src1_i]; + src1_i++; + } + } + m <<= 1; + } + } + return sum; +} + +#enddecl(Q3_K) + #end(DECLS) #define(SHADER) From d76e562b37829bf46e6f53f0b5b5374f6ce4e31c Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Thu, 14 Aug 2025 10:31:15 -0700 Subject: [PATCH 11/15] Add rest of k-quants --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 19 +- .../wgsl-shaders/mul_mat.tmpl.wgsl | 201 +++++++++++++++++- 2 files changed, 217 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 02f48442f12b4..7fe436ab5af13 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -129,7 +129,7 @@ struct webgpu_context_struct { webgpu_buf_pool set_rows_error_buf_pool; wgpu::ComputePipeline memset_pipeline; - wgpu::ComputePipeline mul_mat_pipeline[12][2]; + wgpu::ComputePipeline mul_mat_pipeline[15][2]; wgpu::ComputePipeline set_rows_pipeline; wgpu::ComputePipeline cpy_pipeline; @@ -910,7 +910,7 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_context & webgpu_ctx) { } static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) { - webgpu_pipeline_info pipeline_infos[10] = { + webgpu_pipeline_info pipeline_infos[13] = { { .name = "mul_mat_f32_f32", .shader_code = wgsl_mul_mat_f32_f32, .src0_type = GGML_TYPE_F32, @@ -950,6 +950,18 @@ static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) { { .name = "mul_mat_q3_k_f32", .shader_code = wgsl_mul_mat_q3_k_f32, .src0_type = GGML_TYPE_Q3_K, + .src1_type = GGML_TYPE_F32 }, + { .name = "mul_mat_q4_k_f32", + .shader_code = wgsl_mul_mat_q4_k_f32, + .src0_type = GGML_TYPE_Q4_K, + .src1_type = GGML_TYPE_F32 }, + { .name = "mul_mat_q5_k_f32", + .shader_code = wgsl_mul_mat_q5_k_f32, + .src0_type = GGML_TYPE_Q5_K, + .src1_type = GGML_TYPE_F32 }, + { .name = "mul_mat_q6_k_f32", + .shader_code = wgsl_mul_mat_q6_k_f32, + .src0_type = GGML_TYPE_Q6_K, .src1_type = GGML_TYPE_F32 } }; @@ -1049,6 +1061,9 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const case GGML_TYPE_Q8_0: case GGML_TYPE_Q2_K: case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: return true; default: return false; diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl index 2c94a8fea36d1..8c1d5bab5d8b7 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl @@ -80,6 +80,30 @@ "BLOCK_SIZE": 256 }, "DECLS": "Q3_K" + }, + { + "REPLS": { + "SRC0_TYPE": "q4_k", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": "Q4_K" + }, + { + "REPLS": { + "SRC0_TYPE": "q5_k", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": "Q5_K" + }, + { + "REPLS": { + "SRC0_TYPE": "q6_k", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": "Q6_K" } ] @@ -320,7 +344,7 @@ fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { scale_vals[0] = (scale_vals[0] & kmask2) | ((tmp & kmask1) << 4); scale_vals[1] = (scale_vals[1] & kmask2) | (((tmp >> 2) & kmask1) << 4); - // convert half-precision floats to packed 32-bit integers + // convert arrays of f16 -> u32 var hmask_vals: array; for (var i: u32 = 0; i < 8; i++) { hmask_vals[i] = bitcast(vec2(block.hmask[2 * i], block.hmask[2 * i + 1])); @@ -362,6 +386,181 @@ fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { #enddecl(Q3_K) +#decl(Q4_K) +// 8 blocks of 32 elements each +struct q4_k { + d: f16, + dmin: f16, + scales: array, + qs: array +}; + +fn get_scale_min(is: u32, scales: array) -> vec2 { + if (is < 4) { + let sc_byte = (scales[is / 4] >> ((is % 4) * 8)) & 0xFF; + let min_byte = (scales[(is + 4) / 4] >> ((is % 4) * 8)) & 0xFF; + return vec2(f32(sc_byte & 63), f32(min_byte & 63)); + } else { + let sc_min_lo = (scales[(is + 4) / 4] >> (((is + 4) % 4) * 8)) & 0xFF; + let sc_hi = (scales[(is - 4) / 4] >> (((is - 4) % 4) * 8)) & 0xFF; + let min_hi = (scales[is / 4] >> ((is % 4) * 8)) & 0xFF; + let sc = (sc_min_lo & 0xF) | ((sc_hi >> 6) << 4); + let m = (sc_min_lo >> 4) | ((min_hi >> 6) << 4); + return vec2(f32(sc), f32(m)); + } +} + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + let d = f32(block.d); + let m = f32(block.dmin); + var sum = 0.0; + var src1_i = src1_idx_base + offset * 256; + var is: u32 = 0; + // 2 blocks each iteration + for (var q_b_idx: u32 = 0; q_b_idx < 128; q_b_idx += 32) { + for (var shift: u32 = 0; shift < 8; shift += 4) { + let scale_min = get_scale_min(is, block.scales); + is++; + let dl = d * scale_min.x; + let ml = m * scale_min.y; + for (var l: u32 = 0; l < 32; l++) { + let q_idx = q_b_idx + l; + let q_byte = (block.qs[q_idx / 4] >> ((q_idx % 4) * 8)) & 0xFF; + let qs_val = (q_byte >> shift) & 0xF; + sum += (f32(qs_val) * dl - ml) * src1[src1_i]; + src1_i++; + } + } + } + return sum; +} + +#enddecl(Q4_K) + +#decl(Q5_K) +// 8 blocks of 32 elements each +struct q5_k { + d: f16, + dmin: f16, + scales: array, + qh: array, + qs: array +}; + +fn get_scale_min(is: u32, scales: array) -> vec2 { + if (is < 4) { + let sc_byte = (scales[is / 4] >> ((is % 4) * 8)) & 0xFF; + let min_byte = (scales[(is + 4) / 4] >> ((is % 4) * 8)) & 0xFF; + return vec2(f32(sc_byte & 63), f32(min_byte & 63)); + } else { + let sc_min_lo = (scales[(is + 4) / 4] >> (((is + 4) % 4) * 8)) & 0xFF; + let sc_hi = (scales[(is - 4) / 4] >> (((is - 4) % 4) * 8)) & 0xFF; + let min_hi = (scales[is / 4] >> ((is % 4) * 8)) & 0xFF; + let sc = (sc_min_lo & 0xF) | ((sc_hi >> 6) << 4); + let m = (sc_min_lo >> 4) | ((min_hi >> 6) << 4); + return vec2(f32(sc), f32(m)); + } +} + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + let d = f32(block.d); + let m = f32(block.dmin); + var sum = 0.0; + var src1_i = src1_idx_base + offset * 256; + var is: u32 = 0; + var u: u32 = 1; + // 2 blocks each iteration + for (var q_b_idx: u32 = 0; q_b_idx < 128; q_b_idx += 32) { + for (var shift: u32 = 0; shift < 8; shift += 4) { + let scale_min = get_scale_min(is, block.scales); + is++; + let dl = d * scale_min.x; + let ml = m * scale_min.y; + for (var l: u32 = 0; l < 32; l++) { + let q_idx = q_b_idx + l; + let q_byte = (block.qs[q_idx / 4] >> ((q_idx % 4) * 8)) & 0xFF; + let qh_byte = (block.qh[l / 4] >> ((l % 4) * 8)) & 0xFF; + let qs_val = (q_byte >> shift) & 0xF; + let qh_val = select(0.0, 16.0, (qh_byte & u) != 0); + sum += ((f32(qs_val) + qh_val) * dl - ml) * src1[src1_i]; + src1_i++; + } + u <<= 1; + } + } + return sum; +} + +#enddecl(Q5_K) + +#decl(Q6_K) +// 16 blocks of 16 elements each +struct q6_k { + ql: array, + qh: array, + scales: array, + d: f16 +}; + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + let d = f32(block.d); + + // convert arrays of f16 -> u32 + var ql_vals: array; + for (var i: u32 = 0; i < 32; i++) { + ql_vals[i] = bitcast(vec2(block.ql[2 * i], block.ql[2 * i + 1])); + } + var qh_vals: array; + for (var i: u32 = 0; i < 16; i++) { + qh_vals[i] = bitcast(vec2(block.qh[2 * i], block.qh[2 * i + 1])); + } + var scale_vals: array; + for (var i: u32 = 0; i < 4; i++) { + scale_vals[i] = bitcast(vec2(block.scales[2 * i], block.scales[2 * i + 1])); + } + + var sum = 0.0; + var src1_i = src1_idx_base + offset * 256; + var qh_b_idx: u32 = 0; + var sc_b_idx: u32 = 0; + for (var ql_b_idx: u32 = 0; ql_b_idx < 128; ql_b_idx += 64) { + for (var l: u32 = 0; l < 32; l++) { + let ql13_b = (ql_vals[(ql_b_idx + l) / 4] >> (((ql_b_idx + l) % 4) * 8)) & 0xFF; + let ql24_b = (ql_vals[(ql_b_idx + l + 32) / 4] >> (((ql_b_idx + l + 32) % 4) * 8)) & 0xFF; + let qh_b = ((qh_vals[(qh_b_idx + l) / 4] >> (((qh_b_idx + l) % 4) * 8))) & 0xFF; + + let q1 = f32((ql13_b & 0xF) | ((qh_b & 3) << 4)) - 32.0; + let q2 = f32((ql24_b & 0xF) | (((qh_b >> 2) & 3) << 4)) - 32.0; + let q3 = f32((ql13_b >> 4) | (((qh_b >> 4) & 3) << 4)) - 32.0; + let q4 = f32((ql24_b >> 4) | (((qh_b >> 6) & 3) << 4)) - 32.0; + + let is = l/16; + let is1 = sc_b_idx + is; + let sc1 = bitcast(((scale_vals[is1 / 4] >> ((is1 % 4) * 8)) & 0xFF) << 24) >> 24; + let is2 = sc_b_idx + is + 2; + let sc2 = bitcast(((scale_vals[is2 / 4] >> ((is2 % 4) * 8)) & 0xFF) << 24) >> 24; + let is3 = sc_b_idx + is + 4; + let sc3 = bitcast(((scale_vals[is3 / 4] >> ((is3 % 4) * 8)) & 0xFF) << 24) >> 24; + let is4 = sc_b_idx + is + 6; + let sc4 = bitcast(((scale_vals[is4 / 4] >> ((is4 % 4) * 8)) & 0xFF) << 24) >> 24; + + sum += d * f32(sc1) * q1 * src1[src1_i + l]; + sum += d * f32(sc2) * q2 * src1[src1_i + l + 32]; + sum += d * f32(sc3) * q3 * src1[src1_i + l + 64]; + sum += d * f32(sc4) * q4 * src1[src1_i + l + 96]; + } + src1_i += 128; + qh_b_idx += 32; + sc_b_idx += 8; + } + return sum; +} + +#enddecl(Q6_K) + #end(DECLS) #define(SHADER) From e2380e2582753cfac9cb89bb8585364becce2b52 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Fri, 15 Aug 2025 08:49:37 -0700 Subject: [PATCH 12/15] Get firt i-quant working --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 20 +- .../ggml-webgpu/wgsl-shaders/embed_wgsl.py | 10 +- .../wgsl-shaders/mul_mat.tmpl.wgsl | 270 ++++++++++++++---- tests/test-backend-ops.cpp | 9 + 4 files changed, 238 insertions(+), 71 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 7fe436ab5af13..1d4dde165bf89 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -20,7 +20,7 @@ #ifdef GGML_WEBGPU_DEBUG # define WEBGPU_LOG_DEBUG(msg) std::cout << msg << std::endl -# define WEBGPU_DEBUG_BUF_ELEMS 32 +# define WEBGPU_DEBUG_BUF_ELEMS 33 #else # define WEBGPU_LOG_DEBUG(msg) ((void) 0) #endif // GGML_WEBGPU_DEBUG @@ -129,7 +129,7 @@ struct webgpu_context_struct { webgpu_buf_pool set_rows_error_buf_pool; wgpu::ComputePipeline memset_pipeline; - wgpu::ComputePipeline mul_mat_pipeline[15][2]; + wgpu::ComputePipeline mul_mat_pipeline[17][2]; wgpu::ComputePipeline set_rows_pipeline; wgpu::ComputePipeline cpy_pipeline; @@ -595,12 +595,17 @@ static void ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_t { .binding = 2, .buffer = ggml_webgpu_tensor_buf(dst), .offset = ggml_webgpu_tensor_align_offset(ctx, dst), - .size = ggml_webgpu_tensor_binding_size(ctx, dst) } + .size = ggml_webgpu_tensor_binding_size(ctx, dst) }, +// { .binding = 3, +// .buffer = ctx->debug_dev_buf, +// .offset = 0, +// .size = ctx->debug_dev_buf.GetSize() } }; uint32_t wg_x = (dst->ne[0] * dst->ne[1] * dst->ne[2] * dst->ne[3] + WEBGPU_MUL_MAT_WG_SIZE - 1) / WEBGPU_MUL_MAT_WG_SIZE; ggml_backend_webgpu_build_and_enqueue(ctx, ctx->mul_mat_pipeline[src0->type][src1->type], params, entries, wg_x); + //ggml_backend_webgpu_debug(ctx); } // Returns true if node has enqueued work into the queue, false otherwise @@ -910,7 +915,7 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_context & webgpu_ctx) { } static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) { - webgpu_pipeline_info pipeline_infos[13] = { + webgpu_pipeline_info pipeline_infos[14] = { { .name = "mul_mat_f32_f32", .shader_code = wgsl_mul_mat_f32_f32, .src0_type = GGML_TYPE_F32, @@ -962,7 +967,11 @@ static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) { { .name = "mul_mat_q6_k_f32", .shader_code = wgsl_mul_mat_q6_k_f32, .src0_type = GGML_TYPE_Q6_K, - .src1_type = GGML_TYPE_F32 } + .src1_type = GGML_TYPE_F32 }, + { .name = "mul_mat_iq2_xxs_f32", + .shader_code = wgsl_mul_mat_iq2_xxs_f32, + .src0_type = GGML_TYPE_IQ2_XXS, + .src1_type = GGML_TYPE_F32 } }; for (auto & pipeline_info : pipeline_infos) { @@ -1064,6 +1073,7 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const case GGML_TYPE_Q4_K: case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: + case GGML_TYPE_IQ2_XXS: return true; default: return false; diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py b/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py index 678b1f91e90b1..2985622c892e0 100755 --- a/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +++ b/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py @@ -44,10 +44,12 @@ def generate_variants(shader_path, output_dir, outfile): shader_template = extract_block(text, "SHADER") for variant in variants: - decls_key = variant["DECLS"] - if decls_key not in decls_map: - raise ValueError(f"DECLS key '{decls_key}' not found.") - decls_code = decls_map[decls_key] + "\n\n" + decls = variant["DECLS"] + decls_code = "" + for key in decls: + if key not in decls_map: + raise ValueError(f"DECLS key '{key}' not found.") + decls_code += decls_map[key] + "\n\n" shader_variant = replace_placeholders(shader_template, variant["REPLS"]) final_shader = re.sub(rf'\bDECLS\b', decls_code, shader_variant) diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl index 8c1d5bab5d8b7..3fbbbc28d54f9 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl @@ -7,7 +7,7 @@ "SRC1_TYPE" : "f32", "BLOCK_SIZE" : 1 }, - "DECLS" : "FLOAT" + "DECLS" : ["FLOAT"] }, { "REPLS": { @@ -15,7 +15,7 @@ "SRC1_TYPE" : "f16", "BLOCK_SIZE" : 1 }, - "DECLS" : "FLOAT" + "DECLS" : ["FLOAT"] }, { "REPLS": { @@ -23,7 +23,7 @@ "SRC1_TYPE" : "f32", "BLOCK_SIZE" : 1 }, - "DECLS" : "FLOAT" + "DECLS" : ["FLOAT"] }, { "REPLS": { @@ -31,7 +31,7 @@ "SRC1_TYPE": "f32", "BLOCK_SIZE": 32 }, - "DECLS": "Q4_0" + "DECLS": ["BYTE_HELPERS", "Q4_0"] }, { "REPLS": { @@ -39,7 +39,7 @@ "SRC1_TYPE": "f32", "BLOCK_SIZE": 32 }, - "DECLS": "Q4_1" + "DECLS": ["BYTE_HELPERS", "Q4_1"] }, { "REPLS": { @@ -47,7 +47,7 @@ "SRC1_TYPE": "f32", "BLOCK_SIZE": 32 }, - "DECLS": "Q5_0" + "DECLS": ["BYTE_HELPERS", "Q5_0"] }, { "REPLS": { @@ -55,7 +55,7 @@ "SRC1_TYPE": "f32", "BLOCK_SIZE": 32 }, - "DECLS": "Q5_1" + "DECLS": ["BYTE_HELPERS", "Q5_1"] }, { "REPLS": { @@ -63,7 +63,7 @@ "SRC1_TYPE": "f32", "BLOCK_SIZE": 32 }, - "DECLS": "Q8_0" + "DECLS": ["BYTE_HELPERS", "Q8_0"] }, { "REPLS": { @@ -71,7 +71,7 @@ "SRC1_TYPE": "f32", "BLOCK_SIZE": 256 }, - "DECLS": "Q2_K" + "DECLS": ["BYTE_HELPERS", "Q2_K"] }, { "REPLS": { @@ -79,7 +79,7 @@ "SRC1_TYPE": "f32", "BLOCK_SIZE": 256 }, - "DECLS": "Q3_K" + "DECLS": ["BYTE_HELPERS", "Q3_K"] }, { "REPLS": { @@ -87,7 +87,7 @@ "SRC1_TYPE": "f32", "BLOCK_SIZE": 256 }, - "DECLS": "Q4_K" + "DECLS": ["Q45_K_SCALE_MIN", "BYTE_HELPERS", "Q4_K"] }, { "REPLS": { @@ -95,7 +95,7 @@ "SRC1_TYPE": "f32", "BLOCK_SIZE": 256 }, - "DECLS": "Q5_K" + "DECLS": ["Q45_K_SCALE_MIN", "BYTE_HELPERS", "Q5_K"] }, { "REPLS": { @@ -103,7 +103,15 @@ "SRC1_TYPE": "f32", "BLOCK_SIZE": 256 }, - "DECLS": "Q6_K" + "DECLS": ["BYTE_HELPERS", "Q6_K"] + }, + { + "REPLS": { + "SRC0_TYPE": "iq2_xxs", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["BYTE_HELPERS", "IQ2_TABLES", "IQ2_XXS"] } ] @@ -111,6 +119,18 @@ #define(DECLS) +#decl(BYTE_HELPERS) + +fn get_byte(value: u32, index: u32) -> u32 { + return (value >> (index * 8)) & 0xFF; +} + +fn get_byte_i32(value: u32, index: u32) -> i32 { + return bitcast(((value >> (index * 8)) & 0xFF) << 24) >> 24; +} + +#enddecl(BYTE_HELPERS) + #decl(FLOAT) fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { return f32(src0[src0_idx_base + offset]) * f32(src1[src1_idx_base + offset]); @@ -130,7 +150,7 @@ fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { for (var j: u32 = 0; j < 4; j++) { let q_packed = bitcast(vec2(block_q4_0.qs[2 * j], block_q4_0.qs[2 * j + 1])); for (var k: u32 = 0; k < 4; k++) { - let q_byte = (q_packed >> (k * 8)) & 0xFF; + let q_byte = get_byte(q_packed, k); let q_hi = (f32((q_byte >> 4) & 0xF) - 8.0f) * d; let q_lo = (f32(q_byte & 0xF) - 8.0f) * d; let src1_offset = src1_idx_base + offset * 32 + j * 4 + k; @@ -157,7 +177,7 @@ fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { for (var j: u32 = 0; j < 4; j++) { let q_packed = block_q4_1.qs[j]; for (var k: u32 = 0; k < 4; k++) { - let q_byte = (q_packed >> (k * 8)) & 0xFF; + let q_byte = get_byte(q_packed, k); let q_hi = f32((q_byte >> 4) & 0xF) * d + m; let q_lo = f32(q_byte & 0xF) * d + m; let src1_offset = src1_idx_base + offset * 32 + j * 4 + k; @@ -184,7 +204,7 @@ fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { for (var j: u32 = 0; j < 4; j++) { let q_packed = bitcast(vec2(block_q5_0.qs[2 * j], block_q5_0.qs[2 * j + 1])); for (var k: u32 = 0; k < 4; k++) { - let q_byte = (q_packed >> (k * 8)) & 0xFF; + let q_byte = get_byte(q_packed, k); let qh_hi = (qh_packed >> (j * 4 + k + 12)) & 0x10; let q_hi = (f32(((q_byte >> 4) & 0xF) | qh_hi) - 16.0) * d; let qh_lo = ((qh_packed >> (j * 4 + k)) << 4) & 0x10; @@ -214,7 +234,7 @@ fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { for (var j: u32 = 0; j < 4; j++) { let q_packed = block_q5_1.qs[j]; for (var k: u32 = 0; k < 4; k++) { - let q_byte = (q_packed >> (k * 8)) & 0xFF; + let q_byte = get_byte(q_packed, k); let qh_hi = (block_q5_1.qh >> (j * 4 + k + 12)) & 0x10; let q_hi = f32(((q_byte >> 4) & 0xF) | qh_hi) * d + m; let qh_lo = ((block_q5_1.qh >> (j * 4 + k)) << 4) & 0x10; @@ -241,7 +261,7 @@ fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { for (var j: u32 = 0; j < 8; j++) { let q_packed = bitcast(vec2(block_q8_0.qs[2 * j], block_q8_0.qs[2 * j + 1])); for (var k: u32 = 0; k < 4; k++) { - let q_byte = bitcast((((q_packed >> (k * 8)) & 0xFF) << 24)) >> 24; // sign-extend + let q_byte = get_byte_i32(q_packed, k); let q_val = f32(q_byte) * d; let src1_offset = src1_idx_base + offset * 32 + j * 4 + k; sum += q_val * f32(src1[src1_offset]); @@ -266,7 +286,7 @@ fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { for (var j: u32 = 0; j < 8; j++) { let q_packed = block_q8_1.qs[j]; for (var k: u32 = 0; k < 4; k++) { - let q_byte = bitcast((((q_packed >> (k * 8)) & 0xFF) << 24)) >> 24; // sign-extend + let q_byte = get_byte_i32(q_packed, k); let q_val = f32(q_byte) * d + m; let src1_offset = src1_idx_base + offset * 32 + j * 4 + k; sum += q_val * f32(src1[src1_offset]); @@ -298,13 +318,13 @@ fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { for (var shift: u32 = 0; shift < 8; shift += 2) { // 2 blocks for (var k: u32 = 0; k < 32; k += 16) { - let sc = (block.scales[is / 4] >> ((is % 4) * 8)) & 0xFF; + let sc = get_byte(block.scales[is / 4], is % 4); is++; let dl = d * f32(sc & 0xF); let ml = m * f32(sc >> 4); for (var l: u32 = 0u; l < 16; l++) { let q_idx = q_b_idx + k + l; - let q_byte = (block.qs[q_idx / 4] >> ((q_idx % 4) * 8)) & 0xFF; + let q_byte = get_byte(block.qs[q_idx / 4], q_idx % 4); let qs_val = (q_byte >> shift) & 3; sum += (f32(qs_val) * dl - ml) * src1[src1_i]; src1_i++; @@ -364,14 +384,14 @@ fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { for (var shift: u32 = 0; shift < 8; shift += 2) { // 2 blocks for (var k: u32 = 0; k < 32; k += 16) { - let sc = (scale_vals[is / 4] >> ((is % 4) * 8)) & 0xFF; + let sc = get_byte(scale_vals[is / 4], is % 4); is++; let dl = d * (f32(sc) - 32.0); for (var l: u32 = 0u; l < 16u; l++) { let q_idx = q_b_idx + k + l; let hm_idx = k + l; - let q_byte = (qs_vals[q_idx / 4] >> ((q_idx % 4) * 8)) & 0xFF; - let hmask_byte = (hmask_vals[hm_idx / 4] >> ((hm_idx % 4) * 8)) & 0xFF; + let q_byte = get_byte(qs_vals[q_idx / 4], q_idx % 4); + let hmask_byte = get_byte(hmask_vals[hm_idx / 4], hm_idx % 4); let hm = select(4.0, 0.0, (hmask_byte & m) != 0); let qs_val = (q_byte >> shift) & 3; sum += ((f32(qs_val) - hm) * dl) * src1[src1_i]; @@ -386,30 +406,34 @@ fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { #enddecl(Q3_K) -#decl(Q4_K) -// 8 blocks of 32 elements each -struct q4_k { - d: f16, - dmin: f16, - scales: array, - qs: array -}; +#decl(Q45_K_SCALE_MIN) fn get_scale_min(is: u32, scales: array) -> vec2 { if (is < 4) { - let sc_byte = (scales[is / 4] >> ((is % 4) * 8)) & 0xFF; - let min_byte = (scales[(is + 4) / 4] >> ((is % 4) * 8)) & 0xFF; + let sc_byte = get_byte(scales[is / 4], is % 4); + let min_byte = get_byte(scales[(is + 4) / 4], is % 4); return vec2(f32(sc_byte & 63), f32(min_byte & 63)); } else { - let sc_min_lo = (scales[(is + 4) / 4] >> (((is + 4) % 4) * 8)) & 0xFF; - let sc_hi = (scales[(is - 4) / 4] >> (((is - 4) % 4) * 8)) & 0xFF; - let min_hi = (scales[is / 4] >> ((is % 4) * 8)) & 0xFF; + let sc_min_lo = get_byte(scales[(is + 4) / 4], (is + 4) % 4); + let sc_hi = get_byte(scales[(is - 4) / 4], (is - 4) % 4); + let min_hi = get_byte(scales[is / 4], is % 4); let sc = (sc_min_lo & 0xF) | ((sc_hi >> 6) << 4); let m = (sc_min_lo >> 4) | ((min_hi >> 6) << 4); return vec2(f32(sc), f32(m)); } } +#enddecl(Q45_K_SCALE_MIN) + +#decl(Q4_K) +// 8 blocks of 32 elements each +struct q4_k { + d: f16, + dmin: f16, + scales: array, + qs: array +}; + fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { let block = src0[src0_idx_base + offset]; let d = f32(block.d); @@ -426,7 +450,7 @@ fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { let ml = m * scale_min.y; for (var l: u32 = 0; l < 32; l++) { let q_idx = q_b_idx + l; - let q_byte = (block.qs[q_idx / 4] >> ((q_idx % 4) * 8)) & 0xFF; + let q_byte = get_byte(block.qs[q_idx / 4], q_idx % 4); let qs_val = (q_byte >> shift) & 0xF; sum += (f32(qs_val) * dl - ml) * src1[src1_i]; src1_i++; @@ -448,21 +472,6 @@ struct q5_k { qs: array }; -fn get_scale_min(is: u32, scales: array) -> vec2 { - if (is < 4) { - let sc_byte = (scales[is / 4] >> ((is % 4) * 8)) & 0xFF; - let min_byte = (scales[(is + 4) / 4] >> ((is % 4) * 8)) & 0xFF; - return vec2(f32(sc_byte & 63), f32(min_byte & 63)); - } else { - let sc_min_lo = (scales[(is + 4) / 4] >> (((is + 4) % 4) * 8)) & 0xFF; - let sc_hi = (scales[(is - 4) / 4] >> (((is - 4) % 4) * 8)) & 0xFF; - let min_hi = (scales[is / 4] >> ((is % 4) * 8)) & 0xFF; - let sc = (sc_min_lo & 0xF) | ((sc_hi >> 6) << 4); - let m = (sc_min_lo >> 4) | ((min_hi >> 6) << 4); - return vec2(f32(sc), f32(m)); - } -} - fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { let block = src0[src0_idx_base + offset]; let d = f32(block.d); @@ -480,8 +489,8 @@ fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { let ml = m * scale_min.y; for (var l: u32 = 0; l < 32; l++) { let q_idx = q_b_idx + l; - let q_byte = (block.qs[q_idx / 4] >> ((q_idx % 4) * 8)) & 0xFF; - let qh_byte = (block.qh[l / 4] >> ((l % 4) * 8)) & 0xFF; + let q_byte = get_byte(block.qs[q_idx / 4], q_idx % 4); + let qh_byte = get_byte(block.qh[l / 4], l % 4); let qs_val = (q_byte >> shift) & 0xF; let qh_val = select(0.0, 16.0, (qh_byte & u) != 0); sum += ((f32(qs_val) + qh_val) * dl - ml) * src1[src1_i]; @@ -528,9 +537,9 @@ fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { var sc_b_idx: u32 = 0; for (var ql_b_idx: u32 = 0; ql_b_idx < 128; ql_b_idx += 64) { for (var l: u32 = 0; l < 32; l++) { - let ql13_b = (ql_vals[(ql_b_idx + l) / 4] >> (((ql_b_idx + l) % 4) * 8)) & 0xFF; - let ql24_b = (ql_vals[(ql_b_idx + l + 32) / 4] >> (((ql_b_idx + l + 32) % 4) * 8)) & 0xFF; - let qh_b = ((qh_vals[(qh_b_idx + l) / 4] >> (((qh_b_idx + l) % 4) * 8))) & 0xFF; + let ql13_b = get_byte(ql_vals[(ql_b_idx + l) / 4], (ql_b_idx + l) % 4); + let ql24_b = get_byte(ql_vals[(ql_b_idx + l + 32) / 4], (ql_b_idx + l + 32) % 4); + let qh_b = get_byte(qh_vals[(qh_b_idx + l) / 4], (qh_b_idx + l) % 4); let q1 = f32((ql13_b & 0xF) | ((qh_b & 3) << 4)) - 32.0; let q2 = f32((ql24_b & 0xF) | (((qh_b >> 2) & 3) << 4)) - 32.0; @@ -539,13 +548,13 @@ fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { let is = l/16; let is1 = sc_b_idx + is; - let sc1 = bitcast(((scale_vals[is1 / 4] >> ((is1 % 4) * 8)) & 0xFF) << 24) >> 24; + let sc1 = get_byte_i32(scale_vals[is1 / 4], is1 % 4); let is2 = sc_b_idx + is + 2; - let sc2 = bitcast(((scale_vals[is2 / 4] >> ((is2 % 4) * 8)) & 0xFF) << 24) >> 24; + let sc2 = get_byte_i32(scale_vals[is2 / 4], is2 % 4); let is3 = sc_b_idx + is + 4; - let sc3 = bitcast(((scale_vals[is3 / 4] >> ((is3 % 4) * 8)) & 0xFF) << 24) >> 24; + let sc3 = get_byte_i32(scale_vals[is3 / 4], is3 % 4); let is4 = sc_b_idx + is + 6; - let sc4 = bitcast(((scale_vals[is4 / 4] >> ((is4 % 4) * 8)) & 0xFF) << 24) >> 24; + let sc4 = get_byte_i32(scale_vals[is4 / 4], is4 % 4); sum += d * f32(sc1) * q1 * src1[src1_i + l]; sum += d * f32(sc2) * q2 * src1[src1_i + l + 32]; @@ -561,6 +570,142 @@ fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { #enddecl(Q6_K) +#decl(IQ2_TABLES) +const kmask_iq2xs : array = array( + 0x08040201u, // 1, 2, 4, 8 + 0x80402010u // 16, 32, 64, 128 +); + +const kmask_iq2xs_u32 : array = array( + 1, 2, 4, 8, 16, 32, 64, 128 +); + +const ksigns_iq2xs: array = array( + 0x03828100,0x87060584,0x8b0a0988,0x0f8e8d0c, + 0x93121190,0x17969514,0x1b9a9918,0x9f1e1d9c, + 0xa32221a0,0x27a6a524,0x2baaa928,0xaf2e2dac, + 0x33b2b130,0xb73635b4,0xbb3a39b8,0x3fbebd3c, + 0xc34241c0,0x47c6c544,0x4bcac948,0xcf4e4dcc, + 0x53d2d150,0xd75655d4,0xdb5a59d8,0x5fdedd5c, + 0x63e2e160,0xe76665e4,0xeb6a69e8,0x6feeed6c, + 0xf37271f0,0x77f6f574,0x7bfaf978,0xff7e7dfc +); + +const ksigns_iq2xs_u32 : array = array( + 0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15, + 144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159, + 160, 33, 34, 163, 36, 165, 166, 39, 40, 169, 170, 43, 172, 45, 46, 175, + 48, 177, 178, 51, 180, 53, 54, 183, 184, 57, 58, 187, 60, 189, 190, 63, + 192, 65, 66, 195, 68, 197, 198, 71, 72, 201, 202, 75, 204, 77, 78, 207, + 80, 209, 210, 83, 212, 85, 86, 215, 216, 89, 90, 219, 92, 221, 222, 95, + 96, 225, 226, 99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111, + 240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255 +); + +const iq2xxs_grid = array( + 0x08080808, 0x08080808, 0x0808082b, 0x08080808, 0x08081919, 0x08080808, 0x08082b08, 0x08080808, + 0x08082b2b, 0x08080808, 0x08190819, 0x08080808, 0x08191908, 0x08080808, 0x082b0808, 0x08080808, + 0x082b082b, 0x08080808, 0x082b2b08, 0x08080808, 0x082b2b2b, 0x08080808, 0x19080819, 0x08080808, + 0x19081908, 0x08080808, 0x19190808, 0x08080808, 0x19192b08, 0x08080808, 0x192b0819, 0x08080808, + 0x192b1908, 0x08080808, 0x2b080808, 0x08080808, 0x2b08082b, 0x08080808, 0x2b082b2b, 0x08080808, + 0x2b2b082b, 0x08080808, 0x08080819, 0x08080819, 0x08081908, 0x08080819, 0x08190808, 0x08080819, + 0x08191919, 0x08080819, 0x19080808, 0x08080819, 0x2b081908, 0x08080819, 0x2b192b08, 0x08080819, + 0x08080808, 0x0808082b, 0x0808082b, 0x0808082b, 0x082b082b, 0x0808082b, 0x2b08082b, 0x0808082b, + 0x08080819, 0x08081908, 0x08081908, 0x08081908, 0x08190808, 0x08081908, 0x082b0819, 0x08081908, + 0x082b1908, 0x08081908, 0x19080808, 0x08081908, 0x1908082b, 0x08081908, 0x19082b08, 0x08081908, + 0x192b0808, 0x08081908, 0x2b080819, 0x08081908, 0x2b081908, 0x08081908, 0x2b190808, 0x08081908, + 0x2b2b1908, 0x08081908, 0x08080808, 0x08081919, 0x0808082b, 0x08081919, 0x08082b08, 0x08081919, + 0x082b0808, 0x08081919, 0x1908192b, 0x08081919, 0x192b2b19, 0x08081919, 0x2b080808, 0x08081919, + 0x2b190819, 0x08081919, 0x08082b19, 0x0808192b, 0x08190808, 0x0808192b, 0x19080808, 0x0808192b, + 0x2b081908, 0x0808192b, 0x2b2b1908, 0x0808192b, 0x08080808, 0x08082b08, 0x08081919, 0x08082b08, + 0x08082b08, 0x08082b08, 0x08191908, 0x08082b08, 0x082b2b08, 0x08082b08, 0x19080819, 0x08082b08, + 0x19081908, 0x08082b08, 0x19190808, 0x08082b08, 0x1919082b, 0x08082b08, 0x2b082b08, 0x08082b08, + 0x08081908, 0x08082b19, 0x19080808, 0x08082b19, 0x0808082b, 0x08082b2b, 0x08191908, 0x08082b2b, + 0x08080819, 0x08190808, 0x08081908, 0x08190808, 0x08190808, 0x08190808, 0x082b0819, 0x08190808, + 0x19080808, 0x08190808, 0x192b0808, 0x08190808, 0x2b081908, 0x08190808, 0x2b190808, 0x08190808, + 0x2b191919, 0x08190808, 0x08080808, 0x08190819, 0x08082b08, 0x08190819, 0x082b0808, 0x08190819, + 0x19190808, 0x08190819, 0x19192b2b, 0x08190819, 0x2b080808, 0x08190819, 0x082b1908, 0x0819082b, + 0x19081919, 0x0819082b, 0x08080808, 0x08191908, 0x08082b08, 0x08191908, 0x082b0808, 0x08191908, + 0x082b1919, 0x08191908, 0x19082b19, 0x08191908, 0x2b080808, 0x08191908, 0x08192b08, 0x08191919, + 0x192b082b, 0x08191919, 0x08080808, 0x0819192b, 0x0819192b, 0x0819192b, 0x08080819, 0x08192b08, + 0x08081908, 0x08192b08, 0x08190808, 0x08192b08, 0x19080808, 0x08192b08, 0x2b080819, 0x08192b08, + 0x08080808, 0x08192b19, 0x08081919, 0x08192b19, 0x2b2b0808, 0x08192b19, 0x19190819, 0x08192b2b, + 0x08080808, 0x082b0808, 0x0808082b, 0x082b0808, 0x08082b2b, 0x082b0808, 0x19081908, 0x082b0808, + 0x192b0819, 0x082b0808, 0x2b080808, 0x082b0808, 0x2b08082b, 0x082b0808, 0x082b2b19, 0x082b0819, + 0x19082b08, 0x082b0819, 0x08080808, 0x082b082b, 0x0808082b, 0x082b082b, 0x08080819, 0x082b1908, + 0x08081908, 0x082b1908, 0x08190808, 0x082b1908, 0x19080808, 0x082b1908, 0x1919192b, 0x082b1908, + 0x08080808, 0x082b1919, 0x19080819, 0x082b1919, 0x192b1908, 0x082b1919, 0x2b190808, 0x082b192b, + 0x08082b08, 0x082b2b08, 0x082b0808, 0x082b2b08, 0x2b191908, 0x082b2b08, 0x19081908, 0x082b2b2b, + 0x08080819, 0x19080808, 0x08081908, 0x19080808, 0x08190808, 0x19080808, 0x08192b08, 0x19080808, + 0x082b0819, 0x19080808, 0x082b1908, 0x19080808, 0x19080808, 0x19080808, 0x19082b08, 0x19080808, + 0x1919192b, 0x19080808, 0x192b0808, 0x19080808, 0x2b080819, 0x19080808, 0x2b081908, 0x19080808, + 0x2b190808, 0x19080808, 0x08080808, 0x19080819, 0x082b0808, 0x19080819, 0x192b0819, 0x19080819, + 0x2b080808, 0x19080819, 0x2b081919, 0x19080819, 0x08080819, 0x1908082b, 0x08190808, 0x1908082b, + 0x19082b08, 0x1908082b, 0x1919192b, 0x1908082b, 0x192b2b08, 0x1908082b, 0x08080808, 0x19081908, + 0x08082b08, 0x19081908, 0x082b0808, 0x19081908, 0x2b080808, 0x19081908, 0x2b192b19, 0x19081908, + 0x0819082b, 0x19081919, 0x082b1908, 0x19081919, 0x08080808, 0x1908192b, 0x08080819, 0x19082b08, + 0x08081908, 0x19082b08, 0x08190808, 0x19082b08, 0x19080808, 0x19082b08, 0x19081919, 0x19082b08, + 0x08080808, 0x19082b19, 0x19192b08, 0x19082b19, 0x192b0819, 0x19082b19, 0x2b08082b, 0x19082b19, + 0x19081919, 0x19082b2b, 0x2b190808, 0x19082b2b, 0x08080808, 0x19190808, 0x08082b08, 0x19190808, + 0x08190819, 0x19190808, 0x08192b19, 0x19190808, 0x082b0808, 0x19190808, 0x2b080808, 0x19190808, + 0x2b082b08, 0x19190808, 0x08081908, 0x19190819, 0x1908082b, 0x19190819, 0x2b2b1908, 0x19190819, + 0x2b190819, 0x1919082b, 0x2b190808, 0x19191908, 0x2b19082b, 0x19191908, 0x08082b2b, 0x19191919, + 0x08080819, 0x1919192b, 0x19191908, 0x1919192b, 0x08080808, 0x19192b08, 0x08190819, 0x19192b08, + 0x08192b19, 0x19192b08, 0x192b1908, 0x19192b08, 0x19080808, 0x19192b19, 0x08082b08, 0x19192b2b, + 0x08081908, 0x192b0808, 0x08190808, 0x192b0808, 0x19080808, 0x192b0808, 0x192b2b08, 0x192b0808, + 0x08080808, 0x192b0819, 0x19191919, 0x192b0819, 0x08192b08, 0x192b082b, 0x192b0808, 0x192b082b, + 0x08080808, 0x192b1908, 0x08081919, 0x192b1908, 0x08190808, 0x192b1919, 0x0819082b, 0x192b1919, + 0x2b081908, 0x192b1919, 0x1908082b, 0x192b2b08, 0x08080808, 0x2b080808, 0x0808082b, 0x2b080808, + 0x08082b2b, 0x2b080808, 0x19080819, 0x2b080808, 0x2b08082b, 0x2b080808, 0x08081908, 0x2b080819, + 0x08192b08, 0x2b080819, 0x19080808, 0x2b080819, 0x08190819, 0x2b08082b, 0x08080819, 0x2b081908, + 0x08081908, 0x2b081908, 0x08190808, 0x2b081908, 0x08191919, 0x2b081908, 0x19080808, 0x2b081908, + 0x192b0808, 0x2b081908, 0x08080808, 0x2b081919, 0x1908192b, 0x2b081919, 0x2b191908, 0x2b081919, + 0x08082b19, 0x2b08192b, 0x19080808, 0x2b08192b, 0x192b0808, 0x2b08192b, 0x0808082b, 0x2b082b08, + 0x08081908, 0x2b082b19, 0x08190819, 0x2b082b2b, 0x08081908, 0x2b190808, 0x08190808, 0x2b190808, + 0x082b1908, 0x2b190808, 0x19080808, 0x2b190808, 0x2b2b0819, 0x2b190808, 0x0819192b, 0x2b190819, + 0x2b080808, 0x2b190819, 0x19081919, 0x2b19082b, 0x08080808, 0x2b191908, 0x082b082b, 0x2b191908, + 0x19081908, 0x2b191908, 0x19190819, 0x2b191919, 0x2b080819, 0x2b192b08, 0x082b0808, 0x2b192b19, + 0x0808082b, 0x2b2b0808, 0x19190808, 0x2b2b0808, 0x2b081919, 0x2b2b0808, 0x08082b19, 0x2b2b0819, + 0x08080808, 0x2b2b082b, 0x08192b08, 0x2b2b1908, 0x19190808, 0x2b2b2b08, 0x08081908, 0x2b2b2b19 +); + +#enddecl(IQ2_TABLES) + +#decl(IQ2_XXS) +struct iq2_xxs { + d: f16, + qs: array +}; + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + let d = f32(block.d); + var src1_i = src1_idx_base + offset * 256; + var sum = 0.0; + for (var ib: u32 = 0; ib < 32; ib += 4) { + var aux: array; + aux[0] = bitcast(vec2(block.qs[ib], block.qs[ib + 1])); + aux[1] = bitcast(vec2(block.qs[ib + 2], block.qs[ib + 3])); + let db = d * (0.5 + f32(aux[1] >> 28)) * 0.25; + for (var l: u32 = 0; l < 4; l++) { + let ig = get_byte(aux[0], l) * 8; + let is = (aux[1] >> (7 * l)) & 127; + let signs = get_byte(ksigns_iq2xs[is / 4], is % 4); + //let signs = ksigns_iq2xs_u32[is]; + for (var j: u32 = 0; j < 8; j++) { + let g = get_byte(iq2xxs_grid[(ig + j) / 4], (ig + j) % 4); + let m = select(1.0, -1.0, (get_byte(kmask_iq2xs[j / 4], j % 4) & signs) != 0); + //let m = select(1.0, -1.0, (kmask_iq2xs_u32[j] & signs) != 0); + sum += db * f32(g) * m * src1[src1_i]; + src1_i++; + } + } + } + return sum; +} + +#enddecl(IQ2_XXS) + #end(DECLS) #define(SHADER) @@ -593,6 +738,7 @@ struct MulMatParams { @group(0) @binding(0) var src0: array; // N rows, K columns @group(0) @binding(1) var src1: array; // M rows, K columns (transposed) @group(0) @binding(2) var dst: array; // M rows, N columns +//@group(0) @binding(3) var debug: array; @group(0) @binding(3) var params: MulMatParams; diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index d29779cd12b22..8920f409d2bcc 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -979,6 +979,15 @@ struct test_case { for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { init_tensor_uniform(t); } + // print first 32 elements of each tensor +// for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { +// if (strcmp(ggml_get_name(t), "a") == 0) { +// std::vector values = tensor_to_float(t); +// for (int i = 0; i < 32; i++) { +// printf("%s[%d] = %f\n", ggml_get_name(t), i, values[i]); +// } +// } +// } } virtual size_t op_size(ggml_tensor * t) { From 2a3b9ee26e5c647a8e01f2b35e31524c5a859ae3 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Sat, 16 Aug 2025 18:18:21 -0700 Subject: [PATCH 13/15] Closer to supporting all i-quants --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 24 +- .../wgsl-shaders/mul_mat.tmpl.wgsl | 743 +++++++++++++++++- 2 files changed, 740 insertions(+), 27 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 1d4dde165bf89..b1669995b18f4 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -129,7 +129,7 @@ struct webgpu_context_struct { webgpu_buf_pool set_rows_error_buf_pool; wgpu::ComputePipeline memset_pipeline; - wgpu::ComputePipeline mul_mat_pipeline[17][2]; + wgpu::ComputePipeline mul_mat_pipeline[24][2]; wgpu::ComputePipeline set_rows_pipeline; wgpu::ComputePipeline cpy_pipeline; @@ -915,7 +915,7 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_context & webgpu_ctx) { } static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) { - webgpu_pipeline_info pipeline_infos[14] = { + webgpu_pipeline_info pipeline_infos[18] = { { .name = "mul_mat_f32_f32", .shader_code = wgsl_mul_mat_f32_f32, .src0_type = GGML_TYPE_F32, @@ -971,6 +971,22 @@ static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) { { .name = "mul_mat_iq2_xxs_f32", .shader_code = wgsl_mul_mat_iq2_xxs_f32, .src0_type = GGML_TYPE_IQ2_XXS, + .src1_type = GGML_TYPE_F32 }, + { .name = "mul_mat_iq2_xs_f32", + .shader_code = wgsl_mul_mat_iq2_xs_f32, + .src0_type = GGML_TYPE_IQ2_XS, + .src1_type = GGML_TYPE_F32 }, + { .name = "mul_mat_iq2_s_f32", + .shader_code = wgsl_mul_mat_iq2_s_f32, + .src0_type = GGML_TYPE_IQ2_S, + .src1_type = GGML_TYPE_F32 }, + { .name = "mul_mat_iq3_xxs_f32", + .shader_code = wgsl_mul_mat_iq3_xxs_f32, + .src0_type = GGML_TYPE_IQ3_XXS, + .src1_type = GGML_TYPE_F32 }, + { .name = "mul_mat_iq3_s_f32", + .shader_code = wgsl_mul_mat_iq3_s_f32, + .src0_type = GGML_TYPE_IQ3_S, .src1_type = GGML_TYPE_F32 } }; @@ -1074,6 +1090,10 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ2_S: + case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ3_S: return true; default: return false; diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl index 3fbbbc28d54f9..b97198e86b29f 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl @@ -112,6 +112,38 @@ "BLOCK_SIZE": 256 }, "DECLS": ["BYTE_HELPERS", "IQ2_TABLES", "IQ2_XXS"] + }, + { + "REPLS": { + "SRC0_TYPE": "iq2_xs", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["BYTE_HELPERS", "IQ2_TABLES", "IQ2_XS"] + }, + { + "REPLS": { + "SRC0_TYPE": "iq2_s", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["BYTE_HELPERS", "IQ2_TABLES", "IQ2_S"] + }, + { + "REPLS": { + "SRC0_TYPE": "iq3_xxs", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["BYTE_HELPERS", "IQ2_TABLES", "IQ3_XSS"] + }, + { + "REPLS": { + "SRC0_TYPE": "iq3_s", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["BYTE_HELPERS", "IQ2_TABLES", "IQ3_S"] } ] @@ -576,10 +608,6 @@ const kmask_iq2xs : array = array( 0x80402010u // 16, 32, 64, 128 ); -const kmask_iq2xs_u32 : array = array( - 1, 2, 4, 8, 16, 32, 64, 128 -); - const ksigns_iq2xs: array = array( 0x03828100,0x87060584,0x8b0a0988,0x0f8e8d0c, 0x93121190,0x17969514,0x1b9a9918,0x9f1e1d9c, @@ -590,17 +618,9 @@ const ksigns_iq2xs: array = array( 0x63e2e160,0xe76665e4,0xeb6a69e8,0x6feeed6c, 0xf37271f0,0x77f6f574,0x7bfaf978,0xff7e7dfc ); +#enddecl(IQ2_TABLES) -const ksigns_iq2xs_u32 : array = array( - 0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15, - 144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159, - 160, 33, 34, 163, 36, 165, 166, 39, 40, 169, 170, 43, 172, 45, 46, 175, - 48, 177, 178, 51, 180, 53, 54, 183, 184, 57, 58, 187, 60, 189, 190, 63, - 192, 65, 66, 195, 68, 197, 198, 71, 72, 201, 202, 75, 204, 77, 78, 207, - 80, 209, 210, 83, 212, 85, 86, 215, 216, 89, 90, 219, 92, 221, 222, 95, - 96, 225, 226, 99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111, - 240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255 -); +#decl(IQ2_XXS) const iq2xxs_grid = array( 0x08080808, 0x08080808, 0x0808082b, 0x08080808, 0x08081919, 0x08080808, 0x08082b08, 0x08080808, @@ -669,9 +689,6 @@ const iq2xxs_grid = array( 0x08080808, 0x2b2b082b, 0x08192b08, 0x2b2b1908, 0x19190808, 0x2b2b2b08, 0x08081908, 0x2b2b2b19 ); -#enddecl(IQ2_TABLES) - -#decl(IQ2_XXS) struct iq2_xxs { d: f16, qs: array @@ -683,19 +700,16 @@ fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { var src1_i = src1_idx_base + offset * 256; var sum = 0.0; for (var ib: u32 = 0; ib < 32; ib += 4) { - var aux: array; - aux[0] = bitcast(vec2(block.qs[ib], block.qs[ib + 1])); - aux[1] = bitcast(vec2(block.qs[ib + 2], block.qs[ib + 3])); - let db = d * (0.5 + f32(aux[1] >> 28)) * 0.25; + let aux0 = bitcast(vec2(block.qs[ib], block.qs[ib + 1])); + let aux1 = bitcast(vec2(block.qs[ib + 2], block.qs[ib + 3])); + let db = d * (0.5 + f32(aux1 >> 28)) * 0.25; for (var l: u32 = 0; l < 4; l++) { - let ig = get_byte(aux[0], l) * 8; - let is = (aux[1] >> (7 * l)) & 127; + let ig = get_byte(aux0, l) * 8; + let is = (aux1 >> (7 * l)) & 127; let signs = get_byte(ksigns_iq2xs[is / 4], is % 4); - //let signs = ksigns_iq2xs_u32[is]; for (var j: u32 = 0; j < 8; j++) { let g = get_byte(iq2xxs_grid[(ig + j) / 4], (ig + j) % 4); let m = select(1.0, -1.0, (get_byte(kmask_iq2xs[j / 4], j % 4) & signs) != 0); - //let m = select(1.0, -1.0, (kmask_iq2xs_u32[j] & signs) != 0); sum += db * f32(g) * m * src1[src1_i]; src1_i++; } @@ -706,6 +720,685 @@ fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { #enddecl(IQ2_XXS) +#decl(IQ2_XS) +const iq2xs_grid = array( + 0x08080808, 0x08080808, 0x0808082b, 0x08080808, 0x08081919, 0x08080808, 0x08082b08, 0x08080808, + 0x08082b2b, 0x08080808, 0x08190819, 0x08080808, 0x08191908, 0x08080808, 0x0819192b, 0x08080808, + 0x08192b19, 0x08080808, 0x082b0808, 0x08080808, 0x082b082b, 0x08080808, 0x082b1919, 0x08080808, + 0x082b2b08, 0x08080808, 0x19080819, 0x08080808, 0x19081908, 0x08080808, 0x1908192b, 0x08080808, + 0x19082b19, 0x08080808, 0x19190808, 0x08080808, 0x1919082b, 0x08080808, 0x19191919, 0x08080808, + 0x19192b08, 0x08080808, 0x192b0819, 0x08080808, 0x192b1908, 0x08080808, 0x2b080808, 0x08080808, + 0x2b08082b, 0x08080808, 0x2b081919, 0x08080808, 0x2b082b08, 0x08080808, 0x2b190819, 0x08080808, + 0x2b191908, 0x08080808, 0x2b192b19, 0x08080808, 0x2b2b0808, 0x08080808, 0x08080819, 0x08080819, + 0x08081908, 0x08080819, 0x0808192b, 0x08080819, 0x08082b19, 0x08080819, 0x08190808, 0x08080819, + 0x0819082b, 0x08080819, 0x08191919, 0x08080819, 0x08192b08, 0x08080819, 0x08192b2b, 0x08080819, + 0x082b0819, 0x08080819, 0x082b1908, 0x08080819, 0x19080808, 0x08080819, 0x1908082b, 0x08080819, + 0x19081919, 0x08080819, 0x19082b08, 0x08080819, 0x19190819, 0x08080819, 0x19191908, 0x08080819, + 0x192b0808, 0x08080819, 0x192b2b08, 0x08080819, 0x2b080819, 0x08080819, 0x2b081908, 0x08080819, + 0x2b190808, 0x08080819, 0x08080808, 0x0808082b, 0x0808082b, 0x0808082b, 0x08081919, 0x0808082b, + 0x08082b08, 0x0808082b, 0x08190819, 0x0808082b, 0x08191908, 0x0808082b, 0x082b0808, 0x0808082b, + 0x19080819, 0x0808082b, 0x19081908, 0x0808082b, 0x19190808, 0x0808082b, 0x19191919, 0x0808082b, + 0x2b080808, 0x0808082b, 0x2b082b2b, 0x0808082b, 0x08080819, 0x08081908, 0x08081908, 0x08081908, + 0x0808192b, 0x08081908, 0x08082b19, 0x08081908, 0x08190808, 0x08081908, 0x0819082b, 0x08081908, + 0x08191919, 0x08081908, 0x08192b08, 0x08081908, 0x082b0819, 0x08081908, 0x082b1908, 0x08081908, + 0x19080808, 0x08081908, 0x1908082b, 0x08081908, 0x19081919, 0x08081908, 0x19082b08, 0x08081908, + 0x19190819, 0x08081908, 0x19191908, 0x08081908, 0x1919192b, 0x08081908, 0x192b0808, 0x08081908, + 0x2b080819, 0x08081908, 0x2b081908, 0x08081908, 0x2b190808, 0x08081908, 0x08080808, 0x08081919, + 0x0808082b, 0x08081919, 0x08081919, 0x08081919, 0x08082b08, 0x08081919, 0x08190819, 0x08081919, + 0x08191908, 0x08081919, 0x082b0808, 0x08081919, 0x19080819, 0x08081919, 0x19081908, 0x08081919, + 0x19190808, 0x08081919, 0x192b0819, 0x08081919, 0x2b080808, 0x08081919, 0x08080819, 0x0808192b, + 0x08081908, 0x0808192b, 0x08190808, 0x0808192b, 0x082b192b, 0x0808192b, 0x19080808, 0x0808192b, + 0x1908082b, 0x0808192b, 0x2b081908, 0x0808192b, 0x08080808, 0x08082b08, 0x0808082b, 0x08082b08, + 0x08081919, 0x08082b08, 0x08082b08, 0x08082b08, 0x08082b2b, 0x08082b08, 0x08190819, 0x08082b08, + 0x08191908, 0x08082b08, 0x082b0808, 0x08082b08, 0x082b1919, 0x08082b08, 0x19080819, 0x08082b08, + 0x19081908, 0x08082b08, 0x19190808, 0x08082b08, 0x19192b08, 0x08082b08, 0x2b080808, 0x08082b08, + 0x2b2b0808, 0x08082b08, 0x2b2b2b2b, 0x08082b08, 0x08080819, 0x08082b19, 0x08081908, 0x08082b19, + 0x08190808, 0x08082b19, 0x19080808, 0x08082b19, 0x2b080819, 0x08082b19, 0x2b082b19, 0x08082b19, + 0x08080808, 0x08082b2b, 0x082b0808, 0x08082b2b, 0x082b2b08, 0x08082b2b, 0x2b19192b, 0x08082b2b, + 0x2b2b0808, 0x08082b2b, 0x08080819, 0x08190808, 0x08081908, 0x08190808, 0x0808192b, 0x08190808, + 0x08082b19, 0x08190808, 0x08190808, 0x08190808, 0x0819082b, 0x08190808, 0x08191919, 0x08190808, + 0x08192b08, 0x08190808, 0x082b0819, 0x08190808, 0x082b1908, 0x08190808, 0x19080808, 0x08190808, + 0x1908082b, 0x08190808, 0x19081919, 0x08190808, 0x19082b08, 0x08190808, 0x19190819, 0x08190808, + 0x19191908, 0x08190808, 0x192b0808, 0x08190808, 0x192b2b2b, 0x08190808, 0x2b080819, 0x08190808, + 0x2b081908, 0x08190808, 0x2b190808, 0x08190808, 0x08080808, 0x08190819, 0x0808082b, 0x08190819, + 0x08081919, 0x08190819, 0x08082b08, 0x08190819, 0x08190819, 0x08190819, 0x08191908, 0x08190819, + 0x082b0808, 0x08190819, 0x19080819, 0x08190819, 0x19081908, 0x08190819, 0x19190808, 0x08190819, + 0x2b080808, 0x08190819, 0x2b191908, 0x08190819, 0x2b19192b, 0x08190819, 0x08080819, 0x0819082b, + 0x08081908, 0x0819082b, 0x0808192b, 0x0819082b, 0x08190808, 0x0819082b, 0x19080808, 0x0819082b, + 0x192b0808, 0x0819082b, 0x08080808, 0x08191908, 0x0808082b, 0x08191908, 0x08081919, 0x08191908, + 0x08082b08, 0x08191908, 0x08190819, 0x08191908, 0x08191908, 0x08191908, 0x082b0808, 0x08191908, + 0x19080819, 0x08191908, 0x19081908, 0x08191908, 0x19082b19, 0x08191908, 0x19190808, 0x08191908, + 0x192b1908, 0x08191908, 0x2b080808, 0x08191908, 0x08080819, 0x08191919, 0x08081908, 0x08191919, + 0x08190808, 0x08191919, 0x19080808, 0x08191919, 0x08080808, 0x0819192b, 0x08191908, 0x0819192b, + 0x19082b19, 0x0819192b, 0x08080819, 0x08192b08, 0x08081908, 0x08192b08, 0x08190808, 0x08192b08, + 0x0819082b, 0x08192b08, 0x19080808, 0x08192b08, 0x19191908, 0x08192b08, 0x2b08192b, 0x08192b08, + 0x08080808, 0x08192b19, 0x08081919, 0x08192b19, 0x192b192b, 0x08192b19, 0x19190819, 0x08192b2b, + 0x2b2b2b19, 0x08192b2b, 0x08080808, 0x082b0808, 0x0808082b, 0x082b0808, 0x08081919, 0x082b0808, + 0x08082b08, 0x082b0808, 0x08082b2b, 0x082b0808, 0x08190819, 0x082b0808, 0x08191908, 0x082b0808, + 0x082b0808, 0x082b0808, 0x19080819, 0x082b0808, 0x19081908, 0x082b0808, 0x19190808, 0x082b0808, + 0x2b080808, 0x082b0808, 0x2b2b0808, 0x082b0808, 0x08080819, 0x082b0819, 0x08081908, 0x082b0819, + 0x08190808, 0x082b0819, 0x19080808, 0x082b0819, 0x19082b08, 0x082b0819, 0x192b1919, 0x082b0819, + 0x08080808, 0x082b082b, 0x082b082b, 0x082b082b, 0x2b080808, 0x082b082b, 0x2b2b2b08, 0x082b082b, + 0x08080819, 0x082b1908, 0x08081908, 0x082b1908, 0x08190808, 0x082b1908, 0x082b2b19, 0x082b1908, + 0x19080808, 0x082b1908, 0x08080808, 0x082b1919, 0x19080819, 0x082b1919, 0x1919082b, 0x082b1919, + 0x2b192b19, 0x082b1919, 0x08080819, 0x082b192b, 0x08192b2b, 0x082b192b, 0x2b2b192b, 0x082b192b, + 0x08080808, 0x082b2b08, 0x08082b08, 0x082b2b08, 0x08082b2b, 0x082b2b08, 0x082b0808, 0x082b2b08, + 0x19191919, 0x082b2b08, 0x2b082b08, 0x082b2b08, 0x2b2b082b, 0x082b2b08, 0x192b2b08, 0x082b2b19, + 0x2b190808, 0x082b2b19, 0x08082b08, 0x082b2b2b, 0x082b0808, 0x082b2b2b, 0x2b08082b, 0x082b2b2b, + 0x2b082b08, 0x082b2b2b, 0x2b082b2b, 0x082b2b2b, 0x08080819, 0x19080808, 0x08081908, 0x19080808, + 0x0808192b, 0x19080808, 0x08082b19, 0x19080808, 0x08190808, 0x19080808, 0x0819082b, 0x19080808, + 0x08191919, 0x19080808, 0x08192b08, 0x19080808, 0x082b0819, 0x19080808, 0x082b1908, 0x19080808, + 0x19080808, 0x19080808, 0x1908082b, 0x19080808, 0x19081919, 0x19080808, 0x19082b08, 0x19080808, + 0x19082b2b, 0x19080808, 0x19190819, 0x19080808, 0x19191908, 0x19080808, 0x192b0808, 0x19080808, + 0x192b1919, 0x19080808, 0x2b080819, 0x19080808, 0x2b081908, 0x19080808, 0x2b190808, 0x19080808, + 0x08080808, 0x19080819, 0x0808082b, 0x19080819, 0x08081919, 0x19080819, 0x08082b08, 0x19080819, + 0x08190819, 0x19080819, 0x08191908, 0x19080819, 0x082b0808, 0x19080819, 0x19080819, 0x19080819, + 0x19081908, 0x19080819, 0x19190808, 0x19080819, 0x2b080808, 0x19080819, 0x2b081919, 0x19080819, + 0x2b2b082b, 0x19080819, 0x08080819, 0x1908082b, 0x08081908, 0x1908082b, 0x08190808, 0x1908082b, + 0x0819082b, 0x1908082b, 0x082b2b19, 0x1908082b, 0x19080808, 0x1908082b, 0x08080808, 0x19081908, + 0x0808082b, 0x19081908, 0x08081919, 0x19081908, 0x08082b08, 0x19081908, 0x08190819, 0x19081908, + 0x08191908, 0x19081908, 0x08192b19, 0x19081908, 0x082b0808, 0x19081908, 0x19080819, 0x19081908, + 0x19081908, 0x19081908, 0x19190808, 0x19081908, 0x2b080808, 0x19081908, 0x2b191908, 0x19081908, + 0x08080819, 0x19081919, 0x08081908, 0x19081919, 0x08190808, 0x19081919, 0x082b1908, 0x19081919, + 0x19080808, 0x19081919, 0x2b192b2b, 0x19081919, 0x08080808, 0x1908192b, 0x08082b2b, 0x1908192b, + 0x19081908, 0x1908192b, 0x19190808, 0x1908192b, 0x08080819, 0x19082b08, 0x08081908, 0x19082b08, + 0x08190808, 0x19082b08, 0x19080808, 0x19082b08, 0x19081919, 0x19082b08, 0x19191908, 0x19082b08, + 0x192b082b, 0x19082b08, 0x08080808, 0x19082b19, 0x08190819, 0x19082b19, 0x19081908, 0x19082b19, + 0x19190808, 0x19082b19, 0x192b2b19, 0x19082b19, 0x08081908, 0x19082b2b, 0x08080808, 0x19190808, + 0x0808082b, 0x19190808, 0x08081919, 0x19190808, 0x08082b08, 0x19190808, 0x08190819, 0x19190808, + 0x08191908, 0x19190808, 0x082b0808, 0x19190808, 0x082b2b08, 0x19190808, 0x19080819, 0x19190808, + 0x19081908, 0x19190808, 0x19190808, 0x19190808, 0x2b080808, 0x19190808, 0x08080819, 0x19190819, + 0x08081908, 0x19190819, 0x08190808, 0x19190819, 0x08191919, 0x19190819, 0x19080808, 0x19190819, + 0x1908082b, 0x19190819, 0x08080808, 0x1919082b, 0x19081908, 0x1919082b, 0x2b2b2b2b, 0x1919082b, + 0x08080819, 0x19191908, 0x08081908, 0x19191908, 0x08190808, 0x19191908, 0x082b0819, 0x19191908, + 0x19080808, 0x19191908, 0x192b0808, 0x19191908, 0x2b080819, 0x19191908, 0x2b2b0819, 0x19191908, + 0x08080808, 0x19191919, 0x08082b08, 0x19191919, 0x2b080808, 0x19191919, 0x2b082b08, 0x19191919, + 0x082b0819, 0x1919192b, 0x192b2b08, 0x1919192b, 0x2b2b0819, 0x1919192b, 0x08080808, 0x19192b08, + 0x08191908, 0x19192b08, 0x19080819, 0x19192b08, 0x19190808, 0x19192b08, 0x2b192b19, 0x19192b08, + 0x08192b2b, 0x19192b19, 0x19080808, 0x19192b19, 0x1908082b, 0x19192b19, 0x2b081919, 0x19192b2b, + 0x08080819, 0x192b0808, 0x08081908, 0x192b0808, 0x08190808, 0x192b0808, 0x19080808, 0x192b0808, + 0x19191908, 0x192b0808, 0x192b082b, 0x192b0808, 0x2b08192b, 0x192b0808, 0x2b2b2b19, 0x192b0808, + 0x08080808, 0x192b0819, 0x082b1908, 0x192b082b, 0x19082b2b, 0x192b082b, 0x2b19082b, 0x192b082b, + 0x08080808, 0x192b1908, 0x0819192b, 0x192b1908, 0x08190808, 0x192b1919, 0x19080808, 0x192b1919, + 0x19081919, 0x192b1919, 0x2b2b1908, 0x192b1919, 0x08080819, 0x192b2b08, 0x192b2b2b, 0x192b2b08, + 0x082b1919, 0x192b2b19, 0x0808192b, 0x192b2b2b, 0x19191908, 0x192b2b2b, 0x192b082b, 0x192b2b2b, + 0x08080808, 0x2b080808, 0x0808082b, 0x2b080808, 0x08081919, 0x2b080808, 0x08082b08, 0x2b080808, + 0x08190819, 0x2b080808, 0x08191908, 0x2b080808, 0x082b0808, 0x2b080808, 0x082b2b2b, 0x2b080808, + 0x19080819, 0x2b080808, 0x19081908, 0x2b080808, 0x19190808, 0x2b080808, 0x2b080808, 0x2b080808, + 0x2b08082b, 0x2b080808, 0x2b2b2b08, 0x2b080808, 0x2b2b2b2b, 0x2b080808, 0x08080819, 0x2b080819, + 0x08081908, 0x2b080819, 0x0808192b, 0x2b080819, 0x08190808, 0x2b080819, 0x19080808, 0x2b080819, + 0x19190819, 0x2b080819, 0x19192b19, 0x2b080819, 0x08080808, 0x2b08082b, 0x082b0808, 0x2b08082b, + 0x2b080808, 0x2b08082b, 0x2b08082b, 0x2b08082b, 0x2b2b0808, 0x2b08082b, 0x2b2b2b08, 0x2b08082b, + 0x08080819, 0x2b081908, 0x08081908, 0x2b081908, 0x08190808, 0x2b081908, 0x0819082b, 0x2b081908, + 0x08191919, 0x2b081908, 0x19080808, 0x2b081908, 0x192b0808, 0x2b081908, 0x2b082b19, 0x2b081908, + 0x08080808, 0x2b081919, 0x19081908, 0x2b081919, 0x2b2b1919, 0x2b081919, 0x08192b08, 0x2b08192b, + 0x192b2b2b, 0x2b08192b, 0x08080808, 0x2b082b08, 0x08082b08, 0x2b082b08, 0x082b1919, 0x2b082b08, + 0x19192b2b, 0x2b082b08, 0x2b080808, 0x2b082b08, 0x2b08082b, 0x2b082b08, 0x2b2b2b08, 0x2b082b08, + 0x0808192b, 0x2b082b19, 0x082b082b, 0x2b082b2b, 0x2b080808, 0x2b082b2b, 0x2b082b08, 0x2b082b2b, + 0x2b19192b, 0x2b082b2b, 0x2b2b2b08, 0x2b082b2b, 0x08080819, 0x2b190808, 0x08081908, 0x2b190808, + 0x08190808, 0x2b190808, 0x19080808, 0x2b190808, 0x1919192b, 0x2b190808, 0x2b081908, 0x2b190808, + 0x08080808, 0x2b190819, 0x082b082b, 0x2b190819, 0x192b1908, 0x2b190819, 0x1919192b, 0x2b19082b, + 0x2b082b19, 0x2b19082b, 0x08080808, 0x2b191908, 0x08081919, 0x2b191908, 0x19081908, 0x2b191908, + 0x19190808, 0x2b191908, 0x19192b08, 0x2b191908, 0x082b2b19, 0x2b191919, 0x2b190808, 0x2b191919, + 0x2b19082b, 0x2b191919, 0x19080819, 0x2b19192b, 0x19190819, 0x2b192b08, 0x2b2b192b, 0x2b192b08, + 0x19082b19, 0x2b192b19, 0x08191919, 0x2b192b2b, 0x192b0808, 0x2b192b2b, 0x08080808, 0x2b2b0808, + 0x0808082b, 0x2b2b0808, 0x08082b08, 0x2b2b0808, 0x08082b2b, 0x2b2b0808, 0x082b0808, 0x2b2b0808, + 0x082b2b2b, 0x2b2b0808, 0x2b2b0808, 0x2b2b0808, 0x19190819, 0x2b2b0819, 0x19192b19, 0x2b2b0819, + 0x2b2b192b, 0x2b2b0819, 0x08080808, 0x2b2b082b, 0x0808082b, 0x2b2b082b, 0x08082b08, 0x2b2b082b, + 0x082b2b2b, 0x2b2b082b, 0x2b080808, 0x2b2b082b, 0x2b2b0808, 0x2b2b082b, 0x19080808, 0x2b2b1908, + 0x2b191919, 0x2b2b1908, 0x192b1919, 0x2b2b192b, 0x2b192b08, 0x2b2b192b, 0x08082b2b, 0x2b2b2b08, + 0x082b0808, 0x2b2b2b08, 0x082b082b, 0x2b2b2b08, 0x082b2b08, 0x2b2b2b08, 0x2b2b0808, 0x2b2b2b08, + 0x2b2b2b08, 0x2b2b2b08, 0x08081908, 0x2b2b2b19, 0x2b081908, 0x2b2b2b19, 0x2b08192b, 0x2b2b2b19, + 0x082b2b08, 0x2b2b2b2b, 0x082b2b2b, 0x2b2b2b2b, 0x2b190819, 0x2b2b2b2b, 0x2b2b2b2b, 0x2b2b2b2b +); + +struct iq2_xs { + d: f16, + qs: array, + scales: array +}; + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + let d = f32(block.d); + var src1_i = src1_idx_base + offset * 256; + var scale_vals = array( + bitcast(vec2(block.scales[0], block.scales[1])), + bitcast(vec2(block.scales[2], block.scales[3])) + ); + var sum = 0.0; + for (var ib: u32 = 0; ib < 32; ib += 4) { + let s = get_byte(scale_vals[ib / 16], (ib % 16) / 4); + let db = array( + d * (0.5 + f32(s & 0xF)) * 0.25, + d * (0.5 + f32(s >> 4)) * 0.25 + ); + for (var l: u32 = 0; l < 4; l++) { + let qs_val = bitcast(vec2(block.qs[ib + l], 0.0)); + let ig = (qs_val & 511) * 8; + let is = qs_val >> 9; + let signs = get_byte(ksigns_iq2xs[is / 4], is % 4); + let dl = db[l/2]; + for (var j: u32 = 0; j < 8; j++) { + let g = get_byte(iq2xs_grid[(ig + j) / 4], (ig + j) % 4); + let m = select(1.0, -1.0, (get_byte(kmask_iq2xs[j / 4], j % 4) & signs) != 0); + sum += dl * f32(g) * m * src1[src1_i]; + src1_i++; + } + } + } + return sum; +} + +#enddecl(IQ2_XS) + +#decl(IQ2_S) + +const iq2s_grid = array( + 0x08080808, 0x08080808, 0x0808082b, 0x08080808, 0x08081919, 0x08080808, 0x08082b08, 0x08080808, + 0x08082b2b, 0x08080808, 0x08190819, 0x08080808, 0x08191908, 0x08080808, 0x0819192b, 0x08080808, + 0x08192b19, 0x08080808, 0x082b0808, 0x08080808, 0x082b082b, 0x08080808, 0x082b1919, 0x08080808, + 0x082b2b08, 0x08080808, 0x19080819, 0x08080808, 0x19081908, 0x08080808, 0x1908192b, 0x08080808, + 0x19082b19, 0x08080808, 0x19190808, 0x08080808, 0x1919082b, 0x08080808, 0x19191919, 0x08080808, + 0x19192b08, 0x08080808, 0x192b0819, 0x08080808, 0x192b1908, 0x08080808, 0x192b192b, 0x08080808, + 0x192b2b19, 0x08080808, 0x2b080808, 0x08080808, 0x2b08082b, 0x08080808, 0x2b081919, 0x08080808, + 0x2b082b08, 0x08080808, 0x2b190819, 0x08080808, 0x2b191908, 0x08080808, 0x2b2b0808, 0x08080808, + 0x2b2b1919, 0x08080808, 0x2b2b2b2b, 0x08080808, 0x08080819, 0x08080819, 0x08081908, 0x08080819, + 0x0808192b, 0x08080819, 0x08082b19, 0x08080819, 0x08190808, 0x08080819, 0x0819082b, 0x08080819, + 0x08191919, 0x08080819, 0x08192b08, 0x08080819, 0x082b0819, 0x08080819, 0x082b1908, 0x08080819, + 0x19080808, 0x08080819, 0x1908082b, 0x08080819, 0x19081919, 0x08080819, 0x19082b08, 0x08080819, + 0x19190819, 0x08080819, 0x19191908, 0x08080819, 0x1919192b, 0x08080819, 0x19192b19, 0x08080819, + 0x192b0808, 0x08080819, 0x192b1919, 0x08080819, 0x192b2b08, 0x08080819, 0x2b080819, 0x08080819, + 0x2b081908, 0x08080819, 0x2b190808, 0x08080819, 0x2b19082b, 0x08080819, 0x2b191919, 0x08080819, + 0x2b2b0819, 0x08080819, 0x2b2b1908, 0x08080819, 0x08080808, 0x0808082b, 0x0808082b, 0x0808082b, + 0x08081919, 0x0808082b, 0x08082b08, 0x0808082b, 0x08190819, 0x0808082b, 0x08191908, 0x0808082b, + 0x082b0808, 0x0808082b, 0x082b2b2b, 0x0808082b, 0x19080819, 0x0808082b, 0x19081908, 0x0808082b, + 0x1908192b, 0x0808082b, 0x19082b19, 0x0808082b, 0x19190808, 0x0808082b, 0x19191919, 0x0808082b, + 0x2b080808, 0x0808082b, 0x2b081919, 0x0808082b, 0x2b082b2b, 0x0808082b, 0x2b191908, 0x0808082b, + 0x2b2b082b, 0x0808082b, 0x08080819, 0x08081908, 0x08081908, 0x08081908, 0x0808192b, 0x08081908, + 0x08082b19, 0x08081908, 0x08190808, 0x08081908, 0x0819082b, 0x08081908, 0x08191919, 0x08081908, + 0x08192b08, 0x08081908, 0x082b0819, 0x08081908, 0x082b1908, 0x08081908, 0x082b192b, 0x08081908, + 0x082b2b19, 0x08081908, 0x19080808, 0x08081908, 0x1908082b, 0x08081908, 0x19081919, 0x08081908, + 0x19082b08, 0x08081908, 0x19082b2b, 0x08081908, 0x19190819, 0x08081908, 0x19191908, 0x08081908, + 0x1919192b, 0x08081908, 0x19192b19, 0x08081908, 0x192b0808, 0x08081908, 0x192b082b, 0x08081908, + 0x192b1919, 0x08081908, 0x2b080819, 0x08081908, 0x2b081908, 0x08081908, 0x2b08192b, 0x08081908, + 0x2b082b19, 0x08081908, 0x2b190808, 0x08081908, 0x2b191919, 0x08081908, 0x2b192b08, 0x08081908, + 0x2b2b0819, 0x08081908, 0x2b2b1908, 0x08081908, 0x08080808, 0x08081919, 0x0808082b, 0x08081919, + 0x08081919, 0x08081919, 0x08082b08, 0x08081919, 0x08082b2b, 0x08081919, 0x08190819, 0x08081919, + 0x08191908, 0x08081919, 0x0819192b, 0x08081919, 0x08192b19, 0x08081919, 0x082b0808, 0x08081919, + 0x082b1919, 0x08081919, 0x082b2b08, 0x08081919, 0x19080819, 0x08081919, 0x19081908, 0x08081919, + 0x1908192b, 0x08081919, 0x19082b19, 0x08081919, 0x19190808, 0x08081919, 0x1919082b, 0x08081919, + 0x19191919, 0x08081919, 0x19192b08, 0x08081919, 0x192b0819, 0x08081919, 0x192b1908, 0x08081919, + 0x2b080808, 0x08081919, 0x2b08082b, 0x08081919, 0x2b081919, 0x08081919, 0x2b082b08, 0x08081919, + 0x2b190819, 0x08081919, 0x2b191908, 0x08081919, 0x2b2b0808, 0x08081919, 0x08080819, 0x0808192b, + 0x08081908, 0x0808192b, 0x0808192b, 0x0808192b, 0x08082b19, 0x0808192b, 0x08190808, 0x0808192b, + 0x08191919, 0x0808192b, 0x19080808, 0x0808192b, 0x19081919, 0x0808192b, 0x19082b08, 0x0808192b, + 0x19190819, 0x0808192b, 0x19191908, 0x0808192b, 0x192b0808, 0x0808192b, 0x2b080819, 0x0808192b, + 0x2b081908, 0x0808192b, 0x2b190808, 0x0808192b, 0x08080808, 0x08082b08, 0x0808082b, 0x08082b08, + 0x08081919, 0x08082b08, 0x08082b08, 0x08082b08, 0x08190819, 0x08082b08, 0x08191908, 0x08082b08, + 0x0819192b, 0x08082b08, 0x08192b19, 0x08082b08, 0x082b0808, 0x08082b08, 0x082b1919, 0x08082b08, + 0x082b2b2b, 0x08082b08, 0x19080819, 0x08082b08, 0x19081908, 0x08082b08, 0x1908192b, 0x08082b08, + 0x19082b19, 0x08082b08, 0x19190808, 0x08082b08, 0x1919082b, 0x08082b08, 0x19191919, 0x08082b08, + 0x19192b08, 0x08082b08, 0x192b0819, 0x08082b08, 0x192b1908, 0x08082b08, 0x2b080808, 0x08082b08, + 0x2b081919, 0x08082b08, 0x2b191908, 0x08082b08, 0x2b2b2b2b, 0x08082b08, 0x08080819, 0x08082b19, + 0x08081908, 0x08082b19, 0x08190808, 0x08082b19, 0x0819082b, 0x08082b19, 0x08191919, 0x08082b19, + 0x08192b08, 0x08082b19, 0x082b0819, 0x08082b19, 0x19080808, 0x08082b19, 0x19081919, 0x08082b19, + 0x19082b08, 0x08082b19, 0x19190819, 0x08082b19, 0x19191908, 0x08082b19, 0x192b0808, 0x08082b19, + 0x2b080819, 0x08082b19, 0x2b190808, 0x08082b19, 0x08080808, 0x08082b2b, 0x08190819, 0x08082b2b, + 0x08191908, 0x08082b2b, 0x082b082b, 0x08082b2b, 0x082b2b08, 0x08082b2b, 0x082b2b2b, 0x08082b2b, + 0x19190808, 0x08082b2b, 0x2b192b19, 0x08082b2b, 0x08080819, 0x08190808, 0x08081908, 0x08190808, + 0x0808192b, 0x08190808, 0x08082b19, 0x08190808, 0x08190808, 0x08190808, 0x0819082b, 0x08190808, + 0x08191919, 0x08190808, 0x08192b08, 0x08190808, 0x082b0819, 0x08190808, 0x082b1908, 0x08190808, + 0x082b192b, 0x08190808, 0x19080808, 0x08190808, 0x1908082b, 0x08190808, 0x19081919, 0x08190808, + 0x19082b08, 0x08190808, 0x19190819, 0x08190808, 0x19191908, 0x08190808, 0x1919192b, 0x08190808, + 0x19192b19, 0x08190808, 0x192b0808, 0x08190808, 0x192b082b, 0x08190808, 0x192b1919, 0x08190808, + 0x192b2b08, 0x08190808, 0x2b080819, 0x08190808, 0x2b081908, 0x08190808, 0x2b08192b, 0x08190808, + 0x2b190808, 0x08190808, 0x2b191919, 0x08190808, 0x2b192b08, 0x08190808, 0x2b2b0819, 0x08190808, + 0x2b2b1908, 0x08190808, 0x08080808, 0x08190819, 0x0808082b, 0x08190819, 0x08081919, 0x08190819, + 0x08082b08, 0x08190819, 0x08082b2b, 0x08190819, 0x08190819, 0x08190819, 0x08191908, 0x08190819, + 0x0819192b, 0x08190819, 0x08192b19, 0x08190819, 0x082b0808, 0x08190819, 0x082b082b, 0x08190819, + 0x082b1919, 0x08190819, 0x082b2b08, 0x08190819, 0x19080819, 0x08190819, 0x19081908, 0x08190819, + 0x1908192b, 0x08190819, 0x19082b19, 0x08190819, 0x19190808, 0x08190819, 0x1919082b, 0x08190819, + 0x19191919, 0x08190819, 0x19192b08, 0x08190819, 0x192b0819, 0x08190819, 0x192b1908, 0x08190819, + 0x2b080808, 0x08190819, 0x2b08082b, 0x08190819, 0x2b081919, 0x08190819, 0x2b082b08, 0x08190819, + 0x2b190819, 0x08190819, 0x2b191908, 0x08190819, 0x08080819, 0x0819082b, 0x08081908, 0x0819082b, + 0x08082b19, 0x0819082b, 0x08190808, 0x0819082b, 0x08191919, 0x0819082b, 0x082b0819, 0x0819082b, + 0x082b1908, 0x0819082b, 0x19080808, 0x0819082b, 0x19081919, 0x0819082b, 0x19190819, 0x0819082b, + 0x19191908, 0x0819082b, 0x2b080819, 0x0819082b, 0x2b081908, 0x0819082b, 0x2b190808, 0x0819082b, + 0x08080808, 0x08191908, 0x0808082b, 0x08191908, 0x08081919, 0x08191908, 0x08082b08, 0x08191908, + 0x08190819, 0x08191908, 0x08191908, 0x08191908, 0x0819192b, 0x08191908, 0x08192b19, 0x08191908, + 0x082b0808, 0x08191908, 0x082b1919, 0x08191908, 0x082b2b08, 0x08191908, 0x19080819, 0x08191908, + 0x19081908, 0x08191908, 0x1908192b, 0x08191908, 0x19082b19, 0x08191908, 0x19190808, 0x08191908, + 0x1919082b, 0x08191908, 0x19191919, 0x08191908, 0x19192b08, 0x08191908, 0x192b0819, 0x08191908, + 0x192b1908, 0x08191908, 0x2b080808, 0x08191908, 0x2b08082b, 0x08191908, 0x2b081919, 0x08191908, + 0x2b082b08, 0x08191908, 0x2b190819, 0x08191908, 0x2b191908, 0x08191908, 0x2b2b0808, 0x08191908, + 0x08080819, 0x08191919, 0x08081908, 0x08191919, 0x0808192b, 0x08191919, 0x08082b19, 0x08191919, + 0x08190808, 0x08191919, 0x0819082b, 0x08191919, 0x08191919, 0x08191919, 0x08192b08, 0x08191919, + 0x082b0819, 0x08191919, 0x082b1908, 0x08191919, 0x19080808, 0x08191919, 0x1908082b, 0x08191919, + 0x19081919, 0x08191919, 0x19082b08, 0x08191919, 0x19190819, 0x08191919, 0x19191908, 0x08191919, + 0x192b0808, 0x08191919, 0x2b080819, 0x08191919, 0x2b081908, 0x08191919, 0x2b190808, 0x08191919, + 0x08080808, 0x0819192b, 0x08081919, 0x0819192b, 0x08082b08, 0x0819192b, 0x08190819, 0x0819192b, + 0x08191908, 0x0819192b, 0x082b0808, 0x0819192b, 0x19080819, 0x0819192b, 0x19081908, 0x0819192b, + 0x19190808, 0x0819192b, 0x2b080808, 0x0819192b, 0x2b2b2b2b, 0x0819192b, 0x08080819, 0x08192b08, + 0x08081908, 0x08192b08, 0x0808192b, 0x08192b08, 0x08082b19, 0x08192b08, 0x08190808, 0x08192b08, + 0x08191919, 0x08192b08, 0x08192b08, 0x08192b08, 0x082b0819, 0x08192b08, 0x19080808, 0x08192b08, + 0x1908082b, 0x08192b08, 0x19081919, 0x08192b08, 0x19082b08, 0x08192b08, 0x19190819, 0x08192b08, + 0x19191908, 0x08192b08, 0x192b0808, 0x08192b08, 0x2b080819, 0x08192b08, 0x2b081908, 0x08192b08, + 0x08080808, 0x08192b19, 0x0808082b, 0x08192b19, 0x08081919, 0x08192b19, 0x08082b08, 0x08192b19, + 0x08190819, 0x08192b19, 0x08191908, 0x08192b19, 0x082b0808, 0x08192b19, 0x19080819, 0x08192b19, + 0x19081908, 0x08192b19, 0x19190808, 0x08192b19, 0x192b2b19, 0x08192b19, 0x2b2b082b, 0x08192b19, + 0x08081908, 0x08192b2b, 0x08190808, 0x08192b2b, 0x19080808, 0x08192b2b, 0x1919192b, 0x08192b2b, + 0x08080808, 0x082b0808, 0x0808082b, 0x082b0808, 0x08081919, 0x082b0808, 0x08082b08, 0x082b0808, + 0x08190819, 0x082b0808, 0x08191908, 0x082b0808, 0x0819192b, 0x082b0808, 0x08192b19, 0x082b0808, + 0x082b0808, 0x082b0808, 0x082b1919, 0x082b0808, 0x082b2b2b, 0x082b0808, 0x19080819, 0x082b0808, + 0x19081908, 0x082b0808, 0x19190808, 0x082b0808, 0x1919082b, 0x082b0808, 0x19191919, 0x082b0808, + 0x192b1908, 0x082b0808, 0x2b080808, 0x082b0808, 0x2b082b2b, 0x082b0808, 0x2b191908, 0x082b0808, + 0x2b2b2b2b, 0x082b0808, 0x08080819, 0x082b0819, 0x08081908, 0x082b0819, 0x08190808, 0x082b0819, + 0x0819082b, 0x082b0819, 0x08191919, 0x082b0819, 0x082b0819, 0x082b0819, 0x19080808, 0x082b0819, + 0x1908082b, 0x082b0819, 0x19081919, 0x082b0819, 0x19190819, 0x082b0819, 0x19191908, 0x082b0819, + 0x192b0808, 0x082b0819, 0x2b080819, 0x082b0819, 0x2b081908, 0x082b0819, 0x2b190808, 0x082b0819, + 0x08080808, 0x082b082b, 0x08082b2b, 0x082b082b, 0x082b082b, 0x082b082b, 0x082b2b08, 0x082b082b, + 0x082b2b2b, 0x082b082b, 0x19081908, 0x082b082b, 0x19190808, 0x082b082b, 0x2b082b08, 0x082b082b, + 0x2b082b2b, 0x082b082b, 0x2b2b2b08, 0x082b082b, 0x08080819, 0x082b1908, 0x08081908, 0x082b1908, + 0x0808192b, 0x082b1908, 0x08082b19, 0x082b1908, 0x08190808, 0x082b1908, 0x08191919, 0x082b1908, + 0x08192b08, 0x082b1908, 0x082b0819, 0x082b1908, 0x082b1908, 0x082b1908, 0x19080808, 0x082b1908, + 0x1908082b, 0x082b1908, 0x19081919, 0x082b1908, 0x19082b08, 0x082b1908, 0x19190819, 0x082b1908, + 0x19191908, 0x082b1908, 0x192b0808, 0x082b1908, 0x2b080819, 0x082b1908, 0x2b081908, 0x082b1908, + 0x2b190808, 0x082b1908, 0x08080808, 0x082b1919, 0x08081919, 0x082b1919, 0x08082b08, 0x082b1919, + 0x08190819, 0x082b1919, 0x08191908, 0x082b1919, 0x082b0808, 0x082b1919, 0x19080819, 0x082b1919, + 0x19081908, 0x082b1919, 0x19190808, 0x082b1919, 0x192b192b, 0x082b1919, 0x2b080808, 0x082b1919, + 0x08080819, 0x082b192b, 0x08081908, 0x082b192b, 0x08190808, 0x082b192b, 0x19080808, 0x082b192b, + 0x19192b19, 0x082b192b, 0x08080808, 0x082b2b08, 0x08081919, 0x082b2b08, 0x08190819, 0x082b2b08, + 0x08191908, 0x082b2b08, 0x19080819, 0x082b2b08, 0x19081908, 0x082b2b08, 0x19190808, 0x082b2b08, + 0x2b082b2b, 0x082b2b08, 0x2b2b2b2b, 0x082b2b08, 0x08080819, 0x082b2b19, 0x08081908, 0x082b2b19, + 0x08190808, 0x082b2b19, 0x2b191919, 0x082b2b19, 0x08082b2b, 0x082b2b2b, 0x082b082b, 0x082b2b2b, + 0x192b1908, 0x082b2b2b, 0x2b082b08, 0x082b2b2b, 0x2b082b2b, 0x082b2b2b, 0x08080819, 0x19080808, + 0x08081908, 0x19080808, 0x0808192b, 0x19080808, 0x08082b19, 0x19080808, 0x08190808, 0x19080808, + 0x0819082b, 0x19080808, 0x08191919, 0x19080808, 0x08192b08, 0x19080808, 0x08192b2b, 0x19080808, + 0x082b0819, 0x19080808, 0x082b1908, 0x19080808, 0x082b192b, 0x19080808, 0x19080808, 0x19080808, + 0x1908082b, 0x19080808, 0x19081919, 0x19080808, 0x19082b08, 0x19080808, 0x19082b2b, 0x19080808, + 0x19190819, 0x19080808, 0x19191908, 0x19080808, 0x1919192b, 0x19080808, 0x19192b19, 0x19080808, + 0x192b0808, 0x19080808, 0x192b082b, 0x19080808, 0x192b1919, 0x19080808, 0x2b080819, 0x19080808, + 0x2b081908, 0x19080808, 0x2b190808, 0x19080808, 0x2b191919, 0x19080808, 0x2b192b08, 0x19080808, + 0x2b2b0819, 0x19080808, 0x2b2b1908, 0x19080808, 0x08080808, 0x19080819, 0x0808082b, 0x19080819, + 0x08081919, 0x19080819, 0x08082b08, 0x19080819, 0x08190819, 0x19080819, 0x08191908, 0x19080819, + 0x0819192b, 0x19080819, 0x08192b19, 0x19080819, 0x082b0808, 0x19080819, 0x082b082b, 0x19080819, + 0x082b1919, 0x19080819, 0x19080819, 0x19080819, 0x19081908, 0x19080819, 0x1908192b, 0x19080819, + 0x19082b19, 0x19080819, 0x19190808, 0x19080819, 0x1919082b, 0x19080819, 0x19191919, 0x19080819, + 0x19192b08, 0x19080819, 0x192b0819, 0x19080819, 0x192b1908, 0x19080819, 0x2b080808, 0x19080819, + 0x2b08082b, 0x19080819, 0x2b081919, 0x19080819, 0x2b082b08, 0x19080819, 0x2b190819, 0x19080819, + 0x2b191908, 0x19080819, 0x2b2b0808, 0x19080819, 0x08080819, 0x1908082b, 0x08081908, 0x1908082b, + 0x08190808, 0x1908082b, 0x0819082b, 0x1908082b, 0x08191919, 0x1908082b, 0x08192b08, 0x1908082b, + 0x082b1908, 0x1908082b, 0x19080808, 0x1908082b, 0x19081919, 0x1908082b, 0x19082b08, 0x1908082b, + 0x19190819, 0x1908082b, 0x19191908, 0x1908082b, 0x192b0808, 0x1908082b, 0x2b080819, 0x1908082b, + 0x2b081908, 0x1908082b, 0x08080808, 0x19081908, 0x0808082b, 0x19081908, 0x08081919, 0x19081908, + 0x08082b08, 0x19081908, 0x08082b2b, 0x19081908, 0x08190819, 0x19081908, 0x08191908, 0x19081908, + 0x0819192b, 0x19081908, 0x08192b19, 0x19081908, 0x082b0808, 0x19081908, 0x082b082b, 0x19081908, + 0x082b1919, 0x19081908, 0x082b2b08, 0x19081908, 0x19080819, 0x19081908, 0x19081908, 0x19081908, + 0x1908192b, 0x19081908, 0x19082b19, 0x19081908, 0x19190808, 0x19081908, 0x1919082b, 0x19081908, + 0x19191919, 0x19081908, 0x19192b08, 0x19081908, 0x192b0819, 0x19081908, 0x192b1908, 0x19081908, + 0x2b080808, 0x19081908, 0x2b08082b, 0x19081908, 0x2b081919, 0x19081908, 0x2b082b08, 0x19081908, + 0x2b190819, 0x19081908, 0x2b191908, 0x19081908, 0x2b2b0808, 0x19081908, 0x08080819, 0x19081919, + 0x08081908, 0x19081919, 0x0808192b, 0x19081919, 0x08082b19, 0x19081919, 0x08190808, 0x19081919, + 0x0819082b, 0x19081919, 0x08191919, 0x19081919, 0x08192b08, 0x19081919, 0x082b0819, 0x19081919, + 0x082b1908, 0x19081919, 0x19080808, 0x19081919, 0x1908082b, 0x19081919, 0x19081919, 0x19081919, + 0x19082b08, 0x19081919, 0x19190819, 0x19081919, 0x19191908, 0x19081919, 0x192b0808, 0x19081919, + 0x192b2b2b, 0x19081919, 0x2b080819, 0x19081919, 0x2b081908, 0x19081919, 0x2b190808, 0x19081919, + 0x08080808, 0x1908192b, 0x0808082b, 0x1908192b, 0x08081919, 0x1908192b, 0x08082b08, 0x1908192b, + 0x08190819, 0x1908192b, 0x08191908, 0x1908192b, 0x082b0808, 0x1908192b, 0x19080819, 0x1908192b, + 0x19081908, 0x1908192b, 0x19190808, 0x1908192b, 0x2b080808, 0x1908192b, 0x2b2b1919, 0x1908192b, + 0x08080819, 0x19082b08, 0x08081908, 0x19082b08, 0x08082b19, 0x19082b08, 0x08190808, 0x19082b08, + 0x0819082b, 0x19082b08, 0x08191919, 0x19082b08, 0x08192b08, 0x19082b08, 0x082b0819, 0x19082b08, + 0x082b1908, 0x19082b08, 0x19080808, 0x19082b08, 0x1908082b, 0x19082b08, 0x19081919, 0x19082b08, + 0x19082b08, 0x19082b08, 0x19190819, 0x19082b08, 0x19191908, 0x19082b08, 0x192b0808, 0x19082b08, + 0x2b081908, 0x19082b08, 0x2b190808, 0x19082b08, 0x08080808, 0x19082b19, 0x0808082b, 0x19082b19, + 0x08081919, 0x19082b19, 0x08082b08, 0x19082b19, 0x08190819, 0x19082b19, 0x08191908, 0x19082b19, + 0x082b0808, 0x19082b19, 0x19080819, 0x19082b19, 0x19081908, 0x19082b19, 0x19190808, 0x19082b19, + 0x2b080808, 0x19082b19, 0x2b19192b, 0x19082b19, 0x08080819, 0x19082b2b, 0x08081908, 0x19082b2b, + 0x08190808, 0x19082b2b, 0x19080808, 0x19082b2b, 0x08080808, 0x19190808, 0x0808082b, 0x19190808, + 0x08081919, 0x19190808, 0x08082b08, 0x19190808, 0x08190819, 0x19190808, 0x08191908, 0x19190808, + 0x0819192b, 0x19190808, 0x08192b19, 0x19190808, 0x082b0808, 0x19190808, 0x082b082b, 0x19190808, + 0x082b1919, 0x19190808, 0x082b2b08, 0x19190808, 0x19080819, 0x19190808, 0x19081908, 0x19190808, + 0x1908192b, 0x19190808, 0x19082b19, 0x19190808, 0x19190808, 0x19190808, 0x1919082b, 0x19190808, + 0x19191919, 0x19190808, 0x19192b08, 0x19190808, 0x192b0819, 0x19190808, 0x192b1908, 0x19190808, + 0x2b080808, 0x19190808, 0x2b08082b, 0x19190808, 0x2b081919, 0x19190808, 0x2b082b08, 0x19190808, + 0x2b190819, 0x19190808, 0x2b191908, 0x19190808, 0x08080819, 0x19190819, 0x08081908, 0x19190819, + 0x0808192b, 0x19190819, 0x08082b19, 0x19190819, 0x08190808, 0x19190819, 0x0819082b, 0x19190819, + 0x08191919, 0x19190819, 0x08192b08, 0x19190819, 0x082b0819, 0x19190819, 0x082b1908, 0x19190819, + 0x19080808, 0x19190819, 0x1908082b, 0x19190819, 0x19081919, 0x19190819, 0x19082b08, 0x19190819, + 0x19190819, 0x19190819, 0x19191908, 0x19190819, 0x192b0808, 0x19190819, 0x2b080819, 0x19190819, + 0x2b081908, 0x19190819, 0x2b190808, 0x19190819, 0x08080808, 0x1919082b, 0x08081919, 0x1919082b, + 0x08082b08, 0x1919082b, 0x08190819, 0x1919082b, 0x08191908, 0x1919082b, 0x082b0808, 0x1919082b, + 0x19080819, 0x1919082b, 0x19081908, 0x1919082b, 0x19190808, 0x1919082b, 0x192b2b19, 0x1919082b, + 0x2b080808, 0x1919082b, 0x08080819, 0x19191908, 0x08081908, 0x19191908, 0x0808192b, 0x19191908, + 0x08082b19, 0x19191908, 0x08190808, 0x19191908, 0x0819082b, 0x19191908, 0x08191919, 0x19191908, + 0x08192b08, 0x19191908, 0x082b0819, 0x19191908, 0x082b1908, 0x19191908, 0x19080808, 0x19191908, + 0x1908082b, 0x19191908, 0x19081919, 0x19191908, 0x19082b08, 0x19191908, 0x19190819, 0x19191908, + 0x19191908, 0x19191908, 0x192b0808, 0x19191908, 0x2b080819, 0x19191908, 0x2b081908, 0x19191908, + 0x2b190808, 0x19191908, 0x08080808, 0x19191919, 0x0808082b, 0x19191919, 0x08081919, 0x19191919, + 0x08082b08, 0x19191919, 0x08190819, 0x19191919, 0x08191908, 0x19191919, 0x082b0808, 0x19191919, + 0x19080819, 0x19191919, 0x19081908, 0x19191919, 0x19190808, 0x19191919, 0x2b080808, 0x19191919, + 0x08080819, 0x1919192b, 0x08081908, 0x1919192b, 0x08190808, 0x1919192b, 0x082b192b, 0x1919192b, + 0x19080808, 0x1919192b, 0x08080808, 0x19192b08, 0x0808082b, 0x19192b08, 0x08081919, 0x19192b08, + 0x08082b08, 0x19192b08, 0x08190819, 0x19192b08, 0x08191908, 0x19192b08, 0x082b0808, 0x19192b08, + 0x19080819, 0x19192b08, 0x19081908, 0x19192b08, 0x19190808, 0x19192b08, 0x19192b2b, 0x19192b08, + 0x2b080808, 0x19192b08, 0x08080819, 0x19192b19, 0x08081908, 0x19192b19, 0x08190808, 0x19192b19, + 0x19080808, 0x19192b19, 0x08080808, 0x19192b2b, 0x08192b19, 0x19192b2b, 0x2b081919, 0x19192b2b, + 0x2b2b2b08, 0x19192b2b, 0x08080819, 0x192b0808, 0x08081908, 0x192b0808, 0x0808192b, 0x192b0808, + 0x08190808, 0x192b0808, 0x0819082b, 0x192b0808, 0x08191919, 0x192b0808, 0x08192b08, 0x192b0808, + 0x082b0819, 0x192b0808, 0x082b1908, 0x192b0808, 0x19080808, 0x192b0808, 0x19081919, 0x192b0808, + 0x19082b08, 0x192b0808, 0x19190819, 0x192b0808, 0x19191908, 0x192b0808, 0x192b0808, 0x192b0808, + 0x2b081908, 0x192b0808, 0x2b190808, 0x192b0808, 0x08080808, 0x192b0819, 0x0808082b, 0x192b0819, + 0x08081919, 0x192b0819, 0x08082b08, 0x192b0819, 0x08190819, 0x192b0819, 0x08191908, 0x192b0819, + 0x082b0808, 0x192b0819, 0x19080819, 0x192b0819, 0x19081908, 0x192b0819, 0x19190808, 0x192b0819, + 0x2b080808, 0x192b0819, 0x2b192b19, 0x192b0819, 0x08081908, 0x192b082b, 0x08190808, 0x192b082b, + 0x19080808, 0x192b082b, 0x1919192b, 0x192b082b, 0x2b2b0819, 0x192b082b, 0x08080808, 0x192b1908, + 0x08081919, 0x192b1908, 0x08082b08, 0x192b1908, 0x08190819, 0x192b1908, 0x08191908, 0x192b1908, + 0x082b0808, 0x192b1908, 0x19080819, 0x192b1908, 0x19081908, 0x192b1908, 0x19190808, 0x192b1908, + 0x2b080808, 0x192b1908, 0x08080819, 0x192b1919, 0x08081908, 0x192b1919, 0x08190808, 0x192b1919, + 0x19080808, 0x192b1919, 0x19082b2b, 0x192b1919, 0x192b2b08, 0x192b1919, 0x2b19082b, 0x192b1919, + 0x08080808, 0x192b192b, 0x2b191908, 0x192b192b, 0x08080819, 0x192b2b08, 0x08081908, 0x192b2b08, + 0x08190808, 0x192b2b08, 0x192b1919, 0x192b2b08, 0x2b192b08, 0x192b2b08, 0x08080808, 0x192b2b19, + 0x082b2b2b, 0x192b2b19, 0x1908082b, 0x192b2b2b, 0x2b2b0819, 0x192b2b2b, 0x08080808, 0x2b080808, + 0x0808082b, 0x2b080808, 0x08081919, 0x2b080808, 0x08082b08, 0x2b080808, 0x08190819, 0x2b080808, + 0x08191908, 0x2b080808, 0x08192b19, 0x2b080808, 0x082b0808, 0x2b080808, 0x082b1919, 0x2b080808, + 0x19080819, 0x2b080808, 0x19081908, 0x2b080808, 0x19190808, 0x2b080808, 0x1919082b, 0x2b080808, + 0x19191919, 0x2b080808, 0x19192b08, 0x2b080808, 0x192b0819, 0x2b080808, 0x2b080808, 0x2b080808, + 0x2b081919, 0x2b080808, 0x2b190819, 0x2b080808, 0x2b191908, 0x2b080808, 0x08080819, 0x2b080819, + 0x08081908, 0x2b080819, 0x08082b19, 0x2b080819, 0x08190808, 0x2b080819, 0x0819082b, 0x2b080819, + 0x08191919, 0x2b080819, 0x08192b08, 0x2b080819, 0x082b0819, 0x2b080819, 0x082b1908, 0x2b080819, + 0x19080808, 0x2b080819, 0x1908082b, 0x2b080819, 0x19081919, 0x2b080819, 0x19082b08, 0x2b080819, + 0x19190819, 0x2b080819, 0x19191908, 0x2b080819, 0x2b080819, 0x2b080819, 0x2b081908, 0x2b080819, + 0x2b190808, 0x2b080819, 0x2b2b2b19, 0x2b080819, 0x08080808, 0x2b08082b, 0x08081919, 0x2b08082b, + 0x08082b2b, 0x2b08082b, 0x08190819, 0x2b08082b, 0x08191908, 0x2b08082b, 0x19080819, 0x2b08082b, + 0x19081908, 0x2b08082b, 0x19190808, 0x2b08082b, 0x08080819, 0x2b081908, 0x08081908, 0x2b081908, + 0x0808192b, 0x2b081908, 0x08082b19, 0x2b081908, 0x08190808, 0x2b081908, 0x0819082b, 0x2b081908, + 0x08191919, 0x2b081908, 0x08192b08, 0x2b081908, 0x082b0819, 0x2b081908, 0x19080808, 0x2b081908, + 0x1908082b, 0x2b081908, 0x19081919, 0x2b081908, 0x19082b08, 0x2b081908, 0x19190819, 0x2b081908, + 0x19191908, 0x2b081908, 0x192b0808, 0x2b081908, 0x2b080819, 0x2b081908, 0x2b081908, 0x2b081908, + 0x2b190808, 0x2b081908, 0x08080808, 0x2b081919, 0x0808082b, 0x2b081919, 0x08081919, 0x2b081919, + 0x08082b08, 0x2b081919, 0x08190819, 0x2b081919, 0x08191908, 0x2b081919, 0x082b0808, 0x2b081919, + 0x19080819, 0x2b081919, 0x19081908, 0x2b081919, 0x19190808, 0x2b081919, 0x2b080808, 0x2b081919, + 0x2b082b2b, 0x2b081919, 0x08080819, 0x2b08192b, 0x08081908, 0x2b08192b, 0x08190808, 0x2b08192b, + 0x082b2b19, 0x2b08192b, 0x19080808, 0x2b08192b, 0x08080808, 0x2b082b08, 0x08081919, 0x2b082b08, + 0x08190819, 0x2b082b08, 0x08191908, 0x2b082b08, 0x19080819, 0x2b082b08, 0x19081908, 0x2b082b08, + 0x19190808, 0x2b082b08, 0x2b2b082b, 0x2b082b08, 0x08080819, 0x2b082b19, 0x08081908, 0x2b082b19, + 0x19080808, 0x2b082b19, 0x192b1919, 0x2b082b19, 0x082b082b, 0x2b082b2b, 0x19192b08, 0x2b082b2b, + 0x19192b2b, 0x2b082b2b, 0x2b08082b, 0x2b082b2b, 0x2b2b082b, 0x2b082b2b, 0x08080819, 0x2b190808, + 0x08081908, 0x2b190808, 0x08082b19, 0x2b190808, 0x08190808, 0x2b190808, 0x0819082b, 0x2b190808, + 0x08191919, 0x2b190808, 0x08192b08, 0x2b190808, 0x082b1908, 0x2b190808, 0x19080808, 0x2b190808, + 0x1908082b, 0x2b190808, 0x19081919, 0x2b190808, 0x19082b08, 0x2b190808, 0x19190819, 0x2b190808, + 0x19191908, 0x2b190808, 0x192b0808, 0x2b190808, 0x2b080819, 0x2b190808, 0x2b081908, 0x2b190808, + 0x2b190808, 0x2b190808, 0x08080808, 0x2b190819, 0x08081919, 0x2b190819, 0x08190819, 0x2b190819, + 0x08191908, 0x2b190819, 0x19080819, 0x2b190819, 0x19081908, 0x2b190819, 0x19190808, 0x2b190819, + 0x19192b2b, 0x2b190819, 0x08080819, 0x2b19082b, 0x08081908, 0x2b19082b, 0x08190808, 0x2b19082b, + 0x19080808, 0x2b19082b, 0x2b2b192b, 0x2b19082b, 0x08080808, 0x2b191908, 0x0808082b, 0x2b191908, + 0x08081919, 0x2b191908, 0x08082b08, 0x2b191908, 0x08190819, 0x2b191908, 0x08191908, 0x2b191908, + 0x082b0808, 0x2b191908, 0x19080819, 0x2b191908, 0x19081908, 0x2b191908, 0x19190808, 0x2b191908, + 0x2b080808, 0x2b191908, 0x2b19192b, 0x2b191908, 0x08080819, 0x2b191919, 0x08081908, 0x2b191919, + 0x08190808, 0x2b191919, 0x19080808, 0x2b191919, 0x2b192b08, 0x2b191919, 0x2b2b0819, 0x2b191919, + 0x08080808, 0x2b19192b, 0x1908192b, 0x2b19192b, 0x192b1908, 0x2b19192b, 0x08080819, 0x2b192b08, + 0x08081908, 0x2b192b08, 0x08190808, 0x2b192b08, 0x082b192b, 0x2b192b08, 0x19080808, 0x2b192b08, + 0x2b2b2b19, 0x2b192b08, 0x08080808, 0x2b192b19, 0x19082b19, 0x2b192b19, 0x1919082b, 0x2b192b19, + 0x2b190808, 0x2b192b2b, 0x08080808, 0x2b2b0808, 0x08081919, 0x2b2b0808, 0x08082b2b, 0x2b2b0808, + 0x08191908, 0x2b2b0808, 0x082b082b, 0x2b2b0808, 0x082b2b2b, 0x2b2b0808, 0x19080819, 0x2b2b0808, + 0x19081908, 0x2b2b0808, 0x19190808, 0x2b2b0808, 0x2b2b082b, 0x2b2b0808, 0x2b2b2b2b, 0x2b2b0808, + 0x19080808, 0x2b2b0819, 0x192b1919, 0x2b2b0819, 0x0808082b, 0x2b2b082b, 0x08082b2b, 0x2b2b082b, + 0x082b082b, 0x2b2b082b, 0x082b2b08, 0x2b2b082b, 0x082b2b2b, 0x2b2b082b, 0x2b08082b, 0x2b2b082b, + 0x2b082b08, 0x2b2b082b, 0x2b082b2b, 0x2b2b082b, 0x2b2b2b08, 0x2b2b082b, 0x08080819, 0x2b2b1908, + 0x08081908, 0x2b2b1908, 0x08190808, 0x2b2b1908, 0x19080808, 0x2b2b1908, 0x2b082b19, 0x2b2b1908, + 0x2b2b1908, 0x2b2b1908, 0x08080808, 0x2b2b1919, 0x08192b19, 0x2b2b1919, 0x19190819, 0x2b2b192b, + 0x08082b2b, 0x2b2b2b08, 0x082b2b08, 0x2b2b2b08, 0x2b2b082b, 0x2b2b2b08, 0x19191908, 0x2b2b2b19, + 0x2b08192b, 0x2b2b2b19, 0x08082b08, 0x2b2b2b2b, 0x08082b2b, 0x2b2b2b2b, 0x082b0808, 0x2b2b2b2b, + 0x082b082b, 0x2b2b2b2b, 0x082b2b08, 0x2b2b2b2b, 0x2b082b08, 0x2b2b2b2b, 0x2b2b2b2b, 0x2b2b2b2b +); + +struct iq2_s { + d: f16, + qs: array, + qh: array, + scales: array +}; + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + let d = f32(block.d); + var src1_i = src1_idx_base + offset * 256; + var qs_vals : array; + for (var i: u32 = 0; i < 16; i++) { + qs_vals[i] = bitcast(vec2(block.qs[i * 2], block.qs[i * 2 + 1])); + } + var qh_vals = array( + bitcast(vec2(block.qh[0], block.qh[1])), + bitcast(vec2(block.qh[2], block.qh[3])) + ); + var scale_vals = array( + bitcast(vec2(block.scales[0], block.scales[1])), + bitcast(vec2(block.scales[2], block.scales[3])) + ); + var sum = 0.0; + for (var ib: u32 = 0; ib < 8; ib ++) { + let s = get_byte(scale_vals[ib / 4], ib % 4); + let db = array( + d * (0.5 + f32(s & 0xF)) * 0.25, + d * (0.5 + f32(s >> 4)) * 0.25 + ); + let qs_w = qs_vals[ib]; + for (var l: u32 = 0; l < 4; l++) { + let qh_b = (get_byte(qh_vals[ib / 4], ib % 4) << (8 - 2 * l)) & 0x300; + let ig = (get_byte(qs_w, l) | qh_b) * 8; + let signs = get_byte(qs_vals[ib + 8], l); + let dl = db[l/2]; + for (var j: u32 = 0; j < 8; j++) { + let g = get_byte(iq2s_grid[(ig + j) / 4], (ig + j) % 4); + let m = select(1.0, -1.0, (get_byte(kmask_iq2xs[j / 4], j % 4) & signs) != 0); + sum += dl * f32(g) * m * src1[src1_i]; + src1_i++; + } + } + } + return sum; +} + + +#enddecl(IQ2_S) + +#decl(IQ3_XSS) + +const iq3xxs_grid = array( + 0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414, + 0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14, + 0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404, + 0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e, + 0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c, + 0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c, + 0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34, + 0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c, + 0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c, + 0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04, + 0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c, + 0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414, + 0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434, + 0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c, + 0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e, + 0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24, + 0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24, + 0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c, + 0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c, + 0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14, + 0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414, + 0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e, + 0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404, + 0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c, + 0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c, + 0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14, + 0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c, + 0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c, + 0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14, + 0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14, + 0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c, + 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04 +); + +struct iq3_xxs { + d: f16, + qs: array +}; + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + let d = f32(block.d); + var src1_i = src1_idx_base + offset * 256; + var sum = 0.0; + for (var ib: u32 = 0; ib < 16; ib += 2) { + let sc_sign = bitcast(vec2(block.qs[ib + 32], block.qs[ib + 33])); + let db = d * (0.5 + f32(sc_sign >> 28)) * 0.5; + for (var l: u32 = 0; l < 4; l++) { + let is = (sc_sign >> (7 * l)) & 127; + let signs = get_byte(ksigns_iq2xs[is / 4], is % 4); + let ig_val = bitcast(vec2(block.qs[ib * 2 + l], 0.0)); + let ig1 = get_byte(ig_val, 0); + let ig2 = get_byte(ig_val, 1); + for (var j: u32 = 0; j < 4; j++) { + let g1 = get_byte(iq3xxs_grid[ig1], j); + let g2 = get_byte(iq3xxs_grid[ig2], j); + let m1 = select(1.0, -1.0, (get_byte(kmask_iq2xs[0], j) & signs) != 0); + let m2 = select(1.0, -1.0, (get_byte(kmask_iq2xs[1], j) & signs) != 0); + sum += db * f32(g1) * m1 * src1[src1_i]; + sum += db * f32(g2) * m2 * src1[src1_i + 4]; + src1_i++; + } + src1_i += 4; + } + } + return sum; +} + +#enddecl(IQ3_XSS) + +#decl(IQ3_S) + +const iq3s_grid = array( + 0x01010101, 0x01010103, 0x01010105, 0x0101010b, 0x0101010f, 0x01010301, 0x01010303, 0x01010305, + 0x01010309, 0x0101030d, 0x01010501, 0x01010503, 0x0101050b, 0x01010707, 0x01010901, 0x01010905, + 0x0101090b, 0x0101090f, 0x01010b03, 0x01010b07, 0x01010d01, 0x01010d05, 0x01010f03, 0x01010f09, + 0x01010f0f, 0x01030101, 0x01030103, 0x01030105, 0x01030109, 0x01030301, 0x01030303, 0x0103030b, + 0x01030501, 0x01030507, 0x0103050f, 0x01030703, 0x0103070b, 0x01030909, 0x01030d03, 0x01030d0b, + 0x01030f05, 0x01050101, 0x01050103, 0x0105010b, 0x0105010f, 0x01050301, 0x01050307, 0x0105030d, + 0x01050503, 0x0105050b, 0x01050701, 0x01050709, 0x01050905, 0x0105090b, 0x0105090f, 0x01050b03, + 0x01050b07, 0x01050f01, 0x01050f07, 0x01070107, 0x01070303, 0x0107030b, 0x01070501, 0x01070505, + 0x01070703, 0x01070707, 0x0107070d, 0x01070909, 0x01070b01, 0x01070b05, 0x01070d0f, 0x01070f03, + 0x01070f0b, 0x01090101, 0x01090307, 0x0109030f, 0x01090503, 0x01090509, 0x01090705, 0x01090901, + 0x01090907, 0x01090b03, 0x01090f01, 0x010b0105, 0x010b0109, 0x010b0501, 0x010b0505, 0x010b050d, + 0x010b0707, 0x010b0903, 0x010b090b, 0x010b090f, 0x010b0d0d, 0x010b0f07, 0x010d010d, 0x010d0303, + 0x010d0307, 0x010d0703, 0x010d0b05, 0x010d0f03, 0x010f0101, 0x010f0105, 0x010f0109, 0x010f0501, + 0x010f0505, 0x010f050d, 0x010f0707, 0x010f0b01, 0x010f0b09, 0x03010101, 0x03010103, 0x03010105, + 0x03010109, 0x03010301, 0x03010303, 0x03010307, 0x0301030b, 0x0301030f, 0x03010501, 0x03010505, + 0x03010703, 0x03010709, 0x0301070d, 0x03010b09, 0x03010b0d, 0x03010d03, 0x03010f05, 0x03030101, + 0x03030103, 0x03030107, 0x0303010d, 0x03030301, 0x03030309, 0x03030503, 0x03030701, 0x03030707, + 0x03030903, 0x03030b01, 0x03030b05, 0x03030f01, 0x03030f0d, 0x03050101, 0x03050305, 0x0305030b, + 0x0305030f, 0x03050501, 0x03050509, 0x03050705, 0x03050901, 0x03050907, 0x03050b0b, 0x03050d01, + 0x03050f05, 0x03070103, 0x03070109, 0x0307010f, 0x03070301, 0x03070307, 0x03070503, 0x0307050f, + 0x03070701, 0x03070709, 0x03070903, 0x03070d05, 0x03070f01, 0x03090107, 0x0309010b, 0x03090305, + 0x03090309, 0x03090703, 0x03090707, 0x03090905, 0x0309090d, 0x03090b01, 0x03090b09, 0x030b0103, + 0x030b0301, 0x030b0307, 0x030b0503, 0x030b0701, 0x030b0705, 0x030b0b03, 0x030d0501, 0x030d0509, + 0x030d050f, 0x030d0909, 0x030d090d, 0x030f0103, 0x030f0107, 0x030f0301, 0x030f0305, 0x030f0503, + 0x030f070b, 0x030f0903, 0x030f0d05, 0x030f0f01, 0x05010101, 0x05010103, 0x05010107, 0x0501010b, + 0x0501010f, 0x05010301, 0x05010305, 0x05010309, 0x0501030d, 0x05010503, 0x05010507, 0x0501050f, + 0x05010701, 0x05010705, 0x05010903, 0x05010907, 0x0501090b, 0x05010b01, 0x05010b05, 0x05010d0f, + 0x05010f01, 0x05010f07, 0x05010f0b, 0x05030101, 0x05030105, 0x05030301, 0x05030307, 0x0503030f, + 0x05030505, 0x0503050b, 0x05030703, 0x05030709, 0x05030905, 0x05030b03, 0x05050103, 0x05050109, + 0x0505010f, 0x05050503, 0x05050507, 0x05050701, 0x0505070f, 0x05050903, 0x05050b07, 0x05050b0f, + 0x05050f03, 0x05050f09, 0x05070101, 0x05070105, 0x0507010b, 0x05070303, 0x05070505, 0x05070509, + 0x05070703, 0x05070707, 0x05070905, 0x05070b01, 0x05070d0d, 0x05090103, 0x0509010f, 0x05090501, + 0x05090507, 0x05090705, 0x0509070b, 0x05090903, 0x05090f05, 0x05090f0b, 0x050b0109, 0x050b0303, + 0x050b0505, 0x050b070f, 0x050b0901, 0x050b0b07, 0x050b0f01, 0x050d0101, 0x050d0105, 0x050d010f, + 0x050d0503, 0x050d0b0b, 0x050d0d03, 0x050f010b, 0x050f0303, 0x050f050d, 0x050f0701, 0x050f0907, + 0x050f0b01, 0x07010105, 0x07010303, 0x07010307, 0x0701030b, 0x0701030f, 0x07010505, 0x07010703, + 0x07010707, 0x0701070b, 0x07010905, 0x07010909, 0x0701090f, 0x07010b03, 0x07010d07, 0x07010f03, + 0x07030103, 0x07030107, 0x0703010b, 0x07030309, 0x07030503, 0x07030507, 0x07030901, 0x07030d01, + 0x07030f05, 0x07030f0d, 0x07050101, 0x07050305, 0x07050501, 0x07050705, 0x07050709, 0x07050b01, + 0x07070103, 0x07070301, 0x07070309, 0x07070503, 0x07070507, 0x0707050f, 0x07070701, 0x07070903, + 0x07070907, 0x0707090f, 0x07070b0b, 0x07070f07, 0x07090107, 0x07090303, 0x0709030d, 0x07090505, + 0x07090703, 0x07090b05, 0x07090d01, 0x07090d09, 0x070b0103, 0x070b0301, 0x070b0305, 0x070b050b, + 0x070b0705, 0x070b0909, 0x070b0b0d, 0x070b0f07, 0x070d030d, 0x070d0903, 0x070f0103, 0x070f0107, + 0x070f0501, 0x070f0505, 0x070f070b, 0x09010101, 0x09010109, 0x09010305, 0x09010501, 0x09010509, + 0x0901050f, 0x09010705, 0x09010903, 0x09010b01, 0x09010f01, 0x09030105, 0x0903010f, 0x09030303, + 0x09030307, 0x09030505, 0x09030701, 0x0903070b, 0x09030907, 0x09030b03, 0x09030b0b, 0x09050103, + 0x09050107, 0x09050301, 0x0905030b, 0x09050503, 0x09050707, 0x09050901, 0x09050b0f, 0x09050d05, + 0x09050f01, 0x09070109, 0x09070303, 0x09070307, 0x09070501, 0x09070505, 0x09070703, 0x0907070b, + 0x09090101, 0x09090105, 0x09090509, 0x0909070f, 0x09090901, 0x09090f03, 0x090b010b, 0x090b010f, + 0x090b0503, 0x090b0d05, 0x090d0307, 0x090d0709, 0x090d0d01, 0x090f0301, 0x090f030b, 0x090f0701, + 0x090f0907, 0x090f0b03, 0x0b010105, 0x0b010301, 0x0b010309, 0x0b010505, 0x0b010901, 0x0b010909, + 0x0b01090f, 0x0b010b05, 0x0b010d0d, 0x0b010f09, 0x0b030103, 0x0b030107, 0x0b03010b, 0x0b030305, + 0x0b030503, 0x0b030705, 0x0b030f05, 0x0b050101, 0x0b050303, 0x0b050507, 0x0b050701, 0x0b05070d, + 0x0b050b07, 0x0b070105, 0x0b07010f, 0x0b070301, 0x0b07050f, 0x0b070909, 0x0b070b03, 0x0b070d0b, + 0x0b070f07, 0x0b090103, 0x0b090109, 0x0b090501, 0x0b090705, 0x0b09090d, 0x0b0b0305, 0x0b0b050d, + 0x0b0b0b03, 0x0b0b0b07, 0x0b0d0905, 0x0b0f0105, 0x0b0f0109, 0x0b0f0505, 0x0d010303, 0x0d010307, + 0x0d01030b, 0x0d010703, 0x0d010707, 0x0d010d01, 0x0d030101, 0x0d030501, 0x0d03050f, 0x0d030d09, + 0x0d050305, 0x0d050709, 0x0d050905, 0x0d050b0b, 0x0d050d05, 0x0d050f01, 0x0d070101, 0x0d070309, + 0x0d070503, 0x0d070901, 0x0d09050b, 0x0d090907, 0x0d090d05, 0x0d0b0101, 0x0d0b0107, 0x0d0b0709, + 0x0d0b0d01, 0x0d0d010b, 0x0d0d0901, 0x0d0f0303, 0x0d0f0307, 0x0f010101, 0x0f010109, 0x0f01010f, + 0x0f010501, 0x0f010505, 0x0f01070d, 0x0f010901, 0x0f010b09, 0x0f010d05, 0x0f030105, 0x0f030303, + 0x0f030509, 0x0f030907, 0x0f03090b, 0x0f050103, 0x0f050109, 0x0f050301, 0x0f05030d, 0x0f050503, + 0x0f050701, 0x0f050b03, 0x0f070105, 0x0f070705, 0x0f07070b, 0x0f070b07, 0x0f090103, 0x0f09010b, + 0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101 +); + +struct iq3_s { + d: f16, + qs: array, + qh: array, + signs: array, + scales: array +}; + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + let d = f32(block.d); + var src1_i = src1_idx_base + offset * 256; + var qh_vals = array( + bitcast(vec2(block.qh[0], block.qh[1])), + bitcast(vec2(block.qh[2], block.qh[3])) + ); + var sign_vals: array; + for (var i: u32 = 0; i < 8; i++) { + sign_vals[i] = bitcast(vec2(block.signs[i * 2], block.signs[i * 2 + 1])); + } + var scale_vals = bitcast(vec2(block.scales[0], block.scales[1])); + var sum = 0.0; + for (var ib: u32 = 0; ib < 4; ib++) { + let s = get_byte(scale_vals, ib); + let db = array( + d * (1.0 + 2.0 * f32(s & 0xF)), + d * (1.0 + 2.0 * f32(s >> 4)) + ); + for (var k: u32 = 0; k < 2; k++) { + let dl = db[k]; + let qh_byte = get_byte(qh_vals[ib / 2], (ib % 2) * 2 + k); + let sign_w = sign_vals[ib * 2 + k]; + for (var l: u32 = 0; l < 4; l++) { + let signs = get_byte(sign_w, l); + let ig_val = bitcast(vec2(block.qs[ib * 8 + k * 4 + l], 0.0)); + let ig1 = get_byte(ig_val, 0) | ((qh_byte << ((8 - (2 * l)))) & 256); + let ig2 = get_byte(ig_val, 1) | ((qh_byte << ((7 - (2 * l)))) & 256); + for (var j: u32 = 0; j < 4; j++) { + let g1 = get_byte(iq3s_grid[ig1], j); + let g2 = get_byte(iq3s_grid[ig2], j); + let m1 = select(1.0, -1.0, (get_byte(kmask_iq2xs[0], j) & signs) != 0); + let m2 = select(1.0, -1.0, (get_byte(kmask_iq2xs[1], j) & signs) != 0); + sum += dl * f32(g1) * m1 * src1[src1_i]; + sum += dl * f32(g2) * m2 * src1[src1_i + 4]; + src1_i++; + } + src1_i += 4; + } + } + } + return sum; +} +#enddecl(IQ3_S) + #end(DECLS) #define(SHADER) From 57c26b17c2d39d1df9a68c649f50c19b113f711b Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Tue, 19 Aug 2025 14:30:09 -0700 Subject: [PATCH 14/15] Support rest of i-quants --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 24 +- .../wgsl-shaders/mul_mat.tmpl.wgsl | 337 +++++++++++++++++- 2 files changed, 352 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index b1669995b18f4..c0dc55ff2edbe 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -129,7 +129,7 @@ struct webgpu_context_struct { webgpu_buf_pool set_rows_error_buf_pool; wgpu::ComputePipeline memset_pipeline; - wgpu::ComputePipeline mul_mat_pipeline[24][2]; + wgpu::ComputePipeline mul_mat_pipeline[30][2]; wgpu::ComputePipeline set_rows_pipeline; wgpu::ComputePipeline cpy_pipeline; @@ -915,7 +915,7 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_context & webgpu_ctx) { } static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) { - webgpu_pipeline_info pipeline_infos[18] = { + webgpu_pipeline_info pipeline_infos[22] = { { .name = "mul_mat_f32_f32", .shader_code = wgsl_mul_mat_f32_f32, .src0_type = GGML_TYPE_F32, @@ -987,6 +987,22 @@ static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) { { .name = "mul_mat_iq3_s_f32", .shader_code = wgsl_mul_mat_iq3_s_f32, .src0_type = GGML_TYPE_IQ3_S, + .src1_type = GGML_TYPE_F32 }, + { .name = "mul_mat_iq1_s_f32", + .shader_code = wgsl_mul_mat_iq1_s_f32, + .src0_type = GGML_TYPE_IQ1_S, + .src1_type = GGML_TYPE_F32 }, + { .name = "mul_mat_iq1_m_f32", + .shader_code = wgsl_mul_mat_iq1_m_f32, + .src0_type = GGML_TYPE_IQ1_M, + .src1_type = GGML_TYPE_F32 }, + { .name = "mul_mat_iq4_nl_f32", + .shader_code = wgsl_mul_mat_iq4_nl_f32, + .src0_type = GGML_TYPE_IQ4_NL, + .src1_type = GGML_TYPE_F32 }, + { .name = "mul_mat_iq4_xs_f32", + .shader_code = wgsl_mul_mat_iq4_xs_f32, + .src0_type = GGML_TYPE_IQ4_XS, .src1_type = GGML_TYPE_F32 } }; @@ -1094,6 +1110,10 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ3_S: + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: + case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ4_XS: return true; default: return false; diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl index b97198e86b29f..0f3adfef6e0b6 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl @@ -111,7 +111,7 @@ "SRC1_TYPE": "f32", "BLOCK_SIZE": 256 }, - "DECLS": ["BYTE_HELPERS", "IQ2_TABLES", "IQ2_XXS"] + "DECLS": ["BYTE_HELPERS", "IQ23_TABLES", "IQ2_XXS"] }, { "REPLS": { @@ -119,7 +119,7 @@ "SRC1_TYPE": "f32", "BLOCK_SIZE": 256 }, - "DECLS": ["BYTE_HELPERS", "IQ2_TABLES", "IQ2_XS"] + "DECLS": ["BYTE_HELPERS", "IQ23_TABLES", "IQ2_XS"] }, { "REPLS": { @@ -127,7 +127,7 @@ "SRC1_TYPE": "f32", "BLOCK_SIZE": 256 }, - "DECLS": ["BYTE_HELPERS", "IQ2_TABLES", "IQ2_S"] + "DECLS": ["BYTE_HELPERS", "IQ23_TABLES", "IQ2_S"] }, { "REPLS": { @@ -135,7 +135,7 @@ "SRC1_TYPE": "f32", "BLOCK_SIZE": 256 }, - "DECLS": ["BYTE_HELPERS", "IQ2_TABLES", "IQ3_XSS"] + "DECLS": ["BYTE_HELPERS", "IQ23_TABLES", "IQ3_XSS"] }, { "REPLS": { @@ -143,7 +143,39 @@ "SRC1_TYPE": "f32", "BLOCK_SIZE": 256 }, - "DECLS": ["BYTE_HELPERS", "IQ2_TABLES", "IQ3_S"] + "DECLS": ["BYTE_HELPERS", "IQ23_TABLES", "IQ3_S"] + }, + { + "REPLS": { + "SRC0_TYPE": "iq1_s", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["BYTE_HELPERS", "IQ1_TABLE","IQ1_S"] + }, + { + "REPLS": { + "SRC0_TYPE": "iq1_m", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["BYTE_HELPERS", "IQ1_TABLE","IQ1_M"] + }, + { + "REPLS": { + "SRC0_TYPE": "iq4_nl", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 32, + }, + "DECLS": ["BYTE_HELPERS", "IQ4_TABLE", "IQ4_NL"] + }, + { + "REPLS": { + "SRC0_TYPE": "iq4_xs", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 256, + }, + "DECLS": ["BYTE_HELPERS", "IQ4_TABLE", "IQ4_XS"] } ] @@ -602,7 +634,7 @@ fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { #enddecl(Q6_K) -#decl(IQ2_TABLES) +#decl(IQ23_TABLES) const kmask_iq2xs : array = array( 0x08040201u, // 1, 2, 4, 8 0x80402010u // 16, 32, 64, 128 @@ -618,7 +650,7 @@ const ksigns_iq2xs: array = array( 0x63e2e160,0xe76665e4,0xeb6a69e8,0x6feeed6c, 0xf37271f0,0x77f6f574,0x7bfaf978,0xff7e7dfc ); -#enddecl(IQ2_TABLES) +#enddecl(IQ23_TABLES) #decl(IQ2_XXS) @@ -1399,6 +1431,297 @@ fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { } #enddecl(IQ3_S) +#decl(IQ1_TABLE) + +const IQ1_DELTA: f32 = 0.125; + +const iq1_grid = array( + 0xfffdffff, 0xfff7fff0, 0xffccfff5, 0xffdfffc0, 0xffd7ffdd, 0xff30ffd5, 0xff03ff0c, 0xff10ff01, + 0xff7dff7f, 0xff75ff77, 0xff5fff40, 0xff57ff5d, 0xfcf3ff55, 0xfcccfcf0, 0xfcc1fcc3, 0xfcc5fcc4, + 0xfc3cfcd0, 0xfc34fc31, 0xfc00fc0d, 0xfc1cfc05, 0xfc11fc13, 0xfc70fc17, 0xfc43fc4c, 0xfc50fc41, + 0xfdfdfdff, 0xfdf5fdf7, 0xfddffdc0, 0xfdd7fddd, 0xfd30fdd5, 0xfd04fd0c, 0xfd14fd13, 0xfd7dfd7f, + 0xfd75fd77, 0xfd40fd4c, 0xfd5ffd44, 0xfd57fd5d, 0xf3ccfd55, 0xf3c1f3c3, 0xf33cf3d0, 0xf300f334, + 0xf313f305, 0xf34cf310, 0xf350f344, 0xf0f3f0fc, 0xf0f1f0f0, 0xf0c7f0c0, 0xf0d4f0c5, 0xf030f03f, + 0xf00ff035, 0xf003f00c, 0xf001f000, 0xf01ff004, 0xf010f01d, 0xf015f017, 0xf04cf07c, 0xf047f040, + 0xf05cf045, 0xf050f053, 0xf054f051, 0xf1c4f1c3, 0xf133f13c, 0xf10df10f, 0xf107f100, 0xf11cf11f, + 0xf114f111, 0xf14cf170, 0xf144f143, 0xf7fdf7ff, 0xf7f5f7f7, 0xf7dff7c0, 0xf7d7f7dd, 0xf730f7d5, + 0xf701f70c, 0xf77ff710, 0xf777f77d, 0xf740f775, 0xf75df75f, 0xf755f757, 0xf4ccf4f0, 0xf4c4f4c3, + 0xf4d0f4d3, 0xf40ff43c, 0xf400f40c, 0xf413f41c, 0xf44cf414, 0xf441f443, 0xf450f444, 0xf5fdf5ff, + 0xf5f5f5f7, 0xf5dff5c0, 0xf5d7f5dd, 0xf530f5d5, 0xf504f50c, 0xf510f51c, 0xf57df57f, 0xf577f570, + 0xf540f575, 0xf55df55f, 0xf555f557, 0xcfcccfcf, 0xcfc4cfc3, 0xcfd0cfd3, 0xcf33cf3c, 0xcf00cf0f, + 0xcf1ccf07, 0xcf10cf13, 0xcf4ccf14, 0xcf41cf43, 0xcf50cf5c, 0xccf3ccfc, 0xccf4ccf1, 0xcccdcccf, + 0xccc7ccc0, 0xccd3ccdc, 0xcc30ccd4, 0xcc0fcc35, 0xcc0dcc0c, 0xcc00cc03, 0xcc04cc01, 0xcc10cc1f, + 0xcc4dcc73, 0xcc5ccc40, 0xcdcccc53, 0xcdc1cdc3, 0xcd3fcdd0, 0xcd34cd31, 0xcd00cd0d, 0xcd05cd07, + 0xcd11cd13, 0xcd4ccd70, 0xcd41cd43, 0xc3fccd50, 0xc3f4c3f1, 0xc3c0c3c3, 0xc3c4c3c7, 0xc3d1c3dc, + 0xc330c33c, 0xc337c331, 0xc30cc335, 0xc300c303, 0xc304c301, 0xc310c31d, 0xc373c317, 0xc34fc374, + 0xc340c343, 0xc344c347, 0xc35cc345, 0xc350c353, 0xc0fdc354, 0xc0f5c0f0, 0xc0c3c0cc, 0xc0c1c0c0, + 0xc0dfc0c4, 0xc0d0c0dd, 0xc0d5c0d7, 0xc033c03c, 0xc031c030, 0xc00dc00c, 0xc000c003, 0xc004c001, + 0xc01cc005, 0xc010c013, 0xc014c011, 0xc07dc07f, 0xc070c073, 0xc075c077, 0xc04cc04f, 0xc040c043, + 0xc044c041, 0xc05fc045, 0xc050c05d, 0xc1f3c1fc, 0xc1f1c1f0, 0xc1c1c1c0, 0xc1c5c1c7, 0xc1d1c1dc, + 0xc13dc13f, 0xc130c133, 0xc135c137, 0xc100c10c, 0xc107c101, 0xc11cc104, 0xc110c113, 0xc114c117, + 0xc171c115, 0xc14dc175, 0xc153c140, 0xc7ccc154, 0xc7d0c7c1, 0xc733c73c, 0xc734c731, 0xc700c70f, + 0xc705c707, 0xc71cc71f, 0xc711c713, 0xc770c714, 0xc743c74c, 0xc4cfc750, 0xc4c0c4cd, 0xc4dcc4c5, + 0xc43dc4d0, 0xc430c433, 0xc40cc437, 0xc400c403, 0xc404c401, 0xc41fc405, 0xc415c410, 0xc44cc474, + 0xc440c44d, 0xc45cc447, 0xc454c451, 0xc5c1c5f4, 0xc5d1c5d3, 0xc531c533, 0xc50fc534, 0xc500c50d, + 0xc51cc507, 0xc514c511, 0xc54cc570, 0xc545c541, 0xdffddfff, 0xdff5dff7, 0xdfdfdfc0, 0xdfd0dfdd, + 0xdfd5dfd7, 0xdf0cdf30, 0xdf1cdf04, 0xdf7fdf10, 0xdf77df7d, 0xdf40df75, 0xdf5ddf5f, 0xdf57df50, + 0xdcf0df55, 0xdcc3dccc, 0xdcd0dcc4, 0xdc33dc3d, 0xdc00dc34, 0xdc05dc07, 0xdc13dc1c, 0xdc11dc10, + 0xdc4fdc70, 0xdc44dc41, 0xddfcdc50, 0xddf5ddf7, 0xddc0ddcc, 0xdddddddf, 0xddd5ddd7, 0xdd0cdd30, + 0xdd04dd01, 0xdd7cdd10, 0xdd75dd77, 0xdd40dd4c, 0xdd5ddd5f, 0xdd55dd57, 0xd3c3d3f0, 0xd3c4d3c1, + 0xd333d3d0, 0xd331d330, 0xd30dd334, 0xd307d300, 0xd311d305, 0xd34cd370, 0xd344d343, 0xd350d35c, + 0xd0c0d0f4, 0xd0d4d0dc, 0xd030d03f, 0xd00cd037, 0xd000d003, 0xd01dd004, 0xd017d010, 0xd04fd074, + 0xd040d043, 0xd045d047, 0xd053d05c, 0xd054d051, 0xd1cfd1f0, 0xd1c4d1cd, 0xd13cd1d0, 0xd100d134, + 0xd11cd11f, 0xd173d114, 0xd14fd171, 0xd7ffd145, 0xd7f7d7fd, 0xd7c0d7f5, 0xd7ddd7df, 0xd7d5d7d7, + 0xd70cd730, 0xd710d703, 0xd77dd77f, 0xd775d777, 0xd75dd75f, 0xd755d757, 0xd4ccd4f4, 0xd4c4d4c3, + 0xd431d4d0, 0xd40dd434, 0xd41cd400, 0xd411d413, 0xd470d414, 0xd441d44f, 0xd453d444, 0xd5ffd450, + 0xd5f7d5fd, 0xd5dfd5f5, 0xd5d7d5dd, 0xd530d5d5, 0xd501d50c, 0xd510d504, 0xd57dd57f, 0xd575d577, + 0xd55fd540, 0xd557d55d, 0x3ff0d555, 0x3fc13fcc, 0x3f343fd0, 0x3f003f0d, 0x3f053f07, 0x3f133f1c, + 0x3f433f11, 0x3f5c3f44, 0x3cff3f51, 0x3cf33cfc, 0x3cf43cf1, 0x3cc03ccd, 0x3cc73cc1, 0x3cdc3cc5, + 0x3cd43cd1, 0x3c373c30, 0x3c0c3c35, 0x3c003c03, 0x3c043c01, 0x3c103c05, 0x3c153c17, 0x3c733c7c, + 0x3c4f3c71, 0x3c403c4d, 0x3c5c3c5f, 0x3df03c5d, 0x3dc33dcc, 0x3dd03dc1, 0x3d0d3d3c, 0x3d053d00, + 0x3d143d13, 0x3d433d74, 0x33fc3d50, 0x33c433c0, 0x333033d4, 0x33353337, 0x3303330c, 0x33013300, + 0x331d331c, 0x33173310, 0x337c3315, 0x33743371, 0x334d334f, 0x335f3340, 0x3354335c, 0x30fd30fc, + 0x30f530f0, 0x30c330cc, 0x30c130c0, 0x30df30c4, 0x30d530d0, 0x3033303c, 0x30313030, 0x300f3034, + 0x3003300c, 0x30013000, 0x30043007, 0x3013301c, 0x30113010, 0x307d3014, 0x30703073, 0x304c3077, + 0x30403043, 0x30443041, 0x30503045, 0x30553057, 0x31f031fc, 0x31c331f4, 0x31c731c0, 0x31dc31c5, + 0x31d431d3, 0x313d313f, 0x31373130, 0x310c310f, 0x3100310d, 0x31043101, 0x3110311d, 0x317c3117, + 0x31753170, 0x31403143, 0x3153315c, 0x37f03151, 0x37c037cc, 0x37d037c5, 0x3734373d, 0x3700370f, + 0x371c3707, 0x37113713, 0x37703714, 0x3743374c, 0x37443741, 0x34fc3750, 0x34f134f0, 0x34cf34f5, + 0x34c034c3, 0x34dc34c7, 0x34d134d3, 0x3430343f, 0x340c3435, 0x3403340d, 0x34013400, 0x341f3404, + 0x3410341d, 0x34153411, 0x34743471, 0x3440344d, 0x34473441, 0x3453345c, 0x34543451, 0x353335c1, + 0x35343531, 0x35073500, 0x35133505, 0x35433514, 0x0ffc3550, 0x0ff00ff3, 0x0ff40ff1, 0x0fc00fcd, + 0x0fdc0fc5, 0x0fd40fd3, 0x0f300f3f, 0x0f0c0f37, 0x0f000f03, 0x0f040f01, 0x0f170f10, 0x0f740f71, + 0x0f470f40, 0x0f5c0f5f, 0x0f540f51, 0x0cf70cf0, 0x0cf50cf4, 0x0cc30ccc, 0x0cc10cc0, 0x0cc40cc7, + 0x0cd00cdf, 0x0cd70cd1, 0x0c3c0cd5, 0x0c300c33, 0x0c340c31, 0x0c0c0c0f, 0x0c030c0d, 0x0c010c00, + 0x0c040c07, 0x0c1c0c05, 0x0c100c13, 0x0c140c11, 0x0c700c7d, 0x0c430c4c, 0x0c410c40, 0x0c5f0c44, + 0x0c550c50, 0x0df10dfc, 0x0dc00dcd, 0x0ddc0dc5, 0x0d3d0dd3, 0x0d350d30, 0x0d030d0c, 0x0d010d00, + 0x0d1d0d04, 0x0d700d10, 0x0d4d0d4f, 0x0d440d40, 0x0d530d45, 0x03f003f3, 0x03c303cc, 0x03c103c0, + 0x03c403c7, 0x03d003dc, 0x03d503d7, 0x0333033c, 0x03310330, 0x03350334, 0x030c030f, 0x03000303, + 0x03070301, 0x03050304, 0x031d031c, 0x03100313, 0x03140311, 0x0377037f, 0x034c0375, 0x03400343, + 0x03440341, 0x0353035c, 0x03550350, 0x00fd00fc, 0x00f000f3, 0x00f400f1, 0x00cc00cf, 0x00c300cd, + 0x00c100c0, 0x00c500c4, 0x00d300dc, 0x00d100d0, 0x003f00d4, 0x003d003c, 0x00300033, 0x00370031, + 0x000f0034, 0x000d000c, 0x00000003, 0x00070001, 0x00050004, 0x001c001f, 0x00100013, 0x00170011, + 0x00150014, 0x0073007c, 0x00740070, 0x004f0075, 0x0043004c, 0x00410040, 0x00440047, 0x0053005c, + 0x00510050, 0x01ff0054, 0x01fd01fc, 0x01f101f3, 0x01f401f7, 0x01c301cc, 0x01c701c0, 0x01df01c4, + 0x01dd01dc, 0x01d001d3, 0x01d701d1, 0x013c01d4, 0x01310130, 0x01340137, 0x010f0135, 0x010d010c, + 0x01000103, 0x01070101, 0x01050104, 0x0113011c, 0x01140110, 0x0170017d, 0x01770171, 0x01750174, + 0x0140014c, 0x015d0145, 0x01510150, 0x01540157, 0x07f007f3, 0x07f407f1, 0x07c007cf, 0x07dc07c7, + 0x073007d5, 0x07350737, 0x0703070c, 0x07010700, 0x07040707, 0x071d071f, 0x07100713, 0x0774077d, + 0x074d074f, 0x07470740, 0x0754075c, 0x04fd04fc, 0x04f504f0, 0x04c304cc, 0x04c104c0, 0x04d004c4, + 0x0433043c, 0x04310430, 0x040f0434, 0x040d040c, 0x04000403, 0x04070401, 0x04050404, 0x0413041c, + 0x04110410, 0x047c0414, 0x04740470, 0x0443044c, 0x04410440, 0x04440447, 0x05f30450, 0x05c005f7, + 0x05df05c5, 0x05d105d0, 0x053005d4, 0x05340537, 0x0500050c, 0x05070501, 0x051d0504, 0x05170510, + 0x057c0515, 0x054d0575, 0x05410540, 0x05450547, 0x1ff0055c, 0x1fc11fc3, 0x1fd01fc4, 0x1f0f1f33, + 0x1f011f00, 0x1f051f07, 0x1f131f1c, 0x1f141f11, 0x1f411f7c, 0x1cfc1f50, 0x1cf11cf3, 0x1ccd1cf4, + 0x1cdc1cc0, 0x1cd11cdd, 0x1c301cd4, 0x1c0c1c34, 0x1c011c00, 0x1c101c04, 0x1c151c11, 0x1c751c73, + 0x1c401c4d, 0x1c511c5c, 0x1dcc1c54, 0x1dc41dc1, 0x1d3c1d3f, 0x1d001d31, 0x1d071d01, 0x1d701d1f, + 0x1d411d4c, 0x13cc1d50, 0x13c013cd, 0x13c513c1, 0x13d113dc, 0x133f13d4, 0x1330133d, 0x13351337, + 0x1303130c, 0x13011300, 0x13051304, 0x131d131f, 0x13731310, 0x13741370, 0x134d134f, 0x13401343, + 0x13471341, 0x135c1345, 0x13541353, 0x10f710f0, 0x10cc10f5, 0x10c110c0, 0x103310c4, 0x10311030, + 0x100f1034, 0x1003100c, 0x10011000, 0x101c1004, 0x10101013, 0x10141011, 0x10741071, 0x104c1075, + 0x10411040, 0x10451044, 0x1050105d, 0x10571051, 0x11f411fd, 0x11df11c0, 0x11d711d1, 0x113f11d4, + 0x11371130, 0x110c1135, 0x11001103, 0x11071101, 0x111f1105, 0x11171110, 0x117d117f, 0x11751170, + 0x11411143, 0x11441147, 0x1153115f, 0x11551151, 0x17c417c1, 0x173c17d0, 0x1700170d, 0x171c1705, + 0x17701714, 0x1747174c, 0x14fc1751, 0x14cf14f3, 0x14dc14c0, 0x14d114d3, 0x143f14d4, 0x1430143c, + 0x14371431, 0x1403140c, 0x14011400, 0x141f1404, 0x14151410, 0x1473147d, 0x14401475, 0x1453145c, + 0x14541450, 0x15c115cc, 0x153c15c7, 0x15341533, 0x1500150f, 0x15051507, 0x15101513, 0x15711514, + 0x15471543, 0x15511545, 0x7ffd7fff, 0x7ff57ff7, 0x7fdd7fdf, 0x7fd57fd7, 0x7f0f7f30, 0x7f037f0c, + 0x7f047f01, 0x7f7f7f10, 0x7f777f7d, 0x7f407f75, 0x7f5d7f5f, 0x7f557f57, 0x7ccc7cf0, 0x7cc17cc3, + 0x7cd07cc4, 0x7c337c3c, 0x7c0f7c34, 0x7c007c0d, 0x7c077c01, 0x7c137c04, 0x7c147c11, 0x7c747c70, + 0x7c417c43, 0x7c507c44, 0x7dfd7dff, 0x7df57df7, 0x7ddf7dc0, 0x7dd77ddd, 0x7d0c7dd5, 0x7d047d03, + 0x7d7f7d10, 0x7d777d7d, 0x7d407d75, 0x7d5d7d5f, 0x7d557d57, 0x73c473c3, 0x7333733c, 0x7300730c, + 0x731c7305, 0x73147313, 0x73447343, 0x70f470fc, 0x70c070cd, 0x70d170c5, 0x703f70d4, 0x7030703c, + 0x700c7037, 0x70007003, 0x70047001, 0x70107005, 0x70177011, 0x707c7015, 0x70717073, 0x704f7074, + 0x7040704d, 0x70517047, 0x71c171cc, 0x71d071c4, 0x7133713c, 0x71357134, 0x7100710f, 0x71057104, + 0x7111711c, 0x71707115, 0x7145714c, 0x77ff7153, 0x77f777fd, 0x77c077f5, 0x77dd77df, 0x77d577d7, + 0x7730773c, 0x7703770c, 0x77107704, 0x777f7714, 0x7777777d, 0x77407775, 0x775d775f, 0x77557757, + 0x74f174f0, 0x74c374cc, 0x74d074c1, 0x7433743c, 0x74347431, 0x740d740f, 0x74057400, 0x7413741c, + 0x74417470, 0x74507444, 0x75fd75ff, 0x75f575f7, 0x75df75c0, 0x75d775dd, 0x753075d5, 0x7503750c, + 0x757f7501, 0x7577757d, 0x75407575, 0x755d755f, 0x75557557, 0x4fcc4ff0, 0x4fc74fc1, 0x4fd04fc4, + 0x4f314f3c, 0x4f004f34, 0x4f054f07, 0x4f154f14, 0x4f4c4f70, 0x4f414f43, 0x4f504f44, 0x4cf34cfc, + 0x4cf44cf1, 0x4cc04ccf, 0x4cc54cc7, 0x4cd34cdc, 0x4cd44cd1, 0x4c304c3f, 0x4c0c4c0f, 0x4c004c03, + 0x4c044c01, 0x4c104c1d, 0x4c714c73, 0x4c404c4d, 0x4c5c4c47, 0x4c514c53, 0x4df04c54, 0x4dc34dcc, + 0x4dd04dc4, 0x4d314d33, 0x4d0f4d34, 0x4d004d0d, 0x4d114d07, 0x4d704d14, 0x4d414d43, 0x43fc4d54, + 0x43f143f3, 0x43c043cf, 0x43d143c7, 0x4335433f, 0x4303430c, 0x43014300, 0x43044307, 0x431c431f, + 0x4310431d, 0x43714373, 0x4343434d, 0x43474340, 0x4354435c, 0x40f040ff, 0x40f540f7, 0x40cc40cf, + 0x40c040c3, 0x40c440c1, 0x40d040dc, 0x40d540d4, 0x4033403c, 0x40314030, 0x400f4034, 0x400d400c, + 0x40004003, 0x40074001, 0x40054004, 0x4013401c, 0x40114010, 0x407c4014, 0x40774070, 0x404d404c, + 0x40404043, 0x40444041, 0x405f4045, 0x4050405d, 0x40554057, 0x41f341fc, 0x41c041cf, 0x41df41c4, + 0x41d441d1, 0x41374130, 0x410c4134, 0x4100410d, 0x41044101, 0x41174110, 0x4173417d, 0x41754174, + 0x4143414d, 0x41534140, 0x41544151, 0x47c147f0, 0x47d047c4, 0x4731473c, 0x470d470f, 0x47014700, + 0x47134705, 0x47704710, 0x4741474c, 0x47504744, 0x44f144f3, 0x44cf44f4, 0x44c044cd, 0x44c544c7, + 0x44dc44df, 0x44d144d3, 0x443d443f, 0x44374430, 0x440c4435, 0x44004403, 0x44044401, 0x4410441d, + 0x44154411, 0x4473447c, 0x444d444f, 0x44454440, 0x4451445c, 0x45c045f0, 0x453345d0, 0x45344531, + 0x4500450f, 0x451c4507, 0x454c4570, 0x45404543, 0x5fff4541, 0x5ff75ffd, 0x5fc05ff5, 0x5fdd5fdf, + 0x5fd55fd7, 0x5f0c5f30, 0x5f015f03, 0x5f7f5f04, 0x5f775f7d, 0x5f405f75, 0x5f5d5f5f, 0x5f555f57, + 0x5cf45cf0, 0x5cc35ccc, 0x5cc45cc1, 0x5c315cc5, 0x5c0c5c34, 0x5c075c00, 0x5c1c5c05, 0x5c705c13, + 0x5c4d5c4f, 0x5c445c41, 0x5df75dfd, 0x5dcf5df5, 0x5ddd5dc4, 0x5dd55dd7, 0x5d0c5d30, 0x5d045d01, + 0x5d7f5d10, 0x5d775d7d, 0x5d405d75, 0x5d5d5d5f, 0x5d555d57, 0x53d053c4, 0x5333533c, 0x5303530f, + 0x53075300, 0x531c5305, 0x53115310, 0x53145317, 0x50f15370, 0x50cf50f4, 0x50c050cd, 0x50d150c7, + 0x503d50d4, 0x500c5030, 0x50005003, 0x50045001, 0x50155010, 0x5073507c, 0x50715070, 0x504d5074, + 0x50475040, 0x51cc51f0, 0x51c551c1, 0x51d051dc, 0x51315133, 0x510d5135, 0x51015100, 0x511f5107, + 0x5171511d, 0x5140514f, 0x51445141, 0x5153515c, 0x57ff5151, 0x57f757fd, 0x57df57f5, 0x57d757dd, + 0x570c57d5, 0x57015703, 0x577f5704, 0x5777577d, 0x57405775, 0x575d575f, 0x57555757, 0x54c354f0, + 0x54dc54c4, 0x543c54d0, 0x5400540f, 0x541c5405, 0x54145411, 0x5441544f, 0x55fd55ff, 0x55f555f7, + 0x55dd55df, 0x55d555d7, 0x5503550c, 0x557f5501, 0x5577557d, 0x55405575, 0x555d555f, 0x55555557 +); + +#enddecl(IQ1_TABLE) + +#decl(IQ1_S) + +struct iq1_s { + d: f16, + qs: array, + qh: array +}; + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + let d = f32(block.d); + var src1_i = src1_idx_base + offset * 256; + var sum = 0.0; + for (var ib: u32 = 0; ib < 8; ib++) { + let qh = bitcast(vec2(block.qh[ib], 0.0)); + let dl = d * (2 * f32((qh >> 12) & 7) + 1); + let delta = select(IQ1_DELTA, -IQ1_DELTA, (qh & 0x8000) != 0); + let qs_w = bitcast(vec2(block.qs[ib * 2], block.qs[ib * 2 + 1])); + for (var l: u32 = 0; l < 4; l++) { + let ig = (get_byte(qs_w, l) | (((qh >> (3 * l)) & 7) << 8)) * 8; + for (var j: u32 = 0; j < 8; j++) { + let gw = iq1_grid[(ig + j) / 16]; + let g = (gw >> (((ig + j) % 16) * 2)) & 3; + let gs = bitcast(g << 30) >> 30; + sum += dl * (f32(gs) + delta) * src1[src1_i]; + src1_i++; + } + } + } + return sum; +} + +#enddecl(IQ1_S) + +#decl(IQ1_M) + +struct iq1_m { + qs: array, + qh: array, + scales: array +}; + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + + let scale = ((block.scales[0] >> 12) & 0xF) | ((block.scales[0] >> 24) & 0x00F0) | ((block.scales[1] >> 4) & 0x0F00) | ((block.scales[1] >> 16) & 0xF000); + let d = f32(bitcast>(scale).x); + var src1_i = src1_idx_base + offset * 256; + var sum = 0.0; + for (var ib: u32 = 0; ib < 8; ib++) { + let sw = (block.scales[ib / 4] >> (16 * ((ib / 2) % 2))) & 0xFFFF; + let s1 : u32 = (sw >> (6 * (ib % 2))) & 0x7; + let s2 : u32 = (sw >> (6 * (ib % 2) + 3)) & 0x7; + var dl = array( + d * f32(2 * s1 + 1), + d * f32(2 * s2 + 1) + ); + + let qh = block.qh[ib / 2] >> (16 * (ib % 2)); + var idx = array( + get_byte(block.qs[ib], 0) | ((qh << 8) & 0x700), + get_byte(block.qs[ib], 1) | ((qh << 4) & 0x700), + get_byte(block.qs[ib], 2) | ((qh) & 0x700), + get_byte(block.qs[ib], 3) | ((qh >> 4) & 0x700) + ); + var delta = array( + select(IQ1_DELTA, -IQ1_DELTA, (qh & 0x08) != 0), + select(IQ1_DELTA, -IQ1_DELTA, (qh & 0x80) != 0), + select(IQ1_DELTA, -IQ1_DELTA, ((qh >> 8) & 0x08) != 0), + select(IQ1_DELTA, -IQ1_DELTA, ((qh >> 8) & 0x80) != 0) + ); + for (var l: u32 = 0; l < 4; l++) { + let ig = idx[l] * 8; + for (var j: u32 = 0; j < 8; j++) { + let gw = iq1_grid[(ig + j) / 16]; + let g = (gw >> (((ig + j) % 16) * 2)) & 3; + let gs = bitcast(g << 30) >> 30; + sum += dl[l/2] * (f32(gs) + delta[l]) * src1[src1_i]; + src1_i++; + } + } + } + return sum; +} + +#enddecl(IQ1_M) + +#decl(IQ4_TABLE) + +const kvalues_iq4nl = array( + -127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113 +); + +#enddecl(IQ4_TABLE) + +#decl(IQ4_NL) + +struct iq4_nl { + d: f16, + qs: array, +} + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + let d = f32(block.d); + var src1_i = src1_idx_base + offset * 32; + var sum = 0.0; + var qs: array; + for (var i: u32 = 0; i < 4; i++) { + qs[i] = bitcast(vec2(block.qs[i * 2], block.qs[i * 2 + 1])); + } + for (var j: u32 = 0; j < 16; j++) { + let qsb = get_byte(qs[j / 4], j % 4); + sum += d * f32(kvalues_iq4nl[qsb & 0xF]) * src1[src1_i]; + sum += d * f32(kvalues_iq4nl[qsb >> 4]) * src1[src1_i + 16]; + src1_i++; + } + return sum; +} + +#enddecl(IQ4_NL) + +#decl(IQ4_XS) + +struct iq4_xs { + d: f16, + scales_h: f16, + scales_l: u32, + qs: array +}; + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + let d = f32(block.d); + let scales_h = bitcast(vec2(block.scales_h, 0.0)); + var src1_i = src1_idx_base + offset * 256; + var sum = 0.0; + for (var ib: u32 = 0; ib < 8; ib++) { + let ls = ((get_byte(block.scales_l, ib / 2) >> (4 * (ib % 2))) & 0xF) | (((scales_h >> (2 * ib)) & 3) << 4); + let dl = d * (f32(ls) - 32.0); + for (var j: u32 = 0; j < 16; j++) { + let iqs = ib * 16 + j; + let qsb = get_byte(block.qs[iqs / 4], iqs % 4); + sum += dl * f32(kvalues_iq4nl[qsb & 0xF]) * src1[src1_i]; + sum += dl * f32(kvalues_iq4nl[qsb >> 4]) * src1[src1_i + 16]; + src1_i++; + } + src1_i += 16; + } + return sum; +} + +#enddecl(IQ4_XS) + #end(DECLS) #define(SHADER) From 51252f024391662f5aa5977a1d541017bfd7a176 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Tue, 19 Aug 2025 15:35:45 -0700 Subject: [PATCH 15/15] Cleanup code --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 209 ++++++++---------- .../ggml-webgpu/wgsl-shaders/embed_wgsl.py | 3 +- .../wgsl-shaders/mul_mat.tmpl.wgsl | 15 +- tests/test-backend-ops.cpp | 9 - 4 files changed, 103 insertions(+), 133 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 404a34c8e8b85..d9a195cccac14 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -50,13 +50,6 @@ static uint64_t webgpu_tensor_offset(const ggml_tensor * tensor) { /* Struct definitions */ -struct webgpu_pipeline_info { - std::string name; - const char * shader_code; - ggml_type src0_type; - ggml_type src1_type; -}; - // Forward reference static void ggml_webgpu_create_buffer(wgpu::Device & device, wgpu::Buffer & buffer, @@ -571,12 +564,12 @@ static void ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_t (uint32_t) dst->ne[1], // number of rows in result (M) (uint32_t) dst->ne[0], // number of columns in result (N) (uint32_t) src0->ne[0], // number of columns in src0/src1 (K) - (uint32_t) (src0->nb[1] / ggml_type_size(src0->type)), // stride (elements) of src0 in dimension 1 - (uint32_t) (src1->nb[1] / ggml_type_size(src1->type)), // stride (elements) of src1 in dimension 1 - (uint32_t) (src0->nb[2] / ggml_type_size(src0->type)), // stride (elements) of src0 in dimension 2 - (uint32_t) (src1->nb[2] / ggml_type_size(src1->type)), // stride (elements) of src1 in dimension 2 - (uint32_t) (src0->nb[3] / ggml_type_size(src0->type)), // stride (elements) of src0 in dimension 3 - (uint32_t) (src1->nb[3] / ggml_type_size(src1->type)), // stride (elements) of src1 in dimension 3 + (uint32_t) (src0->nb[1] / ggml_type_size(src0->type)), // stride (elements/blocks) of src0 in dimension 1 + (uint32_t) (src1->nb[1] / ggml_type_size(src1->type)), // stride (elements/blocks) of src1 in dimension 1 + (uint32_t) (src0->nb[2] / ggml_type_size(src0->type)), // stride (elements/blocks) of src0 in dimension 2 + (uint32_t) (src1->nb[2] / ggml_type_size(src1->type)), // stride (elements/blocks) of src1 in dimension 2 + (uint32_t) (src0->nb[3] / ggml_type_size(src0->type)), // stride (elements/blocks) of src0 in dimension 3 + (uint32_t) (src1->nb[3] / ggml_type_size(src1->type)), // stride (elements/blocks) of src1 in dimension 3 (uint32_t) src0->ne[2], // batch size in dimension 2 (uint32_t) src0->ne[3], // batch size in dimension 3 (uint32_t) (src1->ne[2] / src0->ne[2]), // broadcast in dimension 2 @@ -596,16 +589,11 @@ static void ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_t .buffer = ggml_webgpu_tensor_buf(dst), .offset = ggml_webgpu_tensor_align_offset(ctx, dst), .size = ggml_webgpu_tensor_binding_size(ctx, dst) }, - // { .binding = 3, - // .buffer = ctx->debug_dev_buf, - // .offset = 0, - // .size = ctx->debug_dev_buf.GetSize() } }; uint32_t wg_x = (dst->ne[0] * dst->ne[1] * dst->ne[2] * dst->ne[3] + WEBGPU_MUL_MAT_WG_SIZE - 1) / WEBGPU_MUL_MAT_WG_SIZE; ggml_backend_webgpu_build_and_enqueue(ctx, ctx->mul_mat_pipeline[src0->type][src1->type], params, entries, wg_x); - //ggml_backend_webgpu_debug(ctx); } // Returns true if node has enqueued work into the queue, false otherwise @@ -915,103 +903,94 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_context & webgpu_ctx) { } static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) { - webgpu_pipeline_info pipeline_infos[22] = { - { .name = "mul_mat_f32_f32", - .shader_code = wgsl_mul_mat_f32_f32, - .src0_type = GGML_TYPE_F32, - .src1_type = GGML_TYPE_F32 }, - { .name = "mul_mat_f16_f16", - .shader_code = wgsl_mul_mat_f16_f16, - .src0_type = GGML_TYPE_F16, - .src1_type = GGML_TYPE_F16 }, - { .name = "mul_mat_f16_f32", - .shader_code = wgsl_mul_mat_f16_f32, - .src0_type = GGML_TYPE_F16, - .src1_type = GGML_TYPE_F32 }, - { .name = "mul_mat_q4_0_f32", - .shader_code = wgsl_mul_mat_q4_0_f32, - .src0_type = GGML_TYPE_Q4_0, - .src1_type = GGML_TYPE_F32 }, - { .name = "mul_mat_q4_1_f32", - .shader_code = wgsl_mul_mat_q4_1_f32, - .src0_type = GGML_TYPE_Q4_1, - .src1_type = GGML_TYPE_F32 }, - { .name = "mul_mat_q5_0_f32", - .shader_code = wgsl_mul_mat_q5_0_f32, - .src0_type = GGML_TYPE_Q5_0, - .src1_type = GGML_TYPE_F32 }, - { .name = "mul_mat_q5_1_f32", - .shader_code = wgsl_mul_mat_q5_1_f32, - .src0_type = GGML_TYPE_Q5_1, - .src1_type = GGML_TYPE_F32 }, - { .name = "mul_mat_q8_0_f32", - .shader_code = wgsl_mul_mat_q8_0_f32, - .src0_type = GGML_TYPE_Q8_0, - .src1_type = GGML_TYPE_F32 }, - { .name = "mul_mat_q2_k_f32", - .shader_code = wgsl_mul_mat_q2_k_f32, - .src0_type = GGML_TYPE_Q2_K, - .src1_type = GGML_TYPE_F32 }, - { .name = "mul_mat_q3_k_f32", - .shader_code = wgsl_mul_mat_q3_k_f32, - .src0_type = GGML_TYPE_Q3_K, - .src1_type = GGML_TYPE_F32 }, - { .name = "mul_mat_q4_k_f32", - .shader_code = wgsl_mul_mat_q4_k_f32, - .src0_type = GGML_TYPE_Q4_K, - .src1_type = GGML_TYPE_F32 }, - { .name = "mul_mat_q5_k_f32", - .shader_code = wgsl_mul_mat_q5_k_f32, - .src0_type = GGML_TYPE_Q5_K, - .src1_type = GGML_TYPE_F32 }, - { .name = "mul_mat_q6_k_f32", - .shader_code = wgsl_mul_mat_q6_k_f32, - .src0_type = GGML_TYPE_Q6_K, - .src1_type = GGML_TYPE_F32 }, - { .name = "mul_mat_iq2_xxs_f32", - .shader_code = wgsl_mul_mat_iq2_xxs_f32, - .src0_type = GGML_TYPE_IQ2_XXS, - .src1_type = GGML_TYPE_F32 }, - { .name = "mul_mat_iq2_xs_f32", - .shader_code = wgsl_mul_mat_iq2_xs_f32, - .src0_type = GGML_TYPE_IQ2_XS, - .src1_type = GGML_TYPE_F32 }, - { .name = "mul_mat_iq2_s_f32", - .shader_code = wgsl_mul_mat_iq2_s_f32, - .src0_type = GGML_TYPE_IQ2_S, - .src1_type = GGML_TYPE_F32 }, - { .name = "mul_mat_iq3_xxs_f32", - .shader_code = wgsl_mul_mat_iq3_xxs_f32, - .src0_type = GGML_TYPE_IQ3_XXS, - .src1_type = GGML_TYPE_F32 }, - { .name = "mul_mat_iq3_s_f32", - .shader_code = wgsl_mul_mat_iq3_s_f32, - .src0_type = GGML_TYPE_IQ3_S, - .src1_type = GGML_TYPE_F32 }, - { .name = "mul_mat_iq1_s_f32", - .shader_code = wgsl_mul_mat_iq1_s_f32, - .src0_type = GGML_TYPE_IQ1_S, - .src1_type = GGML_TYPE_F32 }, - { .name = "mul_mat_iq1_m_f32", - .shader_code = wgsl_mul_mat_iq1_m_f32, - .src0_type = GGML_TYPE_IQ1_M, - .src1_type = GGML_TYPE_F32 }, - { .name = "mul_mat_iq4_nl_f32", - .shader_code = wgsl_mul_mat_iq4_nl_f32, - .src0_type = GGML_TYPE_IQ4_NL, - .src1_type = GGML_TYPE_F32 }, - { .name = "mul_mat_iq4_xs_f32", - .shader_code = wgsl_mul_mat_iq4_xs_f32, - .src0_type = GGML_TYPE_IQ4_XS, - .src1_type = GGML_TYPE_F32 } - }; - - for (auto & pipeline_info : pipeline_infos) { - ggml_webgpu_create_pipeline(webgpu_ctx->device, - webgpu_ctx->mul_mat_pipeline[pipeline_info.src0_type][pipeline_info.src1_type], - pipeline_info.shader_code, - pipeline_info.name.data()); - } + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_F32][GGML_TYPE_F32], + wgsl_mul_mat_f32_f32, + "mul_mat_f32_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_F16][GGML_TYPE_F16], + wgsl_mul_mat_f16_f16, + "mul_mat_f16_f16"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_F16][GGML_TYPE_F32], + wgsl_mul_mat_f16_f32, + "mul_mat_f16_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q4_0][GGML_TYPE_F32], + wgsl_mul_mat_q4_0_f32, + "mul_mat_q4_0_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q4_1][GGML_TYPE_F32], + wgsl_mul_mat_q4_1_f32, + "mul_mat_q4_1_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q5_0][GGML_TYPE_F32], + wgsl_mul_mat_q5_0_f32, + "mul_mat_q5_0_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q5_1][GGML_TYPE_F32], + wgsl_mul_mat_q5_1_f32, + "mul_mat_q5_1_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q8_0][GGML_TYPE_F32], + wgsl_mul_mat_q8_0_f32, + "mul_mat_q8_0_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q2_K][GGML_TYPE_F32], + wgsl_mul_mat_q2_k_f32, + "mul_mat_q2_k_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q3_K][GGML_TYPE_F32], + wgsl_mul_mat_q3_k_f32, + "mul_mat_q3_k_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q4_K][GGML_TYPE_F32], + wgsl_mul_mat_q4_k_f32, + "mul_mat_q4_k_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q5_K][GGML_TYPE_F32], + wgsl_mul_mat_q5_k_f32, + "mul_mat_q5_k_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q6_K][GGML_TYPE_F32], + wgsl_mul_mat_q6_k_f32, + "mul_mat_q6_k_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ2_XXS][GGML_TYPE_F32], + wgsl_mul_mat_iq2_xxs_f32, + "mul_mat_iq2_xxs_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ2_XS][GGML_TYPE_F32], + wgsl_mul_mat_iq2_xs_f32, + "mul_mat_iq2_xs_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ2_S][GGML_TYPE_F32], + wgsl_mul_mat_iq2_s_f32, + "mul_mat_iq2_s_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ3_XXS][GGML_TYPE_F32], + wgsl_mul_mat_iq3_xxs_f32, + "mul_mat_iq3_xxs_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ3_S][GGML_TYPE_F32], + wgsl_mul_mat_iq3_s_f32, + "mul_mat_iq3_s_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ1_S][GGML_TYPE_F32], + wgsl_mul_mat_iq1_s_f32, + "mul_mat_iq1_s_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ1_M][GGML_TYPE_F32], + wgsl_mul_mat_iq1_m_f32, + "mul_mat_iq1_m_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ4_NL][GGML_TYPE_F32], + wgsl_mul_mat_iq4_nl_f32, + "mul_mat_iq4_nl_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ4_XS][GGML_TYPE_F32], + wgsl_mul_mat_iq4_xs_f32, + "mul_mat_iq4_xs_f32"); } static void ggml_webgpu_init_set_rows_pipeline(webgpu_context & webgpu_ctx) { diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py b/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py index 2985622c892e0..d43fa30cfdefb 100755 --- a/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +++ b/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py @@ -18,7 +18,8 @@ def parse_decls(decls_text): def replace_placeholders(shader_text, replacements): for key, val in replacements.items(): - pattern = rf'\b{re.escape(key)}\b' + # Match {{KEY}} literally, where KEY is escaped + pattern = r'{{\s*' + re.escape(key) + r'\s*}}' shader_text = re.sub(pattern, str(val), shader_text) return shader_text diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl index 0f3adfef6e0b6..79465c298d726 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl @@ -1731,13 +1731,13 @@ enable f16; DECLS struct MulMatParams { - offset_src0: u32, // in elements - offset_src1: u32, // in elements - offset_dst: u32, // in elements + offset_src0: u32, // in elements/blocks + offset_src1: u32, // in elements/blocks + offset_dst: u32, // in elements/blocks m: u32, n: u32, k: u32, - // all strides are in elements + // all strides are in elements/blocks stride_01: u32, stride_11: u32, stride_02: u32, @@ -1751,10 +1751,9 @@ struct MulMatParams { broadcast3: u32 }; -@group(0) @binding(0) var src0: array; // N rows, K columns -@group(0) @binding(1) var src1: array; // M rows, K columns (transposed) +@group(0) @binding(0) var src0: array<{{SRC0_TYPE}}>; // N rows, K columns +@group(0) @binding(1) var src1: array<{{SRC1_TYPE}}>; // M rows, K columns (transposed) @group(0) @binding(2) var dst: array; // M rows, N columns -//@group(0) @binding(3) var debug: array; @group(0) @binding(3) var params: MulMatParams; @@ -1786,7 +1785,7 @@ fn main(@builtin(global_invocation_id) global_id: vec3) { let src1_idx_base = params.offset_src1 + src13_idx * params.stride_13 + src12_idx * params.stride_12 + row * params.stride_11; var sum = 0.0; - for (var i: u32 = 0u; i < params.k/BLOCK_SIZE; i = i + 1u) { + for (var i: u32 = 0u; i < params.k/{{BLOCK_SIZE}}; i = i + 1u) { sum += multiply_add(src0_idx_base, src1_idx_base, i); } dst[params.offset_dst + dst3_idx * dst3_stride + dst2_idx * dst2_stride + row * params.n + col] = sum; diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 348840c6c2702..4623605f66281 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -979,15 +979,6 @@ struct test_case { for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { init_tensor_uniform(t); } - // print first 32 elements of each tensor -// for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { -// if (strcmp(ggml_get_name(t), "a") == 0) { -// std::vector values = tensor_to_float(t); -// for (int i = 0; i < 32; i++) { -// printf("%s[%d] = %f\n", ggml_get_name(t), i, values[i]); -// } -// } -// } } virtual size_t op_size(ggml_tensor * t) {