diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index d9a195cccac14..369ddd84e917a 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -823,7 +823,7 @@ static ggml_backend_buffer_t ggml_backend_webgpu_buffer_type_alloc_buffer(ggml_b wgpu::Buffer buf; ggml_webgpu_create_buffer(ctx->webgpu_ctx->device, buf, - size, + (size + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1), wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst, "allocated_buffer"); diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl index cb7c8c3e09e91..194d2d6f58c77 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl @@ -19,20 +19,20 @@ fn main(@builtin(global_invocation_id) gid: vec3) { let start = params.offset; let end = params.offset + params.size; - for (var j: u32 = 0u; j < bytes_per_thread; j = j + 1u) { + for (var j: u32 = 0u; j < bytes_per_thread; j += 4) { let byte_index = start + i + j; - if (byte_index + 4u <= end) { - output_buffer[(byte_index >> 2u)] = params.value; + if (byte_index + 4 <= end) { + output_buffer[byte_index >> 2] = params.value; } else { // Handle tail (unaligned) - for (var k: u32 = 0u; k < 4u; k = k + 1u) { + for (var k: u32 = 0; k < 4; k++) { let idx = byte_index + k; if (idx < end) { - let word_idx = idx >> 2u; - let byte_offset = (idx & 3u) * 8u; - let mask = ~(0xffu << byte_offset); + let word_idx = idx >> 2; + let bit_offset = (idx & 3) * 8u; + let mask = ~(0xffu << bit_offset); let existing = output_buffer[word_idx]; - output_buffer[word_idx] = (existing & mask) | ((params.value & 0xffu) << byte_offset); + output_buffer[word_idx] = (existing & mask) | (params.value & (0xffu << bit_offset)); } } }