@@ -385,6 +385,14 @@ enum shader_reduction_mode {
385385
386386static constexpr uint32_t num_argsort_pipelines = 11;
387387static constexpr uint32_t max_argsort_cols = 1 << (num_argsort_pipelines-1);
388+ static constexpr uint32_t num_topk_moe_pipelines = 10;
389+ 
390+ static constexpr std::array topk_moe_norm{ GGML_OP_SOFT_MAX, GGML_OP_RESHAPE,  GGML_OP_ARGSORT,
391+                                            GGML_OP_VIEW,     GGML_OP_GET_ROWS, GGML_OP_RESHAPE,
392+                                            GGML_OP_SUM_ROWS, GGML_OP_DIV,      GGML_OP_RESHAPE };
393+ static constexpr std::array topk_moe     { GGML_OP_SOFT_MAX, GGML_OP_RESHAPE,  GGML_OP_ARGSORT,
394+                                            GGML_OP_VIEW,     GGML_OP_GET_ROWS };
395+ 
388396
389397struct vk_device_struct {
390398    std::recursive_mutex mutex;
@@ -598,6 +606,9 @@ struct vk_device_struct {
598606
599607    vk_pipeline pipeline_flash_attn_split_k_reduce;
600608
609+     // [2] is {!norm, norm}
610+     vk_pipeline pipeline_topk_moe[num_topk_moe_pipelines][2];
611+ 
601612    std::vector<vk_pipeline_ref> all_pipelines;
602613
603614    std::vector<std::tuple<void*, size_t, vk_buffer>> pinned_memory;
@@ -941,6 +952,11 @@ struct vk_op_multi_add_push_constants {
941952static_assert(MAX_PARAMETER_COUNT == 12);
942953static_assert(sizeof(vk_op_multi_add_push_constants) <= 256);
943954
955+ struct vk_op_topk_moe_push_constants {
956+     uint32_t n_rows;
957+     uint32_t n_expert_used;
958+ };
959+ 
944960struct vk_op_add_id_push_constants {
945961    uint32_t ne0;
946962    uint32_t ne1;
@@ -3722,6 +3738,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
37223738    ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f16_f32, "conv2d_dw_whcn_f16_f32", conv2d_dw_whcn_f16_f32_len, conv2d_dw_whcn_f16_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
37233739    ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_cwhn_f16_f32, "conv2d_dw_cwhn_f16_f32", conv2d_dw_cwhn_f16_f32_len, conv2d_dw_cwhn_f16_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
37243740
3741+     for (uint32_t i = 0; i < num_topk_moe_pipelines; ++i) {
3742+         ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][0], "topk_moe_f32_"+std::to_string(i),   topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0}, 1, true, true);
3743+         ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][1], "topk_moe_f32_"+std::to_string(i),   topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 1}, 1, true, true);
3744+     }
3745+ 
37253746    for (auto &c : compiles) {
37263747        c.wait();
37273748    }
@@ -8004,6 +8025,13 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
80048025        GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);
80058026        GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F32);
80068027
8028+         if (ctx->num_additional_fused_ops) {
8029+             uint32_t idx = (uint32_t)ceilf(log2f(float(dst->ne[0])));
8030+             GGML_ASSERT(idx < num_topk_moe_pipelines);
8031+             bool with_norm = ctx->num_additional_fused_ops == topk_moe_norm.size() - 1;
8032+             return ctx->device->pipeline_topk_moe[idx][with_norm];
8033+         }
8034+ 
80078035        if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) {
80088036            return src0->ne[0] > 1024 ? ctx->device->pipeline_soft_max_f32_wg512 : ctx->device->pipeline_soft_max_f32;
80098037        }
@@ -9589,6 +9617,87 @@ static void ggml_vk_soft_max_back(ggml_backend_vk_context * ctx, vk_context& sub
95899617    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SOFT_MAX_BACK, { (uint32_t)src0->ne[0], (uint32_t)ggml_nrows(src0), op_params[0], op_params[1] }, dryrun);
95909618}
95919619
9620+ static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx, bool dryrun = false) {
9621+ 
9622+     bool with_norm = ctx->num_additional_fused_ops == topk_moe_norm.size() - 1;
9623+     ggml_tensor * logits = cgraph->nodes[node_idx + 0]->src[0];
9624+     ggml_tensor * weights = with_norm ? cgraph->nodes[node_idx + 8] : cgraph->nodes[node_idx + 4];
9625+     ggml_tensor * ids = cgraph->nodes[node_idx + 3];
9626+ 
9627+     GGML_ASSERT(logits->type == GGML_TYPE_F32);
9628+     GGML_ASSERT(weights->type == GGML_TYPE_F32);
9629+     GGML_ASSERT(ids->type == GGML_TYPE_I32);
9630+ 
9631+     const int n_experts = logits->ne[0];
9632+     const int n_rows    = logits->ne[1];
9633+     const int n_expert_used = weights->ne[1];
9634+ 
9635+     GGML_ASSERT(ids->nb[1] / ggml_type_size(ids->type) == (size_t) n_experts);
9636+ 
9637+     vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, nullptr, nullptr, nullptr, cgraph->nodes[node_idx], GGML_OP_SOFT_MAX);
9638+ 
9639+     if (dryrun) {
9640+         ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
9641+         return;
9642+     }
9643+ 
9644+     ggml_backend_vk_buffer_context * logits_buf_ctx = (ggml_backend_vk_buffer_context *)logits->buffer->context;
9645+     ggml_backend_vk_buffer_context * weights_buf_ctx = (ggml_backend_vk_buffer_context *)weights->buffer->context;
9646+     ggml_backend_vk_buffer_context * ids_buf_ctx = (ggml_backend_vk_buffer_context *)ids->buffer->context;
9647+ 
9648+     vk_buffer d_logits = nullptr;
9649+     size_t logits_buf_offset = 0;
9650+     vk_buffer d_weights = nullptr;
9651+     size_t weights_buf_offset = 0;
9652+     vk_buffer d_ids = nullptr;
9653+     size_t ids_buf_offset = 0;
9654+ 
9655+     bool logits_uma = false;
9656+     bool weights_uma = false;
9657+     bool ids_uma = false;
9658+ 
9659+     if (ctx->device->uma) {
9660+         ggml_vk_host_get(ctx->device, logits->data, d_logits, logits_buf_offset);
9661+         ggml_vk_host_get(ctx->device, weights->data, d_weights, weights_buf_offset);
9662+         ggml_vk_host_get(ctx->device, ids->data, d_ids, ids_buf_offset);
9663+         logits_uma = d_logits != nullptr;
9664+         weights_uma = d_weights != nullptr;
9665+         ids_uma = d_ids != nullptr;
9666+     }
9667+ 
9668+     if (!logits_uma) {
9669+         d_logits = logits_buf_ctx->dev_buffer;
9670+         logits_buf_offset = vk_tensor_offset(logits) + logits->view_offs;
9671+         GGML_ASSERT(d_logits != nullptr);
9672+     }
9673+     if (!weights_uma) {
9674+         d_weights = weights_buf_ctx->dev_buffer;
9675+         weights_buf_offset = vk_tensor_offset(weights) + weights->view_offs;
9676+         GGML_ASSERT(d_weights != nullptr);
9677+     }
9678+     if (!ids_uma) {
9679+         d_ids = ids_buf_ctx->dev_buffer;
9680+         ids_buf_offset = vk_tensor_offset(ids) + ids->view_offs;
9681+         GGML_ASSERT(d_ids != nullptr);
9682+     }
9683+ 
9684+     vk_op_topk_moe_push_constants pc;
9685+     pc.n_rows = n_rows;
9686+     pc.n_expert_used = n_expert_used;
9687+ 
9688+     GGML_ASSERT(n_expert_used <= n_experts);
9689+ 
9690+     const uint32_t rows_per_block = 4;
9691+     std::array<uint32_t, 3> elements = { CEIL_DIV(n_rows, rows_per_block), 1, 1 };
9692+ 
9693+     ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
9694+         {
9695+             ggml_vk_subbuffer(ctx, d_logits, logits_buf_offset),
9696+             ggml_vk_subbuffer(ctx, d_weights, weights_buf_offset),
9697+             ggml_vk_subbuffer(ctx, d_ids, ids_buf_offset),
9698+         }, pc, elements);
9699+ }
9700+ 
95929701static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool backprop, bool dryrun = false) {
95939702    const int n_dims        = ((int32_t *) dst->op_params)[1];
95949703    const int mode          = ((int32_t *) dst->op_params)[2];
@@ -11174,11 +11283,11 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
1117411283            ctx->unsynced_nodes_read.clear();
1117511284            ggml_vk_sync_buffers(ctx, compute_ctx);
1117611285        }
11177-         // Add the last fused node and all fused source nodes to the unsynchronized list.
11178-         const ggml_tensor * last_node = cgraph->nodes[node_idx + ctx->num_additional_fused_ops];
11179-         ctx->unsynced_nodes_written.push_back(last_node);
11286+         // Add all fused nodes to the unsynchronized lists.
1118011287        for (int32_t i = 0; i < ctx->num_additional_fused_ops + 1; ++i) {
1118111288            const ggml_tensor *cur_node = cgraph->nodes[node_idx + i];
11289+             // Multiple outputs could be written, e.g. in topk_moe. Add them all to the list.
11290+             ctx->unsynced_nodes_written.push_back(cur_node);
1118211291            for (uint32_t j = 0; j < GGML_MAX_SRC; ++j) {
1118311292                if (!cur_node->src[j]) {
1118411293                    continue;
@@ -11345,7 +11454,11 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
1134511454
1134611455        break;
1134711456    case GGML_OP_SOFT_MAX:
11348-         ggml_vk_soft_max(ctx, compute_ctx, src0, src1, src2, node, dryrun);
11457+         if (ctx->num_additional_fused_ops) {
11458+             ggml_vk_topk_moe(ctx, compute_ctx, cgraph, node_idx, dryrun);
11459+         } else {
11460+             ggml_vk_soft_max(ctx, compute_ctx, src0, src1, src2, node, dryrun);
11461+         }
1134911462
1135011463        break;
1135111464    case GGML_OP_SOFT_MAX_BACK:
@@ -12141,6 +12254,120 @@ static bool ggml_vk_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, st
1214112254    return true;
1214212255}
1214312256
12257+ static bool ggml_vk_can_fuse_topk_moe(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph,
12258+                                       int node_idx, bool with_norm) {
12259+ 
12260+     if (with_norm) {
12261+         if (node_idx + (int)topk_moe_norm.size() > cgraph->n_nodes) {
12262+             return false;
12263+         }
12264+         for (size_t i = 0; i < topk_moe_norm.size(); ++i) {
12265+             if (cgraph->nodes[node_idx + i]->op != topk_moe_norm[i]) {
12266+                 return false;
12267+             }
12268+         }
12269+     } else {
12270+         if (node_idx + (int)topk_moe.size() > cgraph->n_nodes) {
12271+             return false;
12272+         }
12273+         for (size_t i = 0; i < topk_moe.size(); ++i) {
12274+             if (cgraph->nodes[node_idx + i]->op != topk_moe[i]) {
12275+                 return false;
12276+             }
12277+         }
12278+     }
12279+ 
12280+     const ggml_tensor * softmax =  cgraph->nodes[node_idx + 0];
12281+     const ggml_tensor * weights = with_norm ? cgraph->nodes[node_idx + 8] : cgraph->nodes[node_idx + 4];
12282+ 
12283+     const float * op_params = (const float *)softmax->op_params;
12284+ 
12285+     float scale = op_params[0];
12286+     float max_bias = op_params[1];
12287+ 
12288+     if (!ggml_is_contiguous(softmax->src[0]) || !ggml_is_contiguous(weights)) {
12289+         return false;
12290+     }
12291+ 
12292+     if (scale != 1.0f || max_bias != 0.0f) {
12293+         return false;
12294+     }
12295+ 
12296+     // don't fuse when masks or sinks are present
12297+     if (softmax->src[1] || softmax->src[2]) {
12298+         return false;
12299+     }
12300+ 
12301+     const int n_expert = softmax->ne[0];
12302+     // n_expert must be a power of 2
12303+     if (!is_pow2(n_expert) || n_expert > (1 << (num_topk_moe_pipelines-1))) {
12304+         return false;
12305+     }
12306+ 
12307+     // Check that the nodes don't have any unexpected uses
12308+     const ggml_tensor * reshape1 =  cgraph->nodes[node_idx + 1];
12309+     const ggml_tensor * argsort =   cgraph->nodes[node_idx + 2];
12310+     const ggml_tensor * view =      cgraph->nodes[node_idx + 3];
12311+     const ggml_tensor * get_rows =  cgraph->nodes[node_idx + 4];
12312+     const ggml_tensor * reshape5 =  with_norm ? cgraph->nodes[node_idx + 5] : nullptr;
12313+     const ggml_tensor * sum_rows =  with_norm ? cgraph->nodes[node_idx + 6] : nullptr;
12314+     const ggml_tensor * div =       with_norm ? cgraph->nodes[node_idx + 7] : nullptr;
12315+     const ggml_tensor * reshape8 =  with_norm ? cgraph->nodes[node_idx + 8] : nullptr;
12316+ 
12317+     // softmax is used by reshape and argsort
12318+     if (ggml_node_get_use_count(cgraph, node_idx) != 2 ||
12319+         reshape1->src[0] != softmax ||
12320+         argsort->src[0] != softmax) {
12321+         return false;
12322+     }
12323+     // reshape is used by get_rows
12324+     if (ggml_node_get_use_count(cgraph, node_idx + 1) != 1 ||
12325+         get_rows->src[0] != reshape1) {
12326+         return false;
12327+     }
12328+     // argsort is used by view
12329+     if (ggml_node_get_use_count(cgraph, node_idx + 2) != 1 ||
12330+         view->src[0] != argsort) {
12331+         return false;
12332+     }
12333+     // view is written (via argsort), we can skip checking it
12334+ 
12335+     if (with_norm) {
12336+         // get_rows is used by reshape
12337+         if (ggml_node_get_use_count(cgraph, node_idx + 4) != 1 ||
12338+             reshape5->src[0] != get_rows) {
12339+             return false;
12340+         }
12341+ 
12342+         // reshape is used by sum_rows and div
12343+         if (ggml_node_get_use_count(cgraph, node_idx + 5) != 2 ||
12344+             sum_rows->src[0] != reshape5 ||
12345+             div->src[0] != reshape5) {
12346+             return false;
12347+         }
12348+ 
12349+         // sum_rows is used by div
12350+         if (ggml_node_get_use_count(cgraph, node_idx + 6) != 1 ||
12351+             div->src[1] != sum_rows) {
12352+             return false;
12353+         }
12354+ 
12355+         // div/reshape are written
12356+         if (reshape8->src[0] != div) {
12357+             return false;
12358+         }
12359+     }
12360+ 
12361+     if (!ctx->device->subgroup_arithmetic ||
12362+         !ctx->device->subgroup_shuffle ||
12363+         !ctx->device->subgroup_require_full_support ||
12364+         ctx->device->disable_fusion) {
12365+         return false;
12366+     }
12367+ 
12368+     return true;
12369+ }
12370+ 
1214412371static uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx) {
1214512372
1214612373    const ggml_tensor *first_node = cgraph->nodes[node_idx];
@@ -12216,6 +12443,10 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
1221612443                ctx->num_additional_fused_ops = num_adds - 1;
1221712444            } else if (ggml_vk_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
1221812445                ctx->num_additional_fused_ops = 1;
12446+             } else if (ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, true)) {
12447+                 ctx->num_additional_fused_ops = topk_moe_norm.size() - 1;
12448+             } else if (ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, false)) {
12449+                 ctx->num_additional_fused_ops = topk_moe.size() - 1;
1221912450            }
1222012451        }
1222112452        ggml_vk_build_graph(ctx, cgraph, i, nullptr, 0, true, false, false, false);
@@ -12313,17 +12544,21 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
1231312544                ctx->num_additional_fused_ops = num_adds - 1;
1231412545            } else if (ggml_vk_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
1231512546                ctx->num_additional_fused_ops = 1;
12547+             } else if (ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, true)) {
12548+                 ctx->num_additional_fused_ops = topk_moe_norm.size() - 1;
12549+             } else if (ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, false)) {
12550+                 ctx->num_additional_fused_ops = topk_moe.size() - 1;
1231612551            }
1231712552        }
1231812553
1231912554        // Signal the almost_ready fence when the graph is mostly complete (< 20% remaining)
1232012555        bool almost_ready = (cgraph->n_nodes - i) < cgraph->n_nodes / 5;
1232112556        bool submit = (submitted_nodes >= nodes_per_submit) ||
1232212557                      (mul_mat_bytes >= mul_mat_bytes_per_submit) ||
12323-                       (i + ctx->num_additional_fused_ops = = last_node) ||
12558+                       (i + ctx->num_additional_fused_ops > = last_node) ||
1232412559                      (almost_ready && !ctx->almost_ready_fence_pending);
1232512560
12326-         bool enqueued = ggml_vk_build_graph(ctx, cgraph, i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i + ctx->num_additional_fused_ops = = last_node, almost_ready, submit);
12561+         bool enqueued = ggml_vk_build_graph(ctx, cgraph, i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i + ctx->num_additional_fused_ops > = last_node, almost_ready, submit);
1232712562
1232812563        if (vk_perf_logger_enabled) {
1232912564            if (ctx->compute_ctx.expired()) {
@@ -12444,6 +12679,25 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph *
1244412679    while (first_unused < graph->n_nodes) {
1244512680        std::vector<int> current_set;
1244612681
12682+         // Avoid reordering topk_moe_norm
12683+         if (first_unused + (int)topk_moe_norm.size() <= graph->n_nodes) {
12684+             bool is_topk_moe_norm = true;
12685+             for (size_t j = 0; j < topk_moe_norm.size(); ++j) {
12686+                 if (graph->nodes[first_unused + j]->op != topk_moe_norm[j] || used[first_unused + j]) {
12687+                     is_topk_moe_norm = false;
12688+                 }
12689+             }
12690+             if (is_topk_moe_norm) {
12691+                 for (size_t j = 0; j < topk_moe_norm.size(); ++j) {
12692+                     new_order.push_back(graph->nodes[first_unused + j]);
12693+                     used[first_unused + j] = true;
12694+                 }
12695+                 while (first_unused < graph->n_nodes && used[first_unused]) {
12696+                     first_unused++;
12697+                 }
12698+                 continue;
12699+             }
12700+         }
1244712701        // First, grab the next unused node.
1244812702        current_set.push_back(first_unused);
1244912703
0 commit comments