Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 0 additions & 26 deletions ggml/src/ggml-cuda/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
#include "ggml-cuda/mmq.cuh"
#include "ggml-cuda/mmvf.cuh"
#include "ggml-cuda/mmvq.cuh"
#include "ggml-cuda/moe-expert-reduce.cuh"
#include "ggml-cuda/norm.cuh"
#include "ggml-cuda/opt-step-adamw.cuh"
#include "ggml-cuda/opt-step-sgd.cuh"
Expand Down Expand Up @@ -3197,31 +3196,6 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
continue;
}

if (node->op == GGML_OP_MUL) {
int current_node = i + 1;
int num_views = 0;
int num_adds = 0;
while (current_node < cgraph->n_nodes && cgraph->nodes[current_node]->op == GGML_OP_VIEW) {
num_views++;
current_node++;
}

while (current_node < cgraph->n_nodes && cgraph->nodes[current_node]->op == GGML_OP_ADD &&
num_adds < num_views - 1) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This logic doesn't add up, what was the intention here?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you mean? n_expert_used views followed by n_expert_used - 1 adds

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, it's a little awkward when n_expert_used is 1 (ie, no add, but cont instead), and I'm not sure you're catering for that?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this also means it will kick in on just mul+view.

Copy link
Collaborator Author

@am17an am17an Nov 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, I'll fix that once I figure out the other problems in this kernel (namely related to the lifetime to the weights buffer)

num_adds++;
current_node++;
}

if (num_adds == num_views - 1 && num_views > 0) {
ggml_tensor * dst_node = cgraph->nodes[current_node - 1];
if (ggml_cuda_should_use_moe_expert_reduce(cgraph, i, current_node)) {
ggml_cuda_op_moe_expert_reduce(*cuda_ctx, node->src[0], node->src[1], dst_node);
i += num_views + num_adds;
continue;
}
}
}

if (node->op == GGML_OP_ADD) {
int n_fuse = 0;
ggml_op ops[8];
Expand Down
168 changes: 0 additions & 168 deletions ggml/src/ggml-cuda/moe-expert-reduce.cu

This file was deleted.

11 changes: 0 additions & 11 deletions ggml/src/ggml-cuda/moe-expert-reduce.cuh

This file was deleted.

58 changes: 0 additions & 58 deletions tests/test-backend-ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4882,60 +4882,6 @@ struct test_topk_moe: public test_case {
}
};

struct test_moe_expert_reduce : public test_case {
const int64_t n_embd;
const int64_t n_tokens;
const int64_t n_expert_used;

test_moe_expert_reduce(int64_t n_embd = 64, int64_t n_tokens = 5, int64_t n_expert_used = 4)
: n_embd(n_embd), n_tokens(n_tokens), n_expert_used(n_expert_used) {
GGML_ASSERT(n_expert_used > 1);
}

std::string vars() override {
return VARS_TO_STR3(n_embd, n_tokens, n_expert_used);
}

std::string op_desc(ggml_tensor * t) override {
GGML_UNUSED(t);
return "MOE_EXPERT_REDUCE";
}

bool run_whole_graph() override { return true; }

ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * experts = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd, n_expert_used, n_tokens);
ggml_set_name(experts, "experts");

ggml_tensor * weights = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 1, n_expert_used, n_tokens);
ggml_set_name(weights, "weights");

ggml_tensor * weighted = ggml_mul(ctx, experts, weights);
ggml_set_name(weighted, "weighted_experts");

std::vector<ggml_tensor *> expert_views(n_expert_used);
for (int64_t i = 0; i < n_expert_used; ++i) {
expert_views[i] = ggml_view_2d(ctx, weighted, n_embd, n_tokens, weighted->nb[2], i * weighted->nb[1]);

std::string name = "expert_view_" + std::to_string(i);
ggml_set_name(expert_views[i], name.c_str());
ggml_build_forward_expand(gf, expert_views[i]);
}

ggml_tensor * moe_out = expert_views[0];
for (int64_t i = 1; i < n_expert_used; ++i) {
moe_out = ggml_add(ctx, moe_out, expert_views[i]);

std::string name = "expert_add_" + std::to_string(i - 1);
ggml_set_name(moe_out, name.c_str());
}

ggml_set_name(moe_out, "moe_out");

return moe_out;
}
};

struct test_mul_mat_vec_fusion : public test_case {
const ggml_type type;
const ggml_glu_op glu_op;
Expand Down Expand Up @@ -7415,10 +7361,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_topk_moe({ 8, 22, 1, 1 }, 4, /*with_norm*/ false, /*delayed_softmax*/ true));
test_cases.emplace_back(new test_topk_moe({ 32, 22, 1, 1 }, 8, /*with_norm*/ false, /*delayed_softmax*/ true));

test_cases.emplace_back(new test_moe_expert_reduce(1024, 5, 4));
test_cases.emplace_back(new test_moe_expert_reduce(80, 3, 6));
test_cases.emplace_back(new test_moe_expert_reduce(80, 3, 7));

#if 0
// these tests are disabled to save execution time, sbut they can be handy for debugging
test_cases.emplace_back(new test_llama(2, true));
Expand Down
Loading