Wan MoE: Automatic expert routing based on timestep boundary

stduhpf · stduhpf · commit 8b5581ca5078 · 2025-08-29T22:23:41.000+02:00
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
@@ -112,10 +112,12 @@ struct SDParams {
     bool chroma_use_dit_mask = true;
     bool chroma_use_t5_mask  = false;
     int chroma_t5_mask_pad   = 1;
+    float boundary           = 0.875; 
 
     SDParams() {
         sd_sample_params_init(&sample_params);
         sd_sample_params_init(&high_noise_sample_params);
+        high_noise_sample_params.sample_steps = -1;
     }
 };
 
@@ -240,7 +242,7 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --high-noise-scheduler {discrete, karras, exponential, ays, gits} Denoiser sigma scheduler (default: discrete)\n");
     printf("  --high-noise-sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}\n");
     printf("                                     (high noise) sampling method (default: \"euler_a\")\n");
-    printf("  --high-noise-steps  STEPS          (high noise) number of sample steps (default: 20)\n");
+    printf("  --high-noise-steps  STEPS          (high noise) number of sample steps (default: -1 = auto)\n");
     printf("                                     SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])\n");
     printf("  --strength STRENGTH                strength for noising/unnoising (default: 0.75)\n");
     printf("  --style-ratio STYLE-RATIO          strength for keeping input identity (default: 20)\n");
@@ -271,6 +273,8 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --chroma-t5-mask-pad  PAD_SIZE     t5 mask pad size of chroma\n");
     printf("  --video-frames                     video frames (default: 1)\n");
     printf("  --fps                              fps (default: 24)\n");
+    printf("  --moe-boundary BOUNDARY            Timestep boundary for Wan2.2 MoE model. (default: 0.875)"); 
+    printf("                                     Only enabled if `--high-noise-steps` is set to -1");
     printf("  -v, --verbose                      print extra info\n");
 }
 
@@ -493,6 +497,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         {"", "--strength", "", &params.strength},
         {"", "--style-ratio", "", &params.style_ratio},
         {"", "--control-strength", "", &params.control_strength},
+        {"", "--moe-boundary", "", &params.boundary},
     };
 
     options.bool_options = {
@@ -753,8 +758,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
     }
 
     if (params.high_noise_sample_params.sample_steps <= 0) {
-        fprintf(stderr, "error: the high_noise_sample_steps must be greater than 0\n");
-        exit(1);
+        params.high_noise_sample_params.sample_steps = -1;
     }
 
     if (params.strength < 0.f || params.strength > 1.f) {
@@ -1181,6 +1185,7 @@ int main(int argc, const char* argv[]) {
             params.strength,
             params.seed,
             params.video_frames,
+            params.boundary
         };
 
         results = generate_video(sd_ctx, &vid_gen_params, &num_results);
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
@@ -1727,11 +1727,13 @@ void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params) {
     memset((void*)sd_vid_gen_params, 0, sizeof(sd_vid_gen_params_t));
     sd_sample_params_init(&sd_vid_gen_params->sample_params);
     sd_sample_params_init(&sd_vid_gen_params->high_noise_sample_params);
-    sd_vid_gen_params->width        = 512;
-    sd_vid_gen_params->height       = 512;
-    sd_vid_gen_params->strength     = 0.75f;
-    sd_vid_gen_params->seed         = -1;
-    sd_vid_gen_params->video_frames = 6;
+    sd_vid_gen_params->high_noise_sample_params.sample_steps = -1;
+    sd_vid_gen_params->width                                 = 512;
+    sd_vid_gen_params->height                                = 512;
+    sd_vid_gen_params->strength                              = 0.75f;
+    sd_vid_gen_params->seed                                  = -1;
+    sd_vid_gen_params->video_frames                          = 6;
+    sd_vid_gen_params->boundary                              = 0.875f;
 }
 
 struct sd_ctx_t {
@@ -2381,7 +2383,17 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
         high_noise_sample_steps = sd_vid_gen_params->high_noise_sample_params.sample_steps;
     }
 
-    std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps + high_noise_sample_steps);
+    std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(total_steps);
+
+    if(high_noise_sample_steps < 0) {
+        // timesteps∝sigmas for Flow models (like wan2.2 a14b)
+        for (size_t i = 0; i < sigmas.size(); ++i) {
+            if (sigmas[i] < sd_vid_gen_params->boundary) {
+                high_noise_sample_steps = i;
+                break;
+            }
+        }
+        LOG_DEBUG("Switching from high noise model at step %d", high_noise_sample_steps);
 
     struct ggml_init_params params;
     params.mem_size = static_cast<size_t>(100 * 1024) * 1024;  // 100 MB
diff --git a/stable-diffusion.h b/stable-diffusion.h
@@ -207,6 +207,7 @@ typedef struct {
     float strength;
     int64_t seed;
     int video_frames;
+    float boundary;
 } sd_vid_gen_params_t;
 
 typedef struct sd_ctx_t sd_ctx_t;