Fix PatchedMoeMatmul and Get num_experts from Module (#202)

yiliu30 · Yi4Liu · mengniwang95 · commit b3d334ca258a · 2025-04-15T11:26:11.000Z
Fix `PatchedMoeMatmul` and Get `num_experts` from Module

---------

Signed-off-by: Yi Liu &lt;yiliu4@habana.ai&gt;
Co-authored-by: Yi Liu &lt;yiliu4@habana.ai&gt;
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py b/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py
@@ -143,6 +143,10 @@ def register_patched_measure_modules(model, mod_list, observer_class, d_shapes=N
                 patched_types.add(type(mod))
 
                 set_hqt_config(mod, top_level_config)  # set config in the module, as it consumed by the patched module
+                if mod_type == "dynamic_moe" and hasattr(mod, "num_experts"):
+                    # override default number of outputs for dynamic moe
+                    mod_types[mod_type].num_outputs = mod.num_experts+1
+                    logger.warning(f"Dynamic moe num_outputs set to {mod.num_experts+1}")
                 mod_extra_config = (
                     init_measure_object(
                         mod,
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/patching_common.py b/neural_compressor/torch/algorithms/fp8_quant/_core/patching_common.py
@@ -52,16 +52,14 @@ def create_mod_info_recursion(parent):
 
     create_mod_info_recursion(model)
 
-    
-INC_DYNAMIC_MOE_EXPERTS = int(os.environ.get("INC_DYNAMIC_MOE_EXPERTS", "8"))
 
 _mod_types = {
     "linear": ModuleType(1, ["weight"], 1, False),
     "matmul": ModuleType(2, [], 1, False),
     "kv_cache": ModuleType(1, [], 1, False),
     "softmax": ModuleType(1, [], 1, True),
     "fused_sdpa": ModuleType(3, [], 2, True),
-    "dynamic_moe": ModuleType(1, [], 1 + INC_DYNAMIC_MOE_EXPERTS, True),
+    "dynamic_moe": ModuleType(1, [], 1 + 8, True),
 }
 
 
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
@@ -630,7 +630,7 @@ def __init__(self, mod, parent, mod_extra_config, *args, **kwargs):
         if (self.quantization_mode == QuantMode.MEASURE) or (self.quantization_mode == QuantMode.SHAPE):
             measure_input((torch.tensor(0),), observer=self._mod_extra_config.inputs)
         else:
-            self.weight = self.weight.squeeze()
+            self.weight = torch.nn.Parameter(self.weight.squeeze(), requires_grad=False)
 
     def forward_qdq(self, input, *args, **kwargs):
         qinput = self.quant_input(input)