[SW-228576] Add Dynamic Quant Support For FusedMoE (#243)

yiliu30 · XuehaoSun · commit d726f921e5b7 · 2025-07-19T13:16:06.000+08:00
Signed-off-by: Yi Liu &lt;yiliu4@habana.ai&gt;
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/patching_common.py b/neural_compressor/torch/algorithms/fp8_quant/_core/patching_common.py
@@ -83,7 +83,8 @@ def create_mod_info_recursion(parent):
     "MoeMatmul": ModuleInfo("linear", PatchedMoeMatmul),
     "MoeFP8Matmul": ModuleInfo("linear", PatchedMoeFP8Matmul),
     "ReplicatedLinear": ModuleInfo("linear", PatchedReplicatedLinear),
-    "FusedMoE": ModuleInfo("matmul", PatchedMixtralMoE, False),
+    # Note: `no_quantize_op` indicates that this module is patched but does not require measurement or quantization.
+    "FusedMoE": ModuleInfo("no_quantize_op", PatchedMixtralMoE, False),
     "GaudiMixtralSparseMoeBlock": ModuleInfo("dynamic_moe", PatchedGaudiMixtralSparseMoeBlock),
     "VllmMixtureOfExpertsOp": ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOp),
     "VllmMixtureOfExpertsOpFP8": ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpFP8),
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantized_func_wrappers/hpu/hpu_quantized_func_wrapper.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantized_func_wrappers/hpu/hpu_quantized_func_wrapper.py
@@ -33,8 +33,8 @@ class QuantizedHpuFuncWrapperBase(QuantizedFuncWrapperBase, metaclass=ABCMeta):
     Concrete class may override base class methods in case custom op logic is unique, see examples in concrete
     classes below.
     """
-    def __init__(self, scale_format):
-        self._quantized_func_ = self.get_quantized_func(scale_format)
+    def __init__(self, scale_format, is_dynamic=False):
+        self._quantized_func_ = self.get_quantized_func(scale_format, is_dynamic)
 
     @abstractmethod
     def get_default_quantized_func(self):
@@ -45,19 +45,32 @@ def get_scalar_quantized_func(self):
             return self.get_default_quantized_func()
         return self.get_default_quantized_func().scalar
 
-    def get_quantized_func(self, scale_format):
-        if scale_format == ScaleFormat.SCALAR:
-                return self.get_scalar_quantized_func()
-        elif scale_format == ScaleFormat.CONST:
-            return self.get_default_quantized_func()
+    def get_dynamic_scalar_quantized_func(self):
+        # By default, dynamic scalar quantized function is the same as scalar quantized function.
+        return self.get_scalar_quantized_func()
+
+    def get_dynamic_quantized_func(self):
+        # By default, dynamic quantized function is the same as default quantized function.
+        return self.get_default_quantized_func()
+
+    def get_quantized_func(self, scale_format, is_dynamic=False):
+        if scale_format not in [ScaleFormat.SCALAR, ScaleFormat.CONST]:
+            raise ValueError("Unsupported scale format - {}".format(scale_format))
+        if is_dynamic:
+            if scale_format == ScaleFormat.SCALAR:
+                return self.get_dynamic_scalar_quantized_func()
+            else:
+                return self.get_dynamic_quantized_func()
         else:
-            raise ValueError("Unexpected scale format - {}".format(scale_format))
+            if scale_format == ScaleFormat.SCALAR:
+                return self.get_scalar_quantized_func()
+            else:
+                return self.get_default_quantized_func()
 
     def __call__(self, *args, **kwargs):
         return self._quantized_func_(*args, **kwargs)
 
 
-
 class QuantizedHpuMatmul(QuantizedHpuFuncWrapperBase):
     def get_default_quantized_func(self):
         return torch.ops.hpu.fp8_gemm_v2
@@ -134,7 +147,6 @@ def get_default_quantized_func(self):
     def get_scalar_quantized_func(self):
         return torch.ops.hpu.mixture_of_experts.fp8_scalars
 
-
 class QuantizedHPUCastToFP8(QuantizedHpuFuncWrapperBase):
     def get_default_quantized_func(self):
         return torch.ops.hpu.cast_to_fp8_v2
@@ -144,18 +156,23 @@ def __call__(self, *args, **kwargs):
 
 class QuantizedHPUCastFromFP8(QuantizedHpuFuncWrapperBase):
 
-    def __init__(self, scale_format):
-        super().__init__(scale_format)
-
     def get_default_quantized_func(self):
         return torch.ops.hpu.cast_from_fp8
 
+
 class QuantizedHpuDynamicMoeFusedWeights(QuantizedHpuFuncWrapperBase):
     def get_default_quantized_func(self):
         return torch.ops.hpu.mixture_of_experts.fp8_fused_weights
+
     def get_scalar_quantized_func(self):
         return torch.ops.hpu.mixture_of_experts.fp8_fused_weights_scalars
 
+    def get_dynamic_scalar_quantized_func(self):
+        return torch.ops.hpu.mixture_of_experts.fp8_fused_weights_scalars_dynamic
+
+    def get_dynamic_quantized_func(self):
+        return torch.ops.hpu.mixture_of_experts.fp8_fused_weights_dynamic
+
 
 _OP_TYPE_HPU_QUANTIZED_WRAPPER_CLASSES = {OP_TYPE.LINEAR_GEMM : QuantizedHpuMatmul,
                                           OP_TYPE.MATMUL_GEMM: QuantizedHpuMatmul,
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantized_func_wrappers/quantized_func_wrapper.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantized_func_wrappers/quantized_func_wrapper.py
@@ -73,10 +73,10 @@ def initialize(cls, device_quantized_func_wrapper_dict):
 
 
     @classmethod
-    def get_quantized_func_wrapper_object(cls, op_type, scale_format):
+    def get_quantized_func_wrapper_object(cls, op_type, scale_format, is_dynamic=False):
         if op_type not in cls.__quantized_func_wrapper_instances:
             quantized_wrapper_class = cls.__device_func_wrappers_mapping[op_type]
-            cls.__quantized_func_wrapper_instances[op_type] = quantized_wrapper_class(scale_format)
+            cls.__quantized_func_wrapper_instances[op_type] = quantized_wrapper_class(scale_format, is_dynamic)
 
         return cls.__quantized_func_wrapper_instances[op_type]
 
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantized_func_wrappers/quantized_func_wrapper_api.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantized_func_wrappers/quantized_func_wrapper_api.py
@@ -24,8 +24,8 @@
 Functions to interact with QuantizedFuncWrapperFactory singleton object
 """
 
-def get_quantized_func_wrapper(op_type, scale_format):
-    return QuantizedFuncWrapperFactory.get_quantized_func_wrapper_object(op_type, scale_format)
+def get_quantized_func_wrapper(op_type, scale_format, is_dynamic=False):
+    return QuantizedFuncWrapperFactory.get_quantized_func_wrapper_object(op_type, scale_format, is_dynamic)
 
 
 def init_quantized_func_wrapper_factory():
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantized_func_wrappers/xpu/xpu_quantized_func_wrapper.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantized_func_wrappers/xpu/xpu_quantized_func_wrapper.py
@@ -25,7 +25,7 @@ class QuantizedXPUFuncWrapperBase(QuantizedFuncWrapperBase, metaclass=ABCMeta):
     """
     Placeholder for base class for XPU (Falcon/Jaguar Shores) quantized func wrapper.
     """
-    def __init__(self, scale_format):
+    def __init__(self, scale_format, is_dynamic=False):
         self._quantized_func_ = self.get_default_quantized_func()
 
 class QuantizedXPUMatmul(QuantizedXPUFuncWrapperBase):
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/ops_quantizer.py b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/ops_quantizer.py
@@ -382,7 +382,7 @@ def __init__(self, config, mod, measurement, params, module_type):
             num_of_experts = 8
         
         self.inputs_scales_creators = [
-            self.scales_method_factory.get_scale_method(QuantTensorName.INPUT)
+            self.scales_method_factory.get_scale_method(QuantTensorName.INPUT, is_dynamic=self.is_dynamic)
             for i in range(num_of_inputs + num_of_experts)
         ]
         self.output_scales_creators.append(self.scales_method_factory.get_scale_method(QuantTensorName.OUTPUT))
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
@@ -734,14 +734,20 @@ def __init__(self, mod, parent, mod_extra_config, *args, **kwargs):
         self.experts_max = self.orig_mod.experts_max if hasattr(self.orig_mod, "experts_max") else 7
         self.experts_used = self.local_num_experts if hasattr(self.orig_mod, "local_num_experts") else self.num_experts
         if self.quantization_mode in [QuantMode.QUANTIZE, QuantMode.LOAD]:
-            self.dynamic_moe_op = get_quantized_func_wrapper(OP_TYPE.DYNAMIC_MOE_FUSED_WEIGHTS, self.scale_format)
+
             self.quant_input = self._mod_extra_config.inputs[0]
             self.register_scale("scale_input", mod_extra_config.scale.inputs[0], self.scale_format)
             self.register_scale(
                 "scale_intermediate",
                 [mod_extra_config.scale.inputs[x] for x in range(1, self.experts_used+1)],
                 self.scale_format,
             )
+            self.is_dynamic_quantization = isinstance(self.quant_input, QuantDynamicInput)
+            self.dynamic_moe_op = get_quantized_func_wrapper(
+                OP_TYPE.DYNAMIC_MOE_FUSED_WEIGHTS, scale_format=self.scale_format, is_dynamic=self.is_dynamic_quantization
+            )
+            if self.is_dynamic_quantization:
+                self.forward = self.forward_dynamic_quant
 
     def forward_quant(self,
                       hidden_states,
@@ -772,6 +778,35 @@ def forward_quant(self,
         )
         return output
 
+    def forward_dynamic_quant(
+        self, hidden_states, expert_routing_table, router_weights, permuted_weights=True, layer=None, activation="silu"
+    ):
+        # This is the dynamic version of the forward_quant method.
+        # Compared to the `forward_quant` method, the main differences are:
+        #   1) The `quant_input` is of type `QuantDynamicInput`.
+        #   2) There is no need to pass the `d_scale_intermediate_hidden_states` to the dynamic moe op.
+        experts_range = range(self.num_experts)
+        w1_list = [self.w13_list[i].weight for i in experts_range]
+        w2_list = [self.w2_list[i].weight for i in experts_range]
+        scale_w1 = [self.w13_list[i].scale_weight for i in experts_range]
+        scale_w2 = [self.w2_list[i].scale_weight for i in experts_range]
+        qinput_fp8, input_scale = self.quant_input(hidden_states)
+        output = self.dynamic_moe_op(
+            hidden_states=qinput_fp8,
+            expert_routing_table=expert_routing_table,
+            router_weights=router_weights,
+            w12=w1_list,
+            w3=w2_list,
+            d_scale_w12=scale_w1,
+            d_scale_w3=scale_w2,
+            d_scale_hidden_states=input_scale,
+            permuted_weights=False,
+            activation=activation,
+            experts_min=self.experts_min,
+            experts_max=self.experts_max
+        )
+        return output
+
     def forward_measure(self,
                         hidden_states,
                         expert_routing_table,
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py
@@ -104,7 +104,7 @@ class DeviceForScalesType(Enum):
 
 # TODO [SW-217813]: support dynamic quantization in all ops and remove
 # TODO [SW-228723]: get a better way to list all linear ops, like set in ModuleInfo if supports dynamic
-supported_dynamic_ops = ["linear", "row_parallel_linear"]
+supported_dynamic_ops = ["linear", "row_parallel_linear", "no_quantize_op", "dynamic_moe"]
 def is_supported_dynamic_op(op_type):
     ret = op_type.lower() in [op.lower() for op in supported_dynamic_ops]
     logger.trace("Checking if %s is supported for dynamic quantization: %s", op_type, ret)

Original file line number	Diff line number	Diff line change
`@@ -382,7 +382,7 @@ def __init__(self, config, mod, measurement, params, module_type):`
`382`	`382`	`num_of_experts = 8`
`383`	`383`
`384`	`384`	`self.inputs_scales_creators = [`
`385`		`- self.scales_method_factory.get_scale_method(QuantTensorName.INPUT)`
	`385`	`+ self.scales_method_factory.get_scale_method(QuantTensorName.INPUT, is_dynamic=self.is_dynamic)`
`386`	`386`	`for i in range(num_of_inputs + num_of_experts)`
`387`	`387`	`]`
`388`	`388`	`self.output_scales_creators.append(self.scales_method_factory.get_scale_method(QuantTensorName.OUTPUT))`