[SW-221594]Re-quantize the Official DeepSeek FP8 Model (#187)

yiliu30 · Yi4Liu · web-flow · commit 42b1485ee032 · 2025-04-02T21:19:43.000+08:00
Building on the vllm WoQ path, this PR adds support for re-quantizing FP8 weights w/ per-tensor or per-channel scaling.

---------

Co-authored-by: Yi Liu &lt;yiliu4@habana.ai&gt;
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py
@@ -20,13 +20,19 @@
 import torch
 from enum import Enum, auto
 from functools import lru_cache 
-
 from .._quant_common.quant_config import get_hqt_config
 from ..utils.logger import logger
 from neural_compressor.torch.algorithms.fp8_quant.model_configs import ModuleConfig
 
 UNMEASURED_MODELS = "UnmeasuredModels"
 
+def dequant_original_fp8_weight_if_needed(mod: torch.nn.Module, param: torch.Tensor) -> torch.Tensor:
+    if param.dtype in [torch.float8_e4m3fn]:
+        if hasattr(mod, "get_dequant_weights_func"):
+            dequant_weights_func = mod.get_dequant_weights_func()
+            if dequant_weights_func is not None:
+                param = dequant_weights_func(mod)
+    return param
 
 class QuantTensorType(Enum):
     MEASUREMENTS = auto()
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py b/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py
@@ -31,6 +31,7 @@
     OBSERVER_PARAMS,
     IMOD_DICT,
 )
+from neural_compressor.torch.algorithms.fp8_quant._core.common import dequant_original_fp8_weight_if_needed
 cur_accelerator = auto_detect_accelerator()
 
 
@@ -162,6 +163,7 @@ def register_patched_measure_modules(model, mod_list, observer_class, d_shapes=N
                 if pmod._mod_extra_config:
                     for param_name in pmod._mod_extra_config.params:
                         param = getattr(pmod, param_name)
+                        param = dequant_original_fp8_weight_if_needed(pmod.orig_mod, param)
                         if config["measure_on_hpu"]:
                             param = param.to(cur_accelerator.name())
                         pmod._mod_extra_config.params[param_name].measure(param)
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/patching_common.py b/neural_compressor/torch/algorithms/fp8_quant/_core/patching_common.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import importlib.util
-
+import os
 from ..model_configs import ModuleInfo, ModuleType
 from .._quant_common.helper_modules import *
 from ..utils.logger import logger
@@ -52,14 +52,17 @@ def create_mod_info_recursion(parent):
 
     create_mod_info_recursion(model)
 
+    
+INC_DYNAMIC_MOE_EXPERTS = int(os.environ.get("INC_DYNAMIC_MOE_EXPERTS", "8"))
+
 _mod_types = {
     "linear": ModuleType(1, ["weight"], 1, False),
     "row_parallel_linear": ModuleType(1, ["weight"], 2, True),
     "matmul": ModuleType(2, [], 1, False),
     "kv_cache": ModuleType(1, [], 1, False),
     "softmax": ModuleType(1, [], 1, True),
     "fused_sdpa": ModuleType(3, [], 2, True),
-    "dynamic_moe": ModuleType(1, [], 9, True),
+    "dynamic_moe": ModuleType(1, [], 1 + INC_DYNAMIC_MOE_EXPERTS, True),
 }
 
 
@@ -80,10 +83,12 @@ def create_mod_info_recursion(parent):
     "Softmax": ModuleInfo("softmax", PatchedSoftmax),
     "ModuleFusedSDPA": ModuleInfo("fused_sdpa", PatchedModuleFusedSDPA),
     "MoeMatmul": ModuleInfo("linear", PatchedMoeMatmul),
+    "MoeFP8Matmul": ModuleInfo("linear", PatchedMoeFP8Matmul),
     "ReplicatedLinear": ModuleInfo("linear", PatchedReplicatedLinear),
     "FusedMoE": ModuleInfo("linear", PatchedMixtralMoE, False),
     "GaudiMixtralSparseMoeBlock": ModuleInfo("dynamic_moe", PatchedGaudiMixtralSparseMoeBlock),
     "VllmMixtureOfExpertsOp": ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOp),
+    "VllmMixtureOfExpertsOpFP8": ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpFP8),
 }
 
 
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
@@ -28,6 +28,7 @@
 from .measure import load_measurements
 from .scale import scale_method_mapping, load_layer_scales, prepare_layer_scales
 from neural_compressor.torch.utils.auto_accelerator import auto_detect_accelerator
+from neural_compressor.torch.algorithms.fp8_quant._core.common import dequant_original_fp8_weight_if_needed
 
 
 cur_accelerator = auto_detect_accelerator()
@@ -74,9 +75,12 @@ def quantize_params(mod, mod_extra_config):
         param = getattr(mod, param_name)
         if param.dtype == torch.float16:
             param = param.to(torch.bfloat16)
+        param = dequant_original_fp8_weight_if_needed(mod, param)
         quantized_param = quantizer(param.to(cur_accelerator.name()))
         delattr(mod, param_name)
         setattr(mod, param_name, nn.Parameter(quantized_param))
+        # Note: in case of re-quantize the fp8 weights, we need to set `updated_fp8_weight` to True
+        mod.updated_fp8_weight = True
         quantized_param = getattr(mod, param_name)
         quantized_param.requires_grad_(False)
         cur_accelerator.synchronize()
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/ops_quantizer.py b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/ops_quantizer.py
@@ -18,6 +18,7 @@
 from .scale_method_factory import ScaleMethodFactory, QuantTensorName
 from ..common import ModuleConfig, QuantTensorType
 from ..quant_dequant import DequantOutput, QuantDequant, QuantDequantNone, QuantInput, QuantDynamicInput
+from neural_compressor.torch.algorithms.fp8_quant._core.common import dequant_original_fp8_weight_if_needed
 
 
 class BaseOpQuantizer:
@@ -100,9 +101,11 @@ def get_scales_module_config(self):
         input_scales = self.calc_input_scales(num_of_inputs=1)
         output_measurement = self.measurement.outputs[0] if self.measurement is not None else []
         rescaled_weight = self.mod.weight if hasattr(self.mod, 'weight') else None
+        if rescaled_weight is not None:
+            rescaled_weight = dequant_original_fp8_weight_if_needed(self.mod, rescaled_weight)
         if self.weight_ich_scale_calc is not None:
             weight_scales_in_ch = self.weight_ich_scale_calc.calc_scales(input_scales[0], QuantTensorType.CONST)
-            rescaled_weight = torch.div(self.mod.weight, weight_scales_in_ch.reshape([1, -1]))
+            rescaled_weight = torch.div(rescaled_weight, weight_scales_in_ch.reshape([1, -1]))
         weights_scales_out_ch = self.weight_och_scale_calc.calc_scales(rescaled_weight, QuantTensorType.CONST)
         params_config = (
             {"weight": weights_scales_out_ch}
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/utils.py b/neural_compressor/torch/algorithms/fp8_quant/_core/utils.py
@@ -23,6 +23,7 @@
 from .common import is_runtime_scale_patching
 
 import os
+import re
 import habana_frameworks.torch.utils.experimental as htexp
 
 
@@ -42,8 +43,11 @@ def print_init_info(config):
     logger.info("neural_compressor_pt Configuration = %s", config)
 
 
-def is_substr(substr_list, target):
-    return any([x in target for x in substr_list])
+def is_re_match(substr_list, target):
+    for substr in substr_list:
+        if re.search(substr, target):
+            return True
+    return False
 
 
 def should_quantize(config, mod_type, name):
@@ -57,12 +61,12 @@ def mod_is_not_blocked(mod_type, config):
         return (mod_type in allowlist_tuple)
     def allowlist_is_empty_or_allows_mod(mod_type, name, config):
         def mod_is_in_allowlist_config(mod_type, name, config):
-            return ((mod_type in config.cfg["allowlist"]["types"]) or (is_substr(config.cfg["allowlist"]["names"], name)))
+            return ((mod_type in config.cfg["allowlist"]["types"]) or (is_re_match(config.cfg["allowlist"]["names"], name)))
         def is_allowlist_completely_empty(config):
             return ((len(config.cfg["allowlist"]["names"]) == 0) and len(config.cfg["allowlist"]["types"]) == 0)
         return (mod_is_in_allowlist_config(mod_type, name, config) or is_allowlist_completely_empty(config))
     def name_is_not_blocked(name, config):
-        return (not is_substr(config.cfg["blocklist"]["names"], name))
+        return (not is_re_match(config.cfg["blocklist"]["names"], name))
     def is_static_scale_method(config):
         return config.cfg["scale_method"] not in _dynamic_scale_methods
     def quantize_dynamic_op(config, mod_type):
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
@@ -22,7 +22,8 @@
 from .quant_config import QuantMode, get_hqt_config
 from ..patched_module_base import PatchedModuleBase, get_call_wrapper
 from .._core.scale_handler import get_scale_dtype, ScaleFormat
-
+from neural_compressor.torch.utils.auto_accelerator import auto_detect_accelerator
+cur_accelerator = auto_detect_accelerator()
 
 class BMM(nn.Module):
     def __init__(self):
@@ -75,9 +76,13 @@ def get_current_repr(cls_instance, *member_names):
             if not first_name:
                 curr_repr += ", "
             cur_attr = getattr(cls_instance, name)
-            # currently, only scale is called here.
-            dtype = get_scale_dtype(cur_attr)
-            curr_repr += f"{name} dtype={dtype}"
+            if isinstance(cur_attr, list) and len(cur_attr) > 1:
+                dtype = get_scale_dtype(cur_attr[0])
+                curr_repr += f"{name} type=list of {dtype}, length={len(cur_attr)}"
+            else:
+                # currently, only scale is called here.
+                dtype = get_scale_dtype(cur_attr)
+                curr_repr += f"{name} dtype={dtype}"
             first_name = False
     return curr_repr
 
@@ -398,6 +403,7 @@ def __init__(self, mod, parent, mod_extra_config, *args, **kwargs):
         allreduce_quantization_enable = get_hqt_config(mod).cfg["row_parallel_linear_allreduce_quantization"]
         if self.quantization_mode in (QuantMode.MEASURE, QuantMode.SHAPE):
             self.forward = self.forward_measure_reduce if self.reduce_results and self.tp_size > 1 else self.forward_measure_no_reduce
+        
         elif self.quantization_mode in [QuantMode.QUANTIZE, QuantMode.LOAD]:
             if self.fake_quant or self.use_qdq:
                 self.forward = self.forward_qdq
@@ -467,7 +473,7 @@ def forward_quant_reduce_in_hp(self, input):
     def measure_input_and_matmul(self, input):
         resolved_input = self.resolve_input(input)
         measure_input((resolved_input,), observer=self._mod_extra_config.inputs)
-        return torch.matmul(resolved_input, self.weight.transpose(-1, -2))
+        return self.orig_mod.quant_method.apply(self.orig_mod, resolved_input)
 
     def forward_measure_no_reduce(self, input):
         output = self.measure_input_and_matmul(input)
@@ -567,7 +573,7 @@ def forward_quant(self, input):
 
     def forward_measure(self, input):
         measure_input((input,), observer=self._mod_extra_config.inputs)
-        output = torch.matmul(input, self.weight.transpose(-1, -2))
+        output = self.orig_mod.quant_method.apply(self.orig_mod, input)
         measure_output((output,), self._mod_extra_config.outputs)
         output, output_bias = self.add_bias(output)
         if self.gather_output:
@@ -695,6 +701,8 @@ def __init__(self, mod, parent, mod_extra_config, *args, **kwargs):
         init_linear(self, mod_extra_config)
         if (self.quantization_mode == QuantMode.MEASURE) or (self.quantization_mode == QuantMode.SHAPE):
             measure_input((torch.tensor(0),), observer=self._mod_extra_config.inputs)
+        else:
+            self.weight = self.weight.squeeze()
 
     def forward_qdq(self, input, *args, **kwargs):
         qinput = self.quant_input(input)
@@ -820,6 +828,9 @@ def extra_repr(self) -> str:
 class PatchedVllmMixtureOfExpertsOp(PatchedModuleBase):
     def __init__(self, mod, parent, mod_extra_config, *args, **kwargs):
         super().__init__(mod, parent, mod_extra_config, *args, **kwargs)
+        # Get the `experts_min` and `experts_max` from the original module if they exist
+        self.experts_min = self.orig_mod.experts_min if hasattr(self.orig_mod, "experts_min") else 0
+        self.experts_max = self.orig_mod.experts_max if hasattr(self.orig_mod, "experts_max") else 7
         if self.quantization_mode in [QuantMode.QUANTIZE, QuantMode.LOAD]:
             self.dynamic_moe_op = get_quantized_func_wrapper(OP_TYPE.DYNAMIC_MOE_FUSED_WEIGHTS, self.scale_format)
             self.quant_input = self._mod_extra_config.inputs[0]
@@ -837,8 +848,8 @@ def forward_quant(self,
                       permuted_weights=True,
                       activation="silu"):
         experts_range = range(self.num_experts)
-        w1_list = [self.w13_list[i].weight.squeeze() for i in experts_range]
-        w2_list = [self.w2_list[i].weight.squeeze() for i in experts_range]
+        w1_list = [self.w13_list[i].weight for i in experts_range]
+        w2_list = [self.w2_list[i].weight for i in experts_range]
         scale_w1 = [self.w13_list[i].scale_weight for i in experts_range]
         scale_w2 = [self.w2_list[i].scale_weight for i in experts_range]
         qinput = self.quant_input(hidden_states)
@@ -854,8 +865,8 @@ def forward_quant(self,
             d_scale_intermediate_hidden_states=self.scale_intermediate,
             permuted_weights=False,
             activation=activation,
-            experts_min=0,
-            experts_max=7
+            experts_min=self.experts_min,
+            experts_max=self.experts_max,
         )
         return output
 
@@ -877,8 +888,8 @@ def forward_measure(self,
             w3=w2_list,
             permuted_weights=permuted_weights,
             activation=activation,
-            experts_min=0,
-            experts_max=7,
+            experts_min=self.experts_min,
+            experts_max=self.experts_max,
             measurement_mode=True,
         )
         output_measure_list = [output]
@@ -888,15 +899,65 @@ def forward_measure(self,
         return output
 
     def extra_repr(self) -> str:
-        member_names = ["scale_input"]
-        for x in range(1, self.num_experts+1):
-            member_names.append("scale_intermediate["+str(x)+"]")
+        member_names = ["scale_input", "scale_intermediate"] 
+        # for x in range(1, self.num_experts+1):
+        #     member_names.append("scale_intermediate["+str(x)+"]")
         return extra_representation(
             self.extra_repr_org(),
             self.class_name_org,
             get_current_repr(self, *member_names),
         )
 
+class PatchedVllmMixtureOfExpertsOpFP8(PatchedVllmMixtureOfExpertsOp):
+    """The patched module for the VLLMMixtureOfExpertsOp module with FP8 weights.
+    
+    There are some models like Deepseek R1/V3 with FP8 weights, we need to requantize it.
+    
+    The main difference between this module and the PatchedVllmMixtureOfExpertsOp is that the weights are FP8.
+    - At measurement stage, we dequantize the weights to BF16.
+    - At quantization stage, we use the same `forward_quant` method as the PatchedVllmMixtureOfExpertsOp.
+    """
+
+    def forward_measure(
+        self,
+        x,
+        topk_ids,
+        topk_weights,
+    ):
+        hidden_states = x
+        measure_input((hidden_states,), observer=self._mod_extra_config.inputs)
+        min_expert = self.experts_min
+        max_expert = self.experts_max
+        w13_list_slice = []
+        w2_list_slice = []
+        for j in range(self.num_experts):
+            w13_list_slice.append(self.w13_list[j].get_dequant_weight())
+            w2_list_slice.append(self.w2_list[j].get_dequant_weight())
+
+        output, intermidiate_amax = torch.ops.hpu.mixture_of_experts.fp8_measurement_fused_weights(
+            hidden_states=x,
+            expert_routing_table=topk_ids.to(torch.int64),
+            router_weights=topk_weights.to(x.dtype),
+            w12=w13_list_slice,
+            w3=w2_list_slice,
+            permuted_weights=True,
+            activation="silu",
+            experts_min=min_expert,
+            experts_max=max_expert,
+            measurement_mode=True,
+        )
+        output_measure_list = [output]
+        for i in range(self.num_experts):
+            output_measure_list.append(intermidiate_amax[i])
+        measure_output(output_measure_list, self._mod_extra_config.outputs)
+        return output
+
+class PatchedMoeFP8Matmul(PatchedMoeMatmul):
+    """The patched module for the MoeMatmul module with FP8 weights."""
+
+    def __init__(self, mod, parent, mod_extra_config, *args, **kwargs):
+        super().__init__(mod, parent, mod_extra_config, *args, **kwargs)
+        self.get_dequant_weight = self.orig_mod.get_dequant_weight
 
 class PatchedKVCache(PatchedModuleBase):
     # Module to patch KVCache module from llama model
diff --git a/test/3x/torch/algorithms/fp8_quant/test_utils.py b/test/3x/torch/algorithms/fp8_quant/test_utils.py
@@ -0,0 +1,19 @@
+import unittest
+import re
+from typing import List
+from neural_compressor.torch.algorithms.fp8_quant._core.utils import is_re_match
+
+
+class TestUtils(unittest.TestCase):
+    def test_is_re_match_found(self):
+        substr_list = ["lm_head", "mlp\\.gate\\b"]
+        target = "layer.1.mlp.gate"
+        self.assertTrue(is_re_match(substr_list, target))
+        target2 = "model.lm_head"
+        self.assertTrue(is_re_match(substr_list, target2))
+
+    def test_is_re_match_not_found(self):
+        substr_list = ["lm_head", "mlp\\.gate\\b"]
+        target = "layer.1.mlp.gate_up_proj"
+        self.assertFalse(is_re_match(substr_list, target))
+