[SW-233731] Support FP8 QDQ quant on CPU (#239)

mengniwang95 · Mengni Wang · web-flow · commit 8bb9758f1c56 · 2025-07-03T11:59:43.000+03:00
supported module types: Linear, Conv2D, EmbeddingBag (weight-only quant)
validated scheme: per-tensor, sym, E4M3
validated model: DLRM, vit

---------

Signed-off-by: Mengni Wang &lt;mewang@habana.ai&gt;
Signed-off-by: Mengni Wang &lt;mengni.wang@intel.com&gt;
Co-authored-by: Mengni Wang &lt;mewang@habana.ai&gt;
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py
@@ -22,6 +22,9 @@
 from functools import lru_cache 
 from ..utils.logger import logger
 from neural_compressor.torch.algorithms.fp8_quant.model_configs import ModuleConfig
+from neural_compressor.torch.utils.auto_accelerator import auto_detect_accelerator
+
+cur_device = auto_detect_accelerator().current_device_name()
 
 UNMEASURED_MODELS = "UnmeasuredModels"
 
@@ -161,7 +164,7 @@ def load_scales(fname, target_format):
     return d
 
 
-def convert_scales_to_tensors_dict(scales_obj, scales_file_format, hp_dtype, device="hpu"):
+def convert_scales_to_tensors_dict(scales_obj, scales_file_format, hp_dtype, device=cur_device):
     scales_temp = {k: scales_obj[k].__dict__ for k in scales_obj}
     scales_temp = format_functions_rec((scales_file_format, torch.Tensor))(scales_temp)
     scales_temp = rec_fn(scales_temp, lambda x: x.to(dtype=hp_dtype, device=device))
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/patching_common.py b/neural_compressor/torch/algorithms/fp8_quant/_core/patching_common.py
@@ -61,6 +61,7 @@ def create_mod_info_recursion(parent):
     "softmax": ModuleType(1, [], 1, True),
     "fused_sdpa": ModuleType(3, [], 2, True),
     "dynamic_moe": ModuleType(1, [], 1 + 8, True),
+    "embedding": ModuleType(1, ["weight"], 1, False),
 }
 
 
@@ -126,13 +127,31 @@ def _import_xpu_modules():
                                         "Matmul": ModuleInfo("matmul", PatchedMatmul),})
     PATCHED_MODULE_TYPES_TABLE["xpu"].update({"linear": _mod_types["linear"]})
 
+
+@functools.lru_cache(maxsize=None)
+def _import_cpu_modules():
+    from neural_compressor.torch.algorithms.fp8_quant.patched_module_base import (
+        PATCHED_MODULE_TABLE, PATCHED_MODULE_TYPES_TABLE
+    )
+    cur_accelerator = auto_detect_accelerator()
+    if not cur_accelerator.current_device_name().startswith("cpu"):
+        return
+    PATCHED_MODULE_TABLE["cpu"].update({"Linear": ModuleInfo("linear", PatchedLinear),
+                                        "Conv2d": ModuleInfo("linear", PatchedConv2d),
+                                        "EmbeddingBag": ModuleInfo("embedding", PatchedEmbeddingBag),
+                                        })
+    PATCHED_MODULE_TYPES_TABLE["cpu"].update({"linear": _mod_types["linear"], "embedding": _mod_types["embedding"]})
+
+
 @functools.lru_cache(maxsize=None)
 def _import_device_modules():
     cur_accelerator_type = auto_detect_accelerator().get_inc_accelerator_type()
     if cur_accelerator_type.value > INCAcceleratorType.GAUDI_MIN.value:
         _import_hpu_modules()
     elif cur_accelerator_type == INCAcceleratorType.XPU:
         _import_xpu_modules()
+    elif cur_accelerator_type == INCAcceleratorType.CPU:
+        _import_cpu_modules()
     else:
         logger.warning("No HPU or XPU devices were detected. No Patched Modules available.")
 
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quant_dequant.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quant_dequant.py
@@ -47,13 +47,14 @@ def __init__(self, lp_dtype, hp_dtype="", *args, **kwargs):
             self.qdq_init()
 
     def qdq_init(self):
-        import habana_frameworks.torch.utils.experimental as htexp
-        if htexp._get_device_type() == htexp.synDeviceType.synDeviceGaudi2 and self.lp_dtype == torch.float8_e4m3fn:
-            self.quant_min = int(torch.finfo(torch.float8_e4m3fnuz).min)
-            self.quant_max = int(torch.finfo(torch.float8_e4m3fnuz).max)
-        else:
-            self.quant_min = int(torch.finfo(self.lp_dtype).min)
-            self.quant_max = int(torch.finfo(self.lp_dtype).max)
+        self.quant_min = int(torch.finfo(self.lp_dtype).min)
+        self.quant_max = int(torch.finfo(self.lp_dtype).max)
+
+        if cur_accelerator.current_device_name() == "hpu":
+            import habana_frameworks.torch.utils.experimental as htexp
+            if htexp._get_device_type() == htexp.synDeviceType.synDeviceGaudi2 and self.lp_dtype == torch.float8_e4m3fn:
+                self.quant_min = int(torch.finfo(torch.float8_e4m3fnuz).min)
+                self.quant_max = int(torch.finfo(torch.float8_e4m3fnuz).max)
 
         if self.scale_format == ScaleFormat.CONST:
             self.zero_point = nn.Parameter(torch.tensor(0.))
@@ -98,7 +99,8 @@ def __init__(self, scale_inv, lp_dtype, hp_dtype, *args, **kwargs):
                 else quantize_per_tensor_to_fp8
             )
 
-        self.cast_to_op = get_quantized_func_wrapper(OP_TYPE.CAST_TO_FP8, self.scale_format)
+        else:
+            self.cast_to_op = get_quantized_func_wrapper(OP_TYPE.CAST_TO_FP8, self.scale_format)
 
     def forward(self, x):
         return self.cast_to_op(x, self.scale_inv, False, False, self.lp_dtype)
@@ -156,8 +158,8 @@ def __init__(self, scale, lp_dtype, hp_dtype, *args, **kwargs):
                 if self.scale_format == ScaleFormat.CONST and self.scale.numel() > 1
                 else dequantize_per_tensor_from_fp8
             )
-
-        self.cast_from_op = get_quantized_func_wrapper(OP_TYPE.CAST_FROM_FP8, self.scale_format)
+        else:
+            self.cast_from_op = get_quantized_func_wrapper(OP_TYPE.CAST_FROM_FP8, self.scale_format)
 
     def forward(self, x):
         return self.cast_from_op(x, self.scale, self.hp_dtype)
@@ -185,8 +187,9 @@ def __init__(self, scale_inv, lp_dtype, hp_dtype, *args, **kwargs):
         super(QuantDequant, self).__init__(lp_dtype, hp_dtype, *args, **kwargs)
         self.register_scale("scale_inv", scale_inv, self.scale_format)
         self.register_scale("scale", 1 / scale_inv, self.scale_format)
-        self.cast_to_op = get_quantized_func_wrapper(OP_TYPE.CAST_TO_FP8, self.scale_format)
-        self.cast_from_op = get_quantized_func_wrapper(OP_TYPE.CAST_FROM_FP8, self.scale_format)
+        if not self.use_qdq:
+            self.cast_to_op = get_quantized_func_wrapper(OP_TYPE.CAST_TO_FP8, self.scale_format)
+            self.cast_from_op = get_quantized_func_wrapper(OP_TYPE.CAST_FROM_FP8, self.scale_format)
 
     def forward(self, x, *args, **kwargs):
         y = self.cast_to_op(x, self.scale_inv, False, False, self.lp_dtype)
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantized_func_wrappers/quantized_func_wrapper_api.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantized_func_wrappers/quantized_func_wrapper_api.py
@@ -37,9 +37,12 @@ def init_quantized_func_wrapper_factory():
     elif device_name == "xpu":
         from .xpu.xpu_quantized_func_wrapper import init_xpu_quantized_func_wrapper_factory
         init_xpu_quantized_func_wrapper_factory()
+    elif device_name == "cpu":
+        # only support QDQ now
+        pass
     else:
         raise ValueError("Unknown device type - {}".format(device_name))
 
 
 def clear_quantized_func_wrapper_factory():
-    QuantizedFuncWrapperFactory.clear()
+    QuantizedFuncWrapperFactory.clear()
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/scale_handler.py b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_handler.py
@@ -16,6 +16,9 @@
 import types
 from .._quant_common.quant_config import ScaleFormat
 from .common import is_runtime_scale_patching
+from neural_compressor.torch.utils.auto_accelerator import auto_detect_accelerator
+
+cur_device = auto_detect_accelerator().current_device_name()
 
 
 def add_scale_registry(patched_mod):
@@ -83,7 +86,7 @@ def get_scale_dtype(scale):
         raise Exception(f"Unexpected scale instance type: {type(scale).__name__}, expected Torch.tensor or float number")
 
 
-def get_param_scales_from_scalar(patched_mod, prefix, dtype=torch.bfloat16, device=torch.device('hpu')):
+def get_param_scales_from_scalar(patched_mod, prefix, dtype=torch.bfloat16, device=cur_device):
     """Get all scales in param_list, used for saving scalar scales"""
     scale_dict = {}
     for name in patched_mod.scale_members:
@@ -95,7 +98,7 @@ def get_param_scales_from_scalar(patched_mod, prefix, dtype=torch.bfloat16, devi
     return scale_dict
 
 
-def get_param_scales_from_list(patched_mod, prefix, dtype=torch.bfloat16, device=torch.device('hpu')):
+def get_param_scales_from_list(patched_mod, prefix, dtype=torch.bfloat16, device=cur_device):
     """Get all scales in param_list, used for saving scalar scales"""
     scale_dict = {}
     for name in patched_mod.scale_members:
@@ -141,7 +144,7 @@ def set_param_scales_into_list(patched_mod, state_dict):
 def get_state_dict(patched_mod, *args, destination=None, prefix='', keep_vars=False):
     """replace torch.nn.Module.state_dict"""
     cur_state_dict = torch.nn.Module.state_dict(patched_mod, *args, destination=destination, prefix=prefix, keep_vars=keep_vars)
-    device = torch.device('hpu')
+    device = cur_device
     dtype = patched_mod.hp_dtype
     if patched_mod.scale_format == ScaleFormat.SCALAR:
         scale_dict = get_param_scales_from_scalar(patched_mod, prefix, dtype=dtype, device=device)
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/ops_quantizer.py b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/ops_quantizer.py
@@ -20,6 +20,8 @@
 from ..quant_dequant import DequantOutput, QuantDequant, QuantDequantNone, QuantInput, QuantDynamicInput
 from ...utils.logger import logger
 from neural_compressor.torch.algorithms.fp8_quant._core.common import dequant_original_fp8_weight_if_needed
+from neural_compressor.torch.utils.auto_accelerator import auto_detect_accelerator
+cur_device = auto_detect_accelerator().current_device_name()
 
 
 class BaseOpQuantizer:
@@ -114,7 +116,7 @@ def get_scales_module_config(self):
         rescaled_weight = self.mod.weight if hasattr(self.mod, 'weight') else None
         if self.scales_method_factory.scale_method_config_map[QuantTensorName.WEIGHT_IN_CH].scale_value_type != ScaleValueType.DUMMY_SCALES:
             # Calculating weight in hpu to support scale calculation CGUID torch.ops.hpu.calculate_scale_for_cast
-            rescaled_weight = rescaled_weight.to("hpu")
+            rescaled_weight = rescaled_weight.to(cur_device)
         if rescaled_weight is not None:
             rescaled_weight = dequant_original_fp8_weight_if_needed(self.mod, rescaled_weight)
         if self.weight_ich_scale_calc is not None:
@@ -420,13 +422,78 @@ def scales_module_config_to_q_and_dq(self, module):
 
     
 
+class EmbeddingOpQuantizer(BaseOpQuantizer):
+
+    def __init__(self, config, mod, measurement, params, module_type):
+        super().__init__(config, mod, measurement, params, module_type)
+        self.inputs_scales_creators.append(self.scales_method_factory.get_scale_method(QuantTensorName.INPUT))
+        self.weight_och_scale_calc = self.scales_method_factory.get_scale_method(QuantTensorName.WEIGHT_OUT_CH)
+        self.weight_ich_scale_calc = self.scales_method_factory.get_scale_method(QuantTensorName.WEIGHT_IN_CH)
+        self.output_scales_creators.append(self.scales_method_factory.get_scale_method(QuantTensorName.OUTPUT))
+
+    def get_scales_module_config(self):
+        weight = self.mod.weight if hasattr(self.mod, 'weight') else None
+        input_scales = self.calc_input_scales(num_of_inputs=1)
+
+        if self.weight_ich_scale_calc is not None:
+            weight_scales_in_ch = self.weight_ich_scale_calc.calc_scales(input_scales[0], QuantTensorType.CONST)
+            weight = torch.div(weight, weight_scales_in_ch.reshape([1, -1]))
+        weights_scales_out_ch = self.weight_och_scale_calc.calc_scales(weight, QuantTensorType.CONST)
+
+        params_config = (
+            {"weight": weights_scales_out_ch}
+            if (self.weight_ich_scale_calc is None)
+            else {"weight": {0: weights_scales_out_ch, 1: weight_scales_in_ch}}
+        )
+        return ModuleConfig(
+            (),
+            (),
+            params_config,
+        )
+
+    def init_weight_config(self, scales, scales_inv, lp_dtype, hp_dtype, scale_format, use_qdq, fake_quant):
+        if use_qdq:
+            # to ensure the weights to be loaded to the device in fp8
+            weight_config = [
+                QuantInput(scales_inv, lp_dtype, hp_dtype, scale_format=scale_format, use_qdq=use_qdq),
+                DequantOutput(scales, lp_dtype, hp_dtype, scale_format=scale_format, use_qdq=use_qdq),
+            ]
+        else:
+            raise ValueError("For FP8 quantization, {} only supports QDQ mode now!".format(self.mod.__class__.__name__))
+        return weight_config
+
+    def init_weights_from_module(self, params_config):
+        if isinstance(params_config, dict):
+            self.weight_och_scale_calc.scale = params_config[0]
+            self.weight_ich_scale_calc.scale = params_config[1]
+        else:
+            self.weight_och_scale_calc.scale = params_config
+
+    def scales_module_config_to_q_and_dq(self, module):
+        self.init_scales_from_module_config(module)
+        self.init_weights_from_module(module.params["weight"])
+        scale_format, use_qdq, fake_quant, lp_dtype, hp_dtype = self.get_module_configuration()
+        weight_config = self.init_weight_config(
+            self.weight_och_scale_calc.scale,
+            self.weight_och_scale_calc.calc_invert_scales(),
+            lp_dtype,
+            hp_dtype,
+            scale_format,
+            use_qdq,
+            fake_quant,
+        )
+        params_config = {"weight": weight_config}
+        return ModuleConfig([], [], params_config)
+
+
 ops_quantizer_map = {"linear": LinearOpQuantizer,
                       "matmul": MatmulOpQuantizer,
                       "fused_sdpa": FsdpaOpQuantizer,
                       "softmax": SoftmaxOpQuantizer,
                       "kv_cache": KVCacheOpQuantizer,
                       "dynamic_moe": DynamicMoeOpQuantizer,
-                      "row_parallel_linear": RowParallelLinearOpQuantizer
+                      "row_parallel_linear": RowParallelLinearOpQuantizer,
+                      "embedding": EmbeddingOpQuantizer,
                      }
 
 def get_op_quantizer(config, mod, measurement, params, module_type):
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/scales_method.py b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/scales_method.py
@@ -136,7 +136,7 @@ def calc_scales(self, tensor, tensor_type, **additional_kwargs):
 # used when running with dummy measurement (prepare_model_with_dummy_measurement)
 class DummyScales(ScalesMethod):
     def calc_scales(self, tensor, tensor_type, **additional_kwargs):
-        self.scale = torch.tensor(1.0).to("hpu")
+        self.scale = torch.tensor(1.0).to(self.device)
         return self.scale
 
 
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
@@ -167,6 +167,7 @@ def init_linear(self, mod_extra_config):
             self.quant_input = self._mod_extra_config.inputs[0]
             self.dequant_output = self._mod_extra_config.outputs[0]
 
+
             # When offloading weights to disk using device_map, the module forward is overridden.
             # __dict__.update call again overrides the PatchedLinear forward with the forward that device_map planted.
             # So need to set PatchedLinear forward to be the right forward.
@@ -585,6 +586,53 @@ def forward_measure(self, input):
         return output
 
 
+class PatchedEmbeddingBag(PatchedModuleBase):
+    def __init__(self, mod, parent, mod_extra_config, *args, **kwargs):
+        super().__init__(mod, parent, mod_extra_config, *args, **kwargs)
+        if self.quantization_mode in [QuantMode.QUANTIZE, QuantMode.LOAD]:
+            if self.use_qdq:
+                self.dequant_weights = self._mod_extra_config.params["weight"][1]
+                if isinstance(mod_extra_config.scale.params["weight"], (torch.Tensor, float)):
+                    self.register_scale("scale_weight", mod_extra_config.scale.params["weight"], self.scale_format)
+                elif isinstance(mod_extra_config.scale.params["weight"], dict):
+                    # PCQ weight is calculated with actual weight [0] and ones [1]
+                    # only ScaleFormat.CONST is supported for per-channel scale now.
+                    self.register_scale("scale_weight", mod_extra_config.scale.params["weight"][0], ScaleFormat.CONST)
+            else:
+                raise ValueError("EmbeddingBag is only supported QDQ mode now!")
+
+    def forward_qdq(self, input, offsets, *args, **kwargs):
+        qweight = self.dequant_weights(self.weight, )
+
+        return torch.nn.functional.embedding_bag(
+            input=input,
+            offsets=offsets,
+            weight=qweight,
+            max_norm=self.max_norm,
+            norm_type=self.norm_type,
+            scale_grad_by_freq=self.scale_grad_by_freq,
+            mode=self.mode,
+            sparse=self.sparse,
+            include_last_offset=self.include_last_offset,
+            padding_idx=self.padding_idx,
+            *args,
+            **kwargs,
+        )
+
+    def forward_measure(self, input, *args, **kwargs):
+        measure_input((input,), observer=self._mod_extra_config.inputs)
+        output = self.orig_mod(input, *args, **kwargs)
+        measure_output((output,), self._mod_extra_config.outputs)
+        return output
+
+    def extra_repr(self) -> str:
+        return extra_representation(
+            self.extra_repr_org(),
+            self.class_name_org,
+            get_current_repr(self, "scale_weight"),
+        )
+
+
 # patched vllm FusedMoE module removing the bf16 weights of all experts
 # measure and quant of the weights is done per expert using PatchedMoeMatmul
 # therefore it is configured: ModuleInfo.should_measure_and_quant = False
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py
@@ -220,6 +220,9 @@ def parse(custom_config: Mapping[str, str]) -> Fp8cfg:
         validate_and_populate_scale_method(scale_method_config)
 
 
+        if auto_detect_accelerator().current_device_name() == "cpu" and not measured_global_config["use_qdq"]:
+            raise ValueError("For FP8 quantization, only QDQ mode is supported on CPU device.")
+
         # If seperate_measure_files is True (default value), then it is assumed that there are multiple distinct measure and scale files
         # and they are stored in / loaded from paths with the correct index as a suffix. Else, only one is searched for.
         measured_global_config["local_rank"] = (
@@ -230,6 +233,11 @@ def parse(custom_config: Mapping[str, str]) -> Fp8cfg:
             logger.debug("setting device for scales config")
             Fp8cfg.set_gaudi_device_for_scales(custom_config, measured_global_config, scale_method_config)
 
+        if auto_detect_accelerator().current_device_name() == "cpu" and \
+           check_scale_method_fields(scale_method_config, granularity_weight=ScaleGranularity.PCS, reducer=any):
+            # for PCQ, there is some issue in dequantize_per_channel op on CPU device
+            raise ValueError("Don't support FP8 PCQ (Per Channel Quantization) on CPU device now")
+
         if measured_global_config["scale_format"] == ScaleFormat.SCALAR:
             if check_scale_method_fields(scale_method_config, granularity_weight=ScaleGranularity.PCS, reducer=any) or \
                check_scale_method_fields(scale_method_config, granularity_activation=ScaleGranularity.PCS, reducer=any):
@@ -242,6 +250,8 @@ def parse(custom_config: Mapping[str, str]) -> Fp8cfg:
         dynamic_quantization = measured_global_config["dynamic_quantization"]
         # TODO [SW-217814]: get dynamic methods in a better way, or support file handling in dynamic mode
         if dynamic_quantization:
+            if auto_detect_accelerator().current_device_name() == "cpu":
+                raise ValueError("Currently CPU device doesn't support dynamic quantization")
             logger.info(f"NOTE: Using dynamic scale method, only supported ops will be quantized.")
             if measured_global_config["scale_format"] == ScaleFormat.SCALAR:
                 measured_global_config["scale_format"] = ScaleFormat.CONST
@@ -364,4 +374,4 @@ def _read_config_from_file(config_path: str) -> Mapping[str, str]:
     except JSONDecodeError as e:
         config_json.close()
         raise Exception(f"Got exception: {e}. QUANT PACKAGE: Can't load {config_path}!")
-    return config
+    return config
diff --git a/neural_compressor/torch/algorithms/fp8_quant/observer.py b/neural_compressor/torch/algorithms/fp8_quant/observer.py
diff --git a/neural_compressor/torch/algorithms/fp8_quant/save_load.py b/neural_compressor/torch/algorithms/fp8_quant/save_load.py
diff --git a/test/3x/torch/algorithms/fp8_quant_cpu/unit_tests/test_cpu_basic.py b/test/3x/torch/algorithms/fp8_quant_cpu/unit_tests/test_cpu_basic.py
diff --git a/test/3x/torch/algorithms/fp8_quant_cpu/unit_tests/test_cpu_save_load.py b/test/3x/torch/algorithms/fp8_quant_cpu/unit_tests/test_cpu_save_load.py