[SW-233731] Use torchao op for CPU QDQ and abstract QDQ calling (#264)

mengniwang95 · Mengni Wang · web-flow · commit 744bf4cbad11 · 2025-07-04T14:11:34.000+08:00
Abstract QDQ calling
Fix QDQ model print issue
Use torchao op for CPU QDQ (HPU doesn't has this accuracy issue)
---------

Signed-off-by: Mengni Wang &lt;mewang@habana.ai&gt;
Co-authored-by: Mengni Wang &lt;mewang@habana.ai&gt;
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quant_dequant.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quant_dequant.py
@@ -25,13 +25,6 @@
 from .._core.scale_handler import add_scale_registry, get_scale_dtype
 from .._quant_common.quant_config import ScaleFormat
 from .common import QuantTensorType
-from .fp_utils import (
-    quantize_per_tensor_to_fp8,
-    dequantize_per_tensor_from_fp8,
-    quantize_per_channel_to_fp8,
-    dequantize_per_channel_from_fp8,
-    invert_scale,
-)
 from .scale_handler import create_scale_tensor
 
 
@@ -93,17 +86,13 @@ def __init__(self, scale_inv, lp_dtype, hp_dtype, *args, **kwargs):
         self.register_scale("scale_inv", scale_inv, self.scale_format)
         if self.use_qdq:
             self.register_scale("scale", 1 / self.scale_inv, self.scale_format)
-            self.quantize_op = (
-                quantize_per_channel_to_fp8
-                if self.scale_format == ScaleFormat.CONST and self.scale.numel() > 1
-                else quantize_per_tensor_to_fp8
-            )
-
+            op_type = OP_TYPE.QUANT_PC if self.scale_format == ScaleFormat.CONST and self.scale.numel() > 1 else OP_TYPE.QUANT
         else:
-            self.cast_to_op = get_quantized_func_wrapper(OP_TYPE.CAST_TO_FP8, self.scale_format)
+            op_type = OP_TYPE.CAST_TO_FP8
+        self.quantize_op = get_quantized_func_wrapper(op_type, self.scale_format)
 
     def forward(self, x):
-        return self.cast_to_op(x, self.scale_inv, False, False, self.lp_dtype)
+        return self.quantize_op(x, self.scale_inv, False, False, self.lp_dtype)
 
     def forward_qdq(self, x):
         return self.quantize_op(
@@ -153,16 +142,13 @@ def __init__(self, scale, lp_dtype, hp_dtype, *args, **kwargs):
         super(DequantOutput, self).__init__(lp_dtype, hp_dtype, *args, **kwargs)
         self.register_scale("scale", scale, self.scale_format)
         if self.use_qdq:
-            self.dequantize_op = (
-                dequantize_per_channel_from_fp8
-                if self.scale_format == ScaleFormat.CONST and self.scale.numel() > 1
-                else dequantize_per_tensor_from_fp8
-            )
+            op_type = OP_TYPE.DEQUANT_PC if self.scale_format == ScaleFormat.CONST and self.scale.numel() > 1 else OP_TYPE.DEQUANT
         else:
-            self.cast_from_op = get_quantized_func_wrapper(OP_TYPE.CAST_FROM_FP8, self.scale_format)
+            op_type = OP_TYPE.CAST_FROM_FP8
+        self.dequantize_op = get_quantized_func_wrapper(op_type, self.scale_format)
 
     def forward(self, x):
-        return self.cast_from_op(x, self.scale, self.hp_dtype)
+        return self.dequantize_op(x, self.scale, self.hp_dtype)
 
     def forward_qdq(self, x):
         return self.dequantize_op(
@@ -187,30 +173,37 @@ def __init__(self, scale_inv, lp_dtype, hp_dtype, *args, **kwargs):
         super(QuantDequant, self).__init__(lp_dtype, hp_dtype, *args, **kwargs)
         self.register_scale("scale_inv", scale_inv, self.scale_format)
         self.register_scale("scale", 1 / scale_inv, self.scale_format)
-        if not self.use_qdq:
-            self.cast_to_op = get_quantized_func_wrapper(OP_TYPE.CAST_TO_FP8, self.scale_format)
-            self.cast_from_op = get_quantized_func_wrapper(OP_TYPE.CAST_FROM_FP8, self.scale_format)
+        self.quantize_op = (
+           get_quantized_func_wrapper(OP_TYPE.QUANT, self.scale_format)
+           if self.use_qdq
+           else get_quantized_func_wrapper(OP_TYPE.CAST_TO_FP8, self.scale_format)
+        )
+        self.dequantize_op = (
+           get_quantized_func_wrapper(OP_TYPE.DEQUANT, self.scale_format)
+           if self.use_qdq
+           else get_quantized_func_wrapper(OP_TYPE.CAST_FROM_FP8, self.scale_format)
+        )
 
     def forward(self, x, *args, **kwargs):
-        y = self.cast_to_op(x, self.scale_inv, False, False, self.lp_dtype)
+        y = self.quantize_op(x, self.scale_inv, False, False, self.lp_dtype)
         # mark_step is needed so fuser won't remove 2 consecutive casts.
         # will be removed once SW-196431 is implemented
         # Call cur_accelerator.synchronize() which will call mark_step() as well
         cur_accelerator.synchronize()
-        z = self.cast_from_op(y, self.scale, self.hp_dtype)
+        z = self.dequantize_op(y, self.scale, self.hp_dtype)
         cur_accelerator.synchronize()
         return z
 
     def forward_qdq(self, x, *args, **kwargs):
-        y = quantize_per_tensor_to_fp8(
+        y = self.quantize_op(
             x,
             scale=self.scale,
             zero_point=self.zero_point,
             quant_min=self.quant_min,
             quant_max=self.quant_max,
             dtype=self.lp_dtype,
         )
-        z = dequantize_per_tensor_from_fp8(
+        z = self.dequantize_op(
             y,
             scale=self.scale,
             zero_point=self.zero_point,
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantized_func_wrappers/cpu/__init__.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantized_func_wrappers/cpu/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantized_func_wrappers/cpu/cpu_quantized_func_wrapper.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantized_func_wrappers/cpu/cpu_quantized_func_wrapper.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..quantized_func_wrapper import QuantizedFuncWrapperBase, OP_TYPE, QuantizedFuncWrapperFactory
+
+import torch
+import torchao
+
+from abc import ABCMeta
+
+
+
+class QuantizedCPUFuncWrapperBase(QuantizedFuncWrapperBase, metaclass=ABCMeta):
+    """
+    Placeholder for base class for CPU quantized func wrapper.
+    """
+    def __init__(self, scale_format, is_dynamic=False):
+        self._quantized_func_ = self.get_default_quantized_func()
+
+
+class QuantizedCPUQuant(QuantizedCPUFuncWrapperBase):
+
+    def get_default_quantized_func(self):
+        return torch.ops.torchao.quantize_affine_float8
+
+    def __call__(self, input, scale, zero_point=None, axis=0, quant_min=None, quant_max=None, dtype=torch.float8_e4m3fn):
+        return self._quantized_func_(tensor=input, scale=scale, float8_dtype=dtype)
+
+
+class QuantizedCPUQuantPC(QuantizedCPUFuncWrapperBase):
+
+    def get_default_quantized_func(self):
+        return torch.ops.torchao.quantize_affine_float8
+
+    def __call__(self, input, scale, zero_point=None, axis=0, quant_min=None, quant_max=None, dtype=torch.float8_e4m3fn):
+        return self._quantized_func_(tensor=input, scale=scale.view((-1, 1)), float8_dtype=dtype)
+
+
+class QuantizedCPUDeQuant(QuantizedCPUFuncWrapperBase):
+
+    def get_default_quantized_func(self):
+        return torch.ops.torchao.dequantize_affine_float8
+
+    def __call__(self, input, scale, zero_point=None, axis=0, quant_min=None, quant_max=None, dtype=torch.float8_e4m3fn, out_dtype=torch.bfloat16):
+        return self._quantized_func_(tensor=input, scale=scale, output_dtype=out_dtype)
+
+
+class QuantizedCPUDeQuantPC(QuantizedCPUFuncWrapperBase):
+
+    def get_default_quantized_func(self):
+        return torch.ops.torchao.dequantize_affine_float8
+
+    def __call__(self, input, scale, zero_point=None, axis=0, quant_min=None, quant_max=None, dtype=torch.float8_e4m3fn, out_dtype=torch.bfloat16):
+        return self._quantized_func_(tensor=input, scale=scale.view((1, -1)), output_dtype=out_dtype)
+
+
+_OP_TYPE_CPU_QUANTIZED_WRAPPER_CLASSES = {
+                                          OP_TYPE.QUANT: QuantizedCPUQuant,
+                                          OP_TYPE.DEQUANT: QuantizedCPUDeQuant,
+                                          OP_TYPE.QUANT_PC: QuantizedCPUQuantPC,
+                                          OP_TYPE.DEQUANT_PC: QuantizedCPUDeQuantPC,
+                                         }
+
+
+def init_cpu_quantized_func_wrapper_factory():
+    QuantizedFuncWrapperFactory.initialize(_OP_TYPE_CPU_QUANTIZED_WRAPPER_CLASSES)
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantized_func_wrappers/hpu/hpu_quantized_func_wrapper.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantized_func_wrappers/hpu/hpu_quantized_func_wrapper.py
@@ -174,6 +174,54 @@ def get_dynamic_quantized_func(self):
         return torch.ops.hpu.mixture_of_experts.fp8_fused_weights_dynamic
 
 
+class QuantizedHPUQuant(QuantizedHpuFuncWrapperBase):
+
+    def get_default_quantized_func(self):
+        return torch.ops.quantized_decomposed.quantize_per_tensor
+
+    def get_scalar_quantized_func(self):
+        return self.get_default_quantized_func()
+
+    def __call__(self, input, scale, zero_point=None, axis=0, quant_min=None, quant_max=None, dtype=torch.float8_e4m3fn):
+        return self._quantized_func_(input, scale, zero_point, quant_min, quant_max, dtype=dtype)
+
+
+class QuantizedHPUDeQuant(QuantizedHpuFuncWrapperBase):
+
+    def get_default_quantized_func(self):
+        return torch.ops.quantized_decomposed.dequantize_per_tensor
+
+    def get_scalar_quantized_func(self):
+        return self.get_default_quantized_func()
+
+    def __call__(self, input, scale, zero_point=None, axis=0, quant_min=None, quant_max=None, dtype=torch.float8_e4m3fn, out_dtype=torch.bfloat16):
+        return self._quantized_func_(input, scale, zero_point, quant_min, quant_max, dtype=dtype, out_dtype=out_dtype)
+
+
+class QuantizedHPUQuantPC(QuantizedHpuFuncWrapperBase):
+
+    def get_default_quantized_func(self):
+        return torch.ops.quantized_decomposed.quantize_per_channel
+
+    def get_scalar_quantized_func(self):
+        return self.get_default_quantized_func()
+
+    def __call__(self, input, scale, zero_point=None, axis=0, quant_min=None, quant_max=None, dtype=torch.float8_e4m3fn):
+        return self._quantized_func_(input, scale, zero_point, axis, quant_min, quant_max, dtype=dtype)
+
+
+class QuantizedHPUDeQuantPC(QuantizedHpuFuncWrapperBase):
+
+    def get_default_quantized_func(self):
+        return torch.ops.quantized_decomposed.dequantize_per_channel
+
+    def get_scalar_quantized_func(self):
+        return self.get_default_quantized_func()
+
+    def __call__(self, input, scale, zero_point=None, axis=0, quant_min=None, quant_max=None, dtype=torch.float8_e4m3fn, out_dtype=torch.bfloat16):
+        return self._quantized_func_(input, scale, zero_point, axis, quant_min, quant_max, dtype=dtype, out_dtype=out_dtype)
+
+
 _OP_TYPE_HPU_QUANTIZED_WRAPPER_CLASSES = {OP_TYPE.LINEAR_GEMM : QuantizedHpuMatmul,
                                           OP_TYPE.MATMUL_GEMM: QuantizedHpuMatmul,
                                           OP_TYPE.SOFTMAX : QuantizedHpuSoftmax,
@@ -183,6 +231,10 @@ def get_dynamic_quantized_func(self):
                                           OP_TYPE.CAST_FROM_FP8 : QuantizedHPUCastFromFP8,
                                           OP_TYPE.DYNAMIC_MOE: QuantizedHpuDynamicMoe,
                                           OP_TYPE.DYNAMIC_MOE_FUSED_WEIGHTS: QuantizedHpuDynamicMoeFusedWeights,
+                                          OP_TYPE.QUANT: QuantizedHPUQuant,
+                                          OP_TYPE.DEQUANT: QuantizedHPUDeQuant,
+                                          OP_TYPE.QUANT_PC: QuantizedHPUQuantPC,
+                                          OP_TYPE.DEQUANT_PC: QuantizedHPUDeQuantPC,
                                           }
 
 def init_hpu_quantized_func_wrapper_factory():
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantized_func_wrappers/quantized_func_wrapper.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantized_func_wrappers/quantized_func_wrapper.py
@@ -27,6 +27,10 @@ class OP_TYPE(Enum):
     CAST_FROM_FP8 = auto()
     DYNAMIC_MOE = auto()
     DYNAMIC_MOE_FUSED_WEIGHTS = auto()
+    QUANT = auto()
+    DEQUANT = auto()
+    QUANT_PC = auto()
+    DEQUANT_PC = auto()
 
 
 class QuantizedFuncWrapperBase(ABC):
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantized_func_wrappers/quantized_func_wrapper_api.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantized_func_wrappers/quantized_func_wrapper_api.py
@@ -38,8 +38,8 @@ def init_quantized_func_wrapper_factory():
         from .xpu.xpu_quantized_func_wrapper import init_xpu_quantized_func_wrapper_factory
         init_xpu_quantized_func_wrapper_factory()
     elif device_name == "cpu":
-        # only support QDQ now
-        pass
+        from .cpu.cpu_quantized_func_wrapper import init_cpu_quantized_func_wrapper_factory
+        init_cpu_quantized_func_wrapper_factory()
     else:
         raise ValueError("Unknown device type - {}".format(device_name))
 
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantized_func_wrappers/xpu/xpu_quantized_func_wrapper.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantized_func_wrappers/xpu/xpu_quantized_func_wrapper.py
@@ -63,11 +63,62 @@ def get_default_quantized_func(self):
         return torch.ops.torch_ipex.cast_from_fp8
 
 
+class QuantizedXPUQuant(QuantizedXpuFuncWrapperBase):
+
+    def get_default_quantized_func(self):
+        return torch.ops.quantized_decomposed.quantize_per_tensor
+
+    def get_scalar_quantized_func(self):
+        return self.get_default_quantized_func()
+
+    def __call__(self, input, scale, zero_point=None, axis=0, quant_min=None, quant_max=None, dtype=torch.float8_e4m3fn):
+        return self._quantized_func_(input, scale, zero_point, quant_min, quant_max, dtype=dtype)
+
+
+class QuantizedXPUDeQuant(QuantizedXpuFuncWrapperBase):
+
+    def get_default_quantized_func(self):
+        return torch.ops.quantized_decomposed.dequantize_per_tensor
+
+    def get_scalar_quantized_func(self):
+        return self.get_default_quantized_func()
+
+    def __call__(self, input, scale, zero_point=None, axis=0, quant_min=None, quant_max=None, dtype=torch.float8_e4m3fn, out_dtype=torch.bfloat16):
+        return self._quantized_func_(input, scale, zero_point, quant_min, quant_max, dtype=dtype, out_dtype=out_dtype)
+
+
+class QuantizedXPUQuantPC(QuantizedXpuFuncWrapperBase):
+
+    def get_default_quantized_func(self):
+        return torch.ops.quantized_decomposed.quantize_per_channel
+
+    def get_scalar_quantized_func(self):
+        return self.get_default_quantized_func()
+
+    def __call__(self, input, scale, zero_point=None, axis=0, quant_min=None, quant_max=None, dtype=torch.float8_e4m3fn):
+        return self._quantized_func_(input, scale, zero_point, axis, quant_min, quant_max, dtype=dtype)
+
+
+class QuantizedXPUDeQuantPC(QuantizedXpuFuncWrapperBase):
+
+    def get_default_quantized_func(self):
+        return torch.ops.quantized_decomposed.dequantize_per_channel
+
+    def get_scalar_quantized_func(self):
+        return self.get_default_quantized_func()
+
+    def __call__(self, input, scale, zero_point=None, axis=0, quant_min=None, quant_max=None, dtype=torch.float8_e4m3fn, out_dtype=torch.bfloat16):
+        return self._quantized_func_(input, scale, zero_point, axis, quant_min, quant_max, dtype=dtype, out_dtype=out_dtype)
+
 _OP_TYPE_XPU_QUANTIZED_WRAPPER_CLASSES = {
                                           OP_TYPE.LINEAR_GEMM : QuantizedXPUMatmul,
                                           OP_TYPE.MATMUL_GEMM : QuantizedXPUMatmul,
                                           OP_TYPE.CAST_TO_FP8 : QuantizedXPUCastToFP8Base,
-                                          OP_TYPE.CAST_FROM_FP8 : QuantizedXPUCastFromFP8Base
+                                          OP_TYPE.CAST_FROM_FP8 : QuantizedXPUCastFromFP8Base,
+                                          OP_TYPE.QUANT: QuantizedXPUQuant,
+                                          OP_TYPE.DEQUANT: QuantizedXPUDeQuant,
+                                          OP_TYPE.QUANT_PC: QuantizedXPUQuantPC,
+                                          OP_TYPE.DEQUANT_PC: QuantizedXPUDeQuantPC,
                                          }
 
 
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
@@ -138,7 +138,7 @@ def extra_repr(self) -> str:
         return extra_representation(
             self.extra_repr_org(),
             self.class_name_org,
-            get_current_repr(self, "scale_input", "scale_other"),
+            get_current_repr(self, "scale_input", "scale_other") if not self.use_qdq else "",
         )
 
 
@@ -227,7 +227,7 @@ def extra_repr(self) -> str:
         return extra_representation(
             self.extra_repr_org(),
             self.class_name_org,
-            get_current_repr(self, "scale_input", "scale_weight"),
+            get_current_repr(self, "scale_input", "scale_weight") if not self.use_qdq else "",
         )
 
 
@@ -1135,7 +1135,7 @@ def extra_repr(self) -> str:
         return extra_representation(
             self.extra_repr_org(),
             self.class_name_org,
-            get_current_repr(self, "scale_input", "scale_weight"),
+            get_current_repr(self, "scale_input", "scale_weight") if not self.use_qdq else "",
         )
 
 
@@ -1171,7 +1171,7 @@ def extra_repr(self) -> str:
         return extra_representation(
             self.extra_repr_org(),
             self.class_name_org,
-            get_current_repr(self, "scale_input", "scale_output"),
+            get_current_repr(self, "scale_input", "scale_output") if not self.use_qdq else "",
         )
 
 
@@ -1252,7 +1252,7 @@ def extra_repr(self) -> str:
         return extra_representation(
             self.extra_repr_org(),
             self.class_name_org,
-            get_current_repr(self, "scale_input", "scale_weight"),
+            get_current_repr(self, "scale_input", "scale_weight") if not self.use_qdq else "",
         )
 
 
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py
diff --git a/test/3x/torch/algorithms/fp8_quant_cpu/unit_tests/test_cpu_basic.py b/test/3x/torch/algorithms/fp8_quant_cpu/unit_tests/test_cpu_basic.py