[SW-230641] Remove smoothquant related scale methods (#258)

Yantom1 · xinhe3 · commit eb1569eff1de · 2025-07-15T10:00:45.000+03:00
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/scale_method_config.py b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/scale_method_config.py
@@ -24,13 +24,10 @@ class ScaleMethodString(Enum):
     HW_ALIGNED_SINGLE_SCALE = auto()
     MAXABS_HW = auto()
     MAXABS_POW2 = auto()
-    SMOOTHQUANT_WEIGHTS_OUTPUT_CHANNEL_MAXABS_POW2 = auto()
-    WEAKSMOOTHQUANT_WEIGHTS_OUTPUT_CHANNEL_MAXABS_POW2 = auto()
     ACT_MAXABS_HW_WEIGHTS_PCS_MAXABS_POW2 = auto()
     ACT_MAXABS_HW_WEIGHTS_PCS_OPT_POW2 = auto()
     ACT_MAXABS_POW2_WEIGHTS_PCS_MAXABS_POW2 = auto()
     ACT_MAXABS_POW2_WEIGHTS_PCS_OPT_POW2 = auto()
-    SMOOTHQUANT_OPT = auto()
     MAXABS_HW_OPT_WEIGHT = auto()
     MAXABS_POW2_OPT_WEIGHT = auto()
     MAXABS_ARBITRARY = auto()
@@ -44,9 +41,6 @@ class ScaleValueType(Enum):
     MAXABS = auto()
     FIXED_VALUE = auto()
     OPT = auto()
-    SMOOTHQUANT_MAXABS = auto()
-    SMOOTHQUANT_OPT = auto()
-    SMOOTHQUANT_WEAK = auto()
     DUMMY_SCALES = auto()
 
 class ScaleRoundMethod(Enum):
@@ -150,16 +144,6 @@ def __eq__(self, other):
         CfgStr.WEIGHT:     ScaleMethodConfig(granularity= ScaleGranularity.PCS, rounding_method= ScaleRoundMethod.POW2, backoff= 0.5),
         CfgStr.ACTIVATION: ScaleMethodConfig(rounding_method= ScaleRoundMethod.HW_ALIGNED, backoff= 0.25)
     },
-    ScaleMethodString.SMOOTHQUANT_WEIGHTS_OUTPUT_CHANNEL_MAXABS_POW2:
-    {
-        CfgStr.WEIGHT:     ScaleMethodConfig(scale_value_type = ScaleValueType.SMOOTHQUANT_MAXABS, granularity= ScaleGranularity.PCS, rounding_method= ScaleRoundMethod.POW2, backoff= 0.5),
-        CfgStr.ACTIVATION: ScaleMethodConfig(scale_value_type = ScaleValueType.SMOOTHQUANT_MAXABS, granularity= ScaleGranularity.PCS, rounding_method= ScaleRoundMethod.POW2, backoff= 0.25, params={"alpha": 0.5})
-    },
-    ScaleMethodString.WEAKSMOOTHQUANT_WEIGHTS_OUTPUT_CHANNEL_MAXABS_POW2:
-    {
-        CfgStr.WEIGHT:     ScaleMethodConfig(scale_value_type = ScaleValueType.SMOOTHQUANT_WEAK, granularity= ScaleGranularity.PCS, rounding_method= ScaleRoundMethod.POW2, backoff= 0.5),
-        CfgStr.ACTIVATION: ScaleMethodConfig(scale_value_type = ScaleValueType.SMOOTHQUANT_WEAK, granularity= ScaleGranularity.PCS, rounding_method= ScaleRoundMethod.POW2, backoff= 0.25, params={"alpha": 0.5})
-    },
     ScaleMethodString.ACT_MAXABS_HW_WEIGHTS_PCS_OPT_POW2:
     {
         CfgStr.WEIGHT:     ScaleMethodConfig(scale_value_type = ScaleValueType.OPT, granularity= ScaleGranularity.PCS, rounding_method= ScaleRoundMethod.POW2, backoff= 0.5, params={"weight_scales": [2.0**s for s in range(-3, 5)]}),
@@ -175,11 +159,6 @@ def __eq__(self, other):
         CfgStr.WEIGHT:     ScaleMethodConfig(scale_value_type = ScaleValueType.OPT, granularity= ScaleGranularity.PCS, rounding_method= ScaleRoundMethod.POW2, backoff= 0.5, params={"weight_scales": [2.0**s for s in range(-3, 5)]}),
         CfgStr.ACTIVATION: ScaleMethodConfig(rounding_method= ScaleRoundMethod.POW2, backoff= 0.25)
     },
-    ScaleMethodString.SMOOTHQUANT_OPT: 
-    {
-        CfgStr.WEIGHT:     ScaleMethodConfig(scale_value_type = ScaleValueType.SMOOTHQUANT_OPT, granularity= ScaleGranularity.PCS, rounding_method= ScaleRoundMethod.POW2, backoff= 0.5, params={"transformed_weight_scales": [2.0**s for s in range(-3, 5)]}),
-        CfgStr.ACTIVATION: ScaleMethodConfig(scale_value_type = ScaleValueType.SMOOTHQUANT_OPT, granularity= ScaleGranularity.PCS, rounding_method= ScaleRoundMethod.POW2, backoff= 0.25,  params={"alpha": 0.5})
-    },
 }
 
 reverse_scale_method_mapping = {
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/scale_method_factory.py b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/scale_method_factory.py
@@ -87,8 +87,7 @@ def get_scale_method(self, tensor_type, is_dynamic=False):
                     if scale_value_type in {ScaleValueType.MAXABS, ScaleValueType.OPT}:
                         return MulAdditionalScales(scale_round_method, self.params, self.device_for_scales)
             ## maxabs/opt in channel PTS
-            case (_, ScaleGranularity.PTS, QuantTensorName.WEIGHT_IN_CH, _) \
-                if scale_value_type not in {ScaleValueType.SMOOTHQUANT_OPT, ScaleValueType.SMOOTHQUANT_MAXABS}:
+            case (_, ScaleGranularity.PTS, QuantTensorName.WEIGHT_IN_CH, _):
                 return None
             case (ScaleValueType.MAXABS, ScaleGranularity.PTS, _, _):
                 if is_dynamic:
@@ -111,28 +110,6 @@ def get_scale_method(self, tensor_type, is_dynamic=False):
             case (ScaleValueType.OPT, ScaleGranularity.PCS, _, _):
                 opt_list_of_scales = self.scale_method_config_map[tensor_type].params["weight_scales"]
                 return OptScalesPcs(scale_round_method, opt_list_of_scales, self.params, self.device_for_scales, backoff)
-            ## smooth quant
-            case (_, ScaleGranularity.PCS, QuantTensorName.WEIGHT_IN_CH, _) \
-                if scale_value_type in {ScaleValueType.SMOOTHQUANT_OPT, ScaleValueType.SMOOTHQUANT_MAXABS}:
-                return WeightIchSmoothQuant(scale_round_method, self.params, self.device_for_scales)
-            case (_,  ScaleGranularity.PCS, QuantTensorName.OUTPUT, _) \
-                if scale_value_type in {ScaleValueType.SMOOTHQUANT_OPT, ScaleValueType.SMOOTHQUANT_MAXABS} \
-                   and self.op_type in {"linear", "matmul"}:
-                return UseFirstAdditionalScales(scale_round_method, self.params, self.device_for_scales)
-            ## SMOOTHQUANT_MAXABS input and weight out channel
-            case (ScaleValueType.SMOOTHQUANT_MAXABS, ScaleGranularity.PCS, QuantTensorName.WEIGHT_OUT_CH, _):
-                return MaxAbsPcs(scale_round_method, self.params, self.device_for_scales, backoff)
-            case (ScaleValueType.SMOOTHQUANT_MAXABS, ScaleGranularity.PCS, QuantTensorName.INPUT, _):
-                alpha = self.scale_method_config_map[QuantTensorName.INPUT].params["alpha"]
-                return InputSmoothQuantMaxAbs(scale_round_method, self.mod.weight, self.params, self.device_for_scales, backoff, alpha)
-            ## SMOOTHQUANT_OPT input and weight out channel
-            case (ScaleValueType.SMOOTHQUANT_OPT, _, QuantTensorName.WEIGHT_OUT_CH, _):
-                opt_list_of_scales = self.scale_method_config_map[tensor_type].params["transformed_weight_scales"]
-                return OptScalesPcs(scale_round_method, opt_list_of_scales, self.params, self.device_for_scales, backoff)
-            case (ScaleValueType.SMOOTHQUANT_OPT, _, QuantTensorName.INPUT, _):
-                backoff_weight =  self.scale_method_config_map[QuantTensorName.WEIGHT_OUT_CH].backoff
-                alpha = self.scale_method_config_map[QuantTensorName.INPUT].params["alpha"]
-                return InputSmoothQuantOpt(scale_round_method, self.mod.weight, self.params, self.device_for_scales, backoff, backoff_weight, alpha)
             case _:
                 raise NotImplementedError("the config: scale_round_method: " + \
                                           str(scale_round_method) +
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/scales_method.py b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/scales_method.py
@@ -225,60 +225,6 @@ def calc_scales(self, tensor, tensor_type, **additional_kwargs):
         return self.scale
 
 
-class InputSmoothQuantMaxAbs(ScalesMethod):
-    def __init__(self, round_scale_method, weight, params, device_for_scales, backoff, alpha):
-        super().__init__(round_scale_method, params, device_for_scales)
-        self.round_scale_method = round_scale_method
-        self.weight = weight
-        self.alpha = alpha
-        self.backoff = backoff
-        self.device_for_scales = device_for_scales
-
-    def calc_scales(self, tensor, tensor_type, **additional_kwargs):
-        weight_scale_in_ch = MaxAbsPcs(ScaleIdentity(), self.params, self.device_for_scales, 1.0, 1.0, dim=0).calc_scales(
-            self.weight, QuantTensorType.CONST)
-        input_range = torch.tensor(tensor, dtype=self.hp_dtype, device=self.device)
-        input_scale = MaxAbsPts(ScaleIdentity(), self.params, self.device_for_scales, 1.0, 1.0).calc_scales(tensor,
-                                                                                                            QuantTensorType.MEASUREMENTS)
-        input_scale = (input_scale ** self.alpha) / (weight_scale_in_ch ** (1 - self.alpha))
-        input_scale = self.round_scale_method.calc(input_scale)
-        input_range_post = input_range / input_scale
-        input_scale_post = calc_maxabs_scale(input_range_post.max(), self.fullscale, self.backoff)
-        input_scale_post = self.round_scale_method.calc(input_scale_post)
-        input_scale = input_scale * input_scale_post
-        self.scale = input_scale
-        return self.scale
-
-class InputSmoothQuantOpt(ScalesMethod):
-    def __init__(self, round_scale_method, weight, params, device_for_scales, backoff, backoff_weight, alpha):
-        super().__init__(round_scale_method, params, device_for_scales)
-        self.round_scale_method = round_scale_method
-        self.weight = weight
-        self.alpha = alpha
-        self.backoff = backoff
-        self.backoff_weight = backoff_weight
-        self.device_for_scales = device_for_scales
-
-    def calc_scales(self, tensor, tensor_type, **additional_kwargs):
-        weight_scale_in_ch = MaxAbsPcs(ScaleIdentity(), self.params, self.device_for_scales, self.backoff_weight,
-                                       self.fullscale, dim=0).calc_scales(self.weight, QuantTensorType.CONST)
-        input_scale = MaxAbsPts(ScaleIdentity(), self.params, self.device_for_scales, self.backoff,
-                                self.fullscale).calc_scales(tensor, QuantTensorType.MEASUREMENTS)
-        input_scale = (input_scale ** self.alpha) / (weight_scale_in_ch ** (1 - self.alpha))
-        input_scale = self.round_scale_method.calc(input_scale)
-        self.scale = input_scale
-        return self.scale
-
-
-class WeightIchSmoothQuant(ScalesMethod):
-    def __init__(self, round_scale_method, params, device_for_scales):
-        super().__init__(round_scale_method, params, device_for_scales)
-
-    def calc_scales(self, tensor, tensor_type, **additional_kwargs):
-        self.scale = 1 / tensor
-        return self.scale
-
-
 class MaxAbsDynamicPcs(MaxAbsPcs):
 
     def __init__(self, round_scale_method, params, device_for_scales, backoff, fullscale=None):
diff --git a/test/3x/torch/algorithms/fp8_quant/tester.py b/test/3x/torch/algorithms/fp8_quant/tester.py
@@ -48,11 +48,7 @@
     ScaleMethodString.MAXABS_HW_OPT_WEIGHT,
     ScaleMethodString.MAXABS_POW2_OPT_WEIGHT,
 ]
-SCALE_METHODS_KEY_ERROR = [
-    ScaleMethodString.SMOOTHQUANT_WEIGHTS_OUTPUT_CHANNEL_MAXABS_POW2,
-    ScaleMethodString.WEAKSMOOTHQUANT_WEIGHTS_OUTPUT_CHANNEL_MAXABS_POW2,
-    ScaleMethodString.SMOOTHQUANT_OPT,
-]
+
 SCALE_METHODS_COMPILATION_ERROR = [
     ScaleMethodString.ACT_MAXABS_HW_WEIGHTS_PCS_MAXABS_POW2,
     ScaleMethodString.ACT_MAXABS_POW2_WEIGHTS_PCS_MAXABS_POW2,
diff --git a/test/3x/torch/algorithms/fp8_quant/unit_tests/test_functions/test_config_json.py b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_functions/test_config_json.py
@@ -9,7 +9,7 @@
 from neural_compressor.torch.algorithms.fp8_quant._quant_common.helper_modules import Matmul
 from neural_compressor.torch.algorithms.fp8_quant._quant_common.quant_config import QuantMode
 from neural_compressor.torch.algorithms.fp8_quant._core.scale_methods.scale_method_config import ScaleMethodString
-from ...tester import run_with_raised_exception, get_internal_config, SCALE_METHODS_QUANT_ONLY, SCALE_METHODS_KEY_ERROR
+from ...tester import run_with_raised_exception, get_internal_config, SCALE_METHODS_QUANT_ONLY
 from ...test_hpu_utils import *
 
 class Model(torch.nn.Module):
@@ -49,9 +49,8 @@ def run_predefined_config():
         prepare_model._prep_model_with_predefined_config(model, config=config)
         fp8_quant.finish_measurements(model)
 
-    if scale_method in SCALE_METHODS_KEY_ERROR and quant_mode == QuantMode.QUANTIZE:
-        pytest.xfail("KeyError")
-    elif scale_method == ScaleMethodString.ACT_MAXABS_PCS_POW2_WEIGHT_MAXABS_PTS_POW2_HW:
+
+    if scale_method == ScaleMethodString.ACT_MAXABS_PCS_POW2_WEIGHT_MAXABS_PTS_POW2_HW:
         return run_with_raised_exception(run_predefined_config, ValueError, "Unsupported config: scale_method")
     # This is an expected exception, as test is not measuring before
     elif scale_method not in SCALE_METHODS_QUANT_ONLY:
diff --git a/test/3x/torch/algorithms/fp8_quant/unit_tests/test_layers/test_conv2d.py b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_layers/test_conv2d.py
@@ -26,8 +26,6 @@ def test_conv2d_accuracy(hp_dtype: torch.dtype, lp_dtype: torch.dtype, scale_met
     # TODO [SW-196641]: fix the following issues:
     if scale_method in SCALE_METHODS_SEGFAULT:
         pytest.skip("Not supported")
-    if scale_method in SCALE_METHODS_KEY_ERROR:
-        pytest.xfail("KeyError")
     if scale_method in SCALE_METHODS_COMPILATION_ERROR:
         pytest.xfail("Graph compile error")
     quant_modes = QUANT_MODES_DEFAULT
diff --git a/test/3x/torch/algorithms/fp8_quant/unit_tests/test_layers/test_linear.py b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_layers/test_linear.py
@@ -39,8 +39,6 @@ def get_test_vectors(*, dtype: torch.dtype, N: int, D_in: int, atol: float = 0.0
     )
 
 def check_tests_to_skip(scale_method, scale_format, dynamic_quantization, device_type = None):
-    if scale_method in SCALE_METHODS_KEY_ERROR:
-        pytest.xfail("KeyError")
     # TODO [SW-215692]: Fix segfault
     if scale_format == ScaleFormat.CONST or dynamic_quantization:
         if scale_method in [ScaleMethodString.MAXABS_HW_OPT_WEIGHT, ScaleMethodString.MAXABS_POW2_OPT_WEIGHT]:
diff --git a/test/3x/torch/algorithms/fp8_quant/unit_tests/test_layers/test_matmul.py b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_layers/test_matmul.py
@@ -55,9 +55,6 @@ def forward(self, x, y):
 @pytest.mark.parametrize("device_type", device_type)
 @pytest.mark.parametrize("dynamic_quantization", [True, False], ids=["dynamic_quantization", "static_quantization"])
 def test_matmul_accuracy(hp_dtype: torch.dtype, lp_dtype: torch.dtype, scale_method: ScaleMethodString, device_type: str, dynamic_quantization: bool):
-    # TODO [SW-196641]: fix the following issues:
-    if scale_method in SCALE_METHODS_KEY_ERROR:
-        pytest.xfail("KeyError")
     quant_modes = QUANT_MODES_DEFAULT
     atol = 0.2
     if scale_method in SCALE_METHODS_QUANT_ONLY or dynamic_quantization:
diff --git a/test/3x/torch/algorithms/fp8_quant/unit_tests/test_runtime_scale_patching.py b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_runtime_scale_patching.py
@@ -6,7 +6,7 @@
 import habana_frameworks.torch.core as htcore
 import habana_frameworks.torch.utils.experimental as htexp
 
-from ..tester import RUNTIME_SCALE_PATCHING_SUPPORTED_METHODS_LIST, SCALE_METHODS_KEY_ERROR, run_with_raised_exception
+from ..tester import RUNTIME_SCALE_PATCHING_SUPPORTED_METHODS_LIST, run_with_raised_exception
 from neural_compressor.torch.algorithms.fp8_quant._core.common import is_runtime_scale_patching
 from neural_compressor.torch.algorithms.fp8_quant._quant_common.quant_config import ScaleMethodString
 from neural_compressor.torch.quantization import FP8Config, convert, prepare, finalize_calibration
@@ -52,8 +52,6 @@ def temp_directory():
 @pytest.mark.parametrize("scale_format", ["SCALAR", "CONST"])
 @pytest.mark.parametrize("dynamic_scale_patching", [True, False])
 def test_no_assert(scale_method, scale_format,dynamic_scale_patching, temp_directory):
-    if scale_method in SCALE_METHODS_KEY_ERROR :
-        pytest.xfail("KeyError")
     model = TinyModel()
     model.eval()
     model = model.to("hpu").to(torch.bfloat16)
diff --git a/test/3x/torch/algorithms/fp8_quant/unit_tests/test_scale_method_config.py b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_scale_method_config.py
@@ -37,7 +37,7 @@ def forward(self, x):
 def check_tests_to_skip(scale_method, scale_value_type_weight = None, scale_value_type_activation = None):
     if scale_value_type_weight == ScaleValueType.DUMMY_SCALES or scale_value_type_activation == ScaleValueType.DUMMY_SCALES:
         pytest.xfail("Dummy scales is not a scale method")
-    if scale_method in SCALE_METHODS_KEY_ERROR or scale_method in SUPPORTED_DYNAMIC_SCALES:
+    if scale_method in SUPPORTED_DYNAMIC_SCALES:
         pytest.xfail("Key error")
 
 @pytest.mark.parametrize("scale_granularity_weight", ScaleGranularity)

Original file line number	Diff line number	Diff line change
`@@ -39,8 +39,6 @@ def get_test_vectors(*, dtype: torch.dtype, N: int, D_in: int, atol: float = 0.0`
`39`	`39`	`)`
`40`	`40`
`41`	`41`	`def check_tests_to_skip(scale_method, scale_format, dynamic_quantization, device_type = None):`
`42`		`- if scale_method in SCALE_METHODS_KEY_ERROR:`
`43`		`- pytest.xfail("KeyError")`
`44`	`42`	`# TODO [SW-215692]: Fix segfault`
`45`	`43`	`if scale_format == ScaleFormat.CONST or dynamic_quantization:`
`46`	`44`	`if scale_method in [ScaleMethodString.MAXABS_HW_OPT_WEIGHT, ScaleMethodString.MAXABS_POW2_OPT_WEIGHT]:`