[SW-225078] [INC][DynamicQuant] Reenable testing dynamic quantization… (#214)

Yantom1 · web-flow · commit 08024651945d · 2025-04-28T16:02:14.000+03:00
* [SW-225078] [INC][DynamicQuant] Reenable testing dynamic quantization scales on hpu graphs and torch.compile

* CR fixes

* tiny fix

* cr fix

* don't support running _quant_only_scale_methods with dynamic quantization

* string check fix

* fix test_matmul runs and atol in HW_ALIGNED_SINGLE_SCALE

* string fixes
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py
@@ -266,17 +266,17 @@ def parse(custom_config: Mapping[str, str]) -> Fp8cfg:
             # TODO: "Linear only" in types still causes issues as llama7b quantizes also self_attn,
             # which should be blocked for some reason. We might then want to set measured_global_config["allowlist"]["types"] = supported_dynamic_ops
             # TODO [SW-222725]: support HW aligned rounding in dynamic quantization
-            if scale_method in _hw_aligned_scale_methods:
+            if scale_method in _hw_aligned_scale_methods or scale_method in _quant_only_scale_methods:
                 raise ValueError(
-                    f"Unsupported config: scale method {scale_method} is not supported in dynamic quantization"
+                    f"Unsupported config: scale_method {scale_method} is not supported in dynamic quantization"
                 )
             #TODO [SW-224403]: enable dynamic quantization in row parallel allreduce
             if measured_global_config["row_parallel_linear_allreduce_quantization"]:
                 raise ValueError(f"Dynamic quantization is not supported when using row_parallel_linear_allreduce_quantization")
         else:
             if scale_method == ScaleMethod.ACT_MAXABS_PCS_POW2_WEIGHT_MAXABS_PTS_POW2_HW:
                 raise ValueError(
-                    f"Unsupported config: scale method {scale_method} is supported only in dynamic quantization"
+                    f"Unsupported config: scale_method {scale_method} is supported only in dynamic quantization"
                 )
 
         if scale_method in _quant_only_scale_methods or dynamic_quantization:
@@ -353,7 +353,7 @@ def set_gaudi_device_for_scales(custom_config, measured_global_config):
             # Currently, only maxabs_hw is supported for a different device scales configuration
             if measured_global_config["scale_method"] != ScaleMethod.MAXABS_HW:
                 raise ValueError(
-                    f"Unsupported config: scale_method: {measured_global_config['scale_method']} "
+                    f"Unsupported config: scale_method {measured_global_config['scale_method']} "
                     f"for scale device overriding: {measured_global_config['device_for_scales']}"
                 )
             if not (
diff --git a/test/3x/torch/algorithms/fp8_quant/unit_tests/test_functions/test_config_json.py b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_functions/test_config_json.py
@@ -50,7 +50,7 @@ def run_predefined_config():
     if scale_method in SCALE_METHODS_KEY_ERROR and quant_mode == QuantMode.QUANTIZE:
         run_with_raised_exception(run_predefined_config, KeyError, "(<ScaleMethod.")
     elif scale_method == ScaleMethod.ACT_MAXABS_PCS_POW2_WEIGHT_MAXABS_PTS_POW2_HW:
-        return run_with_raised_exception(run_predefined_config, ValueError, "Unsupported config: scale method ScaleMethod.ACT_MAXABS_PCS_POW2_WEIGHT_MAXABS_PTS_POW2_HW")
+        return run_with_raised_exception(run_predefined_config, ValueError, "Unsupported config: scale_method")
     # This is an expected exception, quant only methods support only quantization
     elif scale_method in SCALE_METHODS_QUANT_ONLY and quant_mode not in [QuantMode.QUANTIZE, QuantMode.LOAD]:
         run_with_raised_exception(run_predefined_config, ValueError, "Unexpected behavior. This scale method doesn't require measurements.")
diff --git a/test/3x/torch/algorithms/fp8_quant/unit_tests/test_layers/test_conv2d.py b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_layers/test_conv2d.py
@@ -61,10 +61,10 @@ def run():
             device_type=device_type,
         )
     if get_device_type() != device_type_id[device_type] and scale_method != ScaleMethod.MAXABS_HW:
-        return run_with_raised_exception(run, ValueError, "Unsupported config: scale_method: ")
+        return run_with_raised_exception(run, ValueError, "Unsupported config: scale_method")
     elif device_type_id[device_type] != get_device_type():
         if not (device_type_id[device_type] == get_gaudi2_type() and is_gaudi3()):
             return run_with_raised_exception(run, ValueError, "Unsupported config: device_for_scales=")
     elif scale_method == ScaleMethod.ACT_MAXABS_PCS_POW2_WEIGHT_MAXABS_PTS_POW2_HW:
-            return run_with_raised_exception(run, ValueError, "Unsupported config: scale method ScaleMethod.ACT_MAXABS_PCS_POW2_WEIGHT_MAXABS_PTS_POW2_HW")
+            return run_with_raised_exception(run, ValueError, "Unsupported config: scale_method ScaleMethod.ACT_MAXABS_PCS_POW2_WEIGHT_MAXABS_PTS_POW2_HW")
     return run()
diff --git a/test/3x/torch/algorithms/fp8_quant/unit_tests/test_layers/test_linear.py b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_layers/test_linear.py
@@ -3,13 +3,27 @@
 import pytest
 import torch
 
-from neural_compressor.torch.algorithms.fp8_quant._quant_common.quant_config import ScaleMethod, ScaleFormat, _hw_aligned_scale_methods
+from neural_compressor.torch.algorithms.fp8_quant._quant_common.quant_config import ScaleMethod, ScaleFormat, _hw_aligned_scale_methods, _quant_only_scale_methods
 from neural_compressor.torch.algorithms.fp8_quant._core.scale_handler import scale_to_scalar
+from neural_compressor.torch.algorithms.fp8_quant._core.quant_dequant import QuantDynamicInput
 
 from ...test_hpu_utils import *
 from ...tester import *
 
 
+SUPPORTED_DYNAMIC_SCALES= [ScaleMethod.ACT_MAXABS_PCS_POW2_WEIGHT_MAXABS_PTS_POW2_HW]
+# Test Class to support restoration of calculated scale during runtime with dynamic quantization to test it correctness.
+# This is a workaround to avoid saving the scale in the original QuantDynamicInput class as scale saving may cause unwanted graph breaks in torch.compile or issues with hpu_graph.
+class TestQuantDynamicInput(QuantDynamicInput):
+    def __init__(self, input_scales_creator, lp_dtype, hp_dtype, *args, **kwargs):
+        super(TestQuantDynamicInput, self).__init__(input_scales_creator, lp_dtype, hp_dtype, *args, **kwargs)
+        self.input_scale = None
+    def forward(self, x):
+        ret ,scale = super().forward(x)
+        # We save the calculated scale during this forward pass to test it correctness.
+        self.input_scale = scale
+        return ret, scale
+
 def get_test_vectors(*, dtype: torch.dtype, N: int, D_in: int, atol: float = 0.02, rtol: float = 0.01) -> typing.Iterable[TestVector]:
     yield TestVector(
         inputs=[torch.ones(N, D_in, dtype=dtype, device="hpu", requires_grad=False)],
@@ -24,14 +38,16 @@ def get_test_vectors(*, dtype: torch.dtype, N: int, D_in: int, atol: float = 0.0
         rtol=rtol,
     )
 
-def check_tests_to_skip(scale_method, scale_format, dynamic_quantization):
+def check_tests_to_skip(scale_method, scale_format, dynamic_quantization, device_type = None):
     if scale_method in SCALE_METHODS_KEY_ERROR:
         pytest.xfail("KeyError")
     # TODO [SW-215692]: Fix segfault
     if scale_format == ScaleFormat.CONST or dynamic_quantization:
         if scale_method in [ScaleMethod.MAXABS_HW_OPT_WEIGHT, ScaleMethod.MAXABS_POW2_OPT_WEIGHT]:
             pytest.xfail("Segfault")
-
+    # TODO [SW-225900] HW_ALIGNED_SINGLE_SCALE on gaudi3 fails in test_linear unit test
+    if scale_method == ScaleMethod.HW_ALIGNED_SINGLE_SCALE and device_type == GAUDI3:
+       pytest.xfail("NoAccuracy")
 
 @pytest.mark.parametrize("hp_dtype", [torch.bfloat16, torch.float32], ids=["bf16", "fp32"])
 @pytest.mark.parametrize("lp_dtype", [torch.float8_e4m3fn], ids=["fp8_e4m3fn"])
@@ -49,12 +65,13 @@ def test_linear_accuracy(
     use_hpu_graphs: bool,
     dynamic_quantization: bool
 ):
-    check_tests_to_skip(scale_method, scale_format, dynamic_quantization)
+    check_tests_to_skip(scale_method, scale_format, dynamic_quantization, device_type)
     quant_modes = QUANT_MODES_DEFAULT
     atol = 0.022
     rtol = 0.175
     if scale_method == ScaleMethod.MAXABS_ARBITRARY:
         atol = 0.078
+        rtol = 0.3
     if scale_method in SCALE_METHODS_QUANT_ONLY or dynamic_quantization:
         quant_modes = QUANT_MODES_QUANT_ONLY
         if scale_method == ScaleMethod.HW_ALIGNED_SINGLE_SCALE:
@@ -84,21 +101,27 @@ def run():
             use_hpu_graphs=use_hpu_graphs,
             dynamic_quantization=dynamic_quantization
         )
-    if get_device_type() != device_type_id[device_type] and scale_method != ScaleMethod.MAXABS_HW:
-        return run_with_raised_exception(run, ValueError, "Unsupported config: scale_method: ")
-    elif device_type_id[device_type] != get_device_type():
-        if not (device_type_id[device_type] == get_gaudi2_type() and is_gaudi3()):
+
+    if scale_method == ScaleMethod.MAXABS_HW:
+        if device_type_id[device_type] == get_gaudi3_type() and is_gaudi2():
+            # Gaudi3 scales not supported on Gaudi2 so "device_for_scales:Gaudi3" is not supported on Gaudi2 run
             return run_with_raised_exception(run, ValueError, "Unsupported config: device_for_scales=")
-    elif scale_method == ScaleMethod.ACT_MAXABS_PCS_POW2_WEIGHT_MAXABS_PTS_POW2_HW and not dynamic_quantization:
-            return run_with_raised_exception(run, ValueError, "Unsupported config: scale method ScaleMethod.ACT_MAXABS_PCS_POW2_WEIGHT_MAXABS_PTS_POW2_HW")
-    # TODO [SW-222725]: support HW aligned rounding in dynamic quantization
-    elif dynamic_quantization and scale_method in _hw_aligned_scale_methods:
-        return run_with_raised_exception(run, ValueError, "is not supported in dynamic quantization")
+    else:
+        if get_device_type() != device_type_id[device_type]:
+            # In scale_method different than MAXABS_HW, we don't support device_for_scales so this scale_method config fails
+            return run_with_raised_exception(run, ValueError, "Unsupported config: scale_method")
+
+    if dynamic_quantization:
+        if scale_method in _hw_aligned_scale_methods or scale_method in _quant_only_scale_methods:
+            # When in dynamic quantization we don't support hw aligned scale methods and unit scale
+            return run_with_raised_exception(run, ValueError, "Unsupported config: scale_method")
+    else :
+        if scale_method in SUPPORTED_DYNAMIC_SCALES:
+            # When in static quantization we don't support dynamic scale method
+            return run_with_raised_exception(run, ValueError, "Unsupported config: scale_method")
     return run()
 
 
-#TODO [SW-225078]: Reeanable test, find a way to test scales in dynamic quantization
-@pytest.mark.skip("[SW-225078] Find a way to test scales in dynamic quantization")
 @pytest.mark.parametrize("hp_dtype", [torch.bfloat16, torch.float32], ids=["bf16", "fp32"])
 @pytest.mark.parametrize("lp_dtype", [torch.float8_e4m3fn], ids=["fp8_e4m3fn"])
 @pytest.mark.parametrize("scale_method", ScaleMethod)
@@ -126,8 +149,6 @@ def test_linear_dynamic_quantization(
     }
     def run():
         test_vectors=get_test_vectors(dtype=hp_dtype, N=N, D_in=D_in)
-        import neural_compressor.torch.algorithms.fp8_quant.prepare_quant.prepare_model as prepare_model
-
         dynamic_quantized_model = WrapModel(module_class, None, **module_kwargs)
         dynamic_quantized_model = setup_quantization(
             dynamic_quantized_model,
@@ -141,25 +162,26 @@ def run():
             **module_kwargs,
         )
         previous_input_dynamic_scale = 0
+        test_quant_dynamic_input = TestQuantDynamicInput(dynamic_quantized_model.inner.quant_input.input_scales_creator,
+                                                        dynamic_quantized_model.inner.quant_input.lp_dtype,
+                                                        dynamic_quantized_model.inner.quant_input.hp_dtype)
+        dynamic_quantized_model.inner.quant_input = test_quant_dynamic_input
 
         for vector in test_vectors:
             dynamic_quantized_output = dynamic_quantized_model(*(input.clone() for input in vector.inputs)).to(float)
+            # We save the calculated scale after the dynamic_quantized_model run the current input and calculates new scale. 
+            # In next iteration, we will have a new scale stored in the class.
+            current_input_dynamic_scale = dynamic_quantized_model.inner.quant_input.input_scale
 
-            current_input_dynamic_scale = dynamic_quantized_model.inner.scale_input
             if isinstance(current_input_dynamic_scale, torch.Tensor):
                 current_input_dynamic_scale = scale_to_scalar(current_input_dynamic_scale)
             if scale_method not in SCALE_METHODS_QUANT_ONLY:
                 assert previous_input_dynamic_scale != current_input_dynamic_scale, f"input scales in dynamic quantization should differ in different tensors {previous_input_dynamic_scale=} {current_input_dynamic_scale=}"
             previous_input_dynamic_scale = current_input_dynamic_scale
 
-        prepare_model.finish_measurements(dynamic_quantized_model)
+    if (device_type_id[device_type] == get_gaudi3_type() and is_gaudi2() and scale_method == ScaleMethod.MAXABS_HW):
+        return run_with_raised_exception(run, ValueError, "Unsupported config: device_for_scales=")
+    if (get_device_type() != device_type_id[device_type]) or scale_method in _hw_aligned_scale_methods or scale_method in _quant_only_scale_methods:
+        return run_with_raised_exception(run, ValueError, "Unsupported config: scale_method")
 
-    if get_device_type() != device_type_id[device_type] and scale_method != ScaleMethod.MAXABS_HW:
-        return run_with_raised_exception(run, ValueError, "Unsupported config: scale_method: ")
-    elif device_type_id[device_type] != get_device_type():
-        if not (device_type_id[device_type] == get_gaudi2_type() and is_gaudi3()):
-            return run_with_raised_exception(run, ValueError, "Unsupported config: device_for_scales=")
-    # TODO [SW-222725]: support HW aligned rounding in dynamic quantization
-    elif scale_method in _hw_aligned_scale_methods:
-        return run_with_raised_exception(run, ValueError, "is not supported in dynamic quantization")
-    return run()
+    return run()
diff --git a/test/3x/torch/algorithms/fp8_quant/unit_tests/test_layers/test_matmul.py b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_layers/test_matmul.py
@@ -3,33 +3,35 @@
 import pytest
 import torch
 
-from neural_compressor.torch.algorithms.fp8_quant._quant_common.quant_config import ScaleMethod, _hw_aligned_scale_methods
+from neural_compressor.torch.algorithms.fp8_quant._quant_common.quant_config import ScaleMethod, _hw_aligned_scale_methods, _quant_only_scale_methods
 
 from ...test_hpu_utils import *
 from ...tester import *
 
+SUPPORTED_DYNAMIC_SCALES= [ScaleMethod.ACT_MAXABS_PCS_POW2_WEIGHT_MAXABS_PTS_POW2_HW]
 
-def get_test_vectors(*, dtype: torch.dtype) -> typing.Iterable[TestVector]:
+
+def get_test_vectors(*, dtype: torch.dtype, atol) -> typing.Iterable[TestVector]:
     yield TestVector(
         inputs=[
             torch.eye(2, dtype=dtype, device="hpu"),
             torch.eye(2, dtype=dtype, device="hpu"),
         ],
-        atol=0.2,
+        atol=atol,
     )
     yield TestVector(
         inputs=[
             torch.randn((2, 2), dtype=dtype, device="hpu"),
             torch.randn((2, 2), dtype=dtype, device="hpu"),
         ],
-        atol=0.2,
+        atol=atol,
     )
     yield TestVector(
         inputs=[
             torch.eye(2, dtype=dtype, device="hpu"),
             torch.randn((2, 2), dtype=dtype, device="hpu"),
         ],
-        atol=0.2,
+        atol=atol,
     )
 
 
@@ -57,25 +59,37 @@ def test_matmul_accuracy(hp_dtype: torch.dtype, lp_dtype: torch.dtype, scale_met
     if scale_method in SCALE_METHODS_KEY_ERROR:
         pytest.xfail("KeyError")
     quant_modes = QUANT_MODES_DEFAULT
+    atol = 0.2
     if scale_method in SCALE_METHODS_QUANT_ONLY or dynamic_quantization:
         quant_modes = QUANT_MODES_QUANT_ONLY
+        if scale_method == ScaleMethod.HW_ALIGNED_SINGLE_SCALE:
+            atol = 1.0
     def run():
         run_accuracy_test(
             module_class=Matmul,
             lp_dtype=lp_dtype,
             scale_method=scale_method,
-            test_vectors=get_test_vectors(dtype=hp_dtype),
+            test_vectors=get_test_vectors(dtype=hp_dtype, atol=atol),
             quant_modes=quant_modes,
             device_type=device_type,
             dynamic_quantization=dynamic_quantization,
         )
-    if get_device_type() != device_type_id[device_type] and scale_method != ScaleMethod.MAXABS_HW:
-        return run_with_raised_exception(run, ValueError, "Unsupported config: scale_method: ")
-    elif device_type_id[device_type] != get_device_type():
-        if not (device_type_id[device_type] == get_gaudi2_type() and is_gaudi3()):
+
+    if scale_method == ScaleMethod.MAXABS_HW:
+        if device_type_id[device_type] == get_gaudi3_type() and is_gaudi2():
+            # Gaudi3 scales not supported on Gaudi2 so "device_for_scales:Gaudi3" is not supported on Gaudi2 run
             return run_with_raised_exception(run, ValueError, "Unsupported config: device_for_scales=")
-    elif scale_method == ScaleMethod.ACT_MAXABS_PCS_POW2_WEIGHT_MAXABS_PTS_POW2_HW and not dynamic_quantization:
-            return run_with_raised_exception(run, ValueError, "Unsupported config: scale method ScaleMethod.ACT_MAXABS_PCS_POW2_WEIGHT_MAXABS_PTS_POW2_HW")
-    elif dynamic_quantization and scale_method in _hw_aligned_scale_methods:
-        return run_with_raised_exception(run, ValueError, "is not supported in dynamic quantization")
+    else:
+        if get_device_type() != device_type_id[device_type]:
+            # In scale_method different than MAXABS_HW, we don't support device_for_scales so this scale_method config fails
+            return run_with_raised_exception(run, ValueError, "Unsupported config: scale_method")
+
+    if dynamic_quantization:
+        if scale_method in _hw_aligned_scale_methods or scale_method in _quant_only_scale_methods:
+            # When in dynamic quantization we don't support hw aligned scale methods and unit scale
+            return run_with_raised_exception(run, ValueError, "Unsupported config: scale_method")
+    else :
+        if scale_method in SUPPORTED_DYNAMIC_SCALES:
+            # When in static quantization we don't support dynamic scale method
+            return run_with_raised_exception(run, ValueError, "Unsupported config: scale_method")
     return run()
diff --git a/test/3x/torch/algorithms/fp8_quant/unit_tests/test_runtime_scale_patching.py b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_runtime_scale_patching.py
@@ -52,7 +52,7 @@ def temp_directory():
 @pytest.mark.parametrize("scale_format", ["SCALAR", "CONST"])
 @pytest.mark.parametrize("dynamic_scale_patching", [True, False])
 def test_no_assert(scale_method, scale_format,dynamic_scale_patching, temp_directory):
-    if scale_method in SCALE_METHODS_KEY_ERROR:
+    if scale_method in SCALE_METHODS_KEY_ERROR :
         pytest.xfail("KeyError")
     model = TinyModel()
     model.eval()
@@ -90,7 +90,7 @@ def run_convert():
     finalize_calibration(model)
 
     if scale_method == ScaleMethod.ACT_MAXABS_PCS_POW2_WEIGHT_MAXABS_PTS_POW2_HW:
-        return run_with_raised_exception(run_convert, ValueError, "Unsupported config: scale method ScaleMethod.ACT_MAXABS_PCS_POW2_WEIGHT_MAXABS_PTS_POW2_HW")
+        return run_with_raised_exception(run_convert, ValueError, "Unsupported config: scale_method")
     if dynamic_scale_patching:
         os.environ["RUNTIME_SCALE_PATCHING"] = "1"
         if not scale_method in RUNTIME_SCALE_PATCHING_SUPPORTED_METHODS_LIST: