[SW-214855] - Set scale attributes in INC to reduce graph recompilations (#162)

nirda7 · XuehaoSun · commit 2b02536fb1e3 · 2025-05-13T10:18:43.000+08:00
* [SW-219831] - Set scale attributes in INC to reduce grpah recompilation

* add scaling methods ids

* fix scaling method ids check and set

* enable feature also for Load QuantMode

* move scale tensors to cpu when feature is enabled

* fix scaling methods ids to start at 1

* fix cr comments

* remove unnecessary imports

* fix cr comments

* fix more cr comments

* fix cr comments

* move scale to float on cpu in scale handler for dynamic scaling

* fix cr comments

* Add unit test

* fix sending scale tensor to bridge and unit-test bug
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py
@@ -19,6 +19,7 @@
 import numpy as np
 import torch
 from enum import Enum, auto
+from functools import lru_cache 
 
 from .._quant_common.quant_config import get_hqt_config
 from ..utils.logger import logger
@@ -288,3 +289,8 @@ def create_mod_info_recursion(parent):
 def get_device_type_for_scales(mod):
     config = get_hqt_config(mod).cfg
     return config["device_for_scales"]
+
+
+@lru_cache
+def is_runtime_scale_patching():
+    return os.getenv("RUNTIME_SCALE_PATCHING", "False").lower() in ["true", "1"]
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
@@ -204,7 +204,7 @@ def prepare_model_with_dummy_measurement(model, mod_list, scaling_method_name, s
             mode_type = config.cfg["mod_dict"][mod_type_str]
             mod_info = mod_types[mode_type]
 
-            op_obj = ops_quantizer.get_op_quantizer(mode_type, "dummy", mod, None, scale_config)
+            op_obj = ops_quantizer.get_op_quantizer("dummy", mod, None, scale_config, mode_type)
             dummy_mod_scales = op_obj.get_scales_module_config()
             dummy_mod_config = op_obj.scales_module_config_to_q_and_dq(dummy_mod_scales)
             dummy_mod_extra_config = ModuleExtraConfig(
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantized_func_wrappers/hpu/hpu_quantized_func_wrapper.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantized_func_wrappers/hpu/hpu_quantized_func_wrapper.py
@@ -18,6 +18,7 @@
 import torch
 
 from ..quantized_func_wrapper import QuantizedFuncWrapperBase, OP_TYPE, QuantizedFuncWrapperFactory
+from ...common import is_runtime_scale_patching
 from neural_compressor.torch.algorithms.fp8_quant._quant_common.quant_config import ScaleFormat
 try:  # backwards compatibility for 1.16
     from habana_frameworks.torch.hpex.kernels import fp8_fused_sdpa
@@ -40,6 +41,8 @@ def get_default_quantized_func(self):
         raise NotImplementedError()
 
     def get_scalar_quantized_func(self):
+        if is_runtime_scale_patching():
+            return self.get_default_quantized_func()
         return self.get_default_quantized_func().scalar
 
     def get_quantized_func(self, scale_format):
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/scale.py b/neural_compressor/torch/algorithms/fp8_quant/_core/scale.py
@@ -30,8 +30,8 @@ def load_layer_scales(mod, mod_name, config, mod_type_str, measurement, scales,
     )
     mod_extra_config = None
     if mod_name in scales or not config.cfg["use_stats_files"] or mod_name in measurement:
-        op_for_scale_obj = ops_quantizer.get_op_quantizer(module_type, scaling_method_name, mod,
-                                                          measurement.get(mod_name, None), scale_config)
+        op_for_scale_obj = ops_quantizer.get_op_quantizer(scaling_method_name, mod, measurement.get(mod_name, None),
+                                                          scale_config, module_type)
         if mod_name not in scales:
             logger.debug("Calculating scales for module %s", mod_name)
             # calculates scales for current module according to scalling_methods
@@ -61,7 +61,7 @@ def prepare_layer_scales(mod, mod_name, config, mod_type_str, measurement, scale
         module_type,
     )
     mod_extra_config = None
-    op_obj = ops_quantizer.get_op_quantizer(module_type, scaling_method_name, mod, None, scale_config)
+    op_obj = ops_quantizer.get_op_quantizer(scaling_method_name, mod, None, scale_config, module_type)
     logger.debug("Preparing dynamic scales for module %s", mod_name)
     # calculates scales for current module according to scaling_methods
     scales[mod_name] = op_obj.get_scales_module_config()  # ModuleConfig of scales
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/scale_handler.py b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_handler.py
@@ -15,6 +15,7 @@
 import torch
 import types
 from .._quant_common.quant_config import ScaleFormat
+from .common import is_runtime_scale_patching
 
 
 def add_scale_registry(patched_mod):
@@ -34,6 +35,8 @@ def register_scale(patched_mod, name, scale, scale_format):
 
 
 def create_scale_tensor(orig_tensor, scale_format):
+    if is_runtime_scale_patching() and scale_format in ScaleFormat.__members__.values():
+        return orig_tensor.to("cpu").to(torch.float)
     if scale_format == ScaleFormat.CONST:
         if isinstance(orig_tensor, torch.Tensor):
             return torch.nn.Parameter(orig_tensor)
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/ops_quantizer.py b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/ops_quantizer.py
@@ -380,5 +380,5 @@ def scales_module_config_to_q_and_dq(self, module):
                       "row_parallel_linear": RowParallelLinearOpQuantizer
                      }
 
-def get_op_quantizer(module_type, config, mod, measurement, params):
+def get_op_quantizer(config, mod, measurement, params, module_type):
     return ops_quantizer_map[module_type](config, mod, measurement, params, module_type)
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/scales_method.py b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/scales_method.py
@@ -161,7 +161,7 @@ def __init__(self, round_scale_method, optional_scales_list, params, device_for_
 
     def calc_scales(self, tensor, tensor_type, **additional_kwargs):
         self.scale = self.round_scale_method.calc(mmse_scale(tensor, self.optional_scales_list, self.lp_dtype, self.hp_dtype))
-        return  self.scale
+        return self.scale
 
 class OptScalesPcs(ScalesMethod):
     def __init__(self, round_scale_method, optional_scales_list, params, device_for_scales, backoff):
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/utils.py b/neural_compressor/torch/algorithms/fp8_quant/_core/utils.py
@@ -20,6 +20,10 @@
 from .measure import prepare_model as prepare_model_for_measure
 from .quantize import quantize
 from .scale import scale_method_mapping, scaling_params
+from .common import is_runtime_scale_patching
+
+import os
+import habana_frameworks.torch.utils.experimental as htexp
 
 
 def update_mod_dict(config):
@@ -75,6 +79,22 @@ def quantize_dynamic_op(config, mod_type):
     logger.trace(f"should_quantize {name=} {mod_type=} returning {ret}")
     return ret
 
+
+scaling_methods_list = list(scale_method_mapping.values())
+#exlude substrings of scaling methods which are not supported for runtime scale patching mode to reduce graph recompile.
+exclude_substrings = ["pcs", "smoothquant"]
+runtime_scale_patching_supported_methods_list = [method for method in scaling_methods_list if not any(substr in method for substr in exclude_substrings)]
+
+
+def set_runtime_scale_patching_mode(scaling_method_name):
+    if is_runtime_scale_patching():
+        assert (
+            scaling_method_name in runtime_scale_patching_supported_methods_list
+        ), f"Scaling method \"{scaling_method_name}\" is not supported for runtime scale patching (graph recompile reduction). Cannot set scaling attributes."
+        htexp._set_scale_attributes("hw" in scaling_method_name or scaling_method_name == "unit_scale",
+                                    scaling_methods_list.index(scaling_method_name) + 1)
+
+
 def prepare_model(model):
     """Receives the parent module to quantize.
     Replaces its submodules with patched submodules that perform calibration and quantization.
@@ -101,4 +121,5 @@ def prepare_model(model):
         scaling_method_name = scale_method_mapping[(config.cfg["scale_method"], config.cfg["observer"])]
         scaling_params[scaling_method_name].update(config.cfg["scale_params"])
         config.cfg["scale_params"] = scaling_params[scaling_method_name]
+        set_runtime_scale_patching_mode(scaling_method_name)
         return quantize(model, mod_list)
diff --git a/test/3x/torch/algorithms/fp8_quant/tester.py b/test/3x/torch/algorithms/fp8_quant/tester.py
@@ -54,6 +54,16 @@
     torch.float32: "FP32",
 }
 
+RUNTIME_SCALE_PATCHING_SUPPORTED_METHODS_LIST = [
+    ScaleMethod.UNIT_SCALE,
+    ScaleMethod.HW_ALIGNED_SINGLE_SCALE,
+    ScaleMethod.MAXABS_HW,
+    ScaleMethod.MAXABS_POW2,
+    ScaleMethod.MAXABS_HW_OPT_WEIGHT,
+    ScaleMethod.MAXABS_POW2_OPT_WEIGHT,
+    ScaleMethod.MAXABS_ARBITRARY
+]
+
 # Expects to get an exception. If there's no exception, the test will fail
 def run_with_raised_exception(test_to_run, error, error_str):
     with pytest_raises(Exception) as exc:
diff --git a/test/3x/torch/algorithms/fp8_quant/unit_tests/test_runtime_scale_patching.py b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_runtime_scale_patching.py
@@ -0,0 +1,98 @@
+import os
+import pytest
+import torch
+import shutil
+import copy
+import habana_frameworks.torch.core as htcore
+
+from ..tester import RUNTIME_SCALE_PATCHING_SUPPORTED_METHODS_LIST, SCALE_METHODS_KEY_ERROR, run_with_raised_exception
+from neural_compressor.torch.algorithms.fp8_quant._core.common import is_runtime_scale_patching
+from neural_compressor.torch.algorithms.fp8_quant._quant_common.quant_config import ScaleMethod
+from neural_compressor.torch.quantization import FP8Config, convert, prepare, finalize_calibration
+
+os.environ["PT_HPU_WEIGHT_SHARING"] = "0"
+htcore.hpu_inference_set_env()
+
+
+class TinyBlock(torch.nn.Module):
+
+    def __init__(self):
+        super(TinyBlock, self).__init__()
+        self.pre_linear = torch.nn.Linear(2, 1, bias=False)
+        self.pre_linear.weight = torch.nn.Parameter(torch.ones([1, 2]))
+
+    def forward(self, x):
+        x = self.pre_linear(x)
+        return x
+
+
+class TinyModel(torch.nn.Module):
+
+    def __init__(self):
+        super(TinyModel, self).__init__()
+        self.block = TinyBlock()
+
+    def forward(self, x):
+        x = self.block(x)
+        return x
+
+
+@pytest.fixture
+def temp_directory():
+    # Create a temporary directory
+    temp_dir = "./test_runtime_scale_patching_outputs"
+    os.makedirs(temp_dir)
+    # Yield the temporary directory path to the test
+    yield temp_dir
+    # Cleanup: Remove the temporary directory after the test ends
+    shutil.rmtree(temp_dir)
+
+
+@pytest.mark.parametrize("scale_method", ScaleMethod)
+@pytest.mark.parametrize("scale_format", ["SCALAR", "CONST"])
+@pytest.mark.parametrize("dynamic_scale_patching", [True, False])
+def test_no_assert(scale_method, scale_format,dynamic_scale_patching, temp_directory):
+    if scale_method in SCALE_METHODS_KEY_ERROR:
+        pytest.xfail("KeyError")
+    model = TinyModel()
+    model.eval()
+    model = model.to("hpu").to(torch.bfloat16)
+    inference_model = copy.deepcopy(model)
+    htcore.hpu_inference_initialize()
+
+    measure_config_dict = {
+        "mode": "MEASURE",
+        "observer": "maxabs",
+        "allowlist": {"types": [], "names": []},
+        "blocklist": {"types": [], "names": []},
+        "dump_stats_path": f"{temp_directory}/inc_output"
+    }
+    quant_config_dict = {
+        "mode": "QUANTIZE",
+        "scale_format": scale_format,
+        "scale_method": scale_method.name,
+        "allowlist": {"types": [], "names": []},
+        "blocklist": {"types": [], "names": []},
+        "dump_stats_path": f"{temp_directory}/inc_output"
+    }
+    measure_config = FP8Config.from_dict(measure_config_dict)
+    quant_config = FP8Config.from_dict(quant_config_dict)
+
+    def run_convert():
+        convert(inference_model, quant_config)
+
+    is_runtime_scale_patching.cache_clear()
+    os.environ["RUNTIME_SCALE_PATCHING"] = "0"
+
+    model = prepare(model, measure_config)
+    input = torch.tensor([1.2,2.1]).to(torch.bfloat16).to("hpu")
+    model(input)
+    finalize_calibration(model)
+
+    if dynamic_scale_patching:
+        os.environ["RUNTIME_SCALE_PATCHING"] = "1"
+        if not scale_method in RUNTIME_SCALE_PATCHING_SUPPORTED_METHODS_LIST:
+            run_with_raised_exception(run_convert, AssertionError, "Cannot set scaling attributes.")
+            return
+    # The following convert should run successfully without any asserts
+    inference_model = convert(inference_model, quant_config)

Original file line number	Diff line number	Diff line change
`@@ -380,5 +380,5 @@ def scales_module_config_to_q_and_dq(self, module):`
`380`	`380`	`"row_parallel_linear": RowParallelLinearOpQuantizer`
`381`	`381`	`}`
`382`	`382`
`383`		`-def get_op_quantizer(module_type, config, mod, measurement, params):`
	`383`	`+def get_op_quantizer(config, mod, measurement, params, module_type):`
`384`	`384`	`return ops_quantizer_map[module_type](config, mod, measurement, params, module_type)`