intel
diff --git a/‎neural_compressor/torch/algorithms/fp8_quant/_core/common.py‎
Lines changed: 5 additions & 0 deletions b/‎neural_compressor/torch/algorithms/fp8_quant/_core/common.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎neural_compressor/torch/algorithms/fp8_quant/_core/fp_utils.py‎
Lines changed: 22 additions & 0 deletions b/‎neural_compressor/torch/algorithms/fp8_quant/_core/fp_utils.py‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎neural_compressor/torch/algorithms/fp8_quant/_core/quant_dequant.py‎
Lines changed: 9 additions & 2 deletions b/‎neural_compressor/torch/algorithms/fp8_quant/_core/quant_dequant.py‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎neural_compressor/torch/algorithms/fp8_quant/_core/scale.py‎
Lines changed: 2 additions & 2 deletions b/‎neural_compressor/torch/algorithms/fp8_quant/_core/scale.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/ops_quantizer.py‎
Lines changed: 29 additions & 12 deletions b/‎neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/ops_quantizer.py‎
Lines changed: 29 additions & 12 deletions
diff --git a/‎neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/round_scales_function.py‎
Lines changed: 13 additions & 1 deletion b/‎neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/round_scales_function.py‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/scale_method_factory.py‎
Lines changed: 17 additions & 11 deletions b/‎neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/scale_method_factory.py‎
Lines changed: 17 additions & 11 deletions
@@ -197,3 +197,8 @@ def get_device_type_for_scales(mod):
 @lru_cache
 def is_runtime_scale_patching():
     return os.getenv("RUNTIME_SCALE_PATCHING", "False").lower() in ["true", "1"]
+
+#TODO [SW-224612]: Use cguid to calc scales and remove the check
+@lru_cache
+def is_calc_scale_with_cguid():
+    return os.getenv("CALC_SCALE_WITH_CGUID", "False").lower() in ["true", "1"]
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import torch
+from enum import Enum
 from .common import ModuleConfig
 from neural_compressor.torch.utils.auto_accelerator import auto_detect_accelerator, INCAcceleratorType
 cur_accelerator = auto_detect_accelerator()
@@ -21,6 +22,27 @@
 scale_fcn = lambda x, scale: torch.div(x, scale)
 cast_fcn = lambda x, dtype: x.to(dtype=dtype)
 cast_to_fp8_fcn = lambda x, dtype, scale_inv=None: torch.ops.hpu.cast_to_fp8_v2(x, scale_inv, False, False, dtype)[0]
+def calculate_scale_maxabs(x, maxMode, **kwargs):
+    return torch.ops.hpu.calculate_scale_for_cast(
+        x, maxMode.value, ScaleCalculationRoundingMode.NO_SCALE_ROUNDING.value, **kwargs
+    )
+
+
+def calculate_scale_rounding(x, scaleMode, **kwargs):
+    return torch.ops.hpu.calculate_scale_for_cast(
+        x, ScaleCalculationMaxMode.NO_MAX_CALCULATION.value, scaleMode.value, **kwargs
+    )
+
+
+class ScaleCalculationMaxMode(Enum):
+    NO_MAX_CALCULATION = 0
+    MAX_ABS_PTS_CALCULATION = 1
+    MAX_ABS_PCS_CALCULATION = 2
+
+
+class ScaleCalculationRoundingMode(Enum):
+    NO_SCALE_ROUNDING = 0
+    SCALE_TO_POW2_ROUNDING = 1
 
 GAUDI2 = INCAcceleratorType.GAUDI2
 GAUDI3 = INCAcceleratorType.GAUDI3
 
@@ -30,6 +30,7 @@
     dequantize_per_tensor_from_fp8,
     quantize_per_channel_to_fp8,
     dequantize_per_channel_from_fp8,
+    invert_scale,
 )
 from .scale_handler import create_scale_tensor
 
@@ -126,14 +127,20 @@ def __init__(self, input_scales_creator, lp_dtype, hp_dtype, *args, **kwargs):
 
         self.cast_to_op = get_quantized_func_wrapper(OP_TYPE.CAST_TO_FP8, self.scale_format)
 
-    def forward(self, x):
+    def calculate_scales(self, x):
         scale = self.input_scales_creator.calc_scales(x, QuantTensorType.DYNAMIC)
-        scale_inv = self.input_scales_creator.calc_invert_scales()
+        scale_inv = self.input_scales_creator.invert_scales(scale)
+        return scale, scale_inv
+
+    def forward(self, x):
+        scale, scale_inv = self.calculate_scales(x)
 
         ret = self.cast_to_op(x, scale_inv, False, False, self.lp_dtype)
 
         return ret, scale
 
+    #TODO [SW-224609]: implement forward qdq
+
     def extra_repr(self) -> str:
         repr = super(QuantDynamicInput, self).extra_repr()
         return f"{repr} input_scales_creator={self.input_scales_creator}"
 
@@ -86,7 +86,7 @@ def prepare_layer_scales(mod, mod_name, config, mod_type_str, measurement, scale
     (ScaleMethod.MAXABS_HW, "maxabs"): "act_maxabs_pts_pow2_hw_weight_maxabs_pts_pow2_hw",
     (ScaleMethod.MAXABS_POW2, "maxabs"): "act_maxabs_pts_pow2_weight_maxabs_pts_pow2",
     (ScaleMethod.MAXABS_ARBITRARY, "maxabs"): "act_maxabs_pts_weight_maxabs_pts_arbitrary",
-    (ScaleMethod.MAXABS_POW2_DYNAMIC, "maxabs"): "act_maxabs_pcs_dyn_pow2_weight_maxabs_pts_pow2_hw", # TODO: remove when changing config parsing
+    (ScaleMethod.ACT_MAXABS_PCS_POW2_WEIGHT_MAXABS_PTS_POW2_HW, "maxabs"): "act_maxabs_pcs_pow2_weight_maxabs_pts_pow2_hw", # TODO: remove when changing config parsing
     (ScaleMethod.MAXABS_HW_OPT_WEIGHT, "maxabs"): "act_maxabs_pts_hw_weight_opt_pts_hw",
     (
         ScaleMethod.MAXABS_POW2_OPT_WEIGHT,
@@ -138,7 +138,7 @@ def prepare_layer_scales(mod, mod_name, config, mod_type_str, measurement, scale
         "input_backoff": 0.25,
         "weight_backoff": 0.5,
     },
-    "act_maxabs_pcs_dyn_pow2_weight_maxabs_pts_pow2_hw": {
+    "act_maxabs_pcs_pow2_weight_maxabs_pts_pow2_hw": {
         "input_backoff": 1.0,
         "weight_backoff": 0.5,
     },
 
@@ -14,10 +14,11 @@
 from abc import abstractmethod
 
 import torch
-from neural_compressor.torch.algorithms.fp8_quant._quant_common.quant_config import get_hqt_config
-from .scale_method_factory import ScaleMethodFactory, QuantTensorName
+from neural_compressor.torch.algorithms.fp8_quant._quant_common.quant_config import get_hqt_config, is_supported_dynamic_op
+from .scale_method_factory import ScaleMethodFactory, QuantTensorName, ScaleValueType
 from ..common import ModuleConfig, QuantTensorType
 from ..quant_dequant import DequantOutput, QuantDequant, QuantDequantNone, QuantInput, QuantDynamicInput
+from ...utils.logger import logger
 from neural_compressor.torch.algorithms.fp8_quant._core.common import dequant_original_fp8_weight_if_needed
 
 
@@ -31,6 +32,8 @@ def __init__(self, config, mod, measurement, params, op_type):
         self.inputs_scales_creators = []
         self.output_scales_creators = []
         self.params_scales_creators = []
+        self.is_dynamic = get_hqt_config(self.mod).cfg["dynamic_quantization"] and is_supported_dynamic_op(op_type)
+        logger.debug("%s %s", self.__class__.__name__, self.__dict__)
 
     def get_module_configuration(self):
         scale_format = get_hqt_config(self.mod).cfg["scale_format"]
@@ -60,14 +63,19 @@ def calc_input_scales(self, num_of_inputs):
         input_scales = []
         for i in range(num_of_inputs):
             input_measurement = self.measurement.inputs[i] if self.measurement is not None else []
-            input_scales.append(
-                self.inputs_scales_creators[i].calc_scales(input_measurement, QuantTensorType.MEASUREMENTS)
-            )
+            input_scale = None
+            if not self.is_dynamic:
+                input_scale = self.inputs_scales_creators[i].calc_scales(
+                    input_measurement, QuantTensorType.MEASUREMENTS
+                )
+            input_scales.append(input_scale)
         return input_scales
 
     def calc_output_scales(self):
         output_measurement = self.measurement.outputs[0] if self.measurement is not None else []
-        output_scales = self.output_scales_creators[0].calc_scales(output_measurement, QuantTensorType.MEASUREMENTS)
+        output_scales = None
+        if not self.is_dynamic:
+            output_scales = self.output_scales_creators[0].calc_scales(output_measurement, QuantTensorType.MEASUREMENTS)
         return (output_scales,)
 
     def init_input_config(self, scales_inv, lp_dtype, hp_dtype, scale_format, use_qdq, fake_quant):
@@ -79,7 +87,7 @@ def init_input_config(self, scales_inv, lp_dtype, hp_dtype, scale_format, use_qd
         else:
             input_config = []
             for input_scales_creator, s_inv in zip(self.inputs_scales_creators, scales_inv):
-                if input_scales_creator.is_dynamic:
+                if self.is_dynamic:
                     input_config.append(
                         QuantDynamicInput(input_scales_creator, lp_dtype, hp_dtype, scale_format=scale_format)
                     )
@@ -92,29 +100,38 @@ class LinearOpQuantizer(BaseOpQuantizer):
 
     def __init__(self, config, mod, measurement, params, module_type):
         super().__init__(config, mod, measurement, params, module_type)
-        self.inputs_scales_creators.append(self.scales_method_factory.get_scale_method(QuantTensorName.INPUT))
+        self.inputs_scales_creators.append(self.scales_method_factory.get_scale_method(QuantTensorName.INPUT, self.is_dynamic))
         self.weight_och_scale_calc = self.scales_method_factory.get_scale_method(QuantTensorName.WEIGHT_OUT_CH)
         self.weight_ich_scale_calc = self.scales_method_factory.get_scale_method(QuantTensorName.WEIGHT_IN_CH)
-        self.output_scales_creators.append(self.scales_method_factory.get_scale_method(QuantTensorName.OUTPUT))
+        self.output_scales_creators.append(self.scales_method_factory.get_scale_method(QuantTensorName.OUTPUT, self.is_dynamic))
 
     def get_scales_module_config(self):
         input_scales = self.calc_input_scales(num_of_inputs=1)
         output_measurement = self.measurement.outputs[0] if self.measurement is not None else []
         rescaled_weight = self.mod.weight if hasattr(self.mod, 'weight') else None
+        if (
+            self.scales_method_factory.scale_value_type_map[QuantTensorName.WEIGHT_IN_CH]
+            is not ScaleValueType.DUMMY_SCALES
+        ):
+            # Calculating weight in hpu to support scale calculation CGUID torch.ops.hpu.calculate_scale_for_cast
+            rescaled_weight = rescaled_weight.to("hpu")
         if rescaled_weight is not None:
             rescaled_weight = dequant_original_fp8_weight_if_needed(self.mod, rescaled_weight)
         if self.weight_ich_scale_calc is not None:
             weight_scales_in_ch = self.weight_ich_scale_calc.calc_scales(input_scales[0], QuantTensorType.CONST)
             rescaled_weight = torch.div(rescaled_weight, weight_scales_in_ch.reshape([1, -1]))
         weights_scales_out_ch = self.weight_och_scale_calc.calc_scales(rescaled_weight, QuantTensorType.CONST)
+
         params_config = (
             {"weight": weights_scales_out_ch}
             if (self.weight_ich_scale_calc is None)
             else {"weight": {0: weights_scales_out_ch, 1: weight_scales_in_ch}}
         )
-        output_scales = self.output_scales_creators[0].calc_scales(
-            output_measurement, QuantTensorType.MEASUREMENTS, input0=weights_scales_out_ch, input1=input_scales[0]
-        )
+        output_scales = None
+        if not self.is_dynamic:
+            output_scales = self.output_scales_creators[0].calc_scales(
+                output_measurement, QuantTensorType.MEASUREMENTS, input0=weights_scales_out_ch, input1=input_scales[0]
+            )
         return ModuleConfig(
             input_scales,
             (output_scales,),
 
@@ -13,10 +13,22 @@
 # limitations under the License.
 import torch
 
-from neural_compressor.torch.algorithms.fp8_quant._core.fp_utils import FP8_143_SCALES, FP8_143_SCALES_TRAITS
+from neural_compressor.torch.algorithms.fp8_quant._core.fp_utils import FP8_143_SCALES, FP8_143_SCALES_TRAITS, calculate_scale_rounding, ScaleCalculationRoundingMode
+#TODO [SW-224612]: Use cguid to calc scales and remoce check
+from ..common import is_calc_scale_with_cguid
 
 
 class ScaleToPow2:
+    def __init__(self):
+        #TODO [SW-224612]: Use cguid to calc scales and remove check
+        if is_calc_scale_with_cguid():
+            self.calc = self.calc_with_cguid
+
+    #TODO [SW-224612]: Use cguid to calc scales and remove special function
+    def calc_with_cguid(self, scale):
+        scale_pow2 = calculate_scale_rounding(scale, ScaleCalculationRoundingMode.SCALE_TO_POW2_ROUNDING)
+        return scale_pow2
+
     def calc(self, scale):
         scale_pow2 = 2.0 ** torch.ceil(torch.log2(scale))
         return scale_pow2
 
@@ -14,8 +14,6 @@
 from enum import Enum, auto
 
 from .round_scales_function import *
-# TODO [SW-217813]: support dynamic quantization in all ops and remove supported_dynamic_ops
-from ..._quant_common.quant_config import is_supported_dynamic_op
 from ..common import get_device_type_for_scales
 from .scales_method import *
 from ...utils.logger import logger
@@ -61,6 +59,7 @@ def parse_tensor_granularity(config):
     scale_granularity = ScaleGranularity.PTS
     if "pcs" in config or "smoothquant" in config:
         scale_granularity = ScaleGranularity.PCS
+    logger.trace("parse_tensor_granularity %s %s", config, scale_granularity)
     return scale_granularity
 
 # TODO [SW-217813]: support dynamic quantization in all ops and remove op_type
@@ -78,8 +77,6 @@ def parse_tensor_scale_value_type(config, op_type):
         scale_value_type = ScaleValueType.OPT
     elif "dummy" in config:
         scale_value_type = ScaleValueType.DUMMY_SCALES
-    elif "dyn" in config and is_supported_dynamic_op(op_type):
-        scale_value_type = ScaleValueType.DYNAMIC
     logger.trace(f"parse_tensor_scale_value_type {config=} {scale_value_type=}")
     return  scale_value_type
 
@@ -121,17 +118,26 @@ def __init__(self, config, params, mod, op_type):
                                   QuantTensorName.WEIGHT_IN_CH: self.params.get("weight_backoff", 1.0),
                                   QuantTensorName.WEIGHT_OUT_CH: self.params.get("weight_backoff", 1.0),
                                   QuantTensorName.OUTPUT: self.params.get("output_backoff", self.params.get("input_backoff", 1.0)),} # get output_backoff, if doesn't exists use input_backoff, if doesn't exists use 1
-        logger.debug("%s %s".format(self.__class__.__name__, self.__dict__))
+        logger.trace("%s %s", self.__class__.__name__, self.__dict__)
 
     ## TODO remove after SW-217369
     ## config string example: "act_maxabs_pts_weight_opt_pts_hw", round_method = pow2_hw, scale_value_type = maxabs, granularity = pts
     # all config strings in scale.py: scale_method_mapping
     # returns MaxAbsPts obj with pow2_hw as scale_round_method
-    def get_scale_method(self, tensor_name):
-        backoff =  self.scale_backoff_map[tensor_name]
+    def get_scale_method(self, tensor_name, is_dynamic=False):
+        backoff = 1.0 if is_dynamic else self.scale_backoff_map[tensor_name]
         scale_round_method = self.scale_round_method_map[tensor_name]
         scale_value_type = self.scale_value_type_map[tensor_name]
         scale_granularity = self.scale_granularity_map[tensor_name]
+        logger.trace(
+            "get_scale_method backoff=%s scale_round_method=%s scale_value_type=%s scale_granularity=%s op_type=%s is_dynamic=%s",
+            backoff,
+            scale_round_method,
+            scale_value_type,
+            scale_granularity,
+            self.op_type,
+            is_dynamic,
+        )
 
         match (scale_value_type, scale_granularity, tensor_name, self.op_type):
             ## dummy
@@ -145,13 +151,13 @@ def get_scale_method(self, tensor_name):
                 if self.op_type in {"linear", "matmul"}:
                     if scale_value_type in {ScaleValueType.MAXABS, ScaleValueType.OPT}:
                         return MulAdditionalScales(scale_round_method, self.params, self.device_for_scales)
-                    if scale_value_type == ScaleValueType.DYNAMIC:
-                        return MulAdditionalDynamicScales(scale_round_method, self.params, self.device_for_scales)
             ## maxabs/opt in channel PTS
             case (_, ScaleGranularity.PTS, QuantTensorName.WEIGHT_IN_CH, _) \
                 if scale_value_type not in {ScaleValueType.SMOOTHQUANT_OPT, ScaleValueType.SMOOTHQUANT_MAXABS}:
                 return None
             case (ScaleValueType.MAXABS, ScaleGranularity.PTS, _, _):
+                if is_dynamic:
+                    return MaxAbsDynamicPts(scale_round_method, self.params, self.device_for_scales, backoff)
                 return MaxAbsPts(scale_round_method, self.params, self.device_for_scales, backoff)
             ## maxabs/opt in channel PCS
             case (_, ScaleGranularity.PCS, QuantTensorName.WEIGHT_IN_CH, _)\
@@ -160,6 +166,8 @@ def get_scale_method(self, tensor_name):
                 return InputChannelScale(scale_round_method, self.params, self.device_for_scales, in_channel_size)
             ## maxabs PCS
             case (ScaleValueType.MAXABS, ScaleGranularity.PCS, _, _):
+                if is_dynamic:
+                    return MaxAbsDynamicPcs(scale_round_method, self.params, self.device_for_scales, backoff)
                 return MaxAbsPcs(scale_round_method, self.params, self.device_for_scales, backoff)
             ## opt PTS
             case (ScaleValueType.OPT, ScaleGranularity.PTS, _, _):
@@ -188,8 +196,6 @@ def get_scale_method(self, tensor_name):
             case (ScaleValueType.SMOOTHQUANT_OPT, _, QuantTensorName.INPUT, _):
                 backoff_weight =  self.params.get("weight_backoff", 1)
                 return InputSmoothQuantOpt(scale_round_method, self.mod.weight, self.params, self.device_for_scales, backoff, backoff_weight)
-            case (ScaleValueType.DYNAMIC, ScaleGranularity.PCS, QuantTensorName.INPUT, _):
-                return MaxAbsDynamicPcs(scale_round_method, self.params, self.device_for_scales, backoff)
             case _:
                 raise NotImplementedError("the config: scale_round_method: " + \
                                           str(scale_round_method) +