[ALGO-808] add support for int4 weights + fp8 activations - phase 1 (#43)

Asaf Karnieli · tgafni · web-flow · commit ab265ef063ff · 2025-02-16T15:40:28.000+02:00
* [ALGO-808] add support for int4 weights + fp8 activations - phase 1

* Add code for quantizing only single input to PatchedMatmul

* w4a8 new kernel

---------

Co-authored-by: Tomer Gafni &lt;tgafni@habana.ai&gt;
diff --git a/neural_compressor/common/utils/constants.py b/neural_compressor/common/utils/constants.py
@@ -34,6 +34,7 @@
 TEQ = "teq"  # pragma: no cover
 AUTOROUND = "autoround"
 FP8_QUANT = "fp8_quant"
+HYBRID_GPTQ = "hybrid_gptq"
 MX_QUANT = "mx_quant"
 MIXED_PRECISION = "mixed_precision"
 
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/ops_quantizer.py b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/ops_quantizer.py
@@ -197,6 +197,11 @@ def scales_module_config_to_q_and_dq(self, module):
             use_qdq,
             fake_quant,
         )
+        
+        # 4bit->8bit inputs, no need to quant
+        if hasattr(self.mod, "no_input_quant"):
+            input_config[1] = QuantDequantNone(lp_dtype, hp_dtype, scale_format=scale_format)
+
         # outputs as bf16, and descaled in gemm under PatchedLinear, so no need to work here
         output_config = [QuantDequantNone(lp_dtype, hp_dtype, scale_format=scale_format)]
         return ModuleConfig(input_config, output_config)
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py
@@ -99,6 +99,7 @@ class DeviceForScalesType(Enum):
     "ignore_modules_wo_measures": TrueFalse,
     "use_qdq": TrueFalse,
     "fake_quant": TrueFalse,
+    "int4_weights": TrueFalse,
     "scale_format": ScaleFormat,
     "device_for_scales": DeviceForScalesType,
     "measure_on_hpu": TrueFalse,
@@ -111,6 +112,7 @@ class DeviceForScalesType(Enum):
     "ignore_modules_wo_measures",
     "recalc_scales",
     "fake_quant",
+    "int4_weights",
     "use_qdq",
     "device_for_scales",
     "measure_on_hpu",
@@ -189,6 +191,7 @@ def parse(custom_config: Mapping[str, str]) -> Fp8cfg:
             },  # types and names to be quantized. Allowlist by names is not yet implemented
             "mode": QuantMode.QUANTIZE,  # Quantize or Measure
             "fake_quant": False, # Fake or Real Quant, fake_quant only works for linear(PatchedLinear) and matmul(PatchedMatmul), usually used for training.
+            "int4_weights": False,
             "use_qdq": False, # QDQ or Real Quant, QDQ works for operators in helper_modules.py, usually used for inference.
             "scale_method": ScaleMethod.MAXABS_HW,  # Method to quantize with
             "scale_params": {},  # scaling parameters that are different then the default ones
diff --git a/neural_compressor/torch/algorithms/mixed_low_precision/__init__.py b/neural_compressor/torch/algorithms/mixed_low_precision/__init__.py
@@ -11,3 +11,5 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from neural_compressor.torch.algorithms.mixed_low_precision.quantizer import HybridGPTQQuantizer
diff --git a/neural_compressor/torch/algorithms/mixed_low_precision/maxabs_quant.json b/neural_compressor/torch/algorithms/mixed_low_precision/maxabs_quant.json
@@ -0,0 +1,8 @@
+{
+    "method": "HOOKS",
+    "mode": "QUANTIZE",
+    "observer": "maxabs",
+    "scale_method": "maxabs_hw",
+    "dump_stats_path": "./calib_output/measure",
+    "int4_weights": "True"
+}
diff --git a/neural_compressor/torch/algorithms/mixed_low_precision/modules.py b/neural_compressor/torch/algorithms/mixed_low_precision/modules.py
@@ -0,0 +1,46 @@
+import math
+from abc import abstractmethod
+
+import numpy as np
+import torch
+from torch.autograd import Function
+from torch.nn import functional as F
+
+from ..weight_only.modules import HPUWeightOnlyLinear
+from neural_compressor.torch.utils import accelerator, logger
+
+
+class HPUMixedPrecisionLinear(HPUWeightOnlyLinear):
+    """Weight and Activations quant (W4A8 gptq) Linear for HPU device."""
+
+    def __init__(
+        self, in_features, out_features,
+        **kwargs,
+    ):
+        """Init the HPUMixedPrecisionLinear object.
+        """
+        super(HPUMixedPrecisionLinear, self).__init__(in_features, out_features)
+
+    def forward(self, input):
+        """The forward function of HPUMixedPrecisionLinear."""
+        input_dtype = input.dtype
+        output_shape = input.shape[:-1] + (self.out_features,)
+        scales = self.scales
+        qweight = self.qweight
+        zeros = self.qzeros
+        weight = torch.ops.hpu.convert_from_uint4(qweight, scales/self.matmul_internal.scale_other, zeros, torch.float8_e4m3fn)     # todo: div scales in init
+        output = self.matmul_internal(input, weight)
+        output = output.to(dtype=input_dtype).reshape(
+            output_shape
+        )  # A cast is needed here as for some reason the vecquant2matmul_faster_old still allocate a float32 output.
+        output = output + self.bias if self.bias is not None else output
+        return output
+
+    @staticmethod
+    def convert_from_weight_only(obj):
+        new_self = HPUMixedPrecisionLinear(obj.in_features, obj.out_features)
+        for attr, value in vars(obj).items():
+            setattr(new_self, attr, value)
+        new_self.matmul_internal.no_input_quant = True # flag for 8bit input, which shouldn't be quantized in matmul
+        return new_self
+
diff --git a/neural_compressor/torch/algorithms/mixed_low_precision/quantizer.py b/neural_compressor/torch/algorithms/mixed_low_precision/quantizer.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from neural_compressor.torch.algorithms import Quantizer
+from neural_compressor.torch.algorithms.mixed_low_precision.modules import HPUMixedPrecisionLinear
+from neural_compressor.torch.algorithms.weight_only.modules import HPUWeightOnlyLinear
+
+class HybridGPTQQuantizer(Quantizer):
+    def __init__(self, quant_config):
+        super().__init__(quant_config)
+        if isinstance(quant_config, dict):
+            json_file = [cfg.json_file for cfg in quant_config.values()]
+            assert len(json_file) > 0, "Cannot get json file from config."
+            self.quant_config = json_file[0]
+
+    def prepare(self, model):
+        return model
+    
+    def convert(self, model):
+        _convert(model)
+        return model
+
+def set_module(model, op_name, new_module):
+    """Set module with a given op name.
+
+    Args:
+        model (object): the input model.
+        op_name (str): name of op.
+        new_module (object): the input model.
+
+    Returns:
+        module (object).
+    """
+    module = model
+    name_list = op_name.split(".")
+    for name in name_list[:-1]:
+        if hasattr(module, name):
+            module = getattr(module, name)
+        else:
+            module = module
+    setattr(module, name_list[-1], new_module)
+
+def _convert(model):
+    for name, module in model.named_modules():
+    # replace `HPUWeightOnlyLinear`s forward func
+        if isinstance(module, HPUWeightOnlyLinear):
+            module = HPUMixedPrecisionLinear.convert_from_weight_only(module)
+            set_module(model, name, module)
+
+    return model
diff --git a/neural_compressor/torch/algorithms/weight_only/modules.py b/neural_compressor/torch/algorithms/weight_only/modules.py
@@ -29,6 +29,14 @@
 
 from .utility import quant_tensor
 
+class Matmul(torch.nn.Module):
+
+    def __init__(self, ) -> None:
+        super().__init__()
+
+    def forward(self, X, Y):
+        """Forward function."""
+        return torch.matmul(X, Y)
 
 class QDQLayer(torch.nn.Module):
     """Quantized and dequantized layer."""
@@ -672,6 +680,7 @@ def __init__(
         self.half_indim = self.in_features // 2
 
         self.wf = torch.tensor(list(range(0, 32, self.bits)), dtype=torch.int32).unsqueeze(0)
+        self.matmul_internal = Matmul()
 
     def forward(self, input):
         """The forward function of HPUWeighOnlyLinear."""
@@ -681,7 +690,7 @@ def forward(self, input):
         qweight = self.qweight
         zeros = self.qzeros
         weight = torch.ops.hpu.convert_from_uint4(qweight, scales, zeros, input_dtype)
-        output = torch.matmul(input, weight)
+        output = self.matmul_internal(input, weight)
         output = output.to(dtype=input_dtype).reshape(
             output_shape
         )  # A cast is needed here as for some reason the vecquant2matmul_faster_old still allocate a float32 output.
diff --git a/neural_compressor/torch/quantization/__init__.py b/neural_compressor/torch/quantization/__init__.py
@@ -44,6 +44,7 @@
     get_woq_tuning_config,
     DynamicQuantConfig,
     get_default_dynamic_config,
+    HybridGPTQConfig
 )
 
 from neural_compressor.torch.quantization.autotune import (
diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py
@@ -23,6 +23,7 @@
     AUTOROUND,
     AWQ,
     FP8_QUANT,
+    HYBRID_GPTQ,
     GPTQ,
     HQQ,
     MIXED_PRECISION,
@@ -45,6 +46,7 @@
     SmoothQuantConfig,
     StaticQuantConfig,
     TEQConfig,
+    HybridGPTQConfig
 )
 from neural_compressor.torch.utils import (
     dump_model_op_stats,
@@ -721,6 +723,26 @@ def fp8_entry(
     postprocess_model(model, mode, quantizer)
     return model
 
+###################### Habana MixedPrecision Algo Entry ##################################
+@register_algo(HYBRID_GPTQ)
+@torch.no_grad()
+def hybrid_gptq_entry(
+    model: torch.nn.Module,
+    configs_mapping: Dict[Tuple[str], FP8Config],
+    mode: Mode = Mode.QUANTIZE,
+    *args,
+    **kwargs,
+) -> torch.nn.Module:
+    """The main entry to apply w4a8 gptq quantization."""
+
+    from neural_compressor.torch.algorithms.mixed_low_precision import HybridGPTQQuantizer
+
+    quantizer = get_quantizer(model, quantizer_cls=HybridGPTQQuantizer, quant_config=configs_mapping)
+    model = quantizer.execute(model, mode=mode)
+
+    fp8_entry(model, configs_mapping, mode,  *args,  **kwargs)
+    return model
+
 
 ###################### MX Quant Algo Entry ##################################
 @register_algo(name=MX_QUANT)
diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
@@ -39,6 +39,7 @@
     AWQ,
     DEFAULT_WHITE_LIST,
     FP8_QUANT,
+    HYBRID_GPTQ,
     GPTQ,
     HQQ,
     MIXED_PRECISION,
@@ -1807,6 +1808,7 @@ def __init__(
         use_qdq: bool = False,
         scale_format: str = "scalar",
         measure_on_hpu: bool = True,
+        int4_weights: bool = False,
         **kwargs,
     ):
         """Initializing FP8Config.
@@ -1825,6 +1827,7 @@ def __init__(
             measure_exclude (str, optional): Select INPUT/OUTPUT to be exculded by measurement. Defaults to "OUTPUT".
             fake_quant (bool, optional): whether to execute fake quantization, a little bit different with use_qdq, used for training. Defaults to False.
             use_qdq (bool, optional): whether to execute Q/DQ quantization. Defaults to False.
+            int4_weights (bool, optional): Expect 4bit weights or not. Defaults to False.
         """
         super().__init__()
         self.dump_stats_path = dump_stats_path
@@ -1840,6 +1843,7 @@ def __init__(
         self._json_file = None
         self.fake_quant = str(fake_quant)
         self.use_qdq = str(use_qdq)
+        self.int4_weights = int4_weights  # FIXME should be inferred
         self.scale_format = scale_format
         self.measure_on_hpu = measure_on_hpu
         # add kwargs
@@ -1956,6 +1960,34 @@ def get_default_fp8_config_set() -> FP8Config:
     return FP8Config.get_config_set_for_tuning()
 
 
+######################## Hybrid GPTQ Quant Config ###############################
+
+@register_config(framework_name=FRAMEWORK_NAME, algo_name=HYBRID_GPTQ)
+class HybridGPTQConfig(FP8Config):
+    """Config class for Hybrid Precision GPTQ quantization.
+    Currently supports running 4bit weights which have been quantized by GPTQ, during which 
+    the weights have been double quantized from high precision, to fp8, to int4.
+    The activations will be quantized to fp8."""
+    name = HYBRID_GPTQ
+
+    def __init__(
+        self,
+        **kwargs,
+    ):
+        """Initializing Hybrid GPTQ Config.
+
+        """
+        super().__init__(**kwargs)
+
+
+    @staticmethod
+    def convert_from_fp8(config):
+        new_self = HybridGPTQConfig()
+        for attr, value in vars(config).items():
+            setattr(new_self, attr, value)
+        new_self.int4_weights = True
+        return new_self
+
 ######################## MixedPrecision Config ###############################
 @register_config(framework_name=FRAMEWORK_NAME, algo_name=MIXED_PRECISION)
 class MixedPrecisionConfig(TorchBaseConfig):
diff --git a/neural_compressor/torch/quantization/quantize.py b/neural_compressor/torch/quantization/quantize.py
@@ -20,7 +20,7 @@
 
 from neural_compressor.common.base_config import BaseConfig, ComposableConfig, config_registry
 from neural_compressor.common.utils import Mode, call_counter, log_process
-from neural_compressor.torch.quantization.config import INT8StaticQuantConfig, SmoothQuantConfig
+from neural_compressor.torch.quantization.config import INT8StaticQuantConfig, SmoothQuantConfig, HybridGPTQConfig, FP8Config
 from neural_compressor.torch.utils import is_ipex_available, logger
 from neural_compressor.torch.utils.utility import WHITE_MODULE_LIST, algos_mapping, get_model_info
 
@@ -207,6 +207,9 @@ def convert(
         assert isinstance(
             quant_config, BaseConfig
         ), f"Please pass a dict or config instance as the quantization configuration, but got {type(quant_config)}."
+
+    if hasattr(quant_config, 'int4_weights') and quant_config.int4_weights and type(quant_config) == FP8Config:
+        quant_config = HybridGPTQConfig.convert_from_fp8(quant_config)
     logger.debug("Convert model with config:")
     logger.debug(quant_config.to_dict())
 
diff --git a/test/3x/torch/quantization/fp8_quant/test_gptq_mixed_precision.py b/test/3x/torch/quantization/fp8_quant/test_gptq_mixed_precision.py
diff --git a/test/3x/torch/quantization/fp8_quant/test_layer_wise.py b/test/3x/torch/quantization/fp8_quant/test_layer_wise.py
diff --git a/test/3x/torch/quantization/jsons/test_hw_quant.json b/test/3x/torch/quantization/jsons/test_hw_quant.json

Original file line number	Diff line number	Diff line change
`@@ -44,6 +44,7 @@`
`44`	`44`	`get_woq_tuning_config,`
`45`	`45`	`DynamicQuantConfig,`
`46`	`46`	`get_default_dynamic_config,`
	`47`	`+ HybridGPTQConfig`
`47`	`48`	`)`
`48`	`49`
`49`	`50`	`from neural_compressor.torch.quantization.autotune import (`