fp8 aware gptq (hybrid gptq) (#154)

tgafni · Asaf Karnieli · web-flow · commit 050dc44424de · 2025-03-24T21:38:24.000+02:00
* fp8 aware gptq (hybrid gptq)

* review1

* loading bias to mixed low precision

* fixing tests for fp8 aware quantization and hybrid re-ordering

* Addressed second review round comments

* Adressed review 3 comments

---------

Co-authored-by: Asaf Karnieli &lt;akarnieli@habana.ai&gt;
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
@@ -101,6 +101,12 @@
 parser.add_argument("--gptq_blockwise", action="store_true",
                     help="Whether to quantize blockwise.")
 parser.add_argument("--blockwise_load_folder", default=None, type=str, help="Directory to load blockwise checkpoints from.")
+parser.add_argument("--fp8_aware", action="store_true", help="Enable an FP8-aware GPTQ quantization flow, "
+                                                             "where an intermediate FP8 quantization step is applied.")
+parser.add_argument("--hybrid_act_order", action="store_true", help="Enable constrained activation reordering: "
+                                                                    "elements can be reordered within each group "
+                                                                    "and the groups themselves can also be reordered, "
+                                                                    "but elements cannot move between groups.")
 
 # =============AWQ configs====================
 parser.add_argument("--use_auto_scale", action="store_true",
@@ -458,6 +464,8 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args):
                 use_mse_search=args.woq_use_mse_search,
                 percdamp=args.gptq_percdamp,
                 act_order=args.gptq_actorder,
+                hybrid_order = args.hybrid_act_order,
+                fp8_aware = args.fp8_aware,
                 block_size=args.gptq_block_size,
                 static_groups=args.gptq_static_groups,
                 use_double_quant=False,
diff --git a/neural_compressor/torch/algorithms/mixed_low_precision/modules.py b/neural_compressor/torch/algorithms/mixed_low_precision/modules.py
@@ -9,26 +9,30 @@
 from ..weight_only.modules import HPUWeightOnlyLinear
 from neural_compressor.torch.utils import accelerator, logger
 
+cast_to_fp8_fcn = lambda x, dtype, scale_inv=None: torch.ops.hpu.cast_to_fp8_v2(x, scale_inv, False, False, dtype)[0]
 
 class HPUMixedPrecisionLinear(HPUWeightOnlyLinear):
     """Weight and Activations quant (W4A8 gptq) Linear for HPU device."""
 
     def __init__(
-        self, in_features, out_features,
+        self, in_features, out_features, bias,
         **kwargs,
     ):
         """Init the HPUMixedPrecisionLinear object.
         """
-        super(HPUMixedPrecisionLinear, self).__init__(in_features, out_features)
+        super(HPUMixedPrecisionLinear, self).__init__(in_features, out_features, bias=bias)
 
     def forward(self, input):
         """The forward function of HPUMixedPrecisionLinear."""
         input_dtype = input.dtype
         output_shape = input.shape[:-1] + (self.out_features,)
         scales = self.scales
+        scale_bf16_to_fp8 = self.scale_bf16_to_fp8
         qweight = self.qweight
         zeros = self.qzeros
-        weight = torch.ops.hpu.convert_from_uint4(qweight, scales/self.matmul_internal.scale_other, zeros, torch.float8_e4m3fn)     # todo: div scales in init
+        self.matmul_internal.scale_other = torch.nn.Parameter(scale_bf16_to_fp8)
+        weight = torch.ops.hpu.convert_from_uint4(qweight, scales, zeros, torch.bfloat16)  # the uint4->fp8 is currently slower and with bugs. Jira ticket: https://jira.habana-labs.com/browse/SW-218009
+        weight = cast_to_fp8_fcn(weight, torch.float8_e4m3fn)
         output = self.matmul_internal(input, weight)
         output = output.to(dtype=input_dtype).reshape(
             output_shape
@@ -38,7 +42,8 @@ def forward(self, input):
 
     @staticmethod
     def convert_from_weight_only(obj):
-        new_self = HPUMixedPrecisionLinear(obj.in_features, obj.out_features)
+        bias = obj.bias is not None
+        new_self = HPUMixedPrecisionLinear(obj.in_features, obj.out_features, bias)
         for attr, value in vars(obj).items():
             setattr(new_self, attr, value)
         new_self.matmul_internal.no_input_quant = True # flag for 8bit input, which shouldn't be quantized in matmul
diff --git a/neural_compressor/torch/algorithms/weight_only/gptq.py b/neural_compressor/torch/algorithms/weight_only/gptq.py
diff --git a/neural_compressor/torch/algorithms/weight_only/modules.py b/neural_compressor/torch/algorithms/weight_only/modules.py
@@ -62,9 +62,9 @@ def forward(self, X):
 class UnpackedWeightOnlyLinearParams(dict):
     """Contains all unpacked weight values."""
 
-    def __init__(self, unpack_weight, scales, unpack_zp, **kwargs):
+    def __init__(self, unpack_weight, scales, scale_bf16_to_fp8, unpack_zp, **kwargs):
         """Create dict."""
-        super().__init__(int_weight=unpack_weight, scales=scales, zp=unpack_zp, **kwargs)
+        super().__init__(int_weight=unpack_weight, scales=scales, scale_bf16_to_fp8 = scale_bf16_to_fp8, zp=unpack_zp, **kwargs)
 
     def to(self, device):
         """Change device for all values."""
@@ -209,6 +209,14 @@ def __init__(
                     dtype=self.float_type,
                 ).to(device),
             )
+            self.register_buffer(
+                "scale_bf16_to_fp8",
+                torch.zeros(
+                    1,
+                    dtype=self.float_type,
+                ).to(device),
+            )
+            # scale_bf16_to_fp8 is only used in w4a8 measurement mode and currently supports only per-tensor scaling
             self.register_buffer(
                 "qweight",
                 torch.zeros(
@@ -234,6 +242,13 @@ def __init__(
                     dtype=self.float_type,
                 ).to(device),
             )
+            self.register_buffer(
+                "scale_bf16_to_fp8",
+                torch.zeros(
+                    1,
+                    dtype=self.float_type,
+                ).to(device),
+            )
             if compression_dim == 1:
                 self.register_buffer(
                     "qweight",
@@ -275,7 +290,7 @@ def __init__(
         else:
             self.g_idx = None
 
-    def pack(self, int_weight, scales, zp, bias=None, g_idx=None, **kwargs):
+    def pack(self, int_weight, scales, zp, scale_bf16_to_fp8=None, bias=None, g_idx=None, **kwargs):
         """Pack int weight."""
         if self.use_optimum_format:
             self.scales = self.scales.T.contiguous()
@@ -301,6 +316,8 @@ def pack(self, int_weight, scales, zp, bias=None, g_idx=None, **kwargs):
                 self.g_idx = self.g_idx.type(torch.int32).to(self.device)
         assert scales.shape == self.scales.shape, f"{scales.shape} != {self.scales.shape} Scale shape is mismatched."
         self.scales = scales.type(self.float_type).to(self.device)
+        if scale_bf16_to_fp8 is not None:
+            self.scale_bf16_to_fp8 = scale_bf16_to_fp8.type(self.float_type).to(self.device)
         if not self.use_optimum_format and self.compression_dim == 0:
             int_weight = int_weight.T.contiguous()
             self.qweight = self.qweight.T.contiguous()
@@ -332,6 +349,7 @@ def pack(self, int_weight, scales, zp, bias=None, g_idx=None, **kwargs):
     def unpack(self):
         """Unpack weight and zero point."""
         scales = self.scales.T.contiguous() if self.use_optimum_format else self.scales
+        scale_bf16_to_fp8 = self.scale_bf16_to_fp8
         qweight = self.qweight.T.contiguous() if self.use_optimum_format else self.qweight
 
         device = scales.device
@@ -367,14 +385,15 @@ def unpack(self):
                 # zp -= 1 may cause zp == -1, after recover it becomes 2**self.bits - 1
                 zp += 1
                 zp = torch.where(zp > (2**self.bits - 1), 0, zp)
-        return UnpackedWeightOnlyLinearParams(weight, scales, zp, g_idx=self.g_idx, bias=self.bias)
+        return UnpackedWeightOnlyLinearParams(weight, scales, scale_bf16_to_fp8, zp, g_idx=self.g_idx, bias=self.bias)
 
     def recover(self):
         """Recover fp32 weight from packed weight."""
         logger.debug(f"Recovering {self} weight")
         unpack_params_dict = self.unpack()
         weight = unpack_params_dict.get("int_weight")
         scales = unpack_params_dict.get("scales")
+        scale_bf16_to_fp8 = unpack_params_dict.get("scale_bf16_to_fp8")
         zp = unpack_params_dict.get("zp")
 
         device = scales.device
@@ -668,7 +687,13 @@ def __init__(
                 dtype=self.float_type,
             ),
         )
-
+        self.register_buffer(
+            "scale_bf16_to_fp8",
+            torch.zeros(
+                1,
+                dtype=self.float_type,
+            ),
+        )
         if g_idx:
             self.register_buffer(
                 "g_idx",
@@ -687,17 +712,22 @@ def forward(self, input):
         input_dtype = input.dtype
         output_shape = input.shape[:-1] + (self.out_features,)
         scales = self.scales
+        scale_bf16_to_fp8 = self.scale_bf16_to_fp8                                                      # Added by Tomer, per tensor scale.
         qweight = self.qweight
         zeros = self.qzeros
-        weight = torch.ops.hpu.convert_from_uint4(qweight, scales, zeros, input_dtype)
+        if scale_bf16_to_fp8 > 0:     # this means we are at w4a8 mode.
+            weight = torch.ops.hpu.convert_from_uint4(qweight, scales, zeros, torch.float8_e4m3fn)
+            weight = weight.to(input_dtype) * scale_bf16_to_fp8
+        else:
+            weight = torch.ops.hpu.convert_from_uint4(qweight, scales, zeros, input_dtype)
         output = self.matmul_internal(input, weight)
         output = output.to(dtype=input_dtype).reshape(
             output_shape
         )  # A cast is needed here as for some reason the vecquant2matmul_faster_old still allocate a float32 output.
         output = output + self.bias if self.bias is not None else output
         return output
 
-    def pack(self, int_weight, scales, zp, bias=None, g_idx=None):
+    def pack(self, int_weight, scales, zp, scale_bf16_to_fp8=None, bias=None, g_idx=None):
         """Pack weight and zero point."""
         logger.debug("Packing for HPU")
 
@@ -706,6 +736,7 @@ def pack(self, int_weight, scales, zp, bias=None, g_idx=None):
         qweight = int_weight.T.contiguous()
 
         self.scales = scales.to(dtype=torch.bfloat16)
+        self.scale_bf16_to_fp8 = scale_bf16_to_fp8.to(dtype=torch.bfloat16)
 
         # weights and zp are on device from unpack, need to load to cpu for packing
         self.qweight = qweight.cpu()
diff --git a/neural_compressor/torch/algorithms/weight_only/save_load.py b/neural_compressor/torch/algorithms/weight_only/save_load.py
@@ -450,7 +450,7 @@ def _replace_woqlinear_modules(self, name, linear_module, module_quantization_co
 
     def _load_data_to_new_module(self, new_module, module_name):
         new_module_state_dict = {}
-        for key in [".qweight", ".scales", ".qzeros", ".bias", ".g_idx"]:
+        for key in [".qweight", ".scales", ".scale_bf16_to_fp8", ".qzeros", ".bias", ".g_idx"]:
             full_name = module_name + key
             if full_name in self.loaded_state_dict:
                 new_module_state_dict[key[1:]] = self.loaded_state_dict.pop(full_name)
diff --git a/neural_compressor/torch/algorithms/weight_only/utility.py b/neural_compressor/torch/algorithms/weight_only/utility.py
@@ -479,7 +479,7 @@ def search_clip(m, bits=4, group_size=32, scheme="asym", dtype="int", enable_ful
     return best_clip_ratio
 
 
-def quant_weight_w_scale(weight, scale, zp=None, group_size=-1, dtype="int"):
+def quant_weight_w_scale(weight, scale, scale_bf16_to_fp8, zp=None, group_size=-1, dtype="int", fp8_aware=False):
     """Quant and dequant tensor with group size. It's an in-place function.
 
     Args:
@@ -494,6 +494,11 @@ def quant_weight_w_scale(weight, scale, zp=None, group_size=-1, dtype="int"):
     """
     device = weight.device
     scale = scale.to(device)
+    if fp8_aware:
+        weight.mul_(1 / scale_bf16_to_fp8)
+        weight = torch.clamp(weight, min=-torch.finfo(torch.float8_e4m3fnuz).max, max=torch.finfo(torch.float8_e4m3fnuz).max)
+        weight = weight.to(torch.float8_e4m3fn)
+        weight = weight.to(torch.float32)
     if zp is not None:
         zp = zp.to(device)
     # group_size = -1
diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py
@@ -158,6 +158,8 @@ def gptq_entry(
             "double_quant_sym": quant_config.double_quant_use_sym,
             "double_quant_group_size": quant_config.double_quant_group_size,
             "act_order": quant_config.act_order,
+            "hybrid_order": quant_config.hybrid_order,
+            "fp8_aware": quant_config.fp8_aware,
             "percdamp": quant_config.percdamp,
             "block_size": quant_config.block_size,
             "static_groups": quant_config.static_groups,
diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
@@ -352,6 +352,8 @@ class GPTQConfig(TorchBaseConfig):
         "quant_lm_head",
         # gptq params
         "act_order",
+        "hybrid_order",
+        "fp8_aware",
         "percdamp",
         "block_size",
         "static_groups",
@@ -379,6 +381,8 @@ def __init__(
         quant_lm_head: bool = False,
         # gptq params
         act_order: bool = False,
+        hybrid_order: bool = False,
+        fp8_aware: bool = False,
         percdamp: float = 0.01,
         block_size: int = 2048,
         static_groups: bool = False,
@@ -406,6 +410,10 @@ def __init__(
             quant_lm_head (bool): Indicates whether quantize the lm_head layer in transformers。 Default is False.
             act_order (bool): Whether to sort Hessian's diagonal values to rearrange channel-wise
                               quantization order. Default is False.
+            hybrid_order (bool): Enables activation re-ordering with no inference overhead.
+                                 Weights are re-ordered within their groups without cross-group mixing.
+            fp8_aware (bool): Whether to include an FP8 quantization step in the GPTQ process.
+                              This improves accuracy when using the W4A8 quantization scheme.
             percdamp (float): Percentage of Hessian's diagonal values' average, which will be added to
                               Hessian's diagonal to increase numerical stability. Default is 0.01.
             block_size (int): Execute GPTQ quantization per block, block shape = [C_out, block_size].
@@ -438,6 +446,8 @@ def __init__(
         self.double_quant_group_size = double_quant_group_size
         # gptq
         self.act_order = act_order
+        self.hybrid_order = hybrid_order
+        self.fp8_aware = fp8_aware
         self.percdamp = percdamp
         self.block_size = block_size
         self.static_groups = static_groups
diff --git a/test/3x/torch/quantization/fp8_quant/test_fp8_jsons/test_pow2_w4a8_quant.json b/test/3x/torch/quantization/fp8_quant/test_fp8_jsons/test_pow2_w4a8_quant.json
@@ -0,0 +1,22 @@
+{
+    "mode": "QUANTIZE",
+    "observer": "maxabs",
+    "scale_method": "maxabs_pow2",
+    "blacklist": {
+        "types": [],
+        "names": [
+            "matmul_qk",
+            "matmul_av",
+            "k_cache",
+            "v_cache",
+            "fused_scaled_dot_product_attention",
+            "lm_head"
+        ]
+    },
+    "scale_params": {
+        "input_backoff": 1,
+        "weight_backoff": 1
+    },
+    "dump_stats_path": "./test_outputs/unit_test",
+    "int4_weights": "True"
+}
diff --git a/test/3x/torch/quantization/fp8_quant/test_gptq_mixed_precision.py b/test/3x/torch/quantization/fp8_quant/test_gptq_mixed_precision.py