Revert "fp8 aware gptq (hybrid gptq) (#154)" (#184)

tszulist-hbn · web-flow · commit ba9475d7599b · 2025-03-25T13:54:48.000+01:00
This reverts commit 050dc44.
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
@@ -101,12 +101,6 @@
 parser.add_argument("--gptq_blockwise", action="store_true",
                     help="Whether to quantize blockwise.")
 parser.add_argument("--blockwise_load_folder", default=None, type=str, help="Directory to load blockwise checkpoints from.")
-parser.add_argument("--fp8_aware", action="store_true", help="Enable an FP8-aware GPTQ quantization flow, "
-                                                             "where an intermediate FP8 quantization step is applied.")
-parser.add_argument("--hybrid_act_order", action="store_true", help="Enable constrained activation reordering: "
-                                                                    "elements can be reordered within each group "
-                                                                    "and the groups themselves can also be reordered, "
-                                                                    "but elements cannot move between groups.")
 
 # =============AWQ configs====================
 parser.add_argument("--use_auto_scale", action="store_true",
@@ -464,8 +458,6 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args):
                 use_mse_search=args.woq_use_mse_search,
                 percdamp=args.gptq_percdamp,
                 act_order=args.gptq_actorder,
-                hybrid_order = args.hybrid_act_order,
-                fp8_aware = args.fp8_aware,
                 block_size=args.gptq_block_size,
                 static_groups=args.gptq_static_groups,
                 use_double_quant=False,
diff --git a/neural_compressor/torch/algorithms/mixed_low_precision/modules.py b/neural_compressor/torch/algorithms/mixed_low_precision/modules.py
@@ -9,30 +9,26 @@
 from ..weight_only.modules import HPUWeightOnlyLinear
 from neural_compressor.torch.utils import accelerator, logger
 
-cast_to_fp8_fcn = lambda x, dtype, scale_inv=None: torch.ops.hpu.cast_to_fp8_v2(x, scale_inv, False, False, dtype)[0]
 
 class HPUMixedPrecisionLinear(HPUWeightOnlyLinear):
     """Weight and Activations quant (W4A8 gptq) Linear for HPU device."""
 
     def __init__(
-        self, in_features, out_features, bias,
+        self, in_features, out_features,
         **kwargs,
     ):
         """Init the HPUMixedPrecisionLinear object.
         """
-        super(HPUMixedPrecisionLinear, self).__init__(in_features, out_features, bias=bias)
+        super(HPUMixedPrecisionLinear, self).__init__(in_features, out_features)
 
     def forward(self, input):
         """The forward function of HPUMixedPrecisionLinear."""
         input_dtype = input.dtype
         output_shape = input.shape[:-1] + (self.out_features,)
         scales = self.scales
-        scale_bf16_to_fp8 = self.scale_bf16_to_fp8
         qweight = self.qweight
         zeros = self.qzeros
-        self.matmul_internal.scale_other = torch.nn.Parameter(scale_bf16_to_fp8)
-        weight = torch.ops.hpu.convert_from_uint4(qweight, scales, zeros, torch.bfloat16)  # the uint4->fp8 is currently slower and with bugs. Jira ticket: https://jira.habana-labs.com/browse/SW-218009
-        weight = cast_to_fp8_fcn(weight, torch.float8_e4m3fn)
+        weight = torch.ops.hpu.convert_from_uint4(qweight, scales/self.matmul_internal.scale_other, zeros, torch.float8_e4m3fn)     # todo: div scales in init
         output = self.matmul_internal(input, weight)
         output = output.to(dtype=input_dtype).reshape(
             output_shape
@@ -42,8 +38,7 @@ def forward(self, input):
 
     @staticmethod
     def convert_from_weight_only(obj):
-        bias = obj.bias is not None
-        new_self = HPUMixedPrecisionLinear(obj.in_features, obj.out_features, bias)
+        new_self = HPUMixedPrecisionLinear(obj.in_features, obj.out_features)
         for attr, value in vars(obj).items():
             setattr(new_self, attr, value)
         new_self.matmul_internal.no_input_quant = True # flag for 8bit input, which shouldn't be quantized in matmul
diff --git a/neural_compressor/torch/algorithms/weight_only/gptq.py b/neural_compressor/torch/algorithms/weight_only/gptq.py
diff --git a/neural_compressor/torch/algorithms/weight_only/modules.py b/neural_compressor/torch/algorithms/weight_only/modules.py
@@ -62,9 +62,9 @@ def forward(self, X):
 class UnpackedWeightOnlyLinearParams(dict):
     """Contains all unpacked weight values."""
 
-    def __init__(self, unpack_weight, scales, scale_bf16_to_fp8, unpack_zp, **kwargs):
+    def __init__(self, unpack_weight, scales, unpack_zp, **kwargs):
         """Create dict."""
-        super().__init__(int_weight=unpack_weight, scales=scales, scale_bf16_to_fp8 = scale_bf16_to_fp8, zp=unpack_zp, **kwargs)
+        super().__init__(int_weight=unpack_weight, scales=scales, zp=unpack_zp, **kwargs)
 
     def to(self, device):
         """Change device for all values."""
@@ -209,14 +209,6 @@ def __init__(
                     dtype=self.float_type,
                 ).to(device),
             )
-            self.register_buffer(
-                "scale_bf16_to_fp8",
-                torch.zeros(
-                    1,
-                    dtype=self.float_type,
-                ).to(device),
-            )
-            # scale_bf16_to_fp8 is only used in w4a8 measurement mode and currently supports only per-tensor scaling
             self.register_buffer(
                 "qweight",
                 torch.zeros(
@@ -242,13 +234,6 @@ def __init__(
                     dtype=self.float_type,
                 ).to(device),
             )
-            self.register_buffer(
-                "scale_bf16_to_fp8",
-                torch.zeros(
-                    1,
-                    dtype=self.float_type,
-                ).to(device),
-            )
             if compression_dim == 1:
                 self.register_buffer(
                     "qweight",
@@ -290,7 +275,7 @@ def __init__(
         else:
             self.g_idx = None
 
-    def pack(self, int_weight, scales, zp, scale_bf16_to_fp8=None, bias=None, g_idx=None, **kwargs):
+    def pack(self, int_weight, scales, zp, bias=None, g_idx=None, **kwargs):
         """Pack int weight."""
         if self.use_optimum_format:
             self.scales = self.scales.T.contiguous()
@@ -316,8 +301,6 @@ def pack(self, int_weight, scales, zp, scale_bf16_to_fp8=None, bias=None, g_idx=
                 self.g_idx = self.g_idx.type(torch.int32).to(self.device)
         assert scales.shape == self.scales.shape, f"{scales.shape} != {self.scales.shape} Scale shape is mismatched."
         self.scales = scales.type(self.float_type).to(self.device)
-        if scale_bf16_to_fp8 is not None:
-            self.scale_bf16_to_fp8 = scale_bf16_to_fp8.type(self.float_type).to(self.device)
         if not self.use_optimum_format and self.compression_dim == 0:
             int_weight = int_weight.T.contiguous()
             self.qweight = self.qweight.T.contiguous()
@@ -349,7 +332,6 @@ def pack(self, int_weight, scales, zp, scale_bf16_to_fp8=None, bias=None, g_idx=
     def unpack(self):
         """Unpack weight and zero point."""
         scales = self.scales.T.contiguous() if self.use_optimum_format else self.scales
-        scale_bf16_to_fp8 = self.scale_bf16_to_fp8
         qweight = self.qweight.T.contiguous() if self.use_optimum_format else self.qweight
 
         device = scales.device
@@ -385,15 +367,14 @@ def unpack(self):
                 # zp -= 1 may cause zp == -1, after recover it becomes 2**self.bits - 1
                 zp += 1
                 zp = torch.where(zp > (2**self.bits - 1), 0, zp)
-        return UnpackedWeightOnlyLinearParams(weight, scales, scale_bf16_to_fp8, zp, g_idx=self.g_idx, bias=self.bias)
+        return UnpackedWeightOnlyLinearParams(weight, scales, zp, g_idx=self.g_idx, bias=self.bias)
 
     def recover(self):
         """Recover fp32 weight from packed weight."""
         logger.debug(f"Recovering {self} weight")
         unpack_params_dict = self.unpack()
         weight = unpack_params_dict.get("int_weight")
         scales = unpack_params_dict.get("scales")
-        scale_bf16_to_fp8 = unpack_params_dict.get("scale_bf16_to_fp8")
         zp = unpack_params_dict.get("zp")
 
         device = scales.device
@@ -687,13 +668,7 @@ def __init__(
                 dtype=self.float_type,
             ),
         )
-        self.register_buffer(
-            "scale_bf16_to_fp8",
-            torch.zeros(
-                1,
-                dtype=self.float_type,
-            ),
-        )
+
         if g_idx:
             self.register_buffer(
                 "g_idx",
@@ -712,22 +687,17 @@ def forward(self, input):
         input_dtype = input.dtype
         output_shape = input.shape[:-1] + (self.out_features,)
         scales = self.scales
-        scale_bf16_to_fp8 = self.scale_bf16_to_fp8                                                      # Added by Tomer, per tensor scale.
         qweight = self.qweight
         zeros = self.qzeros
-        if scale_bf16_to_fp8 > 0:     # this means we are at w4a8 mode.
-            weight = torch.ops.hpu.convert_from_uint4(qweight, scales, zeros, torch.float8_e4m3fn)
-            weight = weight.to(input_dtype) * scale_bf16_to_fp8
-        else:
-            weight = torch.ops.hpu.convert_from_uint4(qweight, scales, zeros, input_dtype)
+        weight = torch.ops.hpu.convert_from_uint4(qweight, scales, zeros, input_dtype)
         output = self.matmul_internal(input, weight)
         output = output.to(dtype=input_dtype).reshape(
             output_shape
         )  # A cast is needed here as for some reason the vecquant2matmul_faster_old still allocate a float32 output.
         output = output + self.bias if self.bias is not None else output
         return output
 
-    def pack(self, int_weight, scales, zp, scale_bf16_to_fp8=None, bias=None, g_idx=None):
+    def pack(self, int_weight, scales, zp, bias=None, g_idx=None):
         """Pack weight and zero point."""
         logger.debug("Packing for HPU")
 
@@ -736,7 +706,6 @@ def pack(self, int_weight, scales, zp, scale_bf16_to_fp8=None, bias=None, g_idx=
         qweight = int_weight.T.contiguous()
 
         self.scales = scales.to(dtype=torch.bfloat16)
-        self.scale_bf16_to_fp8 = scale_bf16_to_fp8.to(dtype=torch.bfloat16)
 
         # weights and zp are on device from unpack, need to load to cpu for packing
         self.qweight = qweight.cpu()
diff --git a/neural_compressor/torch/algorithms/weight_only/save_load.py b/neural_compressor/torch/algorithms/weight_only/save_load.py
@@ -450,7 +450,7 @@ def _replace_woqlinear_modules(self, name, linear_module, module_quantization_co
 
     def _load_data_to_new_module(self, new_module, module_name):
         new_module_state_dict = {}
-        for key in [".qweight", ".scales", ".scale_bf16_to_fp8", ".qzeros", ".bias", ".g_idx"]:
+        for key in [".qweight", ".scales", ".qzeros", ".bias", ".g_idx"]:
             full_name = module_name + key
             if full_name in self.loaded_state_dict:
                 new_module_state_dict[key[1:]] = self.loaded_state_dict.pop(full_name)
diff --git a/neural_compressor/torch/algorithms/weight_only/utility.py b/neural_compressor/torch/algorithms/weight_only/utility.py
@@ -479,7 +479,7 @@ def search_clip(m, bits=4, group_size=32, scheme="asym", dtype="int", enable_ful
     return best_clip_ratio
 
 
-def quant_weight_w_scale(weight, scale, scale_bf16_to_fp8, zp=None, group_size=-1, dtype="int", fp8_aware=False):
+def quant_weight_w_scale(weight, scale, zp=None, group_size=-1, dtype="int"):
     """Quant and dequant tensor with group size. It's an in-place function.
 
     Args:
@@ -494,11 +494,6 @@ def quant_weight_w_scale(weight, scale, scale_bf16_to_fp8, zp=None, group_size=-
     """
     device = weight.device
     scale = scale.to(device)
-    if fp8_aware:
-        weight.mul_(1 / scale_bf16_to_fp8)
-        weight = torch.clamp(weight, min=-torch.finfo(torch.float8_e4m3fnuz).max, max=torch.finfo(torch.float8_e4m3fnuz).max)
-        weight = weight.to(torch.float8_e4m3fn)
-        weight = weight.to(torch.float32)
     if zp is not None:
         zp = zp.to(device)
     # group_size = -1
diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py
@@ -158,8 +158,6 @@ def gptq_entry(
             "double_quant_sym": quant_config.double_quant_use_sym,
             "double_quant_group_size": quant_config.double_quant_group_size,
             "act_order": quant_config.act_order,
-            "hybrid_order": quant_config.hybrid_order,
-            "fp8_aware": quant_config.fp8_aware,
             "percdamp": quant_config.percdamp,
             "block_size": quant_config.block_size,
             "static_groups": quant_config.static_groups,
diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
@@ -352,8 +352,6 @@ class GPTQConfig(TorchBaseConfig):
         "quant_lm_head",
         # gptq params
         "act_order",
-        "hybrid_order",
-        "fp8_aware",
         "percdamp",
         "block_size",
         "static_groups",
@@ -381,8 +379,6 @@ def __init__(
         quant_lm_head: bool = False,
         # gptq params
         act_order: bool = False,
-        hybrid_order: bool = False,
-        fp8_aware: bool = False,
         percdamp: float = 0.01,
         block_size: int = 2048,
         static_groups: bool = False,
@@ -410,10 +406,6 @@ def __init__(
             quant_lm_head (bool): Indicates whether quantize the lm_head layer in transformers。 Default is False.
             act_order (bool): Whether to sort Hessian's diagonal values to rearrange channel-wise
                               quantization order. Default is False.
-            hybrid_order (bool): Enables activation re-ordering with no inference overhead.
-                                 Weights are re-ordered within their groups without cross-group mixing.
-            fp8_aware (bool): Whether to include an FP8 quantization step in the GPTQ process.
-                              This improves accuracy when using the W4A8 quantization scheme.
             percdamp (float): Percentage of Hessian's diagonal values' average, which will be added to
                               Hessian's diagonal to increase numerical stability. Default is 0.01.
             block_size (int): Execute GPTQ quantization per block, block shape = [C_out, block_size].
@@ -446,8 +438,6 @@ def __init__(
         self.double_quant_group_size = double_quant_group_size
         # gptq
         self.act_order = act_order
-        self.hybrid_order = hybrid_order
-        self.fp8_aware = fp8_aware
         self.percdamp = percdamp
         self.block_size = block_size
         self.static_groups = static_groups
diff --git a/test/3x/torch/quantization/fp8_quant/test_fp8_jsons/test_pow2_w4a8_quant.json b/test/3x/torch/quantization/fp8_quant/test_fp8_jsons/test_pow2_w4a8_quant.json
diff --git a/test/3x/torch/quantization/fp8_quant/test_gptq_mixed_precision.py b/test/3x/torch/quantization/fp8_quant/test_gptq_mixed_precision.py