From 9e106b8c6cc81efee1851a0c58485061034bb871 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Tue, 7 Nov 2023 21:58:47 +0800
Subject: [PATCH 01/23] add use_HF_format for export_compressed_model

Signed-off-by: Xin He <xin3.he@intel.com>
---
 .../adaptor/torch_utils/model_wrapper.py      | 109 +++++++++++++-----
 .../adaptor/torch_utils/weight_only.py        |   2 +
 neural_compressor/model/torch_model.py        |   8 ++
 .../test_weight_only_adaptor.py               |  21 +++-
 4 files changed, 104 insertions(+), 36 deletions(-)

diff --git a/neural_compressor/adaptor/torch_utils/model_wrapper.py b/neural_compressor/adaptor/torch_utils/model_wrapper.py
index d30182a7b9e..4138e8b1df1 100644
--- a/neural_compressor/adaptor/torch_utils/model_wrapper.py
+++ b/neural_compressor/adaptor/torch_utils/model_wrapper.py
@@ -217,8 +217,10 @@ def __init__(
         compression_dim=1,
         gptq_perm=False,
         device="cpu",
+        use_HF_format=False,
     ):
         super().__init__()
+        self.use_HF_format = use_HF_format
         self.dtype = dtype
         if "int" not in self.dtype:  # for nf4, fp4
             from neural_compressor.adaptor.torch_utils.weight_only import FLOAT_MAPPING, INT_MAPPING
@@ -249,53 +251,85 @@ def __init__(
         assert compression_dim in [0, 1], (
             "Only support 0 or 1 as compression dimension, " + "0 is output channel, 1 is input channel."
         )
-        self.register_buffer(
-            "scale",
-            torch.zeros(
-                (out_features, math.ceil(in_features / self.groupsize)),
-                dtype=self.float_type,
-            ).to(device),
-        )
-        if compression_dim == 1:
+        if self.use_HF_format:
+            self.register_buffer(
+                "scales",
+                torch.zeros(
+                    (out_features, math.ceil(in_features / self.groupsize)),
+                    dtype=self.float_type,
+                ).to(device),
+            )
+            self.scale = self.scales
             self.register_buffer(
-                "packed_weight",
+                "qweight",
                 torch.zeros(
-                    (out_features, math.ceil(in_features / self.n_pack)),
+                    (math.ceil(in_features / self.n_pack), out_features),
                     dtype=self.compressed_dtype,
                 ).to(device),
             )
+            self.packed_weight = self.qweight.T
             if zp:
                 self.register_buffer(
-                    "packed_zp",
+                    "qzeros",
                     torch.zeros(
-                        (self.out_features, math.ceil(self.in_features / self.groupsize / self.n_pack)),
+                        (math.ceil(self.in_features / self.groupsize), math.ceil(self.out_features / self.n_pack)),
                         dtype=self.compressed_dtype,
                     ).to(device),
                 )
+                self.packed_zp = self.qzeros.T
+            if gptq_perm:
+                self.register_buffer("g_idx", torch.zeros(in_features, dtype=torch.int32).to(device))
+            else:
+                self.g_idx = None
+            self.gptq_perm = self.g_idx
         else:
             self.register_buffer(
-                "packed_weight",
+                "scale",
                 torch.zeros(
-                    (math.ceil(out_features / self.n_pack), in_features),
-                    dtype=self.compressed_dtype,
+                    (out_features, math.ceil(in_features / self.groupsize)),
+                    dtype=self.float_type,
                 ).to(device),
             )
-            if zp:
+            if compression_dim == 1:
                 self.register_buffer(
-                    "packed_zp",
+                    "packed_weight",
                     torch.zeros(
-                        (math.ceil(self.out_features / self.n_pack), math.ceil(self.in_features / self.groupsize)),
+                        (out_features, math.ceil(in_features / self.n_pack)),
                         dtype=self.compressed_dtype,
                     ).to(device),
                 )
+                if zp:
+                    self.register_buffer(
+                        "packed_zp",
+                        torch.zeros(
+                            (self.out_features, math.ceil(self.in_features / self.groupsize / self.n_pack)),
+                            dtype=self.compressed_dtype,
+                        ).to(device),
+                    )
+            else:
+                self.register_buffer(
+                    "packed_weight",
+                    torch.zeros(
+                        (math.ceil(out_features / self.n_pack), in_features),
+                        dtype=self.compressed_dtype,
+                    ).to(device),
+                )
+                if zp:
+                    self.register_buffer(
+                        "packed_zp",
+                        torch.zeros(
+                            (math.ceil(self.out_features / self.n_pack), math.ceil(self.in_features / self.groupsize)),
+                            dtype=self.compressed_dtype,
+                        ).to(device),
+                    )
+            if gptq_perm:
+                self.register_buffer("gptq_perm", torch.zeros(in_features, dtype=torch.int32).to(device))
+            else:
+                self.gptq_perm = None
         if bias:
             self.register_buffer("bias", torch.zeros(self.out_features, dtype=self.float_type).to(device))
         else:
             self.bias = None
-        if gptq_perm:
-            self.register_buffer("gptq_perm", torch.zeros(in_features, dtype=torch.int32).to(device))
-        else:
-            self.gptq_perm = None
 
     def pack(self, int_weight, scale, zp, bias, gptq_perm=None):
         int_weight = int_weight.to(self.device)
@@ -307,7 +341,7 @@ def pack(self, int_weight, scale, zp, bias, gptq_perm=None):
             self.gptq_perm = gptq_perm.type(torch.int32).to(self.device)
         assert scale.shape == self.scale.shape, "Scale shape is mismatched."
         self.scale = scale.type(self.float_type).to(self.device)
-        if self.compression_dim == 0:
+        if not self.use_HF_format and self.compression_dim == 0:
             int_weight = int_weight.T
             self.packed_weight = self.packed_weight.T
         origin_shape = int_weight.shape
@@ -324,12 +358,14 @@ def pack(self, int_weight, scale, zp, bias, gptq_perm=None):
                 tmp[:, e] &= mask
                 tmp[:, e] = tmp[:, e] << (self.bits * e)
                 self.packed_weight[:, j] |= tmp[:, e]
-        if self.compression_dim == 0:
+        if not self.use_HF_format and self.compression_dim == 0:
             self.packed_weight = self.packed_weight.T
 
         if zp is not None:
             zp = zp.to(self.device)
-            if self.compression_dim == 0:
+            if self.use_HF_format:
+                zp -= 1
+            if self.use_HF_format or self.compression_dim == 0:
                 zp = zp.T
                 self.packed_zp = self.packed_zp.T
             assert hasattr(self, "packed_zp"), "zp is not set when initializing."
@@ -342,7 +378,7 @@ def pack(self, int_weight, scale, zp, bias, gptq_perm=None):
                     tmp[:, e] &= mask
                     tmp[:, e] = tmp[:, e] << (self.bits * e)
                     self.packed_zp[:, j] |= tmp[:, e]
-            if self.compression_dim == 0:
+            if self.use_HF_format or self.compression_dim == 0:
                 self.packed_zp = self.packed_zp.T
 
     def recover(self):
@@ -356,7 +392,7 @@ def recover(self):
         # unpack weight
         weight = torch.zeros(self.out_features, self.in_features, dtype=weight_dtype).to(device)
         packed_weight = self.packed_weight
-        if self.compression_dim == 0:
+        if not self.use_HF_format and self.compression_dim == 0:
             weight = weight.T
             packed_weight = packed_weight.T
         origin_shape = weight.shape
@@ -372,7 +408,7 @@ def recover(self):
                 if weight_dtype == torch.uint8:
                     tmp &= mask  # remove sign bit
                 weight[:, index] = tmp.type(weight_dtype)
-        if self.compression_dim == 0:
+        if not self.use_HF_format and self.compression_dim == 0:
             weight = weight.T
         if "int" not in self.dtype:
             new_weight = torch.zeros(self.out_features, self.in_features).to(device)
@@ -384,7 +420,7 @@ def recover(self):
             zp_dtype = self.compressed_dtype  # to avoid overflow when weight-zp
             zp = torch.zeros(self.scale.shape, dtype=zp_dtype).to(device)
             packed_zp = self.packed_zp
-            if self.compression_dim == 0:
+            if self.use_HF_format or self.compression_dim == 0:
                 zp = zp.T
                 packed_zp = packed_zp.T
             origin_shape = zp.shape
@@ -399,8 +435,10 @@ def recover(self):
                     tmp = tmp >> self.compress_bits - self.bits
                     tmp &= mask
                     zp[:, index] = tmp.type(zp_dtype)
-            if self.compression_dim == 0:
+            if self.use_HF_format or self.compression_dim == 0:
                 zp = zp.T
+            if self.use_HF_format:
+                zp += 1
             # recover fp32 weight with int_weight, scale, and zero_point
             left_element = self.in_features % self.groupsize
             if left_element != 0:
@@ -453,9 +491,16 @@ def forward(self, input):
             return F.linear(input, weight, self.bias)
 
     def extra_repr(self) -> str:
-        return "in_features={}, out_features={}, bits={}, group_size={}, bias={}".format(
-            self.in_features, self.out_features, self.bits, self.groupsize, self.bias is not None
+        tmp_str = "in_features={}, out_features={}, bits={}, group_size={}, bias={}".format(
+            self.in_features,
+            self.out_features,
+            self.bits,
+            self.groupsize,
+            self.bias is not None,
         )
+        if self.use_HF_format:
+            tmp_str += ", use_HF_format=True"
+        return tmp_str
 
 
 class FakeAffineTensorQuantFunction(Function):
diff --git a/neural_compressor/adaptor/torch_utils/weight_only.py b/neural_compressor/adaptor/torch_utils/weight_only.py
index 7ba86eaa344..37a810a0428 100644
--- a/neural_compressor/adaptor/torch_utils/weight_only.py
+++ b/neural_compressor/adaptor/torch_utils/weight_only.py
@@ -396,6 +396,7 @@ def rtn_quantize(
         compression_dim = kwargs.get("compression_dim", 1)
         scale_dtype = kwargs.get("scale_dtype", torch.float32)
         device = kwargs.get("device", "cpu")
+        use_HF_format = kwargs.get("use_HF_format", False)
     for name, m in model.named_modules():
         if m.__class__.__name__ not in supported_layers:
             continue
@@ -448,6 +449,7 @@ def rtn_quantize(
                 compression_dim=compression_dim,
                 scale_dtype=scale_dtype,
                 device=device,
+                use_HF_format=use_HF_format,
             )
             new_module.pack(int_weight, scale, zp, m.bias)
             if name == "":
diff --git a/neural_compressor/model/torch_model.py b/neural_compressor/model/torch_model.py
index eeada402f35..e546893e323 100644
--- a/neural_compressor/model/torch_model.py
+++ b/neural_compressor/model/torch_model.py
@@ -459,6 +459,7 @@ def export_compressed_model(
         scale_dtype=torch.float32,
         gptq_config_path=None,
         device="cpu",
+        use_HF_format=False,
     ):
         """Convert Linear to WeightOnlyLinear for low memory inference.
 
@@ -474,6 +475,10 @@ def export_compressed_model(
                                                     Defaults to torch.float32.
             gptq_config_path (str, optional): Path of gptq_config.json. Defaults to None.
             device (str, optional): choose device for compression. Defaults to cpu.
+            use_HF_format (bool, optional): use the popular huggingface compression format.
+                1: compression_dim: weight = 1, zeros = 0 and both are transposed.
+                2: zeros -= 1 before compression. Why we need it?
+                3: g_idx: use same number for one group instead of recording the channel order.
         """
         from ..adaptor.torch_utils.model_wrapper import WeightOnlyLinear
         from ..adaptor.torch_utils.util import collect_weight_info, fetch_module, set_module
@@ -513,6 +518,7 @@ def export_compressed_model(
                         compression_dim=compression_dim,
                         scale_dtype=scale_dtype,
                         device=device,
+                        use_HF_format=use_HF_format,
                     )
                     set_module(self.model, k, new_module)
                     continue
@@ -539,6 +545,7 @@ def export_compressed_model(
                     compression_dim=compression_dim,
                     scale_dtype=scale_dtype,
                     device=device,
+                    use_HF_format=use_HF_format,
                 )
                 new_module.pack(int_weight, gptq_scale, gptq_zp, m.bias, gptq_perm)
                 set_module(self.model, k, new_module)
@@ -565,6 +572,7 @@ def export_compressed_model(
                     compression_dim=compression_dim,
                     scale_dtype=scale_dtype,
                     device=device,
+                    use_HF_format=use_HF_format,
                 )
                 set_module(self.model, k, mod)
         return self.model
diff --git a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py
index 2e6e5b85ee0..85b0bcfafa1 100644
--- a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py
+++ b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py
@@ -8,6 +8,8 @@
 
 from neural_compressor import PostTrainingQuantConfig, quantization
 from neural_compressor.adaptor.torch_utils.model_wrapper import MulLinear, WeightOnlyLinear
+from neural_compressor.model import Model as INCModel
+from neural_compressor.utils.pytorch import load
 
 
 class Model(torch.nn.Module):
@@ -81,13 +83,27 @@ def test_RTN_int_quant(self):
             approach="weight_only",
         )
         q_model = quantization.fit(model, conf)
+        q_model.save("saved")
         out2 = q_model(input)
         self.assertTrue(torch.all(torch.isclose(out1, out2, atol=5e-1)))
         self.assertFalse(torch.all(out1 == out2))
         compressed_model = q_model.export_compressed_model()
         out3 = compressed_model(input)
+        self.assertTrue("fc1.packed_weight" in compressed_model.state_dict().keys())
+        q_weight1 = compressed_model.state_dict()["fc1.packed_weight"]
         self.assertTrue(torch.all(out3 == out2))
 
+        # test huggingface popular int4 format
+        model = Model()
+        new_model = load("saved", model, weight_only=True)
+        inc_model = INCModel(new_model)
+        inc_model.export_compressed_model(qweight_config_path="saved/qconfig.json", use_HF_format=True)
+        out4 = inc_model.model(input)
+        self.assertTrue("fc1.qweight" in inc_model.model.state_dict().keys())
+        q_weight2 = inc_model.model.state_dict()["fc1.qweight"]
+        self.assertTrue(torch.all(q_weight1.T == q_weight2))
+        self.assertTrue(torch.all(out3 == out4))
+
         model = Model()
         out1 = model(input)
         conf = PostTrainingQuantConfig(
@@ -218,7 +234,6 @@ def test_RTN_int_quant(self):
         self.assertTrue(torch.all(torch.isclose(out1, out2, atol=5e-1)))
         self.assertFalse(torch.all(out1 == out2))
         q_model.save("saved")
-        from neural_compressor.utils.pytorch import load
 
         new_model = load("saved", model, weight_only=True)
         out1 = new_model(input)
@@ -226,8 +241,6 @@ def test_RTN_int_quant(self):
 
         model_size1 = os.path.getsize("saved/best_model.pt") / 1024
         print("FP32 Model size:{:.3f}M".format(model_size1))
-        from neural_compressor.model import Model as INCModel
-
         inc_model = INCModel(new_model)
         inc_model.export_compressed_model(qweight_config_path="saved/qconfig.json")
         torch.save(inc_model.state_dict(), "saved/tmp.pt")
@@ -528,7 +541,7 @@ def __iter__(self):
         )
         q_model.save("saved")
         out1 = q_model.model(input)
-        compressed_model = q_model.export_compressed_model()
+        compressed_model = q_model.export_compressed_model(use_HF_format=True)
         out2 = compressed_model(input)
         torch.save(compressed_model.state_dict(), "saved/compressed_model.pt")
         self.assertTrue(torch.allclose(out1[0], out2[0], atol=1e-05))

From 2326e159e0fa8fed363ff735884b6c89ea61c2a8 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Tue, 7 Nov 2023 22:11:29 +0800
Subject: [PATCH 02/23] fix g_idx

Signed-off-by: Xin He <xin3.he@intel.com>
---
 neural_compressor/adaptor/torch_utils/model_wrapper.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/neural_compressor/adaptor/torch_utils/model_wrapper.py b/neural_compressor/adaptor/torch_utils/model_wrapper.py
index 4138e8b1df1..adac2a6f4fc 100644
--- a/neural_compressor/adaptor/torch_utils/model_wrapper.py
+++ b/neural_compressor/adaptor/torch_utils/model_wrapper.py
@@ -339,6 +339,8 @@ def pack(self, int_weight, scale, zp, bias, gptq_perm=None):
         if gptq_perm is not None:
             assert hasattr(self, "gptq_perm"), "gptq_perm is not set when initializing."
             self.gptq_perm = gptq_perm.type(torch.int32).to(self.device)
+            if self.use_HF_format:
+                self.gptq_perm = self.gptq_perm // self.groupsize
         assert scale.shape == self.scale.shape, "Scale shape is mismatched."
         self.scale = scale.type(self.float_type).to(self.device)
         if not self.use_HF_format and self.compression_dim == 0:

From b6b98e3f8dcb4c5527f392fe00e46508e44bc79b Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Wed, 8 Nov 2023 13:06:58 +0800
Subject: [PATCH 03/23] Prevent broken id links

Signed-off-by: Xin He <xin3.he@intel.com>
---
 .../adaptor/torch_utils/model_wrapper.py        | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/neural_compressor/adaptor/torch_utils/model_wrapper.py b/neural_compressor/adaptor/torch_utils/model_wrapper.py
index adac2a6f4fc..9eb2f6446b7 100644
--- a/neural_compressor/adaptor/torch_utils/model_wrapper.py
+++ b/neural_compressor/adaptor/torch_utils/model_wrapper.py
@@ -255,11 +255,11 @@ def __init__(
             self.register_buffer(
                 "scales",
                 torch.zeros(
-                    (out_features, math.ceil(in_features / self.groupsize)),
+                    (math.ceil(in_features / self.groupsize), out_features),
                     dtype=self.float_type,
                 ).to(device),
             )
-            self.scale = self.scales
+            self.scale = self.scales.T
             self.register_buffer(
                 "qweight",
                 torch.zeros(
@@ -382,9 +382,22 @@ def pack(self, int_weight, scale, zp, bias, gptq_perm=None):
                     self.packed_zp[:, j] |= tmp[:, e]
             if self.use_HF_format or self.compression_dim == 0:
                 self.packed_zp = self.packed_zp.T
+        if self.use_HF_format:
+            self.scales = self.scale.T
+            self.qweight = self.packed_weight.T
+            self.g_idx = self.gptq_perm
+            if zp is not None:
+                self.qzeros = self.packed_zp.T
 
     def recover(self):
         logger.debug(f"Recovering {self} weight")
+        if self.use_HF_format:
+            # Prevent broken id links of self.scale and self.scales
+            self.scale = self.scales.T
+            self.packed_weight = self.qweight.T
+            self.gptq_perm = self.g_idx
+            if hasattr(self, "qzeros"):
+                self.packed_zp = self.qzeros.T
         device = self.scale.device
         mask = torch.tensor(2**self.bits - 1, dtype=self.compressed_dtype).to(device)
         if hasattr(self, "packed_zp"):

From fec9c19e3fd5974ea776100babca836f37c0038b Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Wed, 8 Nov 2023 14:29:11 +0800
Subject: [PATCH 04/23] add sym qzero

Signed-off-by: Xin He <xin3.he@intel.com>
---
 .../adaptor/torch_utils/model_wrapper.py      | 21 +++++++++++--------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/neural_compressor/adaptor/torch_utils/model_wrapper.py b/neural_compressor/adaptor/torch_utils/model_wrapper.py
index 9eb2f6446b7..40f152f6d33 100644
--- a/neural_compressor/adaptor/torch_utils/model_wrapper.py
+++ b/neural_compressor/adaptor/torch_utils/model_wrapper.py
@@ -268,15 +268,14 @@ def __init__(
                 ).to(device),
             )
             self.packed_weight = self.qweight.T
-            if zp:
-                self.register_buffer(
-                    "qzeros",
-                    torch.zeros(
-                        (math.ceil(self.in_features / self.groupsize), math.ceil(self.out_features / self.n_pack)),
-                        dtype=self.compressed_dtype,
-                    ).to(device),
-                )
-                self.packed_zp = self.qzeros.T
+            self.register_buffer(
+                "qzeros",
+                torch.zeros(
+                    (math.ceil(self.in_features / self.groupsize), math.ceil(self.out_features / self.n_pack)),
+                    dtype=self.compressed_dtype,
+                ).to(device),
+            )
+            self.packed_zp = self.qzeros.T
             if gptq_perm:
                 self.register_buffer("g_idx", torch.zeros(in_features, dtype=torch.int32).to(device))
             else:
@@ -333,6 +332,10 @@ def __init__(
 
     def pack(self, int_weight, scale, zp, bias, gptq_perm=None):
         int_weight = int_weight.to(self.device)
+        if self.use_HF_format and zp is None:
+            shift_bias = 2 ** (self.bits - 1) - 1
+            int_weight += shift_bias
+            zp = torch.zeros_like(scale, dtype=torch.uint8) + shift_bias
         if bias is not None:
             assert hasattr(self, "bias"), "bias is not set when initializing."
             self.bias = bias.type(self.float_type).to(self.device)

From 22e97de8c635b44b34e33a86901e9df6e91ef0c4 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Wed, 8 Nov 2023 17:48:04 +0800
Subject: [PATCH 05/23] invert perm before compression

Signed-off-by: Xin He <xin3.he@intel.com>
---
 .../adaptor/torch_utils/model_wrapper.py      | 48 ++++---------------
 neural_compressor/model/torch_model.py        |  3 ++
 2 files changed, 13 insertions(+), 38 deletions(-)

diff --git a/neural_compressor/adaptor/torch_utils/model_wrapper.py b/neural_compressor/adaptor/torch_utils/model_wrapper.py
index 40f152f6d33..b94ec56627c 100644
--- a/neural_compressor/adaptor/torch_utils/model_wrapper.py
+++ b/neural_compressor/adaptor/torch_utils/model_wrapper.py
@@ -389,8 +389,7 @@ def pack(self, int_weight, scale, zp, bias, gptq_perm=None):
             self.scales = self.scale.T
             self.qweight = self.packed_weight.T
             self.g_idx = self.gptq_perm
-            if zp is not None:
-                self.qzeros = self.packed_zp.T
+            self.qzeros = self.packed_zp.T
 
     def recover(self):
         logger.debug(f"Recovering {self} weight")
@@ -399,9 +398,12 @@ def recover(self):
             self.scale = self.scales.T
             self.packed_weight = self.qweight.T
             self.gptq_perm = self.g_idx
-            if hasattr(self, "qzeros"):
-                self.packed_zp = self.qzeros.T
+            self.packed_zp = self.qzeros.T
         device = self.scale.device
+        fp32_weight = torch.zeros(self.out_features, self.in_features, dtype=self.float_type).to(device)
+        if self.gptq_perm is None:
+            # used for recovering fp32_weight
+            self.gptq_perm = torch.tensor([i // self.groupsize for i in range(self.in_features)], dtype=torch.int32)
         mask = torch.tensor(2**self.bits - 1, dtype=self.compressed_dtype).to(device)
         if hasattr(self, "packed_zp"):
             weight_dtype = torch.uint8
@@ -458,42 +460,12 @@ def recover(self):
             if self.use_HF_format:
                 zp += 1
             # recover fp32 weight with int_weight, scale, and zero_point
-            left_element = self.in_features % self.groupsize
-            if left_element != 0:
-                split_index = self.in_features // self.groupsize * self.groupsize
-                weight1 = weight[:, :-split_index].reshape(-1, self.groupsize)
-                scale1 = self.scale[:, :-1].reshape(-1, 1)
-                zp1 = zp[:, :-1].reshape(-1, 1)
-                weight1 = ((weight1 - zp1) * scale1).reshape(self.out_features, -1)
-                weight2 = weight[:, -split_index:]
-                scale2 = self.scale[:, -1:]
-                zp2 = zp[:, -1].reshape(-1, 1)
-                weight2 = (weight2 - zp2) * scale2
-                fp32_weight = torch.cat((weight1, weight2), dim=1)
-            else:
-                weight = weight.reshape(-1, self.groupsize)
-                scale = self.scale.reshape(-1, 1)
-                zp = zp.reshape(-1, 1)
-                fp32_weight = ((weight - zp) * scale).reshape(self.out_features, -1)
+            for idx in range(self.in_features):
+                fp32_weight[:, idx] = weight[:, idx] * self.scale[:, self.gptq_perm[idx]] - zp[:, self.gptq_perm[idx]]
         else:
             # recover fp32 weight with int_weight, scale
-            left_element = self.in_features % self.groupsize
-            if left_element != 0:
-                split_index = self.in_features // self.groupsize * self.groupsize
-                weight1 = weight[:, :split_index].reshape(-1, self.groupsize)
-                scale1 = self.scale[:, :-1].reshape(-1, 1)
-                weight1 = (weight1 * scale1).reshape(self.out_features, -1)
-                weight2 = weight[:, split_index:]
-                scale2 = self.scale[:, -1:]
-                weight2 = weight2 * scale2
-                fp32_weight = torch.cat((weight1, weight2), dim=1)
-            else:
-                weight = weight.reshape(-1, self.groupsize)
-                scale = self.scale.reshape(-1, 1)
-                fp32_weight = (weight * scale).reshape(self.out_features, -1)
-        if self.gptq_perm is not None:
-            invperm = torch.argsort(self.gptq_perm)
-            fp32_weight = fp32_weight[:, invperm]
+            for idx in range(self.in_features):
+                fp32_weight[:, idx] = weight[:, idx] * self.scales[:, self.gptq_perm[idx]]
         return fp32_weight
 
     def forward(self, input):
diff --git a/neural_compressor/model/torch_model.py b/neural_compressor/model/torch_model.py
index e546893e323..4e468815847 100644
--- a/neural_compressor/model/torch_model.py
+++ b/neural_compressor/model/torch_model.py
@@ -532,6 +532,9 @@ def export_compressed_model(
                 gptq_scale = torch.tensor(gptq_conf["scale"])
                 gptq_zp = None if scheme == "sym" else torch.tensor(gptq_conf["zero"])
                 int_weight = quant_weight_w_scale(fp32_weight, gptq_scale, gptq_zp, group_size)
+                if "perm" in gptq_conf:
+                    invperm = torch.argsort(gptq_perm)
+                    int_weight = int_weight[:, invperm]
                 new_module = WeightOnlyLinear(
                     m.in_features,
                     m.out_features,

From db6782b7181380621b2943bebc276e0defc8a3a3 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Wed, 8 Nov 2023 17:49:16 +0800
Subject: [PATCH 06/23] fix typo

Signed-off-by: Xin He <xin3.he@intel.com>
---
 neural_compressor/adaptor/torch_utils/model_wrapper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neural_compressor/adaptor/torch_utils/model_wrapper.py b/neural_compressor/adaptor/torch_utils/model_wrapper.py
index b94ec56627c..9150f1a06a9 100644
--- a/neural_compressor/adaptor/torch_utils/model_wrapper.py
+++ b/neural_compressor/adaptor/torch_utils/model_wrapper.py
@@ -465,7 +465,7 @@ def recover(self):
         else:
             # recover fp32 weight with int_weight, scale
             for idx in range(self.in_features):
-                fp32_weight[:, idx] = weight[:, idx] * self.scales[:, self.gptq_perm[idx]]
+                fp32_weight[:, idx] = weight[:, idx] * self.scale[:, self.gptq_perm[idx]]
         return fp32_weight
 
     def forward(self, input):

From 526fce95ecc98971ef8c805970605bf43686b215 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Wed, 8 Nov 2023 19:14:38 +0800
Subject: [PATCH 07/23] fix bug in perm setting

Signed-off-by: Xin He <xin3.he@intel.com>
---
 neural_compressor/adaptor/torch_utils/model_wrapper.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/neural_compressor/adaptor/torch_utils/model_wrapper.py b/neural_compressor/adaptor/torch_utils/model_wrapper.py
index 9150f1a06a9..45e2ae1616f 100644
--- a/neural_compressor/adaptor/torch_utils/model_wrapper.py
+++ b/neural_compressor/adaptor/torch_utils/model_wrapper.py
@@ -342,8 +342,6 @@ def pack(self, int_weight, scale, zp, bias, gptq_perm=None):
         if gptq_perm is not None:
             assert hasattr(self, "gptq_perm"), "gptq_perm is not set when initializing."
             self.gptq_perm = gptq_perm.type(torch.int32).to(self.device)
-            if self.use_HF_format:
-                self.gptq_perm = self.gptq_perm // self.groupsize
         assert scale.shape == self.scale.shape, "Scale shape is mismatched."
         self.scale = scale.type(self.float_type).to(self.device)
         if not self.use_HF_format and self.compression_dim == 0:
@@ -404,6 +402,9 @@ def recover(self):
         if self.gptq_perm is None:
             # used for recovering fp32_weight
             self.gptq_perm = torch.tensor([i // self.groupsize for i in range(self.in_features)], dtype=torch.int32)
+        else:
+            invperm = torch.argsort(self.gptq_perm)
+            self.gptq_perm = invperm // self.groupsize
         mask = torch.tensor(2**self.bits - 1, dtype=self.compressed_dtype).to(device)
         if hasattr(self, "packed_zp"):
             weight_dtype = torch.uint8
@@ -461,7 +462,7 @@ def recover(self):
                 zp += 1
             # recover fp32 weight with int_weight, scale, and zero_point
             for idx in range(self.in_features):
-                fp32_weight[:, idx] = weight[:, idx] * self.scale[:, self.gptq_perm[idx]] - zp[:, self.gptq_perm[idx]]
+                fp32_weight[:, idx] = (weight[:, idx] - zp[:, self.gptq_perm[idx]]) * self.scale[:, self.gptq_perm[idx]]
         else:
             # recover fp32 weight with int_weight, scale
             for idx in range(self.in_features):

From 7429fa5b9dd452321c447da0f73f042be353822d Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Wed, 8 Nov 2023 23:12:40 +0800
Subject: [PATCH 08/23] fix zero shift error

Signed-off-by: Xin He <xin3.he@intel.com>
---
 neural_compressor/adaptor/torch_utils/model_wrapper.py | 2 ++
 neural_compressor/model/torch_model.py                 | 5 +++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/neural_compressor/adaptor/torch_utils/model_wrapper.py b/neural_compressor/adaptor/torch_utils/model_wrapper.py
index 45e2ae1616f..372c365dc1d 100644
--- a/neural_compressor/adaptor/torch_utils/model_wrapper.py
+++ b/neural_compressor/adaptor/torch_utils/model_wrapper.py
@@ -459,7 +459,9 @@ def recover(self):
             if self.use_HF_format or self.compression_dim == 0:
                 zp = zp.T
             if self.use_HF_format:
+                # zp -= 1 may cause zp == -1, after recover it becomes 2**self.bits - 1
                 zp += 1
+                zp = torch.where(zp > (2**self.bits - 1), 0, zp)
             # recover fp32 weight with int_weight, scale, and zero_point
             for idx in range(self.in_features):
                 fp32_weight[:, idx] = (weight[:, idx] - zp[:, self.gptq_perm[idx]]) * self.scale[:, self.gptq_perm[idx]]
diff --git a/neural_compressor/model/torch_model.py b/neural_compressor/model/torch_model.py
index 4e468815847..edf8dca1da9 100644
--- a/neural_compressor/model/torch_model.py
+++ b/neural_compressor/model/torch_model.py
@@ -496,6 +496,7 @@ def export_compressed_model(
             gptq_config = self.gptq_config if hasattr(self, "gptq_config") else {}
         if gptq_config:
             for k, v in weight_config.items():
+                print(k)
                 logger.debug(f"Compressing {k} on device {device}")
                 if v["dtype"] == "fp32":
                     continue
@@ -529,8 +530,8 @@ def export_compressed_model(
                 else:
                     fp32_weight = m.weight.data
                     gptq_perm = None
-                gptq_scale = torch.tensor(gptq_conf["scale"])
-                gptq_zp = None if scheme == "sym" else torch.tensor(gptq_conf["zero"])
+                gptq_scale = torch.tensor(gptq_conf["scale"], dtype=torch.float32)
+                gptq_zp = None if scheme == "sym" else torch.tensor(gptq_conf["zero"], dtype=torch.int32)
                 int_weight = quant_weight_w_scale(fp32_weight, gptq_scale, gptq_zp, group_size)
                 if "perm" in gptq_conf:
                     invperm = torch.argsort(gptq_perm)

From 22499e11e8b60a4de7d06580b59d5fc2bb53530c Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Wed, 8 Nov 2023 23:33:31 +0800
Subject: [PATCH 09/23] fix reload state_dict bug

Signed-off-by: Xin He <xin3.he@intel.com>
---
 neural_compressor/adaptor/torch_utils/model_wrapper.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/neural_compressor/adaptor/torch_utils/model_wrapper.py b/neural_compressor/adaptor/torch_utils/model_wrapper.py
index 372c365dc1d..d10bdba9e74 100644
--- a/neural_compressor/adaptor/torch_utils/model_wrapper.py
+++ b/neural_compressor/adaptor/torch_utils/model_wrapper.py
@@ -342,6 +342,9 @@ def pack(self, int_weight, scale, zp, bias, gptq_perm=None):
         if gptq_perm is not None:
             assert hasattr(self, "gptq_perm"), "gptq_perm is not set when initializing."
             self.gptq_perm = gptq_perm.type(torch.int32).to(self.device)
+            if self.use_HF_format:
+                invperm = torch.argsort(self.gptq_perm)
+                self.gptq_perm = invperm // self.groupsize
         assert scale.shape == self.scale.shape, "Scale shape is mismatched."
         self.scale = scale.type(self.float_type).to(self.device)
         if not self.use_HF_format and self.compression_dim == 0:
@@ -402,9 +405,6 @@ def recover(self):
         if self.gptq_perm is None:
             # used for recovering fp32_weight
             self.gptq_perm = torch.tensor([i // self.groupsize for i in range(self.in_features)], dtype=torch.int32)
-        else:
-            invperm = torch.argsort(self.gptq_perm)
-            self.gptq_perm = invperm // self.groupsize
         mask = torch.tensor(2**self.bits - 1, dtype=self.compressed_dtype).to(device)
         if hasattr(self, "packed_zp"):
             weight_dtype = torch.uint8

From 550a4f9450ff93a48f89245093ce765802161895 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Wed, 8 Nov 2023 23:44:11 +0800
Subject: [PATCH 10/23] enhance ut

Signed-off-by: Xin He <xin3.he@intel.com>
---
 .../pytorch_adaptor/test_weight_only_adaptor.py      | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py
index 85b0bcfafa1..3c59d756846 100644
--- a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py
+++ b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py
@@ -534,6 +534,16 @@ def __iter__(self):
         # # case 2: list or tuple
         model_2 = copy.deepcopy(self.gptj)
         input = torch.ones([1, 512], dtype=torch.long)
+        conf.op_type_dict = {
+            ".*": {  # re.match
+                "weight": {
+                    "bits": 4,  # 1-8 bits
+                    "group_size": 8,  # -1 (per-channel)
+                    "scheme": "asym",
+                    "algorithm": "GPTQ",
+                },
+            },
+        }
         q_model = quantization.fit(
             model_2,
             conf,
@@ -556,7 +566,7 @@ def __iter__(self):
         )
         q_model.save("saved")
         out1 = q_model.model(input)
-        compressed_model = q_model.export_compressed_model()
+        compressed_model = q_model.export_compressed_model(use_HF_format=True)
         out2 = compressed_model(input)
         torch.save(compressed_model.state_dict(), "saved/compressed_model.pt")
         self.assertTrue(torch.allclose(out1[0], out2[0], atol=1e-05))

From c59177f2da5be7d3c50959909679853bb04f4b1f Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Thu, 9 Nov 2023 10:20:14 +0800
Subject: [PATCH 11/23] fix sym zeropoint

Signed-off-by: Xin He <xin3.he@intel.com>
---
 neural_compressor/adaptor/torch_utils/model_wrapper.py | 4 +++-
 neural_compressor/model/torch_model.py                 | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/neural_compressor/adaptor/torch_utils/model_wrapper.py b/neural_compressor/adaptor/torch_utils/model_wrapper.py
index d10bdba9e74..cb4fec7b1fe 100644
--- a/neural_compressor/adaptor/torch_utils/model_wrapper.py
+++ b/neural_compressor/adaptor/torch_utils/model_wrapper.py
@@ -333,7 +333,9 @@ def __init__(
     def pack(self, int_weight, scale, zp, bias, gptq_perm=None):
         int_weight = int_weight.to(self.device)
         if self.use_HF_format and zp is None:
-            shift_bias = 2 ** (self.bits - 1) - 1
+            # to avoid overflow
+            int_weight = int_weight.type(torch.int32)
+            shift_bias = 2 ** (self.bits - 1)
             int_weight += shift_bias
             zp = torch.zeros_like(scale, dtype=torch.uint8) + shift_bias
         if bias is not None:
diff --git a/neural_compressor/model/torch_model.py b/neural_compressor/model/torch_model.py
index edf8dca1da9..3dc0a3be11a 100644
--- a/neural_compressor/model/torch_model.py
+++ b/neural_compressor/model/torch_model.py
@@ -533,6 +533,7 @@ def export_compressed_model(
                 gptq_scale = torch.tensor(gptq_conf["scale"], dtype=torch.float32)
                 gptq_zp = None if scheme == "sym" else torch.tensor(gptq_conf["zero"], dtype=torch.int32)
                 int_weight = quant_weight_w_scale(fp32_weight, gptq_scale, gptq_zp, group_size)
+                int_weight = int_weight.type(torch.int32)
                 if "perm" in gptq_conf:
                     invperm = torch.argsort(gptq_perm)
                     int_weight = int_weight[:, invperm]

From f65eb1e34cade7065145eb20131e94d0cf651a04 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Thu, 9 Nov 2023 10:33:47 +0800
Subject: [PATCH 12/23] add dtype to g_idx

Signed-off-by: Xin He <xin3.he@intel.com>
---
 neural_compressor/adaptor/torch_utils/model_wrapper.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/neural_compressor/adaptor/torch_utils/model_wrapper.py b/neural_compressor/adaptor/torch_utils/model_wrapper.py
index cb4fec7b1fe..78f8aff85b2 100644
--- a/neural_compressor/adaptor/torch_utils/model_wrapper.py
+++ b/neural_compressor/adaptor/torch_utils/model_wrapper.py
@@ -347,6 +347,7 @@ def pack(self, int_weight, scale, zp, bias, gptq_perm=None):
             if self.use_HF_format:
                 invperm = torch.argsort(self.gptq_perm)
                 self.gptq_perm = invperm // self.groupsize
+                self.gptq_perm = self.gptq_perm.type(torch.int32).to(self.device)
         assert scale.shape == self.scale.shape, "Scale shape is mismatched."
         self.scale = scale.type(self.float_type).to(self.device)
         if not self.use_HF_format and self.compression_dim == 0:

From 9c7454a6a361d331ab8a19af3c89f80b7cf139ad Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Thu, 9 Nov 2023 11:31:38 +0800
Subject: [PATCH 13/23] add export_compressed_model func for saved_dir

Signed-off-by: Xin He <xin3.he@intel.com>
---
 neural_compressor/model/torch_model.py        |  3 +-
 neural_compressor/utils/load_huggingface.py   | 29 +++++++++++++++++++
 .../test_weight_only_adaptor.py               |  6 ++++
 3 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/neural_compressor/model/torch_model.py b/neural_compressor/model/torch_model.py
index 3dc0a3be11a..ef7e891aed2 100644
--- a/neural_compressor/model/torch_model.py
+++ b/neural_compressor/model/torch_model.py
@@ -479,6 +479,8 @@ def export_compressed_model(
                 1: compression_dim: weight = 1, zeros = 0 and both are transposed.
                 2: zeros -= 1 before compression. Why we need it?
                 3: g_idx: use same number for one group instead of recording the channel order.
+                4. parameter name changed, such as 'packed_weight' -> 'qweight'.
+                5. zeros is always needed even for sym.
         """
         from ..adaptor.torch_utils.model_wrapper import WeightOnlyLinear
         from ..adaptor.torch_utils.util import collect_weight_info, fetch_module, set_module
@@ -496,7 +498,6 @@ def export_compressed_model(
             gptq_config = self.gptq_config if hasattr(self, "gptq_config") else {}
         if gptq_config:
             for k, v in weight_config.items():
-                print(k)
                 logger.debug(f"Compressing {k} on device {device}")
                 if v["dtype"] == "fp32":
                     continue
diff --git a/neural_compressor/utils/load_huggingface.py b/neural_compressor/utils/load_huggingface.py
index c68259a4abc..783a5dd2f5a 100644
--- a/neural_compressor/utils/load_huggingface.py
+++ b/neural_compressor/utils/load_huggingface.py
@@ -230,3 +230,32 @@ def save_for_huggingface_upstream(model, tokenizer, output_dir):
     model.model.config.architectures = [model.model.__class__.__name__]
     model.model.config.torch_dtype = "int8"
     model.model.config.save_pretrained(output_dir)
+
+
+def export_compressed_model(model, saved_dir=None, use_HF_format=False):
+    """Support get compressed model from saved_dir.
+
+    Args:
+        model (torch.nn.Module): origin fp32 model.
+        saved_dir (_type_, optional): the dir path of compression info. Defaults to None.
+        use_HF_format (bool, optional): whether use HuggingFace format. Defaults to False.
+    """
+    stat_dict = os.path.join(saved_dir, "best_model.pt")
+    qweight_config_path = os.path.join(saved_dir, "qconfig.json")
+    gptq_config_path = os.path.join(saved_dir, "gptq_config.json")
+    model.load_state_dict(torch.load(stat_dict))
+
+    from neural_compressor.model import Model as INCModel
+
+    inc_model = INCModel(model)
+    inc_model.export_compressed_model(
+        qweight_config_path=qweight_config_path,
+        enable_full_range=False,
+        compression_dtype=torch.int32,
+        compression_dim=1,
+        scale_dtype=torch.float32,
+        gptq_config_path=gptq_config_path,
+        device="cpu",
+        use_HF_format=use_HF_format,
+    )
+    return inc_model.model
diff --git a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py
index 3c59d756846..95dea1e7647 100644
--- a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py
+++ b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py
@@ -9,6 +9,7 @@
 from neural_compressor import PostTrainingQuantConfig, quantization
 from neural_compressor.adaptor.torch_utils.model_wrapper import MulLinear, WeightOnlyLinear
 from neural_compressor.model import Model as INCModel
+from neural_compressor.utils.load_huggingface import export_compressed_model
 from neural_compressor.utils.pytorch import load
 
 
@@ -102,6 +103,11 @@ def test_RTN_int_quant(self):
         self.assertTrue("fc1.qweight" in inc_model.model.state_dict().keys())
         q_weight2 = inc_model.model.state_dict()["fc1.qweight"]
         self.assertTrue(torch.all(q_weight1.T == q_weight2))
+        model = Model()
+        compressed_model = export_compressed_model(model, saved_dir="saved", use_HF_format=True)
+        self.assertTrue("fc1.qweight" in compressed_model.state_dict().keys())
+        q_weight2 = compressed_model.state_dict()["fc1.qweight"]
+        self.assertTrue(torch.all(q_weight1.T == q_weight2))
         self.assertTrue(torch.all(out3 == out4))
 
         model = Model()

From 8836f251cf98a4c99c2267ad5a149522310061b5 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Thu, 9 Nov 2023 13:25:16 +0800
Subject: [PATCH 14/23] fix UT

Signed-off-by: Xin He <xin3.he@intel.com>
---
 test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py
index 95dea1e7647..692ce82533b 100644
--- a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py
+++ b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py
@@ -91,7 +91,6 @@ def test_RTN_int_quant(self):
         compressed_model = q_model.export_compressed_model()
         out3 = compressed_model(input)
         self.assertTrue("fc1.packed_weight" in compressed_model.state_dict().keys())
-        q_weight1 = compressed_model.state_dict()["fc1.packed_weight"]
         self.assertTrue(torch.all(out3 == out2))
 
         # test huggingface popular int4 format
@@ -101,13 +100,9 @@ def test_RTN_int_quant(self):
         inc_model.export_compressed_model(qweight_config_path="saved/qconfig.json", use_HF_format=True)
         out4 = inc_model.model(input)
         self.assertTrue("fc1.qweight" in inc_model.model.state_dict().keys())
-        q_weight2 = inc_model.model.state_dict()["fc1.qweight"]
-        self.assertTrue(torch.all(q_weight1.T == q_weight2))
         model = Model()
         compressed_model = export_compressed_model(model, saved_dir="saved", use_HF_format=True)
         self.assertTrue("fc1.qweight" in compressed_model.state_dict().keys())
-        q_weight2 = compressed_model.state_dict()["fc1.qweight"]
-        self.assertTrue(torch.all(q_weight1.T == q_weight2))
         self.assertTrue(torch.all(out3 == out4))
 
         model = Model()

From cbdce1527e94c7005e0bd4b68ef6fd7772e771e0 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Thu, 9 Nov 2023 13:34:08 +0800
Subject: [PATCH 15/23] ignore pylint

Signed-off-by: Xin He <xin3.he@intel.com>
---
 neural_compressor/utils/load_huggingface.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/neural_compressor/utils/load_huggingface.py b/neural_compressor/utils/load_huggingface.py
index 783a5dd2f5a..9d153d9d006 100644
--- a/neural_compressor/utils/load_huggingface.py
+++ b/neural_compressor/utils/load_huggingface.py
@@ -247,6 +247,7 @@ def export_compressed_model(model, saved_dir=None, use_HF_format=False):
 
     from neural_compressor.model import Model as INCModel
 
+    # pylint: disable=E1101
     inc_model = INCModel(model)
     inc_model.export_compressed_model(
         qweight_config_path=qweight_config_path,

From f2de9c6ec6447501c9fb2dd0f876af147e5213b3 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Thu, 9 Nov 2023 13:57:43 +0800
Subject: [PATCH 16/23] fix bug

Signed-off-by: Xin He <xin3.he@intel.com>
---
 neural_compressor/utils/load_huggingface.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/neural_compressor/utils/load_huggingface.py b/neural_compressor/utils/load_huggingface.py
index 9d153d9d006..48ce6bc6a97 100644
--- a/neural_compressor/utils/load_huggingface.py
+++ b/neural_compressor/utils/load_huggingface.py
@@ -243,6 +243,8 @@ def export_compressed_model(model, saved_dir=None, use_HF_format=False):
     stat_dict = os.path.join(saved_dir, "best_model.pt")
     qweight_config_path = os.path.join(saved_dir, "qconfig.json")
     gptq_config_path = os.path.join(saved_dir, "gptq_config.json")
+    if not os.path.exists(gptq_config_path):
+        gptq_config_path = None
     model.load_state_dict(torch.load(stat_dict))
 
     from neural_compressor.model import Model as INCModel

From c0652e32ef030ecae354d54be9e750a8f0f5676f Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Thu, 9 Nov 2023 14:22:05 +0800
Subject: [PATCH 17/23] add document

Signed-off-by: Xin He <xin3.he@intel.com>
---
 docs/source/quantization_weight_only.md     | 21 +++++++++++----
 neural_compressor/utils/load_huggingface.py | 30 ++++++++++++++++-----
 2 files changed, 40 insertions(+), 11 deletions(-)

diff --git a/docs/source/quantization_weight_only.md b/docs/source/quantization_weight_only.md
index 2786dd9d1ec..ce48c0a6df3 100644
--- a/docs/source/quantization_weight_only.md
+++ b/docs/source/quantization_weight_only.md
@@ -96,6 +96,15 @@ To support low memory inference, Neural Compressor implemented WeightOnlyLinear,
 |  compression_dtype  |       torch.int32       |  Data type for compressed dtype, select from [torch.int8\|16\|32\|64]   |
 |  compression_dim  |       1       |   0 means output channel while 1 means input channel   |
 |  scale_dtype  |       torch.float32       |  Data type for scale and bias   |
+|  use_HF_format  |     False       |  Whether to use the popular format present on HuggingFace hub   |
+
+**Note:** HuggingFace format is quite special, the main differences are as follows:
+
+> 1: Compression Dimension: weight = 1, zero = 0 and both are transposed.   
+> 2: Zero Point: zero_point-= 1 before compression. zero_point is always required even for sym.    
+> 3: Group Index: Use the same number for a group instead of recording channel order.    
+> 4. Parameter Name: `packed_weight` -> `qweight`; `packed_zp` -> `qzeros`; `gptq_perm` -> `g_idx`; `scale` -> `scales`. 
+
 
 ### **User Code Example**
 ```python
@@ -119,12 +128,14 @@ conf = PostTrainingQuantConfig(
 )
 q_model = quantization.fit(model, conf, eval_func=eval_func)
 q_model.save("saved_results")
-compressed_model = q_model.export_compressed_model(
-    compression_dtype=torch.int32,
-    compression_dim=1,
-    scale_dtype=torch.float16,
-)
+compressed_model = q_model.export_compressed_model()
 torch.save(compressed_model.state_dict(), "compressed_model.pt")
+# or
+model = Model()
+compressed_model = export_compressed_model(
+    model,
+    saved_dir="saved_results",
+)
 ```
 
 The saved_results folder contains two files: `best_model.pt` and `qconfig.json`, and the generated q_model is a fake quantized model.
diff --git a/neural_compressor/utils/load_huggingface.py b/neural_compressor/utils/load_huggingface.py
index 48ce6bc6a97..f7378d90998 100644
--- a/neural_compressor/utils/load_huggingface.py
+++ b/neural_compressor/utils/load_huggingface.py
@@ -232,13 +232,31 @@ def save_for_huggingface_upstream(model, tokenizer, output_dir):
     model.model.config.save_pretrained(output_dir)
 
 
-def export_compressed_model(model, saved_dir=None, use_HF_format=False):
+def export_compressed_model(
+    model,
+    saved_dir=None,
+    use_HF_format=False,
+    enable_full_range=False,
+    compression_dtype=torch.int32,
+    compression_dim=1,
+    scale_dtype=torch.float32,
+    device="cpu",
+):
     """Support get compressed model from saved_dir.
 
     Args:
         model (torch.nn.Module): origin fp32 model.
         saved_dir (_type_, optional): the dir path of compression info. Defaults to None.
         use_HF_format (bool, optional): whether use HuggingFace format. Defaults to False.
+        enable_full_range (bool, optional): Whether to leverage the full compression range
+                                            under symmetric quantization. Defaults to False.
+        compression_dtype (torch.Tensor, optional): The target dtype after comoression.
+                                                    Defaults to torch.int32.
+        compression_dim (int, optional): Select from [0, 1], 0 is output channel,
+                                            1 is input channel. Defaults to 1.
+        scale_dtype (torch.Tensor, optional): Use float32 or float16.
+                                                Defaults to torch.float32.
+        device (str, optional): choose device for compression. Defaults to cpu.
     """
     stat_dict = os.path.join(saved_dir, "best_model.pt")
     qweight_config_path = os.path.join(saved_dir, "qconfig.json")
@@ -253,12 +271,12 @@ def export_compressed_model(model, saved_dir=None, use_HF_format=False):
     inc_model = INCModel(model)
     inc_model.export_compressed_model(
         qweight_config_path=qweight_config_path,
-        enable_full_range=False,
-        compression_dtype=torch.int32,
-        compression_dim=1,
-        scale_dtype=torch.float32,
+        enable_full_range=enable_full_range,
+        compression_dtype=compression_dtype,
+        compression_dim=compression_dim,
+        scale_dtype=scale_dtype,
         gptq_config_path=gptq_config_path,
-        device="cpu",
+        device=device,
         use_HF_format=use_HF_format,
     )
     return inc_model.model

From ca785c8ae7939abfe9eb9f6557de5c88c7a3efe3 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Fri, 10 Nov 2023 15:48:31 +0800
Subject: [PATCH 18/23] abandon old param names

Signed-off-by: Xin He <xin3.he@intel.com>
---
 .../adaptor/torch_utils/model_wrapper.py      | 117 +++++++++---------
 neural_compressor/model/torch_model.py        |   2 +-
 .../test_weight_only_adaptor.py               |  10 +-
 3 files changed, 63 insertions(+), 66 deletions(-)

diff --git a/neural_compressor/adaptor/torch_utils/model_wrapper.py b/neural_compressor/adaptor/torch_utils/model_wrapper.py
index 78f8aff85b2..04baa05590d 100644
--- a/neural_compressor/adaptor/torch_utils/model_wrapper.py
+++ b/neural_compressor/adaptor/torch_utils/model_wrapper.py
@@ -215,7 +215,7 @@ def __init__(
         scale_dtype=torch.float32,
         compression_dtype=torch.int32,
         compression_dim=1,
-        gptq_perm=False,
+        g_idx=False,
         device="cpu",
         use_HF_format=False,
     ):
@@ -259,7 +259,7 @@ def __init__(
                     dtype=self.float_type,
                 ).to(device),
             )
-            self.scale = self.scales.T
+            self.scales = self.scales.T
             self.register_buffer(
                 "qweight",
                 torch.zeros(
@@ -267,7 +267,7 @@ def __init__(
                     dtype=self.compressed_dtype,
                 ).to(device),
             )
-            self.packed_weight = self.qweight.T
+            self.qweight = self.qweight.T
             self.register_buffer(
                 "qzeros",
                 torch.zeros(
@@ -275,15 +275,10 @@ def __init__(
                     dtype=self.compressed_dtype,
                 ).to(device),
             )
-            self.packed_zp = self.qzeros.T
-            if gptq_perm:
-                self.register_buffer("g_idx", torch.zeros(in_features, dtype=torch.int32).to(device))
-            else:
-                self.g_idx = None
-            self.gptq_perm = self.g_idx
+            self.qzeros = self.qzeros.T
         else:
             self.register_buffer(
-                "scale",
+                "scales",
                 torch.zeros(
                     (out_features, math.ceil(in_features / self.groupsize)),
                     dtype=self.float_type,
@@ -291,7 +286,7 @@ def __init__(
             )
             if compression_dim == 1:
                 self.register_buffer(
-                    "packed_weight",
+                    "qweight",
                     torch.zeros(
                         (out_features, math.ceil(in_features / self.n_pack)),
                         dtype=self.compressed_dtype,
@@ -299,7 +294,7 @@ def __init__(
                 )
                 if zp:
                     self.register_buffer(
-                        "packed_zp",
+                        "qzeros",
                         torch.zeros(
                             (self.out_features, math.ceil(self.in_features / self.groupsize / self.n_pack)),
                             dtype=self.compressed_dtype,
@@ -307,7 +302,7 @@ def __init__(
                     )
             else:
                 self.register_buffer(
-                    "packed_weight",
+                    "qweight",
                     torch.zeros(
                         (math.ceil(out_features / self.n_pack), in_features),
                         dtype=self.compressed_dtype,
@@ -315,22 +310,22 @@ def __init__(
                 )
                 if zp:
                     self.register_buffer(
-                        "packed_zp",
+                        "qzeros",
                         torch.zeros(
                             (math.ceil(self.out_features / self.n_pack), math.ceil(self.in_features / self.groupsize)),
                             dtype=self.compressed_dtype,
                         ).to(device),
                     )
-            if gptq_perm:
-                self.register_buffer("gptq_perm", torch.zeros(in_features, dtype=torch.int32).to(device))
-            else:
-                self.gptq_perm = None
+        if g_idx:
+            self.register_buffer("g_idx", torch.zeros(in_features, dtype=torch.int32).to(device))
+        else:
+            self.g_idx = None
         if bias:
             self.register_buffer("bias", torch.zeros(self.out_features, dtype=self.float_type).to(device))
         else:
             self.bias = None
 
-    def pack(self, int_weight, scale, zp, bias, gptq_perm=None):
+    def pack(self, int_weight, scale, zp, bias, g_idx=None):
         int_weight = int_weight.to(self.device)
         if self.use_HF_format and zp is None:
             # to avoid overflow
@@ -341,20 +336,20 @@ def pack(self, int_weight, scale, zp, bias, gptq_perm=None):
         if bias is not None:
             assert hasattr(self, "bias"), "bias is not set when initializing."
             self.bias = bias.type(self.float_type).to(self.device)
-        if gptq_perm is not None:
-            assert hasattr(self, "gptq_perm"), "gptq_perm is not set when initializing."
-            self.gptq_perm = gptq_perm.type(torch.int32).to(self.device)
+        if g_idx is not None:
+            assert hasattr(self, "g_idx"), "g_idx is not set when initializing."
+            self.g_idx = g_idx.type(torch.int32).to(self.device)
             if self.use_HF_format:
-                invperm = torch.argsort(self.gptq_perm)
-                self.gptq_perm = invperm // self.groupsize
-                self.gptq_perm = self.gptq_perm.type(torch.int32).to(self.device)
-        assert scale.shape == self.scale.shape, "Scale shape is mismatched."
-        self.scale = scale.type(self.float_type).to(self.device)
+                invperm = torch.argsort(self.g_idx)
+                self.g_idx = invperm // self.groupsize
+                self.g_idx = self.g_idx.type(torch.int32).to(self.device)
+        assert scale.shape == self.scales.shape, "Scale shape is mismatched."
+        self.scales = scale.type(self.float_type).to(self.device)
         if not self.use_HF_format and self.compression_dim == 0:
             int_weight = int_weight.T
-            self.packed_weight = self.packed_weight.T
+            self.qweight = self.qweight.T
         origin_shape = int_weight.shape
-        target_shape = self.packed_weight.shape
+        target_shape = self.qweight.shape
         assert origin_shape[0] == target_shape[0], "output channels mismatch, please check."
         mask = torch.tensor(2**self.bits - 1, dtype=self.compressed_dtype).to(self.device)
 
@@ -366,9 +361,9 @@ def pack(self, int_weight, scale, zp, bias, gptq_perm=None):
             for e in range(tmp.shape[1]):
                 tmp[:, e] &= mask
                 tmp[:, e] = tmp[:, e] << (self.bits * e)
-                self.packed_weight[:, j] |= tmp[:, e]
+                self.qweight[:, j] |= tmp[:, e]
         if not self.use_HF_format and self.compression_dim == 0:
-            self.packed_weight = self.packed_weight.T
+            self.qweight = self.qweight.T
 
         if zp is not None:
             zp = zp.to(self.device)
@@ -376,9 +371,9 @@ def pack(self, int_weight, scale, zp, bias, gptq_perm=None):
                 zp -= 1
             if self.use_HF_format or self.compression_dim == 0:
                 zp = zp.T
-                self.packed_zp = self.packed_zp.T
-            assert hasattr(self, "packed_zp"), "zp is not set when initializing."
-            target_shape = self.packed_zp.shape
+                self.qzeros = self.qzeros.T
+            assert hasattr(self, "qzeros"), "zp is not set when initializing."
+            target_shape = self.qzeros.shape
             for j in range(target_shape[1]):
                 start = self.n_pack * j
                 end = self.n_pack * (j + 1)
@@ -386,47 +381,47 @@ def pack(self, int_weight, scale, zp, bias, gptq_perm=None):
                 for e in range(tmp.shape[1]):
                     tmp[:, e] &= mask
                     tmp[:, e] = tmp[:, e] << (self.bits * e)
-                    self.packed_zp[:, j] |= tmp[:, e]
+                    self.qzeros[:, j] |= tmp[:, e]
             if self.use_HF_format or self.compression_dim == 0:
-                self.packed_zp = self.packed_zp.T
+                self.qzeros = self.qzeros.T
         if self.use_HF_format:
-            self.scales = self.scale.T
-            self.qweight = self.packed_weight.T
-            self.g_idx = self.gptq_perm
-            self.qzeros = self.packed_zp.T
+            self.scales = self.scales.T
+            self.qweight = self.qweight.T
+            self.g_idx = self.g_idx
+            self.qzeros = self.qzeros.T
 
     def recover(self):
         logger.debug(f"Recovering {self} weight")
         if self.use_HF_format:
-            # Prevent broken id links of self.scale and self.scales
-            self.scale = self.scales.T
-            self.packed_weight = self.qweight.T
-            self.gptq_perm = self.g_idx
-            self.packed_zp = self.qzeros.T
-        device = self.scale.device
+            # Prevent broken id links of self.scales and self.scales
+            self.scales = self.scales.T
+            self.qweight = self.qweight.T
+            self.g_idx = self.g_idx
+            self.qzeros = self.qzeros.T
+        device = self.scales.device
         fp32_weight = torch.zeros(self.out_features, self.in_features, dtype=self.float_type).to(device)
-        if self.gptq_perm is None:
+        if self.g_idx is None:
             # used for recovering fp32_weight
-            self.gptq_perm = torch.tensor([i // self.groupsize for i in range(self.in_features)], dtype=torch.int32)
+            self.g_idx = torch.tensor([i // self.groupsize for i in range(self.in_features)], dtype=torch.int32)
         mask = torch.tensor(2**self.bits - 1, dtype=self.compressed_dtype).to(device)
-        if hasattr(self, "packed_zp"):
+        if hasattr(self, "qzeros"):
             weight_dtype = torch.uint8
         else:
             weight_dtype = torch.int8
         # unpack weight
         weight = torch.zeros(self.out_features, self.in_features, dtype=weight_dtype).to(device)
-        packed_weight = self.packed_weight
+        qweight = self.qweight
         if not self.use_HF_format and self.compression_dim == 0:
             weight = weight.T
-            packed_weight = packed_weight.T
+            qweight = qweight.T
         origin_shape = weight.shape
-        target_shape = packed_weight.shape
+        target_shape = qweight.shape
         for j in range(target_shape[1]):
             for e in range(self.n_pack):
                 index = j * self.n_pack + e
                 if index >= origin_shape[1]:
                     continue
-                tmp = packed_weight[:, j]
+                tmp = qweight[:, j]
                 tmp = tmp << (self.compress_bits - self.bits * (e + 1))
                 tmp = tmp >> self.compress_bits - self.bits
                 if weight_dtype == torch.uint8:
@@ -440,21 +435,21 @@ def recover(self):
                 new_weight += torch.where(weight == k, v, 0)
             weight = new_weight
         # unpack zero_point
-        if hasattr(self, "packed_zp"):
+        if hasattr(self, "qzeros"):
             zp_dtype = self.compressed_dtype  # to avoid overflow when weight-zp
-            zp = torch.zeros(self.scale.shape, dtype=zp_dtype).to(device)
-            packed_zp = self.packed_zp
+            zp = torch.zeros(self.scales.shape, dtype=zp_dtype).to(device)
+            qzeros = self.qzeros
             if self.use_HF_format or self.compression_dim == 0:
                 zp = zp.T
-                packed_zp = packed_zp.T
+                qzeros = qzeros.T
             origin_shape = zp.shape
-            target_shape = packed_zp.shape
+            target_shape = qzeros.shape
             for j in range(target_shape[1]):
                 for e in range(self.n_pack):
                     index = j * self.n_pack + e
                     if index >= origin_shape[1]:
                         continue
-                    tmp = packed_zp[:, j]
+                    tmp = qzeros[:, j]
                     tmp = tmp << (self.compress_bits - self.bits * (e + 1))
                     tmp = tmp >> self.compress_bits - self.bits
                     tmp &= mask
@@ -467,11 +462,11 @@ def recover(self):
                 zp = torch.where(zp > (2**self.bits - 1), 0, zp)
             # recover fp32 weight with int_weight, scale, and zero_point
             for idx in range(self.in_features):
-                fp32_weight[:, idx] = (weight[:, idx] - zp[:, self.gptq_perm[idx]]) * self.scale[:, self.gptq_perm[idx]]
+                fp32_weight[:, idx] = (weight[:, idx] - zp[:, self.g_idx[idx]]) * self.scales[:, self.g_idx[idx]]
         else:
             # recover fp32 weight with int_weight, scale
             for idx in range(self.in_features):
-                fp32_weight[:, idx] = weight[:, idx] * self.scale[:, self.gptq_perm[idx]]
+                fp32_weight[:, idx] = weight[:, idx] * self.scales[:, self.g_idx[idx]]
         return fp32_weight
 
     def forward(self, input):
diff --git a/neural_compressor/model/torch_model.py b/neural_compressor/model/torch_model.py
index ef7e891aed2..907646fa30d 100644
--- a/neural_compressor/model/torch_model.py
+++ b/neural_compressor/model/torch_model.py
@@ -546,7 +546,7 @@ def export_compressed_model(
                     dtype=dtype,
                     zp=gptq_zp is not None,
                     bias=m.bias is not None,
-                    gptq_perm=gptq_perm is not None,
+                    g_idx=gptq_perm is not None,
                     compression_dtype=compression_dtype,
                     compression_dim=compression_dim,
                     scale_dtype=scale_dtype,
diff --git a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py
index 692ce82533b..57e5ebb368b 100644
--- a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py
+++ b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py
@@ -16,7 +16,7 @@
 class Model(torch.nn.Module):
     def __init__(self):
         super(Model, self).__init__()
-        self.fc1 = torch.nn.Linear(30, 50)
+        self.fc1 = torch.nn.Linear(30, 50, bias=True)
         self.fc2 = torch.nn.Linear(50, 30)
         self.fc3 = torch.nn.Linear(30, 5)
 
@@ -90,7 +90,9 @@ def test_RTN_int_quant(self):
         self.assertFalse(torch.all(out1 == out2))
         compressed_model = q_model.export_compressed_model()
         out3 = compressed_model(input)
-        self.assertTrue("fc1.packed_weight" in compressed_model.state_dict().keys())
+        self.assertTrue("fc1.qweight" in compressed_model.state_dict().keys())
+        self.assertTrue("fc1.qzeros" not in compressed_model.state_dict().keys())
+        shape2 = compressed_model.state_dict()["fc1.scales"]
         self.assertTrue(torch.all(out3 == out2))
 
         # test huggingface popular int4 format
@@ -99,10 +101,10 @@ def test_RTN_int_quant(self):
         inc_model = INCModel(new_model)
         inc_model.export_compressed_model(qweight_config_path="saved/qconfig.json", use_HF_format=True)
         out4 = inc_model.model(input)
-        self.assertTrue("fc1.qweight" in inc_model.model.state_dict().keys())
+        self.assertTrue("fc1.qzeros" in inc_model.model.state_dict().keys())
         model = Model()
         compressed_model = export_compressed_model(model, saved_dir="saved", use_HF_format=True)
-        self.assertTrue("fc1.qweight" in compressed_model.state_dict().keys())
+        self.assertTrue("fc1.qzeros" in inc_model.model.state_dict().keys())
         self.assertTrue(torch.all(out3 == out4))
 
         model = Model()

From ed07108ec6bdb9ef1d4a1989f36d7ffa5e7b8cc2 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Fri, 10 Nov 2023 15:51:53 +0800
Subject: [PATCH 19/23] remove useless code

Signed-off-by: Xin He <xin3.he@intel.com>
---
 test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py
index 57e5ebb368b..01a63352f00 100644
--- a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py
+++ b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py
@@ -729,7 +729,6 @@ def __iter__(self):
             calib_dataloader=dataloader,
         )
         out2 = q_model.model(input)
-        print(out1[0] - out2[0])
         self.assertTrue(torch.allclose(out1[0], out2[0], atol=1e-01))
 
 

From 468902ab7c375893738b7942166fff347889b124 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Mon, 13 Nov 2023 09:39:46 +0800
Subject: [PATCH 20/23] remove useless doc

Signed-off-by: Xin He <xin3.he@intel.com>
---
 docs/source/quantization_weight_only.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/source/quantization_weight_only.md b/docs/source/quantization_weight_only.md
index ce48c0a6df3..6d208d69037 100644
--- a/docs/source/quantization_weight_only.md
+++ b/docs/source/quantization_weight_only.md
@@ -103,7 +103,6 @@ To support low memory inference, Neural Compressor implemented WeightOnlyLinear,
 > 1: Compression Dimension: weight = 1, zero = 0 and both are transposed.   
 > 2: Zero Point: zero_point-= 1 before compression. zero_point is always required even for sym.    
 > 3: Group Index: Use the same number for a group instead of recording channel order.    
-> 4. Parameter Name: `packed_weight` -> `qweight`; `packed_zp` -> `qzeros`; `gptq_perm` -> `g_idx`; `scale` -> `scales`. 
 
 
 ### **User Code Example**

From dc6f51c846b11f328b32908575dc99bb33c5369f Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Tue, 14 Nov 2023 21:19:18 +0800
Subject: [PATCH 21/23] fix ut

Signed-off-by: Xin He <xin3.he@intel.com>
---
 test/model/test_model_pytorch.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/model/test_model_pytorch.py b/test/model/test_model_pytorch.py
index 7b42ef63729..0dd2fdead84 100644
--- a/test/model/test_model_pytorch.py
+++ b/test/model/test_model_pytorch.py
@@ -123,7 +123,7 @@ def test_WeightOnlyLinear(self):
             model_size2 = os.path.getsize("saved/tmp.pt") / 1024
             print("WeightOnlyLinear Model size:{:.3f}M".format(model_size2))
             self.assertTrue(isinstance(inc_model.model.fc1, WeightOnlyLinear))
-            self.assertTrue(inc_model.model.fc1.packed_weight.dtype == dtype)
+            self.assertTrue(inc_model.model.fc1.qweight.dtype == dtype)
             self.assertTrue(inc_model.model.fc1.scale.dtype == torch.float32)
             self.assertTrue(model_size1 / model_size2 > 2)
             self.assertTrue(torch.all(torch.isclose(out1, out2, atol=5e-1)))
@@ -143,9 +143,9 @@ def test_WeightOnlyLinear(self):
             print("WeightOnlyLinear Model size:{:.3f}M".format(model_size2))
             self.assertTrue(isinstance(inc_model.model.fc1, WeightOnlyLinear))
             if dim == 1:
-                self.assertTrue(inc_model.model.fc1.packed_weight.shape[0] == inc_model.model.fc1.out_features)
+                self.assertTrue(inc_model.model.fc1.qweight.shape[0] == inc_model.model.fc1.out_features)
             else:
-                self.assertTrue(inc_model.model.fc1.packed_weight.shape[1] == inc_model.model.fc1.in_features)
+                self.assertTrue(inc_model.model.fc1.qweight.shape[1] == inc_model.model.fc1.in_features)
             self.assertTrue(model_size1 / model_size2 > 2)
             self.assertTrue(torch.all(torch.isclose(out1, out2, atol=5e-1)))
 

From 340cf597cb2bfc983e82f0bc541cfa5765a700b1 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Wed, 15 Nov 2023 09:51:20 +0800
Subject: [PATCH 22/23] fix ut

Signed-off-by: Xin He <xin3.he@intel.com>
---
 test/model/test_model_pytorch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/model/test_model_pytorch.py b/test/model/test_model_pytorch.py
index 0dd2fdead84..05edfd9c6fb 100644
--- a/test/model/test_model_pytorch.py
+++ b/test/model/test_model_pytorch.py
@@ -124,7 +124,7 @@ def test_WeightOnlyLinear(self):
             print("WeightOnlyLinear Model size:{:.3f}M".format(model_size2))
             self.assertTrue(isinstance(inc_model.model.fc1, WeightOnlyLinear))
             self.assertTrue(inc_model.model.fc1.qweight.dtype == dtype)
-            self.assertTrue(inc_model.model.fc1.scale.dtype == torch.float32)
+            self.assertTrue(inc_model.model.fc1.scales.dtype == torch.float32)
             self.assertTrue(model_size1 / model_size2 > 2)
             self.assertTrue(torch.all(torch.isclose(out1, out2, atol=5e-1)))
 
@@ -161,7 +161,7 @@ def test_WeightOnlyLinear(self):
         model_size2 = os.path.getsize("saved/tmp.pt") / 1024
         print("WeightOnlyLinear Model size:{:.3f}M".format(model_size2))
         self.assertTrue(isinstance(inc_model.model.fc1, WeightOnlyLinear))
-        self.assertTrue(inc_model.model.fc1.scale.dtype == torch.float16)
+        self.assertTrue(inc_model.model.fc1.scales.dtype == torch.float16)
         self.assertTrue(model_size1 / model_size2 > 2)
         self.assertTrue(torch.all(torch.isclose(out1, out2, atol=5e-1)))
 

From 2e4e6b42b07c619806db9a9737d32fb99af72164 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Thu, 16 Nov 2023 10:44:09 +0800
Subject: [PATCH 23/23] rename use_HF_format to use_hf_format

Signed-off-by: Xin He <xin3.he@intel.com>
---
 docs/source/quantization_weight_only.md       |  2 +-
 .../adaptor/torch_utils/model_wrapper.py      | 38 +++++++++----------
 .../adaptor/torch_utils/weight_only.py        |  4 +-
 neural_compressor/model/torch_model.py        | 10 ++---
 neural_compressor/utils/load_huggingface.py   |  6 +--
 .../test_weight_only_adaptor.py               |  8 ++--
 6 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/docs/source/quantization_weight_only.md b/docs/source/quantization_weight_only.md
index 6d208d69037..b1ab86d1fd5 100644
--- a/docs/source/quantization_weight_only.md
+++ b/docs/source/quantization_weight_only.md
@@ -96,7 +96,7 @@ To support low memory inference, Neural Compressor implemented WeightOnlyLinear,
 |  compression_dtype  |       torch.int32       |  Data type for compressed dtype, select from [torch.int8\|16\|32\|64]   |
 |  compression_dim  |       1       |   0 means output channel while 1 means input channel   |
 |  scale_dtype  |       torch.float32       |  Data type for scale and bias   |
-|  use_HF_format  |     False       |  Whether to use the popular format present on HuggingFace hub   |
+|  use_hf_format  |     False       |  Whether to use the popular format present on HuggingFace hub   |
 
 **Note:** HuggingFace format is quite special, the main differences are as follows:
 
diff --git a/neural_compressor/adaptor/torch_utils/model_wrapper.py b/neural_compressor/adaptor/torch_utils/model_wrapper.py
index 04baa05590d..57103566d9d 100644
--- a/neural_compressor/adaptor/torch_utils/model_wrapper.py
+++ b/neural_compressor/adaptor/torch_utils/model_wrapper.py
@@ -217,10 +217,10 @@ def __init__(
         compression_dim=1,
         g_idx=False,
         device="cpu",
-        use_HF_format=False,
+        use_hf_format=False,
     ):
         super().__init__()
-        self.use_HF_format = use_HF_format
+        self.use_hf_format = use_hf_format
         self.dtype = dtype
         if "int" not in self.dtype:  # for nf4, fp4
             from neural_compressor.adaptor.torch_utils.weight_only import FLOAT_MAPPING, INT_MAPPING
@@ -251,7 +251,7 @@ def __init__(
         assert compression_dim in [0, 1], (
             "Only support 0 or 1 as compression dimension, " + "0 is output channel, 1 is input channel."
         )
-        if self.use_HF_format:
+        if self.use_hf_format:
             self.register_buffer(
                 "scales",
                 torch.zeros(
@@ -327,7 +327,7 @@ def __init__(
 
     def pack(self, int_weight, scale, zp, bias, g_idx=None):
         int_weight = int_weight.to(self.device)
-        if self.use_HF_format and zp is None:
+        if self.use_hf_format and zp is None:
             # to avoid overflow
             int_weight = int_weight.type(torch.int32)
             shift_bias = 2 ** (self.bits - 1)
@@ -339,13 +339,13 @@ def pack(self, int_weight, scale, zp, bias, g_idx=None):
         if g_idx is not None:
             assert hasattr(self, "g_idx"), "g_idx is not set when initializing."
             self.g_idx = g_idx.type(torch.int32).to(self.device)
-            if self.use_HF_format:
+            if self.use_hf_format:
                 invperm = torch.argsort(self.g_idx)
                 self.g_idx = invperm // self.groupsize
                 self.g_idx = self.g_idx.type(torch.int32).to(self.device)
         assert scale.shape == self.scales.shape, "Scale shape is mismatched."
         self.scales = scale.type(self.float_type).to(self.device)
-        if not self.use_HF_format and self.compression_dim == 0:
+        if not self.use_hf_format and self.compression_dim == 0:
             int_weight = int_weight.T
             self.qweight = self.qweight.T
         origin_shape = int_weight.shape
@@ -362,14 +362,14 @@ def pack(self, int_weight, scale, zp, bias, g_idx=None):
                 tmp[:, e] &= mask
                 tmp[:, e] = tmp[:, e] << (self.bits * e)
                 self.qweight[:, j] |= tmp[:, e]
-        if not self.use_HF_format and self.compression_dim == 0:
+        if not self.use_hf_format and self.compression_dim == 0:
             self.qweight = self.qweight.T
 
         if zp is not None:
             zp = zp.to(self.device)
-            if self.use_HF_format:
+            if self.use_hf_format:
                 zp -= 1
-            if self.use_HF_format or self.compression_dim == 0:
+            if self.use_hf_format or self.compression_dim == 0:
                 zp = zp.T
                 self.qzeros = self.qzeros.T
             assert hasattr(self, "qzeros"), "zp is not set when initializing."
@@ -382,9 +382,9 @@ def pack(self, int_weight, scale, zp, bias, g_idx=None):
                     tmp[:, e] &= mask
                     tmp[:, e] = tmp[:, e] << (self.bits * e)
                     self.qzeros[:, j] |= tmp[:, e]
-            if self.use_HF_format or self.compression_dim == 0:
+            if self.use_hf_format or self.compression_dim == 0:
                 self.qzeros = self.qzeros.T
-        if self.use_HF_format:
+        if self.use_hf_format:
             self.scales = self.scales.T
             self.qweight = self.qweight.T
             self.g_idx = self.g_idx
@@ -392,7 +392,7 @@ def pack(self, int_weight, scale, zp, bias, g_idx=None):
 
     def recover(self):
         logger.debug(f"Recovering {self} weight")
-        if self.use_HF_format:
+        if self.use_hf_format:
             # Prevent broken id links of self.scales and self.scales
             self.scales = self.scales.T
             self.qweight = self.qweight.T
@@ -411,7 +411,7 @@ def recover(self):
         # unpack weight
         weight = torch.zeros(self.out_features, self.in_features, dtype=weight_dtype).to(device)
         qweight = self.qweight
-        if not self.use_HF_format and self.compression_dim == 0:
+        if not self.use_hf_format and self.compression_dim == 0:
             weight = weight.T
             qweight = qweight.T
         origin_shape = weight.shape
@@ -427,7 +427,7 @@ def recover(self):
                 if weight_dtype == torch.uint8:
                     tmp &= mask  # remove sign bit
                 weight[:, index] = tmp.type(weight_dtype)
-        if not self.use_HF_format and self.compression_dim == 0:
+        if not self.use_hf_format and self.compression_dim == 0:
             weight = weight.T
         if "int" not in self.dtype:
             new_weight = torch.zeros(self.out_features, self.in_features).to(device)
@@ -439,7 +439,7 @@ def recover(self):
             zp_dtype = self.compressed_dtype  # to avoid overflow when weight-zp
             zp = torch.zeros(self.scales.shape, dtype=zp_dtype).to(device)
             qzeros = self.qzeros
-            if self.use_HF_format or self.compression_dim == 0:
+            if self.use_hf_format or self.compression_dim == 0:
                 zp = zp.T
                 qzeros = qzeros.T
             origin_shape = zp.shape
@@ -454,9 +454,9 @@ def recover(self):
                     tmp = tmp >> self.compress_bits - self.bits
                     tmp &= mask
                     zp[:, index] = tmp.type(zp_dtype)
-            if self.use_HF_format or self.compression_dim == 0:
+            if self.use_hf_format or self.compression_dim == 0:
                 zp = zp.T
-            if self.use_HF_format:
+            if self.use_hf_format:
                 # zp -= 1 may cause zp == -1, after recover it becomes 2**self.bits - 1
                 zp += 1
                 zp = torch.where(zp > (2**self.bits - 1), 0, zp)
@@ -489,8 +489,8 @@ def extra_repr(self) -> str:
             self.groupsize,
             self.bias is not None,
         )
-        if self.use_HF_format:
-            tmp_str += ", use_HF_format=True"
+        if self.use_hf_format:
+            tmp_str += ", use_hf_format=True"
         return tmp_str
 
 
diff --git a/neural_compressor/adaptor/torch_utils/weight_only.py b/neural_compressor/adaptor/torch_utils/weight_only.py
index 37a810a0428..f866ac12410 100644
--- a/neural_compressor/adaptor/torch_utils/weight_only.py
+++ b/neural_compressor/adaptor/torch_utils/weight_only.py
@@ -396,7 +396,7 @@ def rtn_quantize(
         compression_dim = kwargs.get("compression_dim", 1)
         scale_dtype = kwargs.get("scale_dtype", torch.float32)
         device = kwargs.get("device", "cpu")
-        use_HF_format = kwargs.get("use_HF_format", False)
+        use_hf_format = kwargs.get("use_hf_format", False)
     for name, m in model.named_modules():
         if m.__class__.__name__ not in supported_layers:
             continue
@@ -449,7 +449,7 @@ def rtn_quantize(
                 compression_dim=compression_dim,
                 scale_dtype=scale_dtype,
                 device=device,
-                use_HF_format=use_HF_format,
+                use_hf_format=use_hf_format,
             )
             new_module.pack(int_weight, scale, zp, m.bias)
             if name == "":
diff --git a/neural_compressor/model/torch_model.py b/neural_compressor/model/torch_model.py
index 907646fa30d..fb7046a1607 100644
--- a/neural_compressor/model/torch_model.py
+++ b/neural_compressor/model/torch_model.py
@@ -459,7 +459,7 @@ def export_compressed_model(
         scale_dtype=torch.float32,
         gptq_config_path=None,
         device="cpu",
-        use_HF_format=False,
+        use_hf_format=False,
     ):
         """Convert Linear to WeightOnlyLinear for low memory inference.
 
@@ -475,7 +475,7 @@ def export_compressed_model(
                                                     Defaults to torch.float32.
             gptq_config_path (str, optional): Path of gptq_config.json. Defaults to None.
             device (str, optional): choose device for compression. Defaults to cpu.
-            use_HF_format (bool, optional): use the popular huggingface compression format.
+            use_hf_format (bool, optional): use the popular huggingface compression format.
                 1: compression_dim: weight = 1, zeros = 0 and both are transposed.
                 2: zeros -= 1 before compression. Why we need it?
                 3: g_idx: use same number for one group instead of recording the channel order.
@@ -520,7 +520,7 @@ def export_compressed_model(
                         compression_dim=compression_dim,
                         scale_dtype=scale_dtype,
                         device=device,
-                        use_HF_format=use_HF_format,
+                        use_hf_format=use_hf_format,
                     )
                     set_module(self.model, k, new_module)
                     continue
@@ -551,7 +551,7 @@ def export_compressed_model(
                     compression_dim=compression_dim,
                     scale_dtype=scale_dtype,
                     device=device,
-                    use_HF_format=use_HF_format,
+                    use_hf_format=use_hf_format,
                 )
                 new_module.pack(int_weight, gptq_scale, gptq_zp, m.bias, gptq_perm)
                 set_module(self.model, k, new_module)
@@ -578,7 +578,7 @@ def export_compressed_model(
                     compression_dim=compression_dim,
                     scale_dtype=scale_dtype,
                     device=device,
-                    use_HF_format=use_HF_format,
+                    use_hf_format=use_hf_format,
                 )
                 set_module(self.model, k, mod)
         return self.model
diff --git a/neural_compressor/utils/load_huggingface.py b/neural_compressor/utils/load_huggingface.py
index f7378d90998..fff4c050603 100644
--- a/neural_compressor/utils/load_huggingface.py
+++ b/neural_compressor/utils/load_huggingface.py
@@ -235,7 +235,7 @@ def save_for_huggingface_upstream(model, tokenizer, output_dir):
 def export_compressed_model(
     model,
     saved_dir=None,
-    use_HF_format=False,
+    use_hf_format=False,
     enable_full_range=False,
     compression_dtype=torch.int32,
     compression_dim=1,
@@ -247,7 +247,7 @@ def export_compressed_model(
     Args:
         model (torch.nn.Module): origin fp32 model.
         saved_dir (_type_, optional): the dir path of compression info. Defaults to None.
-        use_HF_format (bool, optional): whether use HuggingFace format. Defaults to False.
+        use_hf_format (bool, optional): whether use HuggingFace format. Defaults to False.
         enable_full_range (bool, optional): Whether to leverage the full compression range
                                             under symmetric quantization. Defaults to False.
         compression_dtype (torch.Tensor, optional): The target dtype after comoression.
@@ -277,6 +277,6 @@ def export_compressed_model(
         scale_dtype=scale_dtype,
         gptq_config_path=gptq_config_path,
         device=device,
-        use_HF_format=use_HF_format,
+        use_hf_format=use_hf_format,
     )
     return inc_model.model
diff --git a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py
index 01a63352f00..47202b86b52 100644
--- a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py
+++ b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py
@@ -99,11 +99,11 @@ def test_RTN_int_quant(self):
         model = Model()
         new_model = load("saved", model, weight_only=True)
         inc_model = INCModel(new_model)
-        inc_model.export_compressed_model(qweight_config_path="saved/qconfig.json", use_HF_format=True)
+        inc_model.export_compressed_model(qweight_config_path="saved/qconfig.json", use_hf_format=True)
         out4 = inc_model.model(input)
         self.assertTrue("fc1.qzeros" in inc_model.model.state_dict().keys())
         model = Model()
-        compressed_model = export_compressed_model(model, saved_dir="saved", use_HF_format=True)
+        compressed_model = export_compressed_model(model, saved_dir="saved", use_hf_format=True)
         self.assertTrue("fc1.qzeros" in inc_model.model.state_dict().keys())
         self.assertTrue(torch.all(out3 == out4))
 
@@ -554,7 +554,7 @@ def __iter__(self):
         )
         q_model.save("saved")
         out1 = q_model.model(input)
-        compressed_model = q_model.export_compressed_model(use_HF_format=True)
+        compressed_model = q_model.export_compressed_model(use_hf_format=True)
         out2 = compressed_model(input)
         torch.save(compressed_model.state_dict(), "saved/compressed_model.pt")
         self.assertTrue(torch.allclose(out1[0], out2[0], atol=1e-05))
@@ -569,7 +569,7 @@ def __iter__(self):
         )
         q_model.save("saved")
         out1 = q_model.model(input)
-        compressed_model = q_model.export_compressed_model(use_HF_format=True)
+        compressed_model = q_model.export_compressed_model(use_hf_format=True)
         out2 = compressed_model(input)
         torch.save(compressed_model.state_dict(), "saved/compressed_model.pt")
         self.assertTrue(torch.allclose(out1[0], out2[0], atol=1e-05))