From 9e106b8c6cc81efee1851a0c58485061034bb871 Mon Sep 17 00:00:00 2001 From: Xin He Date: Tue, 7 Nov 2023 21:58:47 +0800 Subject: [PATCH 01/23] add use_HF_format for export_compressed_model Signed-off-by: Xin He --- .../adaptor/torch_utils/model_wrapper.py | 109 +++++++++++++----- .../adaptor/torch_utils/weight_only.py | 2 + neural_compressor/model/torch_model.py | 8 ++ .../test_weight_only_adaptor.py | 21 +++- 4 files changed, 104 insertions(+), 36 deletions(-) diff --git a/neural_compressor/adaptor/torch_utils/model_wrapper.py b/neural_compressor/adaptor/torch_utils/model_wrapper.py index d30182a7b9e..4138e8b1df1 100644 --- a/neural_compressor/adaptor/torch_utils/model_wrapper.py +++ b/neural_compressor/adaptor/torch_utils/model_wrapper.py @@ -217,8 +217,10 @@ def __init__( compression_dim=1, gptq_perm=False, device="cpu", + use_HF_format=False, ): super().__init__() + self.use_HF_format = use_HF_format self.dtype = dtype if "int" not in self.dtype: # for nf4, fp4 from neural_compressor.adaptor.torch_utils.weight_only import FLOAT_MAPPING, INT_MAPPING @@ -249,53 +251,85 @@ def __init__( assert compression_dim in [0, 1], ( "Only support 0 or 1 as compression dimension, " + "0 is output channel, 1 is input channel." ) - self.register_buffer( - "scale", - torch.zeros( - (out_features, math.ceil(in_features / self.groupsize)), - dtype=self.float_type, - ).to(device), - ) - if compression_dim == 1: + if self.use_HF_format: + self.register_buffer( + "scales", + torch.zeros( + (out_features, math.ceil(in_features / self.groupsize)), + dtype=self.float_type, + ).to(device), + ) + self.scale = self.scales self.register_buffer( - "packed_weight", + "qweight", torch.zeros( - (out_features, math.ceil(in_features / self.n_pack)), + (math.ceil(in_features / self.n_pack), out_features), dtype=self.compressed_dtype, ).to(device), ) + self.packed_weight = self.qweight.T if zp: self.register_buffer( - "packed_zp", + "qzeros", torch.zeros( - (self.out_features, math.ceil(self.in_features / self.groupsize / self.n_pack)), + (math.ceil(self.in_features / self.groupsize), math.ceil(self.out_features / self.n_pack)), dtype=self.compressed_dtype, ).to(device), ) + self.packed_zp = self.qzeros.T + if gptq_perm: + self.register_buffer("g_idx", torch.zeros(in_features, dtype=torch.int32).to(device)) + else: + self.g_idx = None + self.gptq_perm = self.g_idx else: self.register_buffer( - "packed_weight", + "scale", torch.zeros( - (math.ceil(out_features / self.n_pack), in_features), - dtype=self.compressed_dtype, + (out_features, math.ceil(in_features / self.groupsize)), + dtype=self.float_type, ).to(device), ) - if zp: + if compression_dim == 1: self.register_buffer( - "packed_zp", + "packed_weight", torch.zeros( - (math.ceil(self.out_features / self.n_pack), math.ceil(self.in_features / self.groupsize)), + (out_features, math.ceil(in_features / self.n_pack)), dtype=self.compressed_dtype, ).to(device), ) + if zp: + self.register_buffer( + "packed_zp", + torch.zeros( + (self.out_features, math.ceil(self.in_features / self.groupsize / self.n_pack)), + dtype=self.compressed_dtype, + ).to(device), + ) + else: + self.register_buffer( + "packed_weight", + torch.zeros( + (math.ceil(out_features / self.n_pack), in_features), + dtype=self.compressed_dtype, + ).to(device), + ) + if zp: + self.register_buffer( + "packed_zp", + torch.zeros( + (math.ceil(self.out_features / self.n_pack), math.ceil(self.in_features / self.groupsize)), + dtype=self.compressed_dtype, + ).to(device), + ) + if gptq_perm: + self.register_buffer("gptq_perm", torch.zeros(in_features, dtype=torch.int32).to(device)) + else: + self.gptq_perm = None if bias: self.register_buffer("bias", torch.zeros(self.out_features, dtype=self.float_type).to(device)) else: self.bias = None - if gptq_perm: - self.register_buffer("gptq_perm", torch.zeros(in_features, dtype=torch.int32).to(device)) - else: - self.gptq_perm = None def pack(self, int_weight, scale, zp, bias, gptq_perm=None): int_weight = int_weight.to(self.device) @@ -307,7 +341,7 @@ def pack(self, int_weight, scale, zp, bias, gptq_perm=None): self.gptq_perm = gptq_perm.type(torch.int32).to(self.device) assert scale.shape == self.scale.shape, "Scale shape is mismatched." self.scale = scale.type(self.float_type).to(self.device) - if self.compression_dim == 0: + if not self.use_HF_format and self.compression_dim == 0: int_weight = int_weight.T self.packed_weight = self.packed_weight.T origin_shape = int_weight.shape @@ -324,12 +358,14 @@ def pack(self, int_weight, scale, zp, bias, gptq_perm=None): tmp[:, e] &= mask tmp[:, e] = tmp[:, e] << (self.bits * e) self.packed_weight[:, j] |= tmp[:, e] - if self.compression_dim == 0: + if not self.use_HF_format and self.compression_dim == 0: self.packed_weight = self.packed_weight.T if zp is not None: zp = zp.to(self.device) - if self.compression_dim == 0: + if self.use_HF_format: + zp -= 1 + if self.use_HF_format or self.compression_dim == 0: zp = zp.T self.packed_zp = self.packed_zp.T assert hasattr(self, "packed_zp"), "zp is not set when initializing." @@ -342,7 +378,7 @@ def pack(self, int_weight, scale, zp, bias, gptq_perm=None): tmp[:, e] &= mask tmp[:, e] = tmp[:, e] << (self.bits * e) self.packed_zp[:, j] |= tmp[:, e] - if self.compression_dim == 0: + if self.use_HF_format or self.compression_dim == 0: self.packed_zp = self.packed_zp.T def recover(self): @@ -356,7 +392,7 @@ def recover(self): # unpack weight weight = torch.zeros(self.out_features, self.in_features, dtype=weight_dtype).to(device) packed_weight = self.packed_weight - if self.compression_dim == 0: + if not self.use_HF_format and self.compression_dim == 0: weight = weight.T packed_weight = packed_weight.T origin_shape = weight.shape @@ -372,7 +408,7 @@ def recover(self): if weight_dtype == torch.uint8: tmp &= mask # remove sign bit weight[:, index] = tmp.type(weight_dtype) - if self.compression_dim == 0: + if not self.use_HF_format and self.compression_dim == 0: weight = weight.T if "int" not in self.dtype: new_weight = torch.zeros(self.out_features, self.in_features).to(device) @@ -384,7 +420,7 @@ def recover(self): zp_dtype = self.compressed_dtype # to avoid overflow when weight-zp zp = torch.zeros(self.scale.shape, dtype=zp_dtype).to(device) packed_zp = self.packed_zp - if self.compression_dim == 0: + if self.use_HF_format or self.compression_dim == 0: zp = zp.T packed_zp = packed_zp.T origin_shape = zp.shape @@ -399,8 +435,10 @@ def recover(self): tmp = tmp >> self.compress_bits - self.bits tmp &= mask zp[:, index] = tmp.type(zp_dtype) - if self.compression_dim == 0: + if self.use_HF_format or self.compression_dim == 0: zp = zp.T + if self.use_HF_format: + zp += 1 # recover fp32 weight with int_weight, scale, and zero_point left_element = self.in_features % self.groupsize if left_element != 0: @@ -453,9 +491,16 @@ def forward(self, input): return F.linear(input, weight, self.bias) def extra_repr(self) -> str: - return "in_features={}, out_features={}, bits={}, group_size={}, bias={}".format( - self.in_features, self.out_features, self.bits, self.groupsize, self.bias is not None + tmp_str = "in_features={}, out_features={}, bits={}, group_size={}, bias={}".format( + self.in_features, + self.out_features, + self.bits, + self.groupsize, + self.bias is not None, ) + if self.use_HF_format: + tmp_str += ", use_HF_format=True" + return tmp_str class FakeAffineTensorQuantFunction(Function): diff --git a/neural_compressor/adaptor/torch_utils/weight_only.py b/neural_compressor/adaptor/torch_utils/weight_only.py index 7ba86eaa344..37a810a0428 100644 --- a/neural_compressor/adaptor/torch_utils/weight_only.py +++ b/neural_compressor/adaptor/torch_utils/weight_only.py @@ -396,6 +396,7 @@ def rtn_quantize( compression_dim = kwargs.get("compression_dim", 1) scale_dtype = kwargs.get("scale_dtype", torch.float32) device = kwargs.get("device", "cpu") + use_HF_format = kwargs.get("use_HF_format", False) for name, m in model.named_modules(): if m.__class__.__name__ not in supported_layers: continue @@ -448,6 +449,7 @@ def rtn_quantize( compression_dim=compression_dim, scale_dtype=scale_dtype, device=device, + use_HF_format=use_HF_format, ) new_module.pack(int_weight, scale, zp, m.bias) if name == "": diff --git a/neural_compressor/model/torch_model.py b/neural_compressor/model/torch_model.py index eeada402f35..e546893e323 100644 --- a/neural_compressor/model/torch_model.py +++ b/neural_compressor/model/torch_model.py @@ -459,6 +459,7 @@ def export_compressed_model( scale_dtype=torch.float32, gptq_config_path=None, device="cpu", + use_HF_format=False, ): """Convert Linear to WeightOnlyLinear for low memory inference. @@ -474,6 +475,10 @@ def export_compressed_model( Defaults to torch.float32. gptq_config_path (str, optional): Path of gptq_config.json. Defaults to None. device (str, optional): choose device for compression. Defaults to cpu. + use_HF_format (bool, optional): use the popular huggingface compression format. + 1: compression_dim: weight = 1, zeros = 0 and both are transposed. + 2: zeros -= 1 before compression. Why we need it? + 3: g_idx: use same number for one group instead of recording the channel order. """ from ..adaptor.torch_utils.model_wrapper import WeightOnlyLinear from ..adaptor.torch_utils.util import collect_weight_info, fetch_module, set_module @@ -513,6 +518,7 @@ def export_compressed_model( compression_dim=compression_dim, scale_dtype=scale_dtype, device=device, + use_HF_format=use_HF_format, ) set_module(self.model, k, new_module) continue @@ -539,6 +545,7 @@ def export_compressed_model( compression_dim=compression_dim, scale_dtype=scale_dtype, device=device, + use_HF_format=use_HF_format, ) new_module.pack(int_weight, gptq_scale, gptq_zp, m.bias, gptq_perm) set_module(self.model, k, new_module) @@ -565,6 +572,7 @@ def export_compressed_model( compression_dim=compression_dim, scale_dtype=scale_dtype, device=device, + use_HF_format=use_HF_format, ) set_module(self.model, k, mod) return self.model diff --git a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py index 2e6e5b85ee0..85b0bcfafa1 100644 --- a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py +++ b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py @@ -8,6 +8,8 @@ from neural_compressor import PostTrainingQuantConfig, quantization from neural_compressor.adaptor.torch_utils.model_wrapper import MulLinear, WeightOnlyLinear +from neural_compressor.model import Model as INCModel +from neural_compressor.utils.pytorch import load class Model(torch.nn.Module): @@ -81,13 +83,27 @@ def test_RTN_int_quant(self): approach="weight_only", ) q_model = quantization.fit(model, conf) + q_model.save("saved") out2 = q_model(input) self.assertTrue(torch.all(torch.isclose(out1, out2, atol=5e-1))) self.assertFalse(torch.all(out1 == out2)) compressed_model = q_model.export_compressed_model() out3 = compressed_model(input) + self.assertTrue("fc1.packed_weight" in compressed_model.state_dict().keys()) + q_weight1 = compressed_model.state_dict()["fc1.packed_weight"] self.assertTrue(torch.all(out3 == out2)) + # test huggingface popular int4 format + model = Model() + new_model = load("saved", model, weight_only=True) + inc_model = INCModel(new_model) + inc_model.export_compressed_model(qweight_config_path="saved/qconfig.json", use_HF_format=True) + out4 = inc_model.model(input) + self.assertTrue("fc1.qweight" in inc_model.model.state_dict().keys()) + q_weight2 = inc_model.model.state_dict()["fc1.qweight"] + self.assertTrue(torch.all(q_weight1.T == q_weight2)) + self.assertTrue(torch.all(out3 == out4)) + model = Model() out1 = model(input) conf = PostTrainingQuantConfig( @@ -218,7 +234,6 @@ def test_RTN_int_quant(self): self.assertTrue(torch.all(torch.isclose(out1, out2, atol=5e-1))) self.assertFalse(torch.all(out1 == out2)) q_model.save("saved") - from neural_compressor.utils.pytorch import load new_model = load("saved", model, weight_only=True) out1 = new_model(input) @@ -226,8 +241,6 @@ def test_RTN_int_quant(self): model_size1 = os.path.getsize("saved/best_model.pt") / 1024 print("FP32 Model size:{:.3f}M".format(model_size1)) - from neural_compressor.model import Model as INCModel - inc_model = INCModel(new_model) inc_model.export_compressed_model(qweight_config_path="saved/qconfig.json") torch.save(inc_model.state_dict(), "saved/tmp.pt") @@ -528,7 +541,7 @@ def __iter__(self): ) q_model.save("saved") out1 = q_model.model(input) - compressed_model = q_model.export_compressed_model() + compressed_model = q_model.export_compressed_model(use_HF_format=True) out2 = compressed_model(input) torch.save(compressed_model.state_dict(), "saved/compressed_model.pt") self.assertTrue(torch.allclose(out1[0], out2[0], atol=1e-05)) From 2326e159e0fa8fed363ff735884b6c89ea61c2a8 Mon Sep 17 00:00:00 2001 From: Xin He Date: Tue, 7 Nov 2023 22:11:29 +0800 Subject: [PATCH 02/23] fix g_idx Signed-off-by: Xin He --- neural_compressor/adaptor/torch_utils/model_wrapper.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/neural_compressor/adaptor/torch_utils/model_wrapper.py b/neural_compressor/adaptor/torch_utils/model_wrapper.py index 4138e8b1df1..adac2a6f4fc 100644 --- a/neural_compressor/adaptor/torch_utils/model_wrapper.py +++ b/neural_compressor/adaptor/torch_utils/model_wrapper.py @@ -339,6 +339,8 @@ def pack(self, int_weight, scale, zp, bias, gptq_perm=None): if gptq_perm is not None: assert hasattr(self, "gptq_perm"), "gptq_perm is not set when initializing." self.gptq_perm = gptq_perm.type(torch.int32).to(self.device) + if self.use_HF_format: + self.gptq_perm = self.gptq_perm // self.groupsize assert scale.shape == self.scale.shape, "Scale shape is mismatched." self.scale = scale.type(self.float_type).to(self.device) if not self.use_HF_format and self.compression_dim == 0: From b6b98e3f8dcb4c5527f392fe00e46508e44bc79b Mon Sep 17 00:00:00 2001 From: Xin He Date: Wed, 8 Nov 2023 13:06:58 +0800 Subject: [PATCH 03/23] Prevent broken id links Signed-off-by: Xin He --- .../adaptor/torch_utils/model_wrapper.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/neural_compressor/adaptor/torch_utils/model_wrapper.py b/neural_compressor/adaptor/torch_utils/model_wrapper.py index adac2a6f4fc..9eb2f6446b7 100644 --- a/neural_compressor/adaptor/torch_utils/model_wrapper.py +++ b/neural_compressor/adaptor/torch_utils/model_wrapper.py @@ -255,11 +255,11 @@ def __init__( self.register_buffer( "scales", torch.zeros( - (out_features, math.ceil(in_features / self.groupsize)), + (math.ceil(in_features / self.groupsize), out_features), dtype=self.float_type, ).to(device), ) - self.scale = self.scales + self.scale = self.scales.T self.register_buffer( "qweight", torch.zeros( @@ -382,9 +382,22 @@ def pack(self, int_weight, scale, zp, bias, gptq_perm=None): self.packed_zp[:, j] |= tmp[:, e] if self.use_HF_format or self.compression_dim == 0: self.packed_zp = self.packed_zp.T + if self.use_HF_format: + self.scales = self.scale.T + self.qweight = self.packed_weight.T + self.g_idx = self.gptq_perm + if zp is not None: + self.qzeros = self.packed_zp.T def recover(self): logger.debug(f"Recovering {self} weight") + if self.use_HF_format: + # Prevent broken id links of self.scale and self.scales + self.scale = self.scales.T + self.packed_weight = self.qweight.T + self.gptq_perm = self.g_idx + if hasattr(self, "qzeros"): + self.packed_zp = self.qzeros.T device = self.scale.device mask = torch.tensor(2**self.bits - 1, dtype=self.compressed_dtype).to(device) if hasattr(self, "packed_zp"): From fec9c19e3fd5974ea776100babca836f37c0038b Mon Sep 17 00:00:00 2001 From: Xin He Date: Wed, 8 Nov 2023 14:29:11 +0800 Subject: [PATCH 04/23] add sym qzero Signed-off-by: Xin He --- .../adaptor/torch_utils/model_wrapper.py | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/neural_compressor/adaptor/torch_utils/model_wrapper.py b/neural_compressor/adaptor/torch_utils/model_wrapper.py index 9eb2f6446b7..40f152f6d33 100644 --- a/neural_compressor/adaptor/torch_utils/model_wrapper.py +++ b/neural_compressor/adaptor/torch_utils/model_wrapper.py @@ -268,15 +268,14 @@ def __init__( ).to(device), ) self.packed_weight = self.qweight.T - if zp: - self.register_buffer( - "qzeros", - torch.zeros( - (math.ceil(self.in_features / self.groupsize), math.ceil(self.out_features / self.n_pack)), - dtype=self.compressed_dtype, - ).to(device), - ) - self.packed_zp = self.qzeros.T + self.register_buffer( + "qzeros", + torch.zeros( + (math.ceil(self.in_features / self.groupsize), math.ceil(self.out_features / self.n_pack)), + dtype=self.compressed_dtype, + ).to(device), + ) + self.packed_zp = self.qzeros.T if gptq_perm: self.register_buffer("g_idx", torch.zeros(in_features, dtype=torch.int32).to(device)) else: @@ -333,6 +332,10 @@ def __init__( def pack(self, int_weight, scale, zp, bias, gptq_perm=None): int_weight = int_weight.to(self.device) + if self.use_HF_format and zp is None: + shift_bias = 2 ** (self.bits - 1) - 1 + int_weight += shift_bias + zp = torch.zeros_like(scale, dtype=torch.uint8) + shift_bias if bias is not None: assert hasattr(self, "bias"), "bias is not set when initializing." self.bias = bias.type(self.float_type).to(self.device) From 22e97de8c635b44b34e33a86901e9df6e91ef0c4 Mon Sep 17 00:00:00 2001 From: Xin He Date: Wed, 8 Nov 2023 17:48:04 +0800 Subject: [PATCH 05/23] invert perm before compression Signed-off-by: Xin He --- .../adaptor/torch_utils/model_wrapper.py | 48 ++++--------------- neural_compressor/model/torch_model.py | 3 ++ 2 files changed, 13 insertions(+), 38 deletions(-) diff --git a/neural_compressor/adaptor/torch_utils/model_wrapper.py b/neural_compressor/adaptor/torch_utils/model_wrapper.py index 40f152f6d33..b94ec56627c 100644 --- a/neural_compressor/adaptor/torch_utils/model_wrapper.py +++ b/neural_compressor/adaptor/torch_utils/model_wrapper.py @@ -389,8 +389,7 @@ def pack(self, int_weight, scale, zp, bias, gptq_perm=None): self.scales = self.scale.T self.qweight = self.packed_weight.T self.g_idx = self.gptq_perm - if zp is not None: - self.qzeros = self.packed_zp.T + self.qzeros = self.packed_zp.T def recover(self): logger.debug(f"Recovering {self} weight") @@ -399,9 +398,12 @@ def recover(self): self.scale = self.scales.T self.packed_weight = self.qweight.T self.gptq_perm = self.g_idx - if hasattr(self, "qzeros"): - self.packed_zp = self.qzeros.T + self.packed_zp = self.qzeros.T device = self.scale.device + fp32_weight = torch.zeros(self.out_features, self.in_features, dtype=self.float_type).to(device) + if self.gptq_perm is None: + # used for recovering fp32_weight + self.gptq_perm = torch.tensor([i // self.groupsize for i in range(self.in_features)], dtype=torch.int32) mask = torch.tensor(2**self.bits - 1, dtype=self.compressed_dtype).to(device) if hasattr(self, "packed_zp"): weight_dtype = torch.uint8 @@ -458,42 +460,12 @@ def recover(self): if self.use_HF_format: zp += 1 # recover fp32 weight with int_weight, scale, and zero_point - left_element = self.in_features % self.groupsize - if left_element != 0: - split_index = self.in_features // self.groupsize * self.groupsize - weight1 = weight[:, :-split_index].reshape(-1, self.groupsize) - scale1 = self.scale[:, :-1].reshape(-1, 1) - zp1 = zp[:, :-1].reshape(-1, 1) - weight1 = ((weight1 - zp1) * scale1).reshape(self.out_features, -1) - weight2 = weight[:, -split_index:] - scale2 = self.scale[:, -1:] - zp2 = zp[:, -1].reshape(-1, 1) - weight2 = (weight2 - zp2) * scale2 - fp32_weight = torch.cat((weight1, weight2), dim=1) - else: - weight = weight.reshape(-1, self.groupsize) - scale = self.scale.reshape(-1, 1) - zp = zp.reshape(-1, 1) - fp32_weight = ((weight - zp) * scale).reshape(self.out_features, -1) + for idx in range(self.in_features): + fp32_weight[:, idx] = weight[:, idx] * self.scale[:, self.gptq_perm[idx]] - zp[:, self.gptq_perm[idx]] else: # recover fp32 weight with int_weight, scale - left_element = self.in_features % self.groupsize - if left_element != 0: - split_index = self.in_features // self.groupsize * self.groupsize - weight1 = weight[:, :split_index].reshape(-1, self.groupsize) - scale1 = self.scale[:, :-1].reshape(-1, 1) - weight1 = (weight1 * scale1).reshape(self.out_features, -1) - weight2 = weight[:, split_index:] - scale2 = self.scale[:, -1:] - weight2 = weight2 * scale2 - fp32_weight = torch.cat((weight1, weight2), dim=1) - else: - weight = weight.reshape(-1, self.groupsize) - scale = self.scale.reshape(-1, 1) - fp32_weight = (weight * scale).reshape(self.out_features, -1) - if self.gptq_perm is not None: - invperm = torch.argsort(self.gptq_perm) - fp32_weight = fp32_weight[:, invperm] + for idx in range(self.in_features): + fp32_weight[:, idx] = weight[:, idx] * self.scales[:, self.gptq_perm[idx]] return fp32_weight def forward(self, input): diff --git a/neural_compressor/model/torch_model.py b/neural_compressor/model/torch_model.py index e546893e323..4e468815847 100644 --- a/neural_compressor/model/torch_model.py +++ b/neural_compressor/model/torch_model.py @@ -532,6 +532,9 @@ def export_compressed_model( gptq_scale = torch.tensor(gptq_conf["scale"]) gptq_zp = None if scheme == "sym" else torch.tensor(gptq_conf["zero"]) int_weight = quant_weight_w_scale(fp32_weight, gptq_scale, gptq_zp, group_size) + if "perm" in gptq_conf: + invperm = torch.argsort(gptq_perm) + int_weight = int_weight[:, invperm] new_module = WeightOnlyLinear( m.in_features, m.out_features, From db6782b7181380621b2943bebc276e0defc8a3a3 Mon Sep 17 00:00:00 2001 From: Xin He Date: Wed, 8 Nov 2023 17:49:16 +0800 Subject: [PATCH 06/23] fix typo Signed-off-by: Xin He --- neural_compressor/adaptor/torch_utils/model_wrapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neural_compressor/adaptor/torch_utils/model_wrapper.py b/neural_compressor/adaptor/torch_utils/model_wrapper.py index b94ec56627c..9150f1a06a9 100644 --- a/neural_compressor/adaptor/torch_utils/model_wrapper.py +++ b/neural_compressor/adaptor/torch_utils/model_wrapper.py @@ -465,7 +465,7 @@ def recover(self): else: # recover fp32 weight with int_weight, scale for idx in range(self.in_features): - fp32_weight[:, idx] = weight[:, idx] * self.scales[:, self.gptq_perm[idx]] + fp32_weight[:, idx] = weight[:, idx] * self.scale[:, self.gptq_perm[idx]] return fp32_weight def forward(self, input): From 526fce95ecc98971ef8c805970605bf43686b215 Mon Sep 17 00:00:00 2001 From: Xin He Date: Wed, 8 Nov 2023 19:14:38 +0800 Subject: [PATCH 07/23] fix bug in perm setting Signed-off-by: Xin He --- neural_compressor/adaptor/torch_utils/model_wrapper.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/neural_compressor/adaptor/torch_utils/model_wrapper.py b/neural_compressor/adaptor/torch_utils/model_wrapper.py index 9150f1a06a9..45e2ae1616f 100644 --- a/neural_compressor/adaptor/torch_utils/model_wrapper.py +++ b/neural_compressor/adaptor/torch_utils/model_wrapper.py @@ -342,8 +342,6 @@ def pack(self, int_weight, scale, zp, bias, gptq_perm=None): if gptq_perm is not None: assert hasattr(self, "gptq_perm"), "gptq_perm is not set when initializing." self.gptq_perm = gptq_perm.type(torch.int32).to(self.device) - if self.use_HF_format: - self.gptq_perm = self.gptq_perm // self.groupsize assert scale.shape == self.scale.shape, "Scale shape is mismatched." self.scale = scale.type(self.float_type).to(self.device) if not self.use_HF_format and self.compression_dim == 0: @@ -404,6 +402,9 @@ def recover(self): if self.gptq_perm is None: # used for recovering fp32_weight self.gptq_perm = torch.tensor([i // self.groupsize for i in range(self.in_features)], dtype=torch.int32) + else: + invperm = torch.argsort(self.gptq_perm) + self.gptq_perm = invperm // self.groupsize mask = torch.tensor(2**self.bits - 1, dtype=self.compressed_dtype).to(device) if hasattr(self, "packed_zp"): weight_dtype = torch.uint8 @@ -461,7 +462,7 @@ def recover(self): zp += 1 # recover fp32 weight with int_weight, scale, and zero_point for idx in range(self.in_features): - fp32_weight[:, idx] = weight[:, idx] * self.scale[:, self.gptq_perm[idx]] - zp[:, self.gptq_perm[idx]] + fp32_weight[:, idx] = (weight[:, idx] - zp[:, self.gptq_perm[idx]]) * self.scale[:, self.gptq_perm[idx]] else: # recover fp32 weight with int_weight, scale for idx in range(self.in_features): From 7429fa5b9dd452321c447da0f73f042be353822d Mon Sep 17 00:00:00 2001 From: Xin He Date: Wed, 8 Nov 2023 23:12:40 +0800 Subject: [PATCH 08/23] fix zero shift error Signed-off-by: Xin He --- neural_compressor/adaptor/torch_utils/model_wrapper.py | 2 ++ neural_compressor/model/torch_model.py | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/neural_compressor/adaptor/torch_utils/model_wrapper.py b/neural_compressor/adaptor/torch_utils/model_wrapper.py index 45e2ae1616f..372c365dc1d 100644 --- a/neural_compressor/adaptor/torch_utils/model_wrapper.py +++ b/neural_compressor/adaptor/torch_utils/model_wrapper.py @@ -459,7 +459,9 @@ def recover(self): if self.use_HF_format or self.compression_dim == 0: zp = zp.T if self.use_HF_format: + # zp -= 1 may cause zp == -1, after recover it becomes 2**self.bits - 1 zp += 1 + zp = torch.where(zp > (2**self.bits - 1), 0, zp) # recover fp32 weight with int_weight, scale, and zero_point for idx in range(self.in_features): fp32_weight[:, idx] = (weight[:, idx] - zp[:, self.gptq_perm[idx]]) * self.scale[:, self.gptq_perm[idx]] diff --git a/neural_compressor/model/torch_model.py b/neural_compressor/model/torch_model.py index 4e468815847..edf8dca1da9 100644 --- a/neural_compressor/model/torch_model.py +++ b/neural_compressor/model/torch_model.py @@ -496,6 +496,7 @@ def export_compressed_model( gptq_config = self.gptq_config if hasattr(self, "gptq_config") else {} if gptq_config: for k, v in weight_config.items(): + print(k) logger.debug(f"Compressing {k} on device {device}") if v["dtype"] == "fp32": continue @@ -529,8 +530,8 @@ def export_compressed_model( else: fp32_weight = m.weight.data gptq_perm = None - gptq_scale = torch.tensor(gptq_conf["scale"]) - gptq_zp = None if scheme == "sym" else torch.tensor(gptq_conf["zero"]) + gptq_scale = torch.tensor(gptq_conf["scale"], dtype=torch.float32) + gptq_zp = None if scheme == "sym" else torch.tensor(gptq_conf["zero"], dtype=torch.int32) int_weight = quant_weight_w_scale(fp32_weight, gptq_scale, gptq_zp, group_size) if "perm" in gptq_conf: invperm = torch.argsort(gptq_perm) From 22499e11e8b60a4de7d06580b59d5fc2bb53530c Mon Sep 17 00:00:00 2001 From: Xin He Date: Wed, 8 Nov 2023 23:33:31 +0800 Subject: [PATCH 09/23] fix reload state_dict bug Signed-off-by: Xin He --- neural_compressor/adaptor/torch_utils/model_wrapper.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/neural_compressor/adaptor/torch_utils/model_wrapper.py b/neural_compressor/adaptor/torch_utils/model_wrapper.py index 372c365dc1d..d10bdba9e74 100644 --- a/neural_compressor/adaptor/torch_utils/model_wrapper.py +++ b/neural_compressor/adaptor/torch_utils/model_wrapper.py @@ -342,6 +342,9 @@ def pack(self, int_weight, scale, zp, bias, gptq_perm=None): if gptq_perm is not None: assert hasattr(self, "gptq_perm"), "gptq_perm is not set when initializing." self.gptq_perm = gptq_perm.type(torch.int32).to(self.device) + if self.use_HF_format: + invperm = torch.argsort(self.gptq_perm) + self.gptq_perm = invperm // self.groupsize assert scale.shape == self.scale.shape, "Scale shape is mismatched." self.scale = scale.type(self.float_type).to(self.device) if not self.use_HF_format and self.compression_dim == 0: @@ -402,9 +405,6 @@ def recover(self): if self.gptq_perm is None: # used for recovering fp32_weight self.gptq_perm = torch.tensor([i // self.groupsize for i in range(self.in_features)], dtype=torch.int32) - else: - invperm = torch.argsort(self.gptq_perm) - self.gptq_perm = invperm // self.groupsize mask = torch.tensor(2**self.bits - 1, dtype=self.compressed_dtype).to(device) if hasattr(self, "packed_zp"): weight_dtype = torch.uint8 From 550a4f9450ff93a48f89245093ce765802161895 Mon Sep 17 00:00:00 2001 From: Xin He Date: Wed, 8 Nov 2023 23:44:11 +0800 Subject: [PATCH 10/23] enhance ut Signed-off-by: Xin He --- .../pytorch_adaptor/test_weight_only_adaptor.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py index 85b0bcfafa1..3c59d756846 100644 --- a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py +++ b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py @@ -534,6 +534,16 @@ def __iter__(self): # # case 2: list or tuple model_2 = copy.deepcopy(self.gptj) input = torch.ones([1, 512], dtype=torch.long) + conf.op_type_dict = { + ".*": { # re.match + "weight": { + "bits": 4, # 1-8 bits + "group_size": 8, # -1 (per-channel) + "scheme": "asym", + "algorithm": "GPTQ", + }, + }, + } q_model = quantization.fit( model_2, conf, @@ -556,7 +566,7 @@ def __iter__(self): ) q_model.save("saved") out1 = q_model.model(input) - compressed_model = q_model.export_compressed_model() + compressed_model = q_model.export_compressed_model(use_HF_format=True) out2 = compressed_model(input) torch.save(compressed_model.state_dict(), "saved/compressed_model.pt") self.assertTrue(torch.allclose(out1[0], out2[0], atol=1e-05)) From c59177f2da5be7d3c50959909679853bb04f4b1f Mon Sep 17 00:00:00 2001 From: Xin He Date: Thu, 9 Nov 2023 10:20:14 +0800 Subject: [PATCH 11/23] fix sym zeropoint Signed-off-by: Xin He --- neural_compressor/adaptor/torch_utils/model_wrapper.py | 4 +++- neural_compressor/model/torch_model.py | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/neural_compressor/adaptor/torch_utils/model_wrapper.py b/neural_compressor/adaptor/torch_utils/model_wrapper.py index d10bdba9e74..cb4fec7b1fe 100644 --- a/neural_compressor/adaptor/torch_utils/model_wrapper.py +++ b/neural_compressor/adaptor/torch_utils/model_wrapper.py @@ -333,7 +333,9 @@ def __init__( def pack(self, int_weight, scale, zp, bias, gptq_perm=None): int_weight = int_weight.to(self.device) if self.use_HF_format and zp is None: - shift_bias = 2 ** (self.bits - 1) - 1 + # to avoid overflow + int_weight = int_weight.type(torch.int32) + shift_bias = 2 ** (self.bits - 1) int_weight += shift_bias zp = torch.zeros_like(scale, dtype=torch.uint8) + shift_bias if bias is not None: diff --git a/neural_compressor/model/torch_model.py b/neural_compressor/model/torch_model.py index edf8dca1da9..3dc0a3be11a 100644 --- a/neural_compressor/model/torch_model.py +++ b/neural_compressor/model/torch_model.py @@ -533,6 +533,7 @@ def export_compressed_model( gptq_scale = torch.tensor(gptq_conf["scale"], dtype=torch.float32) gptq_zp = None if scheme == "sym" else torch.tensor(gptq_conf["zero"], dtype=torch.int32) int_weight = quant_weight_w_scale(fp32_weight, gptq_scale, gptq_zp, group_size) + int_weight = int_weight.type(torch.int32) if "perm" in gptq_conf: invperm = torch.argsort(gptq_perm) int_weight = int_weight[:, invperm] From f65eb1e34cade7065145eb20131e94d0cf651a04 Mon Sep 17 00:00:00 2001 From: Xin He Date: Thu, 9 Nov 2023 10:33:47 +0800 Subject: [PATCH 12/23] add dtype to g_idx Signed-off-by: Xin He --- neural_compressor/adaptor/torch_utils/model_wrapper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/neural_compressor/adaptor/torch_utils/model_wrapper.py b/neural_compressor/adaptor/torch_utils/model_wrapper.py index cb4fec7b1fe..78f8aff85b2 100644 --- a/neural_compressor/adaptor/torch_utils/model_wrapper.py +++ b/neural_compressor/adaptor/torch_utils/model_wrapper.py @@ -347,6 +347,7 @@ def pack(self, int_weight, scale, zp, bias, gptq_perm=None): if self.use_HF_format: invperm = torch.argsort(self.gptq_perm) self.gptq_perm = invperm // self.groupsize + self.gptq_perm = self.gptq_perm.type(torch.int32).to(self.device) assert scale.shape == self.scale.shape, "Scale shape is mismatched." self.scale = scale.type(self.float_type).to(self.device) if not self.use_HF_format and self.compression_dim == 0: From 9c7454a6a361d331ab8a19af3c89f80b7cf139ad Mon Sep 17 00:00:00 2001 From: Xin He Date: Thu, 9 Nov 2023 11:31:38 +0800 Subject: [PATCH 13/23] add export_compressed_model func for saved_dir Signed-off-by: Xin He --- neural_compressor/model/torch_model.py | 3 +- neural_compressor/utils/load_huggingface.py | 29 +++++++++++++++++++ .../test_weight_only_adaptor.py | 6 ++++ 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/neural_compressor/model/torch_model.py b/neural_compressor/model/torch_model.py index 3dc0a3be11a..ef7e891aed2 100644 --- a/neural_compressor/model/torch_model.py +++ b/neural_compressor/model/torch_model.py @@ -479,6 +479,8 @@ def export_compressed_model( 1: compression_dim: weight = 1, zeros = 0 and both are transposed. 2: zeros -= 1 before compression. Why we need it? 3: g_idx: use same number for one group instead of recording the channel order. + 4. parameter name changed, such as 'packed_weight' -> 'qweight'. + 5. zeros is always needed even for sym. """ from ..adaptor.torch_utils.model_wrapper import WeightOnlyLinear from ..adaptor.torch_utils.util import collect_weight_info, fetch_module, set_module @@ -496,7 +498,6 @@ def export_compressed_model( gptq_config = self.gptq_config if hasattr(self, "gptq_config") else {} if gptq_config: for k, v in weight_config.items(): - print(k) logger.debug(f"Compressing {k} on device {device}") if v["dtype"] == "fp32": continue diff --git a/neural_compressor/utils/load_huggingface.py b/neural_compressor/utils/load_huggingface.py index c68259a4abc..783a5dd2f5a 100644 --- a/neural_compressor/utils/load_huggingface.py +++ b/neural_compressor/utils/load_huggingface.py @@ -230,3 +230,32 @@ def save_for_huggingface_upstream(model, tokenizer, output_dir): model.model.config.architectures = [model.model.__class__.__name__] model.model.config.torch_dtype = "int8" model.model.config.save_pretrained(output_dir) + + +def export_compressed_model(model, saved_dir=None, use_HF_format=False): + """Support get compressed model from saved_dir. + + Args: + model (torch.nn.Module): origin fp32 model. + saved_dir (_type_, optional): the dir path of compression info. Defaults to None. + use_HF_format (bool, optional): whether use HuggingFace format. Defaults to False. + """ + stat_dict = os.path.join(saved_dir, "best_model.pt") + qweight_config_path = os.path.join(saved_dir, "qconfig.json") + gptq_config_path = os.path.join(saved_dir, "gptq_config.json") + model.load_state_dict(torch.load(stat_dict)) + + from neural_compressor.model import Model as INCModel + + inc_model = INCModel(model) + inc_model.export_compressed_model( + qweight_config_path=qweight_config_path, + enable_full_range=False, + compression_dtype=torch.int32, + compression_dim=1, + scale_dtype=torch.float32, + gptq_config_path=gptq_config_path, + device="cpu", + use_HF_format=use_HF_format, + ) + return inc_model.model diff --git a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py index 3c59d756846..95dea1e7647 100644 --- a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py +++ b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py @@ -9,6 +9,7 @@ from neural_compressor import PostTrainingQuantConfig, quantization from neural_compressor.adaptor.torch_utils.model_wrapper import MulLinear, WeightOnlyLinear from neural_compressor.model import Model as INCModel +from neural_compressor.utils.load_huggingface import export_compressed_model from neural_compressor.utils.pytorch import load @@ -102,6 +103,11 @@ def test_RTN_int_quant(self): self.assertTrue("fc1.qweight" in inc_model.model.state_dict().keys()) q_weight2 = inc_model.model.state_dict()["fc1.qweight"] self.assertTrue(torch.all(q_weight1.T == q_weight2)) + model = Model() + compressed_model = export_compressed_model(model, saved_dir="saved", use_HF_format=True) + self.assertTrue("fc1.qweight" in compressed_model.state_dict().keys()) + q_weight2 = compressed_model.state_dict()["fc1.qweight"] + self.assertTrue(torch.all(q_weight1.T == q_weight2)) self.assertTrue(torch.all(out3 == out4)) model = Model() From 8836f251cf98a4c99c2267ad5a149522310061b5 Mon Sep 17 00:00:00 2001 From: Xin He Date: Thu, 9 Nov 2023 13:25:16 +0800 Subject: [PATCH 14/23] fix UT Signed-off-by: Xin He --- test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py index 95dea1e7647..692ce82533b 100644 --- a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py +++ b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py @@ -91,7 +91,6 @@ def test_RTN_int_quant(self): compressed_model = q_model.export_compressed_model() out3 = compressed_model(input) self.assertTrue("fc1.packed_weight" in compressed_model.state_dict().keys()) - q_weight1 = compressed_model.state_dict()["fc1.packed_weight"] self.assertTrue(torch.all(out3 == out2)) # test huggingface popular int4 format @@ -101,13 +100,9 @@ def test_RTN_int_quant(self): inc_model.export_compressed_model(qweight_config_path="saved/qconfig.json", use_HF_format=True) out4 = inc_model.model(input) self.assertTrue("fc1.qweight" in inc_model.model.state_dict().keys()) - q_weight2 = inc_model.model.state_dict()["fc1.qweight"] - self.assertTrue(torch.all(q_weight1.T == q_weight2)) model = Model() compressed_model = export_compressed_model(model, saved_dir="saved", use_HF_format=True) self.assertTrue("fc1.qweight" in compressed_model.state_dict().keys()) - q_weight2 = compressed_model.state_dict()["fc1.qweight"] - self.assertTrue(torch.all(q_weight1.T == q_weight2)) self.assertTrue(torch.all(out3 == out4)) model = Model() From cbdce1527e94c7005e0bd4b68ef6fd7772e771e0 Mon Sep 17 00:00:00 2001 From: Xin He Date: Thu, 9 Nov 2023 13:34:08 +0800 Subject: [PATCH 15/23] ignore pylint Signed-off-by: Xin He --- neural_compressor/utils/load_huggingface.py | 1 + 1 file changed, 1 insertion(+) diff --git a/neural_compressor/utils/load_huggingface.py b/neural_compressor/utils/load_huggingface.py index 783a5dd2f5a..9d153d9d006 100644 --- a/neural_compressor/utils/load_huggingface.py +++ b/neural_compressor/utils/load_huggingface.py @@ -247,6 +247,7 @@ def export_compressed_model(model, saved_dir=None, use_HF_format=False): from neural_compressor.model import Model as INCModel + # pylint: disable=E1101 inc_model = INCModel(model) inc_model.export_compressed_model( qweight_config_path=qweight_config_path, From f2de9c6ec6447501c9fb2dd0f876af147e5213b3 Mon Sep 17 00:00:00 2001 From: Xin He Date: Thu, 9 Nov 2023 13:57:43 +0800 Subject: [PATCH 16/23] fix bug Signed-off-by: Xin He --- neural_compressor/utils/load_huggingface.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/neural_compressor/utils/load_huggingface.py b/neural_compressor/utils/load_huggingface.py index 9d153d9d006..48ce6bc6a97 100644 --- a/neural_compressor/utils/load_huggingface.py +++ b/neural_compressor/utils/load_huggingface.py @@ -243,6 +243,8 @@ def export_compressed_model(model, saved_dir=None, use_HF_format=False): stat_dict = os.path.join(saved_dir, "best_model.pt") qweight_config_path = os.path.join(saved_dir, "qconfig.json") gptq_config_path = os.path.join(saved_dir, "gptq_config.json") + if not os.path.exists(gptq_config_path): + gptq_config_path = None model.load_state_dict(torch.load(stat_dict)) from neural_compressor.model import Model as INCModel From c0652e32ef030ecae354d54be9e750a8f0f5676f Mon Sep 17 00:00:00 2001 From: Xin He Date: Thu, 9 Nov 2023 14:22:05 +0800 Subject: [PATCH 17/23] add document Signed-off-by: Xin He --- docs/source/quantization_weight_only.md | 21 +++++++++++---- neural_compressor/utils/load_huggingface.py | 30 ++++++++++++++++----- 2 files changed, 40 insertions(+), 11 deletions(-) diff --git a/docs/source/quantization_weight_only.md b/docs/source/quantization_weight_only.md index 2786dd9d1ec..ce48c0a6df3 100644 --- a/docs/source/quantization_weight_only.md +++ b/docs/source/quantization_weight_only.md @@ -96,6 +96,15 @@ To support low memory inference, Neural Compressor implemented WeightOnlyLinear, | compression_dtype | torch.int32 | Data type for compressed dtype, select from [torch.int8\|16\|32\|64] | | compression_dim | 1 | 0 means output channel while 1 means input channel | | scale_dtype | torch.float32 | Data type for scale and bias | +| use_HF_format | False | Whether to use the popular format present on HuggingFace hub | + +**Note:** HuggingFace format is quite special, the main differences are as follows: + +> 1: Compression Dimension: weight = 1, zero = 0 and both are transposed. +> 2: Zero Point: zero_point-= 1 before compression. zero_point is always required even for sym. +> 3: Group Index: Use the same number for a group instead of recording channel order. +> 4. Parameter Name: `packed_weight` -> `qweight`; `packed_zp` -> `qzeros`; `gptq_perm` -> `g_idx`; `scale` -> `scales`. + ### **User Code Example** ```python @@ -119,12 +128,14 @@ conf = PostTrainingQuantConfig( ) q_model = quantization.fit(model, conf, eval_func=eval_func) q_model.save("saved_results") -compressed_model = q_model.export_compressed_model( - compression_dtype=torch.int32, - compression_dim=1, - scale_dtype=torch.float16, -) +compressed_model = q_model.export_compressed_model() torch.save(compressed_model.state_dict(), "compressed_model.pt") +# or +model = Model() +compressed_model = export_compressed_model( + model, + saved_dir="saved_results", +) ``` The saved_results folder contains two files: `best_model.pt` and `qconfig.json`, and the generated q_model is a fake quantized model. diff --git a/neural_compressor/utils/load_huggingface.py b/neural_compressor/utils/load_huggingface.py index 48ce6bc6a97..f7378d90998 100644 --- a/neural_compressor/utils/load_huggingface.py +++ b/neural_compressor/utils/load_huggingface.py @@ -232,13 +232,31 @@ def save_for_huggingface_upstream(model, tokenizer, output_dir): model.model.config.save_pretrained(output_dir) -def export_compressed_model(model, saved_dir=None, use_HF_format=False): +def export_compressed_model( + model, + saved_dir=None, + use_HF_format=False, + enable_full_range=False, + compression_dtype=torch.int32, + compression_dim=1, + scale_dtype=torch.float32, + device="cpu", +): """Support get compressed model from saved_dir. Args: model (torch.nn.Module): origin fp32 model. saved_dir (_type_, optional): the dir path of compression info. Defaults to None. use_HF_format (bool, optional): whether use HuggingFace format. Defaults to False. + enable_full_range (bool, optional): Whether to leverage the full compression range + under symmetric quantization. Defaults to False. + compression_dtype (torch.Tensor, optional): The target dtype after comoression. + Defaults to torch.int32. + compression_dim (int, optional): Select from [0, 1], 0 is output channel, + 1 is input channel. Defaults to 1. + scale_dtype (torch.Tensor, optional): Use float32 or float16. + Defaults to torch.float32. + device (str, optional): choose device for compression. Defaults to cpu. """ stat_dict = os.path.join(saved_dir, "best_model.pt") qweight_config_path = os.path.join(saved_dir, "qconfig.json") @@ -253,12 +271,12 @@ def export_compressed_model(model, saved_dir=None, use_HF_format=False): inc_model = INCModel(model) inc_model.export_compressed_model( qweight_config_path=qweight_config_path, - enable_full_range=False, - compression_dtype=torch.int32, - compression_dim=1, - scale_dtype=torch.float32, + enable_full_range=enable_full_range, + compression_dtype=compression_dtype, + compression_dim=compression_dim, + scale_dtype=scale_dtype, gptq_config_path=gptq_config_path, - device="cpu", + device=device, use_HF_format=use_HF_format, ) return inc_model.model From ca785c8ae7939abfe9eb9f6557de5c88c7a3efe3 Mon Sep 17 00:00:00 2001 From: Xin He Date: Fri, 10 Nov 2023 15:48:31 +0800 Subject: [PATCH 18/23] abandon old param names Signed-off-by: Xin He --- .../adaptor/torch_utils/model_wrapper.py | 117 +++++++++--------- neural_compressor/model/torch_model.py | 2 +- .../test_weight_only_adaptor.py | 10 +- 3 files changed, 63 insertions(+), 66 deletions(-) diff --git a/neural_compressor/adaptor/torch_utils/model_wrapper.py b/neural_compressor/adaptor/torch_utils/model_wrapper.py index 78f8aff85b2..04baa05590d 100644 --- a/neural_compressor/adaptor/torch_utils/model_wrapper.py +++ b/neural_compressor/adaptor/torch_utils/model_wrapper.py @@ -215,7 +215,7 @@ def __init__( scale_dtype=torch.float32, compression_dtype=torch.int32, compression_dim=1, - gptq_perm=False, + g_idx=False, device="cpu", use_HF_format=False, ): @@ -259,7 +259,7 @@ def __init__( dtype=self.float_type, ).to(device), ) - self.scale = self.scales.T + self.scales = self.scales.T self.register_buffer( "qweight", torch.zeros( @@ -267,7 +267,7 @@ def __init__( dtype=self.compressed_dtype, ).to(device), ) - self.packed_weight = self.qweight.T + self.qweight = self.qweight.T self.register_buffer( "qzeros", torch.zeros( @@ -275,15 +275,10 @@ def __init__( dtype=self.compressed_dtype, ).to(device), ) - self.packed_zp = self.qzeros.T - if gptq_perm: - self.register_buffer("g_idx", torch.zeros(in_features, dtype=torch.int32).to(device)) - else: - self.g_idx = None - self.gptq_perm = self.g_idx + self.qzeros = self.qzeros.T else: self.register_buffer( - "scale", + "scales", torch.zeros( (out_features, math.ceil(in_features / self.groupsize)), dtype=self.float_type, @@ -291,7 +286,7 @@ def __init__( ) if compression_dim == 1: self.register_buffer( - "packed_weight", + "qweight", torch.zeros( (out_features, math.ceil(in_features / self.n_pack)), dtype=self.compressed_dtype, @@ -299,7 +294,7 @@ def __init__( ) if zp: self.register_buffer( - "packed_zp", + "qzeros", torch.zeros( (self.out_features, math.ceil(self.in_features / self.groupsize / self.n_pack)), dtype=self.compressed_dtype, @@ -307,7 +302,7 @@ def __init__( ) else: self.register_buffer( - "packed_weight", + "qweight", torch.zeros( (math.ceil(out_features / self.n_pack), in_features), dtype=self.compressed_dtype, @@ -315,22 +310,22 @@ def __init__( ) if zp: self.register_buffer( - "packed_zp", + "qzeros", torch.zeros( (math.ceil(self.out_features / self.n_pack), math.ceil(self.in_features / self.groupsize)), dtype=self.compressed_dtype, ).to(device), ) - if gptq_perm: - self.register_buffer("gptq_perm", torch.zeros(in_features, dtype=torch.int32).to(device)) - else: - self.gptq_perm = None + if g_idx: + self.register_buffer("g_idx", torch.zeros(in_features, dtype=torch.int32).to(device)) + else: + self.g_idx = None if bias: self.register_buffer("bias", torch.zeros(self.out_features, dtype=self.float_type).to(device)) else: self.bias = None - def pack(self, int_weight, scale, zp, bias, gptq_perm=None): + def pack(self, int_weight, scale, zp, bias, g_idx=None): int_weight = int_weight.to(self.device) if self.use_HF_format and zp is None: # to avoid overflow @@ -341,20 +336,20 @@ def pack(self, int_weight, scale, zp, bias, gptq_perm=None): if bias is not None: assert hasattr(self, "bias"), "bias is not set when initializing." self.bias = bias.type(self.float_type).to(self.device) - if gptq_perm is not None: - assert hasattr(self, "gptq_perm"), "gptq_perm is not set when initializing." - self.gptq_perm = gptq_perm.type(torch.int32).to(self.device) + if g_idx is not None: + assert hasattr(self, "g_idx"), "g_idx is not set when initializing." + self.g_idx = g_idx.type(torch.int32).to(self.device) if self.use_HF_format: - invperm = torch.argsort(self.gptq_perm) - self.gptq_perm = invperm // self.groupsize - self.gptq_perm = self.gptq_perm.type(torch.int32).to(self.device) - assert scale.shape == self.scale.shape, "Scale shape is mismatched." - self.scale = scale.type(self.float_type).to(self.device) + invperm = torch.argsort(self.g_idx) + self.g_idx = invperm // self.groupsize + self.g_idx = self.g_idx.type(torch.int32).to(self.device) + assert scale.shape == self.scales.shape, "Scale shape is mismatched." + self.scales = scale.type(self.float_type).to(self.device) if not self.use_HF_format and self.compression_dim == 0: int_weight = int_weight.T - self.packed_weight = self.packed_weight.T + self.qweight = self.qweight.T origin_shape = int_weight.shape - target_shape = self.packed_weight.shape + target_shape = self.qweight.shape assert origin_shape[0] == target_shape[0], "output channels mismatch, please check." mask = torch.tensor(2**self.bits - 1, dtype=self.compressed_dtype).to(self.device) @@ -366,9 +361,9 @@ def pack(self, int_weight, scale, zp, bias, gptq_perm=None): for e in range(tmp.shape[1]): tmp[:, e] &= mask tmp[:, e] = tmp[:, e] << (self.bits * e) - self.packed_weight[:, j] |= tmp[:, e] + self.qweight[:, j] |= tmp[:, e] if not self.use_HF_format and self.compression_dim == 0: - self.packed_weight = self.packed_weight.T + self.qweight = self.qweight.T if zp is not None: zp = zp.to(self.device) @@ -376,9 +371,9 @@ def pack(self, int_weight, scale, zp, bias, gptq_perm=None): zp -= 1 if self.use_HF_format or self.compression_dim == 0: zp = zp.T - self.packed_zp = self.packed_zp.T - assert hasattr(self, "packed_zp"), "zp is not set when initializing." - target_shape = self.packed_zp.shape + self.qzeros = self.qzeros.T + assert hasattr(self, "qzeros"), "zp is not set when initializing." + target_shape = self.qzeros.shape for j in range(target_shape[1]): start = self.n_pack * j end = self.n_pack * (j + 1) @@ -386,47 +381,47 @@ def pack(self, int_weight, scale, zp, bias, gptq_perm=None): for e in range(tmp.shape[1]): tmp[:, e] &= mask tmp[:, e] = tmp[:, e] << (self.bits * e) - self.packed_zp[:, j] |= tmp[:, e] + self.qzeros[:, j] |= tmp[:, e] if self.use_HF_format or self.compression_dim == 0: - self.packed_zp = self.packed_zp.T + self.qzeros = self.qzeros.T if self.use_HF_format: - self.scales = self.scale.T - self.qweight = self.packed_weight.T - self.g_idx = self.gptq_perm - self.qzeros = self.packed_zp.T + self.scales = self.scales.T + self.qweight = self.qweight.T + self.g_idx = self.g_idx + self.qzeros = self.qzeros.T def recover(self): logger.debug(f"Recovering {self} weight") if self.use_HF_format: - # Prevent broken id links of self.scale and self.scales - self.scale = self.scales.T - self.packed_weight = self.qweight.T - self.gptq_perm = self.g_idx - self.packed_zp = self.qzeros.T - device = self.scale.device + # Prevent broken id links of self.scales and self.scales + self.scales = self.scales.T + self.qweight = self.qweight.T + self.g_idx = self.g_idx + self.qzeros = self.qzeros.T + device = self.scales.device fp32_weight = torch.zeros(self.out_features, self.in_features, dtype=self.float_type).to(device) - if self.gptq_perm is None: + if self.g_idx is None: # used for recovering fp32_weight - self.gptq_perm = torch.tensor([i // self.groupsize for i in range(self.in_features)], dtype=torch.int32) + self.g_idx = torch.tensor([i // self.groupsize for i in range(self.in_features)], dtype=torch.int32) mask = torch.tensor(2**self.bits - 1, dtype=self.compressed_dtype).to(device) - if hasattr(self, "packed_zp"): + if hasattr(self, "qzeros"): weight_dtype = torch.uint8 else: weight_dtype = torch.int8 # unpack weight weight = torch.zeros(self.out_features, self.in_features, dtype=weight_dtype).to(device) - packed_weight = self.packed_weight + qweight = self.qweight if not self.use_HF_format and self.compression_dim == 0: weight = weight.T - packed_weight = packed_weight.T + qweight = qweight.T origin_shape = weight.shape - target_shape = packed_weight.shape + target_shape = qweight.shape for j in range(target_shape[1]): for e in range(self.n_pack): index = j * self.n_pack + e if index >= origin_shape[1]: continue - tmp = packed_weight[:, j] + tmp = qweight[:, j] tmp = tmp << (self.compress_bits - self.bits * (e + 1)) tmp = tmp >> self.compress_bits - self.bits if weight_dtype == torch.uint8: @@ -440,21 +435,21 @@ def recover(self): new_weight += torch.where(weight == k, v, 0) weight = new_weight # unpack zero_point - if hasattr(self, "packed_zp"): + if hasattr(self, "qzeros"): zp_dtype = self.compressed_dtype # to avoid overflow when weight-zp - zp = torch.zeros(self.scale.shape, dtype=zp_dtype).to(device) - packed_zp = self.packed_zp + zp = torch.zeros(self.scales.shape, dtype=zp_dtype).to(device) + qzeros = self.qzeros if self.use_HF_format or self.compression_dim == 0: zp = zp.T - packed_zp = packed_zp.T + qzeros = qzeros.T origin_shape = zp.shape - target_shape = packed_zp.shape + target_shape = qzeros.shape for j in range(target_shape[1]): for e in range(self.n_pack): index = j * self.n_pack + e if index >= origin_shape[1]: continue - tmp = packed_zp[:, j] + tmp = qzeros[:, j] tmp = tmp << (self.compress_bits - self.bits * (e + 1)) tmp = tmp >> self.compress_bits - self.bits tmp &= mask @@ -467,11 +462,11 @@ def recover(self): zp = torch.where(zp > (2**self.bits - 1), 0, zp) # recover fp32 weight with int_weight, scale, and zero_point for idx in range(self.in_features): - fp32_weight[:, idx] = (weight[:, idx] - zp[:, self.gptq_perm[idx]]) * self.scale[:, self.gptq_perm[idx]] + fp32_weight[:, idx] = (weight[:, idx] - zp[:, self.g_idx[idx]]) * self.scales[:, self.g_idx[idx]] else: # recover fp32 weight with int_weight, scale for idx in range(self.in_features): - fp32_weight[:, idx] = weight[:, idx] * self.scale[:, self.gptq_perm[idx]] + fp32_weight[:, idx] = weight[:, idx] * self.scales[:, self.g_idx[idx]] return fp32_weight def forward(self, input): diff --git a/neural_compressor/model/torch_model.py b/neural_compressor/model/torch_model.py index ef7e891aed2..907646fa30d 100644 --- a/neural_compressor/model/torch_model.py +++ b/neural_compressor/model/torch_model.py @@ -546,7 +546,7 @@ def export_compressed_model( dtype=dtype, zp=gptq_zp is not None, bias=m.bias is not None, - gptq_perm=gptq_perm is not None, + g_idx=gptq_perm is not None, compression_dtype=compression_dtype, compression_dim=compression_dim, scale_dtype=scale_dtype, diff --git a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py index 692ce82533b..57e5ebb368b 100644 --- a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py +++ b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py @@ -16,7 +16,7 @@ class Model(torch.nn.Module): def __init__(self): super(Model, self).__init__() - self.fc1 = torch.nn.Linear(30, 50) + self.fc1 = torch.nn.Linear(30, 50, bias=True) self.fc2 = torch.nn.Linear(50, 30) self.fc3 = torch.nn.Linear(30, 5) @@ -90,7 +90,9 @@ def test_RTN_int_quant(self): self.assertFalse(torch.all(out1 == out2)) compressed_model = q_model.export_compressed_model() out3 = compressed_model(input) - self.assertTrue("fc1.packed_weight" in compressed_model.state_dict().keys()) + self.assertTrue("fc1.qweight" in compressed_model.state_dict().keys()) + self.assertTrue("fc1.qzeros" not in compressed_model.state_dict().keys()) + shape2 = compressed_model.state_dict()["fc1.scales"] self.assertTrue(torch.all(out3 == out2)) # test huggingface popular int4 format @@ -99,10 +101,10 @@ def test_RTN_int_quant(self): inc_model = INCModel(new_model) inc_model.export_compressed_model(qweight_config_path="saved/qconfig.json", use_HF_format=True) out4 = inc_model.model(input) - self.assertTrue("fc1.qweight" in inc_model.model.state_dict().keys()) + self.assertTrue("fc1.qzeros" in inc_model.model.state_dict().keys()) model = Model() compressed_model = export_compressed_model(model, saved_dir="saved", use_HF_format=True) - self.assertTrue("fc1.qweight" in compressed_model.state_dict().keys()) + self.assertTrue("fc1.qzeros" in inc_model.model.state_dict().keys()) self.assertTrue(torch.all(out3 == out4)) model = Model() From ed07108ec6bdb9ef1d4a1989f36d7ffa5e7b8cc2 Mon Sep 17 00:00:00 2001 From: Xin He Date: Fri, 10 Nov 2023 15:51:53 +0800 Subject: [PATCH 19/23] remove useless code Signed-off-by: Xin He --- test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py index 57e5ebb368b..01a63352f00 100644 --- a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py +++ b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py @@ -729,7 +729,6 @@ def __iter__(self): calib_dataloader=dataloader, ) out2 = q_model.model(input) - print(out1[0] - out2[0]) self.assertTrue(torch.allclose(out1[0], out2[0], atol=1e-01)) From 468902ab7c375893738b7942166fff347889b124 Mon Sep 17 00:00:00 2001 From: Xin He Date: Mon, 13 Nov 2023 09:39:46 +0800 Subject: [PATCH 20/23] remove useless doc Signed-off-by: Xin He --- docs/source/quantization_weight_only.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/source/quantization_weight_only.md b/docs/source/quantization_weight_only.md index ce48c0a6df3..6d208d69037 100644 --- a/docs/source/quantization_weight_only.md +++ b/docs/source/quantization_weight_only.md @@ -103,7 +103,6 @@ To support low memory inference, Neural Compressor implemented WeightOnlyLinear, > 1: Compression Dimension: weight = 1, zero = 0 and both are transposed. > 2: Zero Point: zero_point-= 1 before compression. zero_point is always required even for sym. > 3: Group Index: Use the same number for a group instead of recording channel order. -> 4. Parameter Name: `packed_weight` -> `qweight`; `packed_zp` -> `qzeros`; `gptq_perm` -> `g_idx`; `scale` -> `scales`. ### **User Code Example** From dc6f51c846b11f328b32908575dc99bb33c5369f Mon Sep 17 00:00:00 2001 From: Xin He Date: Tue, 14 Nov 2023 21:19:18 +0800 Subject: [PATCH 21/23] fix ut Signed-off-by: Xin He --- test/model/test_model_pytorch.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/model/test_model_pytorch.py b/test/model/test_model_pytorch.py index 7b42ef63729..0dd2fdead84 100644 --- a/test/model/test_model_pytorch.py +++ b/test/model/test_model_pytorch.py @@ -123,7 +123,7 @@ def test_WeightOnlyLinear(self): model_size2 = os.path.getsize("saved/tmp.pt") / 1024 print("WeightOnlyLinear Model size:{:.3f}M".format(model_size2)) self.assertTrue(isinstance(inc_model.model.fc1, WeightOnlyLinear)) - self.assertTrue(inc_model.model.fc1.packed_weight.dtype == dtype) + self.assertTrue(inc_model.model.fc1.qweight.dtype == dtype) self.assertTrue(inc_model.model.fc1.scale.dtype == torch.float32) self.assertTrue(model_size1 / model_size2 > 2) self.assertTrue(torch.all(torch.isclose(out1, out2, atol=5e-1))) @@ -143,9 +143,9 @@ def test_WeightOnlyLinear(self): print("WeightOnlyLinear Model size:{:.3f}M".format(model_size2)) self.assertTrue(isinstance(inc_model.model.fc1, WeightOnlyLinear)) if dim == 1: - self.assertTrue(inc_model.model.fc1.packed_weight.shape[0] == inc_model.model.fc1.out_features) + self.assertTrue(inc_model.model.fc1.qweight.shape[0] == inc_model.model.fc1.out_features) else: - self.assertTrue(inc_model.model.fc1.packed_weight.shape[1] == inc_model.model.fc1.in_features) + self.assertTrue(inc_model.model.fc1.qweight.shape[1] == inc_model.model.fc1.in_features) self.assertTrue(model_size1 / model_size2 > 2) self.assertTrue(torch.all(torch.isclose(out1, out2, atol=5e-1))) From 340cf597cb2bfc983e82f0bc541cfa5765a700b1 Mon Sep 17 00:00:00 2001 From: Xin He Date: Wed, 15 Nov 2023 09:51:20 +0800 Subject: [PATCH 22/23] fix ut Signed-off-by: Xin He --- test/model/test_model_pytorch.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/model/test_model_pytorch.py b/test/model/test_model_pytorch.py index 0dd2fdead84..05edfd9c6fb 100644 --- a/test/model/test_model_pytorch.py +++ b/test/model/test_model_pytorch.py @@ -124,7 +124,7 @@ def test_WeightOnlyLinear(self): print("WeightOnlyLinear Model size:{:.3f}M".format(model_size2)) self.assertTrue(isinstance(inc_model.model.fc1, WeightOnlyLinear)) self.assertTrue(inc_model.model.fc1.qweight.dtype == dtype) - self.assertTrue(inc_model.model.fc1.scale.dtype == torch.float32) + self.assertTrue(inc_model.model.fc1.scales.dtype == torch.float32) self.assertTrue(model_size1 / model_size2 > 2) self.assertTrue(torch.all(torch.isclose(out1, out2, atol=5e-1))) @@ -161,7 +161,7 @@ def test_WeightOnlyLinear(self): model_size2 = os.path.getsize("saved/tmp.pt") / 1024 print("WeightOnlyLinear Model size:{:.3f}M".format(model_size2)) self.assertTrue(isinstance(inc_model.model.fc1, WeightOnlyLinear)) - self.assertTrue(inc_model.model.fc1.scale.dtype == torch.float16) + self.assertTrue(inc_model.model.fc1.scales.dtype == torch.float16) self.assertTrue(model_size1 / model_size2 > 2) self.assertTrue(torch.all(torch.isclose(out1, out2, atol=5e-1))) From 2e4e6b42b07c619806db9a9737d32fb99af72164 Mon Sep 17 00:00:00 2001 From: Xin He Date: Thu, 16 Nov 2023 10:44:09 +0800 Subject: [PATCH 23/23] rename use_HF_format to use_hf_format Signed-off-by: Xin He --- docs/source/quantization_weight_only.md | 2 +- .../adaptor/torch_utils/model_wrapper.py | 38 +++++++++---------- .../adaptor/torch_utils/weight_only.py | 4 +- neural_compressor/model/torch_model.py | 10 ++--- neural_compressor/utils/load_huggingface.py | 6 +-- .../test_weight_only_adaptor.py | 8 ++-- 6 files changed, 34 insertions(+), 34 deletions(-) diff --git a/docs/source/quantization_weight_only.md b/docs/source/quantization_weight_only.md index 6d208d69037..b1ab86d1fd5 100644 --- a/docs/source/quantization_weight_only.md +++ b/docs/source/quantization_weight_only.md @@ -96,7 +96,7 @@ To support low memory inference, Neural Compressor implemented WeightOnlyLinear, | compression_dtype | torch.int32 | Data type for compressed dtype, select from [torch.int8\|16\|32\|64] | | compression_dim | 1 | 0 means output channel while 1 means input channel | | scale_dtype | torch.float32 | Data type for scale and bias | -| use_HF_format | False | Whether to use the popular format present on HuggingFace hub | +| use_hf_format | False | Whether to use the popular format present on HuggingFace hub | **Note:** HuggingFace format is quite special, the main differences are as follows: diff --git a/neural_compressor/adaptor/torch_utils/model_wrapper.py b/neural_compressor/adaptor/torch_utils/model_wrapper.py index 04baa05590d..57103566d9d 100644 --- a/neural_compressor/adaptor/torch_utils/model_wrapper.py +++ b/neural_compressor/adaptor/torch_utils/model_wrapper.py @@ -217,10 +217,10 @@ def __init__( compression_dim=1, g_idx=False, device="cpu", - use_HF_format=False, + use_hf_format=False, ): super().__init__() - self.use_HF_format = use_HF_format + self.use_hf_format = use_hf_format self.dtype = dtype if "int" not in self.dtype: # for nf4, fp4 from neural_compressor.adaptor.torch_utils.weight_only import FLOAT_MAPPING, INT_MAPPING @@ -251,7 +251,7 @@ def __init__( assert compression_dim in [0, 1], ( "Only support 0 or 1 as compression dimension, " + "0 is output channel, 1 is input channel." ) - if self.use_HF_format: + if self.use_hf_format: self.register_buffer( "scales", torch.zeros( @@ -327,7 +327,7 @@ def __init__( def pack(self, int_weight, scale, zp, bias, g_idx=None): int_weight = int_weight.to(self.device) - if self.use_HF_format and zp is None: + if self.use_hf_format and zp is None: # to avoid overflow int_weight = int_weight.type(torch.int32) shift_bias = 2 ** (self.bits - 1) @@ -339,13 +339,13 @@ def pack(self, int_weight, scale, zp, bias, g_idx=None): if g_idx is not None: assert hasattr(self, "g_idx"), "g_idx is not set when initializing." self.g_idx = g_idx.type(torch.int32).to(self.device) - if self.use_HF_format: + if self.use_hf_format: invperm = torch.argsort(self.g_idx) self.g_idx = invperm // self.groupsize self.g_idx = self.g_idx.type(torch.int32).to(self.device) assert scale.shape == self.scales.shape, "Scale shape is mismatched." self.scales = scale.type(self.float_type).to(self.device) - if not self.use_HF_format and self.compression_dim == 0: + if not self.use_hf_format and self.compression_dim == 0: int_weight = int_weight.T self.qweight = self.qweight.T origin_shape = int_weight.shape @@ -362,14 +362,14 @@ def pack(self, int_weight, scale, zp, bias, g_idx=None): tmp[:, e] &= mask tmp[:, e] = tmp[:, e] << (self.bits * e) self.qweight[:, j] |= tmp[:, e] - if not self.use_HF_format and self.compression_dim == 0: + if not self.use_hf_format and self.compression_dim == 0: self.qweight = self.qweight.T if zp is not None: zp = zp.to(self.device) - if self.use_HF_format: + if self.use_hf_format: zp -= 1 - if self.use_HF_format or self.compression_dim == 0: + if self.use_hf_format or self.compression_dim == 0: zp = zp.T self.qzeros = self.qzeros.T assert hasattr(self, "qzeros"), "zp is not set when initializing." @@ -382,9 +382,9 @@ def pack(self, int_weight, scale, zp, bias, g_idx=None): tmp[:, e] &= mask tmp[:, e] = tmp[:, e] << (self.bits * e) self.qzeros[:, j] |= tmp[:, e] - if self.use_HF_format or self.compression_dim == 0: + if self.use_hf_format or self.compression_dim == 0: self.qzeros = self.qzeros.T - if self.use_HF_format: + if self.use_hf_format: self.scales = self.scales.T self.qweight = self.qweight.T self.g_idx = self.g_idx @@ -392,7 +392,7 @@ def pack(self, int_weight, scale, zp, bias, g_idx=None): def recover(self): logger.debug(f"Recovering {self} weight") - if self.use_HF_format: + if self.use_hf_format: # Prevent broken id links of self.scales and self.scales self.scales = self.scales.T self.qweight = self.qweight.T @@ -411,7 +411,7 @@ def recover(self): # unpack weight weight = torch.zeros(self.out_features, self.in_features, dtype=weight_dtype).to(device) qweight = self.qweight - if not self.use_HF_format and self.compression_dim == 0: + if not self.use_hf_format and self.compression_dim == 0: weight = weight.T qweight = qweight.T origin_shape = weight.shape @@ -427,7 +427,7 @@ def recover(self): if weight_dtype == torch.uint8: tmp &= mask # remove sign bit weight[:, index] = tmp.type(weight_dtype) - if not self.use_HF_format and self.compression_dim == 0: + if not self.use_hf_format and self.compression_dim == 0: weight = weight.T if "int" not in self.dtype: new_weight = torch.zeros(self.out_features, self.in_features).to(device) @@ -439,7 +439,7 @@ def recover(self): zp_dtype = self.compressed_dtype # to avoid overflow when weight-zp zp = torch.zeros(self.scales.shape, dtype=zp_dtype).to(device) qzeros = self.qzeros - if self.use_HF_format or self.compression_dim == 0: + if self.use_hf_format or self.compression_dim == 0: zp = zp.T qzeros = qzeros.T origin_shape = zp.shape @@ -454,9 +454,9 @@ def recover(self): tmp = tmp >> self.compress_bits - self.bits tmp &= mask zp[:, index] = tmp.type(zp_dtype) - if self.use_HF_format or self.compression_dim == 0: + if self.use_hf_format or self.compression_dim == 0: zp = zp.T - if self.use_HF_format: + if self.use_hf_format: # zp -= 1 may cause zp == -1, after recover it becomes 2**self.bits - 1 zp += 1 zp = torch.where(zp > (2**self.bits - 1), 0, zp) @@ -489,8 +489,8 @@ def extra_repr(self) -> str: self.groupsize, self.bias is not None, ) - if self.use_HF_format: - tmp_str += ", use_HF_format=True" + if self.use_hf_format: + tmp_str += ", use_hf_format=True" return tmp_str diff --git a/neural_compressor/adaptor/torch_utils/weight_only.py b/neural_compressor/adaptor/torch_utils/weight_only.py index 37a810a0428..f866ac12410 100644 --- a/neural_compressor/adaptor/torch_utils/weight_only.py +++ b/neural_compressor/adaptor/torch_utils/weight_only.py @@ -396,7 +396,7 @@ def rtn_quantize( compression_dim = kwargs.get("compression_dim", 1) scale_dtype = kwargs.get("scale_dtype", torch.float32) device = kwargs.get("device", "cpu") - use_HF_format = kwargs.get("use_HF_format", False) + use_hf_format = kwargs.get("use_hf_format", False) for name, m in model.named_modules(): if m.__class__.__name__ not in supported_layers: continue @@ -449,7 +449,7 @@ def rtn_quantize( compression_dim=compression_dim, scale_dtype=scale_dtype, device=device, - use_HF_format=use_HF_format, + use_hf_format=use_hf_format, ) new_module.pack(int_weight, scale, zp, m.bias) if name == "": diff --git a/neural_compressor/model/torch_model.py b/neural_compressor/model/torch_model.py index 907646fa30d..fb7046a1607 100644 --- a/neural_compressor/model/torch_model.py +++ b/neural_compressor/model/torch_model.py @@ -459,7 +459,7 @@ def export_compressed_model( scale_dtype=torch.float32, gptq_config_path=None, device="cpu", - use_HF_format=False, + use_hf_format=False, ): """Convert Linear to WeightOnlyLinear for low memory inference. @@ -475,7 +475,7 @@ def export_compressed_model( Defaults to torch.float32. gptq_config_path (str, optional): Path of gptq_config.json. Defaults to None. device (str, optional): choose device for compression. Defaults to cpu. - use_HF_format (bool, optional): use the popular huggingface compression format. + use_hf_format (bool, optional): use the popular huggingface compression format. 1: compression_dim: weight = 1, zeros = 0 and both are transposed. 2: zeros -= 1 before compression. Why we need it? 3: g_idx: use same number for one group instead of recording the channel order. @@ -520,7 +520,7 @@ def export_compressed_model( compression_dim=compression_dim, scale_dtype=scale_dtype, device=device, - use_HF_format=use_HF_format, + use_hf_format=use_hf_format, ) set_module(self.model, k, new_module) continue @@ -551,7 +551,7 @@ def export_compressed_model( compression_dim=compression_dim, scale_dtype=scale_dtype, device=device, - use_HF_format=use_HF_format, + use_hf_format=use_hf_format, ) new_module.pack(int_weight, gptq_scale, gptq_zp, m.bias, gptq_perm) set_module(self.model, k, new_module) @@ -578,7 +578,7 @@ def export_compressed_model( compression_dim=compression_dim, scale_dtype=scale_dtype, device=device, - use_HF_format=use_HF_format, + use_hf_format=use_hf_format, ) set_module(self.model, k, mod) return self.model diff --git a/neural_compressor/utils/load_huggingface.py b/neural_compressor/utils/load_huggingface.py index f7378d90998..fff4c050603 100644 --- a/neural_compressor/utils/load_huggingface.py +++ b/neural_compressor/utils/load_huggingface.py @@ -235,7 +235,7 @@ def save_for_huggingface_upstream(model, tokenizer, output_dir): def export_compressed_model( model, saved_dir=None, - use_HF_format=False, + use_hf_format=False, enable_full_range=False, compression_dtype=torch.int32, compression_dim=1, @@ -247,7 +247,7 @@ def export_compressed_model( Args: model (torch.nn.Module): origin fp32 model. saved_dir (_type_, optional): the dir path of compression info. Defaults to None. - use_HF_format (bool, optional): whether use HuggingFace format. Defaults to False. + use_hf_format (bool, optional): whether use HuggingFace format. Defaults to False. enable_full_range (bool, optional): Whether to leverage the full compression range under symmetric quantization. Defaults to False. compression_dtype (torch.Tensor, optional): The target dtype after comoression. @@ -277,6 +277,6 @@ def export_compressed_model( scale_dtype=scale_dtype, gptq_config_path=gptq_config_path, device=device, - use_HF_format=use_HF_format, + use_hf_format=use_hf_format, ) return inc_model.model diff --git a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py index 01a63352f00..47202b86b52 100644 --- a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py +++ b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py @@ -99,11 +99,11 @@ def test_RTN_int_quant(self): model = Model() new_model = load("saved", model, weight_only=True) inc_model = INCModel(new_model) - inc_model.export_compressed_model(qweight_config_path="saved/qconfig.json", use_HF_format=True) + inc_model.export_compressed_model(qweight_config_path="saved/qconfig.json", use_hf_format=True) out4 = inc_model.model(input) self.assertTrue("fc1.qzeros" in inc_model.model.state_dict().keys()) model = Model() - compressed_model = export_compressed_model(model, saved_dir="saved", use_HF_format=True) + compressed_model = export_compressed_model(model, saved_dir="saved", use_hf_format=True) self.assertTrue("fc1.qzeros" in inc_model.model.state_dict().keys()) self.assertTrue(torch.all(out3 == out4)) @@ -554,7 +554,7 @@ def __iter__(self): ) q_model.save("saved") out1 = q_model.model(input) - compressed_model = q_model.export_compressed_model(use_HF_format=True) + compressed_model = q_model.export_compressed_model(use_hf_format=True) out2 = compressed_model(input) torch.save(compressed_model.state_dict(), "saved/compressed_model.pt") self.assertTrue(torch.allclose(out1[0], out2[0], atol=1e-05)) @@ -569,7 +569,7 @@ def __iter__(self): ) q_model.save("saved") out1 = q_model.model(input) - compressed_model = q_model.export_compressed_model(use_HF_format=True) + compressed_model = q_model.export_compressed_model(use_hf_format=True) out2 = compressed_model(input) torch.save(compressed_model.state_dict(), "saved/compressed_model.pt") self.assertTrue(torch.allclose(out1[0], out2[0], atol=1e-05))