From e8ebe1254e13023f4bcc9f9b1c7221011b628bec Mon Sep 17 00:00:00 2001 From: Xin He Date: Thu, 30 Nov 2023 14:19:52 +0800 Subject: [PATCH 1/9] change use_hf_format=True and add bias Signed-off-by: Xin He --- docs/source/quantization_weight_only.md | 2 +- .../adaptor/torch_utils/model_wrapper.py | 17 ++++++++++------- .../adaptor/torch_utils/weight_only.py | 2 +- neural_compressor/model/torch_model.py | 2 +- neural_compressor/torch/quantization/modules.py | 2 +- neural_compressor/utils/load_huggingface.py | 4 ++-- .../pytorch_adaptor/test_weight_only_adaptor.py | 14 +++++++------- 7 files changed, 23 insertions(+), 20 deletions(-) diff --git a/docs/source/quantization_weight_only.md b/docs/source/quantization_weight_only.md index de5d70c6d09..673f2de09b6 100644 --- a/docs/source/quantization_weight_only.md +++ b/docs/source/quantization_weight_only.md @@ -98,7 +98,7 @@ To support low memory inference, Neural Compressor implemented WeightOnlyLinear, | compression_dtype | torch.int32 | Data type for compressed dtype, select from [torch.int8\|16\|32\|64] | | compression_dim | 1 | 0 means output channel while 1 means input channel | | scale_dtype | torch.float32 | Data type for scale and bias | -| use_hf_format | False | Whether to use the popular format present on HuggingFace hub | +| use_hf_format | True | Whether to use the popular format present on HuggingFace hub | **Note:** HuggingFace format is quite special, the main differences are as follows: diff --git a/neural_compressor/adaptor/torch_utils/model_wrapper.py b/neural_compressor/adaptor/torch_utils/model_wrapper.py index 57103566d9d..ea01c3b829a 100644 --- a/neural_compressor/adaptor/torch_utils/model_wrapper.py +++ b/neural_compressor/adaptor/torch_utils/model_wrapper.py @@ -217,7 +217,7 @@ def __init__( compression_dim=1, g_idx=False, device="cpu", - use_hf_format=False, + use_hf_format=True, ): super().__init__() self.use_hf_format = use_hf_format @@ -245,13 +245,13 @@ def __init__( dtype_bits_mapping = {torch.int8: 8, torch.int16: 16, torch.int32: 32, torch.int64: 64} self.compress_bits = dtype_bits_mapping[compression_dtype] self.n_pack = self.compress_bits // self.bits - self.compressed_dtype = compression_dtype - self.float_type = scale_dtype # K is input channel, N is output channel assert compression_dim in [0, 1], ( "Only support 0 or 1 as compression dimension, " + "0 is output channel, 1 is input channel." ) if self.use_hf_format: + self.float_type = torch.float16 + self.compressed_dtype = torch.int32 self.register_buffer( "scales", torch.zeros( @@ -276,7 +276,10 @@ def __init__( ).to(device), ) self.qzeros = self.qzeros.T + self.register_buffer("bias", torch.zeros(self.out_features, dtype=self.float_type).to(device)) else: + self.compressed_dtype = compression_dtype + self.float_type = scale_dtype self.register_buffer( "scales", torch.zeros( @@ -316,14 +319,14 @@ def __init__( dtype=self.compressed_dtype, ).to(device), ) + if bias: + self.register_buffer("bias", torch.zeros(self.out_features, dtype=self.float_type).to(device)) + else: + self.bias = None if g_idx: self.register_buffer("g_idx", torch.zeros(in_features, dtype=torch.int32).to(device)) else: self.g_idx = None - if bias: - self.register_buffer("bias", torch.zeros(self.out_features, dtype=self.float_type).to(device)) - else: - self.bias = None def pack(self, int_weight, scale, zp, bias, g_idx=None): int_weight = int_weight.to(self.device) diff --git a/neural_compressor/adaptor/torch_utils/weight_only.py b/neural_compressor/adaptor/torch_utils/weight_only.py index eb404d139f8..ad4f7e48226 100644 --- a/neural_compressor/adaptor/torch_utils/weight_only.py +++ b/neural_compressor/adaptor/torch_utils/weight_only.py @@ -396,7 +396,7 @@ def rtn_quantize( compression_dim = kwargs.get("compression_dim", 1) scale_dtype = kwargs.get("scale_dtype", torch.float32) device = kwargs.get("device", "cpu") - use_hf_format = kwargs.get("use_hf_format", False) + use_hf_format = kwargs.get("use_hf_format", True) for name, m in model.named_modules(): if m.__class__.__name__ not in supported_layers: continue diff --git a/neural_compressor/model/torch_model.py b/neural_compressor/model/torch_model.py index fb7046a1607..31fc8cb22c7 100644 --- a/neural_compressor/model/torch_model.py +++ b/neural_compressor/model/torch_model.py @@ -459,7 +459,7 @@ def export_compressed_model( scale_dtype=torch.float32, gptq_config_path=None, device="cpu", - use_hf_format=False, + use_hf_format=True, ): """Convert Linear to WeightOnlyLinear for low memory inference. diff --git a/neural_compressor/torch/quantization/modules.py b/neural_compressor/torch/quantization/modules.py index 6dd646fe6ae..f99783e784d 100644 --- a/neural_compressor/torch/quantization/modules.py +++ b/neural_compressor/torch/quantization/modules.py @@ -134,7 +134,7 @@ def __init__( compression_dim=1, g_idx=False, device="cpu", - use_hf_format=False, + use_hf_format=True, ): super().__init__() self.use_hf_format = use_hf_format diff --git a/neural_compressor/utils/load_huggingface.py b/neural_compressor/utils/load_huggingface.py index fff4c050603..c814f1f02c0 100644 --- a/neural_compressor/utils/load_huggingface.py +++ b/neural_compressor/utils/load_huggingface.py @@ -235,7 +235,7 @@ def save_for_huggingface_upstream(model, tokenizer, output_dir): def export_compressed_model( model, saved_dir=None, - use_hf_format=False, + use_hf_format=True, enable_full_range=False, compression_dtype=torch.int32, compression_dim=1, @@ -247,7 +247,7 @@ def export_compressed_model( Args: model (torch.nn.Module): origin fp32 model. saved_dir (_type_, optional): the dir path of compression info. Defaults to None. - use_hf_format (bool, optional): whether use HuggingFace format. Defaults to False. + use_hf_format (bool, optional): whether use HuggingFace format. Defaults to True. enable_full_range (bool, optional): Whether to leverage the full compression range under symmetric quantization. Defaults to False. compression_dtype (torch.Tensor, optional): The target dtype after comoression. diff --git a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py index 47202b86b52..bebe0050534 100644 --- a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py +++ b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py @@ -88,7 +88,7 @@ def test_RTN_int_quant(self): out2 = q_model(input) self.assertTrue(torch.all(torch.isclose(out1, out2, atol=5e-1))) self.assertFalse(torch.all(out1 == out2)) - compressed_model = q_model.export_compressed_model() + compressed_model = q_model.export_compressed_model(use_hf_format=False) out3 = compressed_model(input) self.assertTrue("fc1.qweight" in compressed_model.state_dict().keys()) self.assertTrue("fc1.qzeros" not in compressed_model.state_dict().keys()) @@ -120,7 +120,7 @@ def test_RTN_int_quant(self): out2 = q_model(input) self.assertTrue(torch.all(torch.isclose(out1, out2, atol=5e-1))) self.assertFalse(torch.all(out1 == out2)) - compressed_model = q_model.export_compressed_model(enable_full_range=True) + compressed_model = q_model.export_compressed_model(use_hf_format=False, enable_full_range=True) out3 = compressed_model(input) self.assertTrue(torch.all(out3 == out2)) @@ -245,7 +245,7 @@ def test_RTN_int_quant(self): model_size1 = os.path.getsize("saved/best_model.pt") / 1024 print("FP32 Model size:{:.3f}M".format(model_size1)) inc_model = INCModel(new_model) - inc_model.export_compressed_model(qweight_config_path="saved/qconfig.json") + inc_model.export_compressed_model(use_hf_format=False, qweight_config_path="saved/qconfig.json") torch.save(inc_model.state_dict(), "saved/tmp.pt") model_size2 = os.path.getsize("saved/tmp.pt") / 1024 print("WeightOnlyLinear Model size:{:.3f}M".format(model_size2)) @@ -273,7 +273,7 @@ def test_RTN_4bit_quant(self): out2 = q_model(self.lm_input) self.assertTrue(torch.all(torch.isclose(out1[0], out2[0], atol=1e-1))) self.assertFalse(torch.all(out1[0] == out2[0])) - compressed_model = q_model.export_compressed_model() + compressed_model = q_model.export_compressed_model(use_hf_format=False) out3 = compressed_model(self.lm_input) self.assertTrue(torch.all(out3[0] == out2[0])) @@ -324,7 +324,7 @@ def test_AWQ_quant(self): fp32_model = copy.deepcopy(self.gptj) reload_model = load("saved", fp32_model, weight_only=True) out2 = reload_model(input) - q_model.export_compressed_model() + q_model.export_compressed_model(use_hf_format=False) out3 = q_model(input) # no idea about the gap at 1e-08, use allclose instead of out1==out2 self.assertTrue(torch.allclose(out1[0], out2[0], atol=1e-05)) @@ -428,7 +428,7 @@ def test_AWQ_nf4_quant(self): ) out2 = q_model(input) self.assertTrue(torch.allclose(out1[0], out2[0], atol=1e-01)) - compressed_model = q_model.export_compressed_model() + compressed_model = q_model.export_compressed_model(use_hf_format=False) out3 = compressed_model(input) self.assertTrue(torch.all(out3[0] == out2[0])) @@ -529,7 +529,7 @@ def __iter__(self): q_model.save("saved") out1 = q_model.model(input) self.assertTrue(torch.allclose(out1[0], out0[0], atol=1e-02)) - compressed_model = q_model.export_compressed_model() + compressed_model = q_model.export_compressed_model(use_hf_format=False) out2 = compressed_model(input) torch.save(compressed_model.state_dict(), "saved/compressed_model.pt") self.assertTrue(torch.allclose(out1[0], out2[0], atol=1e-05)) From 958b54dbb653ab0d24ec47bd24b14641f2e3f2ae Mon Sep 17 00:00:00 2001 From: Xin He Date: Thu, 30 Nov 2023 15:43:25 +0800 Subject: [PATCH 2/9] auto fallback fp16 to fp32 if device is cpu Signed-off-by: Xin He --- neural_compressor/adaptor/torch_utils/model_wrapper.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/neural_compressor/adaptor/torch_utils/model_wrapper.py b/neural_compressor/adaptor/torch_utils/model_wrapper.py index ea01c3b829a..d645e950bf4 100644 --- a/neural_compressor/adaptor/torch_utils/model_wrapper.py +++ b/neural_compressor/adaptor/torch_utils/model_wrapper.py @@ -473,14 +473,17 @@ def recover(self): return fp32_weight def forward(self, input): + weight = self.recover() + device = self.scales.device + if weight.dtype == torch.float16 and device.type == "cpu": + weight.float() if level == DEBUG: if not hasattr(self, "weight"): - self.weight = self.recover() + self.weight = weight input = input.type(self.weight.dtype) logger.debug(f"Calculating {self}") return F.linear(input, self.weight, self.bias) else: - weight = self.recover() input = input.type(weight.dtype) return F.linear(input, weight, self.bias) From 7e11d9f676a67f3a0df4c3c0bcdf2c7843787dab Mon Sep 17 00:00:00 2001 From: Xin He Date: Thu, 30 Nov 2023 16:35:58 +0800 Subject: [PATCH 3/9] fix ut Signed-off-by: Xin He --- neural_compressor/adaptor/torch_utils/model_wrapper.py | 3 ++- test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/neural_compressor/adaptor/torch_utils/model_wrapper.py b/neural_compressor/adaptor/torch_utils/model_wrapper.py index d645e950bf4..374ce611b5f 100644 --- a/neural_compressor/adaptor/torch_utils/model_wrapper.py +++ b/neural_compressor/adaptor/torch_utils/model_wrapper.py @@ -476,7 +476,8 @@ def forward(self, input): weight = self.recover() device = self.scales.device if weight.dtype == torch.float16 and device.type == "cpu": - weight.float() + weight = weight.float() + self.bias = self.bias.float() if self.bias is not None else None if level == DEBUG: if not hasattr(self, "weight"): self.weight = weight diff --git a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py index bebe0050534..581de7a7c1c 100644 --- a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py +++ b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py @@ -105,7 +105,8 @@ def test_RTN_int_quant(self): model = Model() compressed_model = export_compressed_model(model, saved_dir="saved", use_hf_format=True) self.assertTrue("fc1.qzeros" in inc_model.model.state_dict().keys()) - self.assertTrue(torch.all(out3 == out4)) + # output gap is because of torch.float16 is used in hf_format + self.assertTrue(torch.allclose(out3, out4, atol=1e-3)) model = Model() out1 = model(input) From 3d2c12dff2e0092529b866ab479d59671cd4bde6 Mon Sep 17 00:00:00 2001 From: Xin He Date: Mon, 4 Dec 2023 11:47:41 +0800 Subject: [PATCH 4/9] fix bug Signed-off-by: Xin He --- neural_compressor/adaptor/pytorch.py | 4 +- .../adaptor/torch_utils/model_wrapper.py | 21 ++++----- .../torch/quantization/modules.py | 46 ++++++++++--------- .../test_weight_only_adaptor.py | 8 +++- 4 files changed, 41 insertions(+), 38 deletions(-) diff --git a/neural_compressor/adaptor/pytorch.py b/neural_compressor/adaptor/pytorch.py index def044148ca..ad152900a19 100644 --- a/neural_compressor/adaptor/pytorch.py +++ b/neural_compressor/adaptor/pytorch.py @@ -4573,10 +4573,12 @@ def rtn_quantize(self, model, tune_cfg): enable_full_range = self.recipes["rtn_args"].get("enable_full_range", False) enable_mse_search = self.recipes["rtn_args"].get("enable_mse_search", False) group_dim = self.recipes["rtn_args"].get("group_dim", 1) + return_int = self.recipes["rtn_args"].get("return_int", False) else: # pragma: no cover enable_full_range = False enable_mse_search = False group_dim = 1 + return_int = False from .torch_utils.util import fetch_module, set_module from .torch_utils.weight_only import rtn_quantize @@ -4614,7 +4616,7 @@ def rtn_quantize(self, model, tune_cfg): num_bits, group_size, scheme, - return_int=False, + return_int=return_int, data_type=dtype, enable_full_range=enable_full_range, enable_mse_search=enable_mse_search, diff --git a/neural_compressor/adaptor/torch_utils/model_wrapper.py b/neural_compressor/adaptor/torch_utils/model_wrapper.py index 374ce611b5f..a930bfcc05a 100644 --- a/neural_compressor/adaptor/torch_utils/model_wrapper.py +++ b/neural_compressor/adaptor/torch_utils/model_wrapper.py @@ -390,18 +390,14 @@ def pack(self, int_weight, scale, zp, bias, g_idx=None): if self.use_hf_format: self.scales = self.scales.T self.qweight = self.qweight.T - self.g_idx = self.g_idx self.qzeros = self.qzeros.T def recover(self): logger.debug(f"Recovering {self} weight") - if self.use_hf_format: - # Prevent broken id links of self.scales and self.scales - self.scales = self.scales.T - self.qweight = self.qweight.T - self.g_idx = self.g_idx - self.qzeros = self.qzeros.T - device = self.scales.device + scales = self.scales.T if self.use_hf_format else self.scales + qweight = self.qweight.T if self.use_hf_format else self.qweight + + device = scales.device fp32_weight = torch.zeros(self.out_features, self.in_features, dtype=self.float_type).to(device) if self.g_idx is None: # used for recovering fp32_weight @@ -413,7 +409,6 @@ def recover(self): weight_dtype = torch.int8 # unpack weight weight = torch.zeros(self.out_features, self.in_features, dtype=weight_dtype).to(device) - qweight = self.qweight if not self.use_hf_format and self.compression_dim == 0: weight = weight.T qweight = qweight.T @@ -440,8 +435,8 @@ def recover(self): # unpack zero_point if hasattr(self, "qzeros"): zp_dtype = self.compressed_dtype # to avoid overflow when weight-zp - zp = torch.zeros(self.scales.shape, dtype=zp_dtype).to(device) - qzeros = self.qzeros + zp = torch.zeros(scales.shape, dtype=zp_dtype).to(device) + qzeros = self.qzeros.T if self.use_hf_format else self.qzeros if self.use_hf_format or self.compression_dim == 0: zp = zp.T qzeros = qzeros.T @@ -465,11 +460,11 @@ def recover(self): zp = torch.where(zp > (2**self.bits - 1), 0, zp) # recover fp32 weight with int_weight, scale, and zero_point for idx in range(self.in_features): - fp32_weight[:, idx] = (weight[:, idx] - zp[:, self.g_idx[idx]]) * self.scales[:, self.g_idx[idx]] + fp32_weight[:, idx] = (weight[:, idx] - zp[:, self.g_idx[idx]]) * scales[:, self.g_idx[idx]] else: # recover fp32 weight with int_weight, scale for idx in range(self.in_features): - fp32_weight[:, idx] = weight[:, idx] * self.scales[:, self.g_idx[idx]] + fp32_weight[:, idx] = weight[:, idx] * scales[:, self.g_idx[idx]] return fp32_weight def forward(self, input): diff --git a/neural_compressor/torch/quantization/modules.py b/neural_compressor/torch/quantization/modules.py index f99783e784d..36c058f29ff 100644 --- a/neural_compressor/torch/quantization/modules.py +++ b/neural_compressor/torch/quantization/modules.py @@ -140,7 +140,7 @@ def __init__( self.use_hf_format = use_hf_format self.dtype = dtype if "int" not in self.dtype: # for nf4, fp4 - from neural_compressor.torch.algorithms.weight_only.rtn import FLOAT_MAPPING, INT_MAPPING + from neural_compressor.adaptor.torch_utils.weight_only import FLOAT_MAPPING, INT_MAPPING float_list = FLOAT_MAPPING[self.dtype] int_list = INT_MAPPING[self.dtype] @@ -162,13 +162,13 @@ def __init__( dtype_bits_mapping = {torch.int8: 8, torch.int16: 16, torch.int32: 32, torch.int64: 64} self.compress_bits = dtype_bits_mapping[compression_dtype] self.n_pack = self.compress_bits // self.bits - self.compressed_dtype = compression_dtype - self.float_type = scale_dtype # K is input channel, N is output channel assert compression_dim in [0, 1], ( "Only support 0 or 1 as compression dimension, " + "0 is output channel, 1 is input channel." ) if self.use_hf_format: + self.float_type = torch.float16 + self.compressed_dtype = torch.int32 self.register_buffer( "scales", torch.zeros( @@ -193,7 +193,10 @@ def __init__( ).to(device), ) self.qzeros = self.qzeros.T + self.register_buffer("bias", torch.zeros(self.out_features, dtype=self.float_type).to(device)) else: + self.compressed_dtype = compression_dtype + self.float_type = scale_dtype self.register_buffer( "scales", torch.zeros( @@ -233,14 +236,14 @@ def __init__( dtype=self.compressed_dtype, ).to(device), ) + if bias: + self.register_buffer("bias", torch.zeros(self.out_features, dtype=self.float_type).to(device)) + else: + self.bias = None if g_idx: self.register_buffer("g_idx", torch.zeros(in_features, dtype=torch.int32).to(device)) else: self.g_idx = None - if bias: - self.register_buffer("bias", torch.zeros(self.out_features, dtype=self.float_type).to(device)) - else: - self.bias = None def pack(self, int_weight, scale, zp, bias, g_idx=None): int_weight = int_weight.to(self.device) @@ -304,18 +307,14 @@ def pack(self, int_weight, scale, zp, bias, g_idx=None): if self.use_hf_format: self.scales = self.scales.T self.qweight = self.qweight.T - self.g_idx = self.g_idx self.qzeros = self.qzeros.T def recover(self): logger.debug(f"Recovering {self} weight") - if self.use_hf_format: - # Prevent broken id links of self.scales and self.scales - self.scales = self.scales.T - self.qweight = self.qweight.T - self.g_idx = self.g_idx - self.qzeros = self.qzeros.T - device = self.scales.device + scales = self.scales.T if self.use_hf_format else self.scales + qweight = self.qweight.T if self.use_hf_format else self.qweight + + device = scales.device fp32_weight = torch.zeros(self.out_features, self.in_features, dtype=self.float_type).to(device) if self.g_idx is None: # used for recovering fp32_weight @@ -327,7 +326,6 @@ def recover(self): weight_dtype = torch.int8 # unpack weight weight = torch.zeros(self.out_features, self.in_features, dtype=weight_dtype).to(device) - qweight = self.qweight if not self.use_hf_format and self.compression_dim == 0: weight = weight.T qweight = qweight.T @@ -354,8 +352,8 @@ def recover(self): # unpack zero_point if hasattr(self, "qzeros"): zp_dtype = self.compressed_dtype # to avoid overflow when weight-zp - zp = torch.zeros(self.scales.shape, dtype=zp_dtype).to(device) - qzeros = self.qzeros + zp = torch.zeros(scales.shape, dtype=zp_dtype).to(device) + qzeros = self.qzeros.T if self.use_hf_format else self.qzeros if self.use_hf_format or self.compression_dim == 0: zp = zp.T qzeros = qzeros.T @@ -379,22 +377,26 @@ def recover(self): zp = torch.where(zp > (2**self.bits - 1), 0, zp) # recover fp32 weight with int_weight, scale, and zero_point for idx in range(self.in_features): - fp32_weight[:, idx] = (weight[:, idx] - zp[:, self.g_idx[idx]]) * self.scales[:, self.g_idx[idx]] + fp32_weight[:, idx] = (weight[:, idx] - zp[:, self.g_idx[idx]]) * scales[:, self.g_idx[idx]] else: # recover fp32 weight with int_weight, scale for idx in range(self.in_features): - fp32_weight[:, idx] = weight[:, idx] * self.scales[:, self.g_idx[idx]] + fp32_weight[:, idx] = weight[:, idx] * scales[:, self.g_idx[idx]] return fp32_weight def forward(self, input): + weight = self.recover() + device = self.scales.device + if weight.dtype == torch.float16 and device.type == "cpu": + weight = weight.float() + self.bias = self.bias.float() if self.bias is not None else None if level == DEBUG: if not hasattr(self, "weight"): - self.weight = self.recover() + self.weight = weight input = input.type(self.weight.dtype) logger.debug(f"Calculating {self}") return F.linear(input, self.weight, self.bias) else: - weight = self.recover() input = input.type(weight.dtype) return F.linear(input, weight, self.bias) diff --git a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py index 581de7a7c1c..5bf9e34b070 100644 --- a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py +++ b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py @@ -182,6 +182,7 @@ def test_RTN_int_quant(self): ) q_model = quantization.fit(model, conf, eval_func=eval_func) out2 = q_model(input) + self.assertTrue(isinstance(q_model.model.fc1, WeightOnlyLinear)) self.assertTrue(torch.all(torch.isclose(out1, out2, atol=5e-1))) self.assertFalse(torch.all(out1 == out2)) @@ -557,8 +558,11 @@ def __iter__(self): out1 = q_model.model(input) compressed_model = q_model.export_compressed_model(use_hf_format=True) out2 = compressed_model(input) + print(out1[0]) + print(out2[0]) torch.save(compressed_model.state_dict(), "saved/compressed_model.pt") - self.assertTrue(torch.allclose(out1[0], out2[0], atol=1e-05)) + # hf_format uses fp16 for scale, so output atol is higher. + self.assertTrue(torch.allclose(out1[0], out2[0], atol=2e-04)) # # case 2: list or tuple model_3 = copy.deepcopy(self.gptj) @@ -570,7 +574,7 @@ def __iter__(self): ) q_model.save("saved") out1 = q_model.model(input) - compressed_model = q_model.export_compressed_model(use_hf_format=True) + compressed_model = q_model.export_compressed_model(use_hf_format=False) out2 = compressed_model(input) torch.save(compressed_model.state_dict(), "saved/compressed_model.pt") self.assertTrue(torch.allclose(out1[0], out2[0], atol=1e-05)) From c7426535cdc889153149fd5c6f74e4fc1268364f Mon Sep 17 00:00:00 2001 From: Xin He Date: Mon, 4 Dec 2023 12:57:19 +0800 Subject: [PATCH 5/9] fix bug Signed-off-by: Xin He --- neural_compressor/torch/quantization/modules.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neural_compressor/torch/quantization/modules.py b/neural_compressor/torch/quantization/modules.py index 36c058f29ff..f7c286d9382 100644 --- a/neural_compressor/torch/quantization/modules.py +++ b/neural_compressor/torch/quantization/modules.py @@ -140,7 +140,7 @@ def __init__( self.use_hf_format = use_hf_format self.dtype = dtype if "int" not in self.dtype: # for nf4, fp4 - from neural_compressor.adaptor.torch_utils.weight_only import FLOAT_MAPPING, INT_MAPPING + from neural_compressor.torch.algorithms.weight_only.rtn import FLOAT_MAPPING, INT_MAPPING float_list = FLOAT_MAPPING[self.dtype] int_list = INT_MAPPING[self.dtype] From e3c4ff073bc0f6667b086f05a0041711eb2db961 Mon Sep 17 00:00:00 2001 From: Xin He Date: Mon, 4 Dec 2023 13:56:17 +0800 Subject: [PATCH 6/9] fix ut Signed-off-by: Xin He --- test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py index 5bf9e34b070..094c2b4d1e9 100644 --- a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py +++ b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py @@ -655,7 +655,8 @@ def __iter__(self): compressed_model = q_model.export_compressed_model() out2 = compressed_model(input) torch.save(compressed_model.state_dict(), "saved/compressed_model.pt") - self.assertTrue(torch.allclose(out1[0], out2[0], atol=1e-05)) + # hf_format uses fp16 for scale, so output atol is higher. + self.assertTrue(torch.allclose(out1[0], out2[0], atol=2e-04)) # # case 2: list or tuple model_2 = copy.deepcopy(self.gptj) @@ -667,7 +668,7 @@ def __iter__(self): ) q_model.save("saved") out1 = q_model.model(input) - compressed_model = q_model.export_compressed_model() + compressed_model = q_model.export_compressed_model(use_hf_format=False) out2 = compressed_model(input) torch.save(compressed_model.state_dict(), "saved/compressed_model.pt") self.assertTrue(torch.allclose(out1[0], out2[0], atol=1e-05)) @@ -685,7 +686,8 @@ def __iter__(self): compressed_model = q_model.export_compressed_model() out2 = compressed_model(input) torch.save(compressed_model.state_dict(), "saved/compressed_model.pt") - self.assertTrue(torch.allclose(out1[0], out2[0], atol=1e-05)) + # hf_format uses fp16 for scale, so output atol is higher. + self.assertTrue(torch.allclose(out1[0], out2[0], atol=2e-04)) print("GPTQ with unfixed length Done") From 64b21af7387c3c25bc22960522bf7cf961a2414e Mon Sep 17 00:00:00 2001 From: Xin He Date: Mon, 4 Dec 2023 15:50:59 +0800 Subject: [PATCH 7/9] update ut for default hf_format Signed-off-by: Xin He --- test/model/test_model_pytorch.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/model/test_model_pytorch.py b/test/model/test_model_pytorch.py index 05edfd9c6fb..a96c35d036e 100644 --- a/test/model/test_model_pytorch.py +++ b/test/model/test_model_pytorch.py @@ -117,6 +117,8 @@ def test_WeightOnlyLinear(self): inc_model.export_compressed_model( qweight_config_path="saved/qconfig.json", compression_dtype=dtype, + scale_dtype=torch.float32, + use_hf_format=False, ) out2 = q_model(input) torch.save(inc_model.state_dict(), "saved/tmp.pt") @@ -136,6 +138,7 @@ def test_WeightOnlyLinear(self): inc_model.export_compressed_model( qweight_config_path="saved/qconfig.json", compression_dim=dim, + use_hf_format=False, ) out2 = q_model(input) torch.save(inc_model.state_dict(), "saved/tmp.pt") @@ -154,7 +157,6 @@ def test_WeightOnlyLinear(self): inc_model = INCModel(new_model) inc_model.export_compressed_model( qweight_config_path="saved/qconfig.json", - scale_dtype=torch.float16, ) out2 = q_model(input) torch.save(inc_model.state_dict(), "saved/tmp.pt") From 477d0080611976f13b1d06969a8dc1ce913f2c9a Mon Sep 17 00:00:00 2001 From: Xin He Date: Tue, 5 Dec 2023 19:17:30 +0800 Subject: [PATCH 8/9] change flag name to optimum and add link Signed-off-by: Xin He --- docs/source/quantization_weight_only.md | 15 +++---- .../adaptor/torch_utils/model_wrapper.py | 42 +++++++++---------- .../adaptor/torch_utils/weight_only.py | 4 +- neural_compressor/model/torch_model.py | 10 ++--- .../torch/quantization/modules.py | 42 +++++++++---------- neural_compressor/utils/load_huggingface.py | 6 +-- .../test_weight_only_adaptor.py | 24 +++++------ test/model/test_model_pytorch.py | 4 +- 8 files changed, 74 insertions(+), 73 deletions(-) diff --git a/docs/source/quantization_weight_only.md b/docs/source/quantization_weight_only.md index 673f2de09b6..f2e7828460e 100644 --- a/docs/source/quantization_weight_only.md +++ b/docs/source/quantization_weight_only.md @@ -93,18 +93,19 @@ To support low memory inference, Neural Compressor implemented WeightOnlyLinear, **Export arguments** | export args | default value | comments | |:----------:|:-------------:|:-------------------------------------------------------------------:| -| qweight_config_path | None | If need to export model with fp32_model and json file, set the path of qconfig.json | +| use_optimum_format | True | Whether to use the popular format used in [Optimum](https://github.com/huggingface/optimum/blob/e0927976d06d163ed09fe5bd80d013e1cfa0c463/docs/source/llm_quantization/usage_guides/quantization.mdx#L5) | +| qweight_config_path | None | set the path of qconfig.json if you want to export model with json file | +| gptq_config_path | None | If need to export model with fp32_model and json file, set the path of gptq_config.json for GPTQ quantized model| | sym_full_range | False | Whether to leverage the full compression range under symmetric quantization | -| compression_dtype | torch.int32 | Data type for compressed dtype, select from [torch.int8\|16\|32\|64] | -| compression_dim | 1 | 0 means output channel while 1 means input channel | -| scale_dtype | torch.float32 | Data type for scale and bias | -| use_hf_format | True | Whether to use the popular format present on HuggingFace hub | +| compression_dtype | torch.int32 | Data type for compressed dtype, select from [torch.int8\|16\|32\|64]. It's torch.int32 when use_optimum_format=True | +| compression_dim | 1 | 0 means output channel while 1 means input channel. It's 1 for weight and 0 for zero-point when use_optimum_format=True | +| scale_dtype | torch.float32 | Data type for scale and bias. It's torch.float16 when use_optimum_format=True | -**Note:** HuggingFace format is quite special, the main differences are as follows: +**Note:** The format used in Optimum is acceptable for transformers, which makes it easy to use. However, this format is rather special, the main differences are as follows: > 1: Compression Dimension: weight = 1, zero = 0 and both are transposed. > 2: Zero Point: zero_point-= 1 before compression. zero_point is always required even for sym. -> 3: Group Index: Use the same number for a group instead of recording channel order. +> 3: Group Index: Use the same number for a group instead of recording channel order. ### **User Code Example** diff --git a/neural_compressor/adaptor/torch_utils/model_wrapper.py b/neural_compressor/adaptor/torch_utils/model_wrapper.py index a930bfcc05a..6e9df2d5392 100644 --- a/neural_compressor/adaptor/torch_utils/model_wrapper.py +++ b/neural_compressor/adaptor/torch_utils/model_wrapper.py @@ -217,10 +217,10 @@ def __init__( compression_dim=1, g_idx=False, device="cpu", - use_hf_format=True, + use_optimum_format=True, ): super().__init__() - self.use_hf_format = use_hf_format + self.use_optimum_format = use_optimum_format self.dtype = dtype if "int" not in self.dtype: # for nf4, fp4 from neural_compressor.adaptor.torch_utils.weight_only import FLOAT_MAPPING, INT_MAPPING @@ -249,7 +249,7 @@ def __init__( assert compression_dim in [0, 1], ( "Only support 0 or 1 as compression dimension, " + "0 is output channel, 1 is input channel." ) - if self.use_hf_format: + if self.use_optimum_format: self.float_type = torch.float16 self.compressed_dtype = torch.int32 self.register_buffer( @@ -330,7 +330,7 @@ def __init__( def pack(self, int_weight, scale, zp, bias, g_idx=None): int_weight = int_weight.to(self.device) - if self.use_hf_format and zp is None: + if self.use_optimum_format and zp is None: # to avoid overflow int_weight = int_weight.type(torch.int32) shift_bias = 2 ** (self.bits - 1) @@ -342,13 +342,13 @@ def pack(self, int_weight, scale, zp, bias, g_idx=None): if g_idx is not None: assert hasattr(self, "g_idx"), "g_idx is not set when initializing." self.g_idx = g_idx.type(torch.int32).to(self.device) - if self.use_hf_format: + if self.use_optimum_format: invperm = torch.argsort(self.g_idx) self.g_idx = invperm // self.groupsize self.g_idx = self.g_idx.type(torch.int32).to(self.device) assert scale.shape == self.scales.shape, "Scale shape is mismatched." self.scales = scale.type(self.float_type).to(self.device) - if not self.use_hf_format and self.compression_dim == 0: + if not self.use_optimum_format and self.compression_dim == 0: int_weight = int_weight.T self.qweight = self.qweight.T origin_shape = int_weight.shape @@ -365,14 +365,14 @@ def pack(self, int_weight, scale, zp, bias, g_idx=None): tmp[:, e] &= mask tmp[:, e] = tmp[:, e] << (self.bits * e) self.qweight[:, j] |= tmp[:, e] - if not self.use_hf_format and self.compression_dim == 0: + if not self.use_optimum_format and self.compression_dim == 0: self.qweight = self.qweight.T if zp is not None: zp = zp.to(self.device) - if self.use_hf_format: + if self.use_optimum_format: zp -= 1 - if self.use_hf_format or self.compression_dim == 0: + if self.use_optimum_format or self.compression_dim == 0: zp = zp.T self.qzeros = self.qzeros.T assert hasattr(self, "qzeros"), "zp is not set when initializing." @@ -385,17 +385,17 @@ def pack(self, int_weight, scale, zp, bias, g_idx=None): tmp[:, e] &= mask tmp[:, e] = tmp[:, e] << (self.bits * e) self.qzeros[:, j] |= tmp[:, e] - if self.use_hf_format or self.compression_dim == 0: + if self.use_optimum_format or self.compression_dim == 0: self.qzeros = self.qzeros.T - if self.use_hf_format: + if self.use_optimum_format: self.scales = self.scales.T self.qweight = self.qweight.T self.qzeros = self.qzeros.T def recover(self): logger.debug(f"Recovering {self} weight") - scales = self.scales.T if self.use_hf_format else self.scales - qweight = self.qweight.T if self.use_hf_format else self.qweight + scales = self.scales.T if self.use_optimum_format else self.scales + qweight = self.qweight.T if self.use_optimum_format else self.qweight device = scales.device fp32_weight = torch.zeros(self.out_features, self.in_features, dtype=self.float_type).to(device) @@ -409,7 +409,7 @@ def recover(self): weight_dtype = torch.int8 # unpack weight weight = torch.zeros(self.out_features, self.in_features, dtype=weight_dtype).to(device) - if not self.use_hf_format and self.compression_dim == 0: + if not self.use_optimum_format and self.compression_dim == 0: weight = weight.T qweight = qweight.T origin_shape = weight.shape @@ -425,7 +425,7 @@ def recover(self): if weight_dtype == torch.uint8: tmp &= mask # remove sign bit weight[:, index] = tmp.type(weight_dtype) - if not self.use_hf_format and self.compression_dim == 0: + if not self.use_optimum_format and self.compression_dim == 0: weight = weight.T if "int" not in self.dtype: new_weight = torch.zeros(self.out_features, self.in_features).to(device) @@ -436,8 +436,8 @@ def recover(self): if hasattr(self, "qzeros"): zp_dtype = self.compressed_dtype # to avoid overflow when weight-zp zp = torch.zeros(scales.shape, dtype=zp_dtype).to(device) - qzeros = self.qzeros.T if self.use_hf_format else self.qzeros - if self.use_hf_format or self.compression_dim == 0: + qzeros = self.qzeros.T if self.use_optimum_format else self.qzeros + if self.use_optimum_format or self.compression_dim == 0: zp = zp.T qzeros = qzeros.T origin_shape = zp.shape @@ -452,9 +452,9 @@ def recover(self): tmp = tmp >> self.compress_bits - self.bits tmp &= mask zp[:, index] = tmp.type(zp_dtype) - if self.use_hf_format or self.compression_dim == 0: + if self.use_optimum_format or self.compression_dim == 0: zp = zp.T - if self.use_hf_format: + if self.use_optimum_format: # zp -= 1 may cause zp == -1, after recover it becomes 2**self.bits - 1 zp += 1 zp = torch.where(zp > (2**self.bits - 1), 0, zp) @@ -491,8 +491,8 @@ def extra_repr(self) -> str: self.groupsize, self.bias is not None, ) - if self.use_hf_format: - tmp_str += ", use_hf_format=True" + if self.use_optimum_format: + tmp_str += ", use_optimum_format=True" return tmp_str diff --git a/neural_compressor/adaptor/torch_utils/weight_only.py b/neural_compressor/adaptor/torch_utils/weight_only.py index ad4f7e48226..c29994f7755 100644 --- a/neural_compressor/adaptor/torch_utils/weight_only.py +++ b/neural_compressor/adaptor/torch_utils/weight_only.py @@ -396,7 +396,7 @@ def rtn_quantize( compression_dim = kwargs.get("compression_dim", 1) scale_dtype = kwargs.get("scale_dtype", torch.float32) device = kwargs.get("device", "cpu") - use_hf_format = kwargs.get("use_hf_format", True) + use_optimum_format = kwargs.get("use_optimum_format", True) for name, m in model.named_modules(): if m.__class__.__name__ not in supported_layers: continue @@ -452,7 +452,7 @@ def rtn_quantize( compression_dim=compression_dim, scale_dtype=scale_dtype, device=device, - use_hf_format=use_hf_format, + use_optimum_format=use_optimum_format, ) new_module.pack(int_weight, scale, zp, m.bias) if name == "": diff --git a/neural_compressor/model/torch_model.py b/neural_compressor/model/torch_model.py index 31fc8cb22c7..395b9c007fe 100644 --- a/neural_compressor/model/torch_model.py +++ b/neural_compressor/model/torch_model.py @@ -459,7 +459,7 @@ def export_compressed_model( scale_dtype=torch.float32, gptq_config_path=None, device="cpu", - use_hf_format=True, + use_optimum_format=True, ): """Convert Linear to WeightOnlyLinear for low memory inference. @@ -475,7 +475,7 @@ def export_compressed_model( Defaults to torch.float32. gptq_config_path (str, optional): Path of gptq_config.json. Defaults to None. device (str, optional): choose device for compression. Defaults to cpu. - use_hf_format (bool, optional): use the popular huggingface compression format. + use_optimum_format (bool, optional): use the popular huggingface compression format. 1: compression_dim: weight = 1, zeros = 0 and both are transposed. 2: zeros -= 1 before compression. Why we need it? 3: g_idx: use same number for one group instead of recording the channel order. @@ -520,7 +520,7 @@ def export_compressed_model( compression_dim=compression_dim, scale_dtype=scale_dtype, device=device, - use_hf_format=use_hf_format, + use_optimum_format=use_optimum_format, ) set_module(self.model, k, new_module) continue @@ -551,7 +551,7 @@ def export_compressed_model( compression_dim=compression_dim, scale_dtype=scale_dtype, device=device, - use_hf_format=use_hf_format, + use_optimum_format=use_optimum_format, ) new_module.pack(int_weight, gptq_scale, gptq_zp, m.bias, gptq_perm) set_module(self.model, k, new_module) @@ -578,7 +578,7 @@ def export_compressed_model( compression_dim=compression_dim, scale_dtype=scale_dtype, device=device, - use_hf_format=use_hf_format, + use_optimum_format=use_optimum_format, ) set_module(self.model, k, mod) return self.model diff --git a/neural_compressor/torch/quantization/modules.py b/neural_compressor/torch/quantization/modules.py index f7c286d9382..ccba214e0f8 100644 --- a/neural_compressor/torch/quantization/modules.py +++ b/neural_compressor/torch/quantization/modules.py @@ -134,10 +134,10 @@ def __init__( compression_dim=1, g_idx=False, device="cpu", - use_hf_format=True, + use_optimum_format=True, ): super().__init__() - self.use_hf_format = use_hf_format + self.use_optimum_format = use_optimum_format self.dtype = dtype if "int" not in self.dtype: # for nf4, fp4 from neural_compressor.torch.algorithms.weight_only.rtn import FLOAT_MAPPING, INT_MAPPING @@ -166,7 +166,7 @@ def __init__( assert compression_dim in [0, 1], ( "Only support 0 or 1 as compression dimension, " + "0 is output channel, 1 is input channel." ) - if self.use_hf_format: + if self.use_optimum_format: self.float_type = torch.float16 self.compressed_dtype = torch.int32 self.register_buffer( @@ -247,7 +247,7 @@ def __init__( def pack(self, int_weight, scale, zp, bias, g_idx=None): int_weight = int_weight.to(self.device) - if self.use_hf_format and zp is None: + if self.use_optimum_format and zp is None: # to avoid overflow int_weight = int_weight.type(torch.int32) shift_bias = 2 ** (self.bits - 1) @@ -259,13 +259,13 @@ def pack(self, int_weight, scale, zp, bias, g_idx=None): if g_idx is not None: assert hasattr(self, "g_idx"), "g_idx is not set when initializing." self.g_idx = g_idx.type(torch.int32).to(self.device) - if self.use_hf_format: + if self.use_optimum_format: invperm = torch.argsort(self.g_idx) self.g_idx = invperm // self.groupsize self.g_idx = self.g_idx.type(torch.int32).to(self.device) assert scale.shape == self.scales.shape, "Scale shape is mismatched." self.scales = scale.type(self.float_type).to(self.device) - if not self.use_hf_format and self.compression_dim == 0: + if not self.use_optimum_format and self.compression_dim == 0: int_weight = int_weight.T self.qweight = self.qweight.T origin_shape = int_weight.shape @@ -282,14 +282,14 @@ def pack(self, int_weight, scale, zp, bias, g_idx=None): tmp[:, e] &= mask tmp[:, e] = tmp[:, e] << (self.bits * e) self.qweight[:, j] |= tmp[:, e] - if not self.use_hf_format and self.compression_dim == 0: + if not self.use_optimum_format and self.compression_dim == 0: self.qweight = self.qweight.T if zp is not None: zp = zp.to(self.device) - if self.use_hf_format: + if self.use_optimum_format: zp -= 1 - if self.use_hf_format or self.compression_dim == 0: + if self.use_optimum_format or self.compression_dim == 0: zp = zp.T self.qzeros = self.qzeros.T assert hasattr(self, "qzeros"), "zp is not set when initializing." @@ -302,17 +302,17 @@ def pack(self, int_weight, scale, zp, bias, g_idx=None): tmp[:, e] &= mask tmp[:, e] = tmp[:, e] << (self.bits * e) self.qzeros[:, j] |= tmp[:, e] - if self.use_hf_format or self.compression_dim == 0: + if self.use_optimum_format or self.compression_dim == 0: self.qzeros = self.qzeros.T - if self.use_hf_format: + if self.use_optimum_format: self.scales = self.scales.T self.qweight = self.qweight.T self.qzeros = self.qzeros.T def recover(self): logger.debug(f"Recovering {self} weight") - scales = self.scales.T if self.use_hf_format else self.scales - qweight = self.qweight.T if self.use_hf_format else self.qweight + scales = self.scales.T if self.use_optimum_format else self.scales + qweight = self.qweight.T if self.use_optimum_format else self.qweight device = scales.device fp32_weight = torch.zeros(self.out_features, self.in_features, dtype=self.float_type).to(device) @@ -326,7 +326,7 @@ def recover(self): weight_dtype = torch.int8 # unpack weight weight = torch.zeros(self.out_features, self.in_features, dtype=weight_dtype).to(device) - if not self.use_hf_format and self.compression_dim == 0: + if not self.use_optimum_format and self.compression_dim == 0: weight = weight.T qweight = qweight.T origin_shape = weight.shape @@ -342,7 +342,7 @@ def recover(self): if weight_dtype == torch.uint8: tmp &= mask # remove sign bit weight[:, index] = tmp.type(weight_dtype) - if not self.use_hf_format and self.compression_dim == 0: + if not self.use_optimum_format and self.compression_dim == 0: weight = weight.T if "int" not in self.dtype: new_weight = torch.zeros(self.out_features, self.in_features).to(device) @@ -353,8 +353,8 @@ def recover(self): if hasattr(self, "qzeros"): zp_dtype = self.compressed_dtype # to avoid overflow when weight-zp zp = torch.zeros(scales.shape, dtype=zp_dtype).to(device) - qzeros = self.qzeros.T if self.use_hf_format else self.qzeros - if self.use_hf_format or self.compression_dim == 0: + qzeros = self.qzeros.T if self.use_optimum_format else self.qzeros + if self.use_optimum_format or self.compression_dim == 0: zp = zp.T qzeros = qzeros.T origin_shape = zp.shape @@ -369,9 +369,9 @@ def recover(self): tmp = tmp >> self.compress_bits - self.bits tmp &= mask zp[:, index] = tmp.type(zp_dtype) - if self.use_hf_format or self.compression_dim == 0: + if self.use_optimum_format or self.compression_dim == 0: zp = zp.T - if self.use_hf_format: + if self.use_optimum_format: # zp -= 1 may cause zp == -1, after recover it becomes 2**self.bits - 1 zp += 1 zp = torch.where(zp > (2**self.bits - 1), 0, zp) @@ -408,8 +408,8 @@ def extra_repr(self) -> str: self.groupsize, self.bias is not None, ) - if self.use_hf_format: - tmp_str += ", use_hf_format=True" + if self.use_optimum_format: + tmp_str += ", use_optimum_format=True" return tmp_str diff --git a/neural_compressor/utils/load_huggingface.py b/neural_compressor/utils/load_huggingface.py index c814f1f02c0..43b68ef4c47 100644 --- a/neural_compressor/utils/load_huggingface.py +++ b/neural_compressor/utils/load_huggingface.py @@ -235,7 +235,7 @@ def save_for_huggingface_upstream(model, tokenizer, output_dir): def export_compressed_model( model, saved_dir=None, - use_hf_format=True, + use_optimum_format=True, enable_full_range=False, compression_dtype=torch.int32, compression_dim=1, @@ -247,7 +247,7 @@ def export_compressed_model( Args: model (torch.nn.Module): origin fp32 model. saved_dir (_type_, optional): the dir path of compression info. Defaults to None. - use_hf_format (bool, optional): whether use HuggingFace format. Defaults to True. + use_optimum_format (bool, optional): whether use HuggingFace format. Defaults to True. enable_full_range (bool, optional): Whether to leverage the full compression range under symmetric quantization. Defaults to False. compression_dtype (torch.Tensor, optional): The target dtype after comoression. @@ -277,6 +277,6 @@ def export_compressed_model( scale_dtype=scale_dtype, gptq_config_path=gptq_config_path, device=device, - use_hf_format=use_hf_format, + use_optimum_format=use_optimum_format, ) return inc_model.model diff --git a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py index 094c2b4d1e9..a2da94ac822 100644 --- a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py +++ b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py @@ -88,7 +88,7 @@ def test_RTN_int_quant(self): out2 = q_model(input) self.assertTrue(torch.all(torch.isclose(out1, out2, atol=5e-1))) self.assertFalse(torch.all(out1 == out2)) - compressed_model = q_model.export_compressed_model(use_hf_format=False) + compressed_model = q_model.export_compressed_model(use_optimum_format=False) out3 = compressed_model(input) self.assertTrue("fc1.qweight" in compressed_model.state_dict().keys()) self.assertTrue("fc1.qzeros" not in compressed_model.state_dict().keys()) @@ -99,11 +99,11 @@ def test_RTN_int_quant(self): model = Model() new_model = load("saved", model, weight_only=True) inc_model = INCModel(new_model) - inc_model.export_compressed_model(qweight_config_path="saved/qconfig.json", use_hf_format=True) + inc_model.export_compressed_model(qweight_config_path="saved/qconfig.json", use_optimum_format=True) out4 = inc_model.model(input) self.assertTrue("fc1.qzeros" in inc_model.model.state_dict().keys()) model = Model() - compressed_model = export_compressed_model(model, saved_dir="saved", use_hf_format=True) + compressed_model = export_compressed_model(model, saved_dir="saved", use_optimum_format=True) self.assertTrue("fc1.qzeros" in inc_model.model.state_dict().keys()) # output gap is because of torch.float16 is used in hf_format self.assertTrue(torch.allclose(out3, out4, atol=1e-3)) @@ -121,7 +121,7 @@ def test_RTN_int_quant(self): out2 = q_model(input) self.assertTrue(torch.all(torch.isclose(out1, out2, atol=5e-1))) self.assertFalse(torch.all(out1 == out2)) - compressed_model = q_model.export_compressed_model(use_hf_format=False, enable_full_range=True) + compressed_model = q_model.export_compressed_model(use_optimum_format=False, enable_full_range=True) out3 = compressed_model(input) self.assertTrue(torch.all(out3 == out2)) @@ -247,7 +247,7 @@ def test_RTN_int_quant(self): model_size1 = os.path.getsize("saved/best_model.pt") / 1024 print("FP32 Model size:{:.3f}M".format(model_size1)) inc_model = INCModel(new_model) - inc_model.export_compressed_model(use_hf_format=False, qweight_config_path="saved/qconfig.json") + inc_model.export_compressed_model(use_optimum_format=False, qweight_config_path="saved/qconfig.json") torch.save(inc_model.state_dict(), "saved/tmp.pt") model_size2 = os.path.getsize("saved/tmp.pt") / 1024 print("WeightOnlyLinear Model size:{:.3f}M".format(model_size2)) @@ -275,7 +275,7 @@ def test_RTN_4bit_quant(self): out2 = q_model(self.lm_input) self.assertTrue(torch.all(torch.isclose(out1[0], out2[0], atol=1e-1))) self.assertFalse(torch.all(out1[0] == out2[0])) - compressed_model = q_model.export_compressed_model(use_hf_format=False) + compressed_model = q_model.export_compressed_model(use_optimum_format=False) out3 = compressed_model(self.lm_input) self.assertTrue(torch.all(out3[0] == out2[0])) @@ -326,7 +326,7 @@ def test_AWQ_quant(self): fp32_model = copy.deepcopy(self.gptj) reload_model = load("saved", fp32_model, weight_only=True) out2 = reload_model(input) - q_model.export_compressed_model(use_hf_format=False) + q_model.export_compressed_model(use_optimum_format=False) out3 = q_model(input) # no idea about the gap at 1e-08, use allclose instead of out1==out2 self.assertTrue(torch.allclose(out1[0], out2[0], atol=1e-05)) @@ -430,7 +430,7 @@ def test_AWQ_nf4_quant(self): ) out2 = q_model(input) self.assertTrue(torch.allclose(out1[0], out2[0], atol=1e-01)) - compressed_model = q_model.export_compressed_model(use_hf_format=False) + compressed_model = q_model.export_compressed_model(use_optimum_format=False) out3 = compressed_model(input) self.assertTrue(torch.all(out3[0] == out2[0])) @@ -531,7 +531,7 @@ def __iter__(self): q_model.save("saved") out1 = q_model.model(input) self.assertTrue(torch.allclose(out1[0], out0[0], atol=1e-02)) - compressed_model = q_model.export_compressed_model(use_hf_format=False) + compressed_model = q_model.export_compressed_model(use_optimum_format=False) out2 = compressed_model(input) torch.save(compressed_model.state_dict(), "saved/compressed_model.pt") self.assertTrue(torch.allclose(out1[0], out2[0], atol=1e-05)) @@ -556,7 +556,7 @@ def __iter__(self): ) q_model.save("saved") out1 = q_model.model(input) - compressed_model = q_model.export_compressed_model(use_hf_format=True) + compressed_model = q_model.export_compressed_model(use_optimum_format=True) out2 = compressed_model(input) print(out1[0]) print(out2[0]) @@ -574,7 +574,7 @@ def __iter__(self): ) q_model.save("saved") out1 = q_model.model(input) - compressed_model = q_model.export_compressed_model(use_hf_format=False) + compressed_model = q_model.export_compressed_model(use_optimum_format=False) out2 = compressed_model(input) torch.save(compressed_model.state_dict(), "saved/compressed_model.pt") self.assertTrue(torch.allclose(out1[0], out2[0], atol=1e-05)) @@ -668,7 +668,7 @@ def __iter__(self): ) q_model.save("saved") out1 = q_model.model(input) - compressed_model = q_model.export_compressed_model(use_hf_format=False) + compressed_model = q_model.export_compressed_model(use_optimum_format=False) out2 = compressed_model(input) torch.save(compressed_model.state_dict(), "saved/compressed_model.pt") self.assertTrue(torch.allclose(out1[0], out2[0], atol=1e-05)) diff --git a/test/model/test_model_pytorch.py b/test/model/test_model_pytorch.py index a96c35d036e..f0990b6558c 100644 --- a/test/model/test_model_pytorch.py +++ b/test/model/test_model_pytorch.py @@ -118,7 +118,7 @@ def test_WeightOnlyLinear(self): qweight_config_path="saved/qconfig.json", compression_dtype=dtype, scale_dtype=torch.float32, - use_hf_format=False, + use_optimum_format=False, ) out2 = q_model(input) torch.save(inc_model.state_dict(), "saved/tmp.pt") @@ -138,7 +138,7 @@ def test_WeightOnlyLinear(self): inc_model.export_compressed_model( qweight_config_path="saved/qconfig.json", compression_dim=dim, - use_hf_format=False, + use_optimum_format=False, ) out2 = q_model(input) torch.save(inc_model.state_dict(), "saved/tmp.pt") From b93a3a48d31a5cf3314dcbcbd4a94d21073e4fee Mon Sep 17 00:00:00 2001 From: Xin He Date: Tue, 5 Dec 2023 19:18:08 +0800 Subject: [PATCH 9/9] refine doc Signed-off-by: Xin He --- docs/source/quantization_weight_only.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/quantization_weight_only.md b/docs/source/quantization_weight_only.md index f2e7828460e..addc6490ed5 100644 --- a/docs/source/quantization_weight_only.md +++ b/docs/source/quantization_weight_only.md @@ -94,12 +94,12 @@ To support low memory inference, Neural Compressor implemented WeightOnlyLinear, | export args | default value | comments | |:----------:|:-------------:|:-------------------------------------------------------------------:| | use_optimum_format | True | Whether to use the popular format used in [Optimum](https://github.com/huggingface/optimum/blob/e0927976d06d163ed09fe5bd80d013e1cfa0c463/docs/source/llm_quantization/usage_guides/quantization.mdx#L5) | -| qweight_config_path | None | set the path of qconfig.json if you want to export model with json file | -| gptq_config_path | None | If need to export model with fp32_model and json file, set the path of gptq_config.json for GPTQ quantized model| | sym_full_range | False | Whether to leverage the full compression range under symmetric quantization | | compression_dtype | torch.int32 | Data type for compressed dtype, select from [torch.int8\|16\|32\|64]. It's torch.int32 when use_optimum_format=True | | compression_dim | 1 | 0 means output channel while 1 means input channel. It's 1 for weight and 0 for zero-point when use_optimum_format=True | | scale_dtype | torch.float32 | Data type for scale and bias. It's torch.float16 when use_optimum_format=True | +| qweight_config_path | None | set the path of qconfig.json if you want to export model with json file | +| gptq_config_path | None | If need to export model with fp32_model and json file, set the path of gptq_config.json for GPTQ quantized model| **Note:** The format used in Optimum is acceptable for transformers, which makes it easy to use. However, this format is rather special, the main differences are as follows: