diff --git a/neural_compressor/model/torch_model.py b/neural_compressor/model/torch_model.py index 395b9c007fe..3685c5c208d 100644 --- a/neural_compressor/model/torch_model.py +++ b/neural_compressor/model/torch_model.py @@ -496,6 +496,9 @@ def export_compressed_model( gptq_config = json.load(f) else: gptq_config = self.gptq_config if hasattr(self, "gptq_config") else {} + + autoround_config = self.autoround_config if hasattr(self, "autoround_config") else {} + if gptq_config: for k, v in weight_config.items(): logger.debug(f"Compressing {k} on device {device}") @@ -555,6 +558,19 @@ def export_compressed_model( ) new_module.pack(int_weight, gptq_scale, gptq_zp, m.bias, gptq_perm) set_module(self.model, k, new_module) + elif autoround_config: + from auto_round.export.export_to_itrex import compress_model # pylint: disable=E0401 + + self.model = compress_model( + self.model, + weight_config=autoround_config, + enable_full_range=enable_full_range, + compression_dtype=compression_dtype, + compression_dim=compression_dim, + device=device, + use_optimum_format=use_optimum_format, + inplace=True, + ) else: for k, v in weight_config.items(): logger.debug(f"Compressing {k} on device {device}") diff --git a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor_pytorch.py b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor_pytorch.py index ecfa34e56ff..8ca5c73d50c 100644 --- a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor_pytorch.py +++ b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor_pytorch.py @@ -801,6 +801,14 @@ def test_AutoRound_quant(self): self.assertTrue("scale" in q_model.autoround_config["transformer.h.0.attn.k_proj"].keys()) self.assertTrue(torch.float32 == q_model.autoround_config["transformer.h.0.attn.k_proj"]["scale_dtype"]) + export_model = q_model.export_compressed_model() + export_out = export_model(input) + self.assertTrue(torch.allclose(out2[0], export_out[0])) + from auto_round.export.export_to_itrex.model_wrapper import WeightOnlyLinear + + self.assertTrue(isinstance(q_model.model.transformer.h[0].attn.k_proj, WeightOnlyLinear)) + self.assertTrue(isinstance(export_model.transformer.h[0].attn.k_proj, WeightOnlyLinear)) + fp32_model = copy.deepcopy(self.gptj) conf = PostTrainingQuantConfig( @@ -852,8 +860,6 @@ def test_AutoRound_quant(self): ) out2 = export_model.model(input) self.assertTrue(torch.allclose(out1[0], out2[0], atol=1e-01)) - from auto_round.export.export_to_itrex.model_wrapper import WeightOnlyLinear - self.assertTrue(isinstance(export_model.model.transformer.h[0].attn.k_proj, WeightOnlyLinear))