From 619dec5a2c359a88cb8140b183b3844dcbd783a7 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Sat, 14 Sep 2024 14:12:51 +0800 Subject: [PATCH 01/27] load unquant module Signed-off-by: Kaihui-intel --- .../torch/algorithms/weight_only/rtn.py | 3 ++- .../transformers/models/modeling_auto.py | 6 ++++- .../transformers/quantization/utils.py | 1 + .../weight_only/test_transfomers.py | 23 +++++++++++++++++++ 4 files changed, 31 insertions(+), 2 deletions(-) diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py index 6ce9b49fac8..5ad44688ad2 100644 --- a/neural_compressor/torch/algorithms/weight_only/rtn.py +++ b/neural_compressor/torch/algorithms/weight_only/rtn.py @@ -140,8 +140,9 @@ def convert( register_weight_hooks(model, model_path, device=device, clean_weight=True) for name, m in model.named_modules(): - if not isinstance(m, supported_layers): + if use_layer_wise and isinstance(m, torch.nn.Module): + load_module(model, name, model_path, device=device) continue if name in weight_config: # pragma: no cover # initialize op configuration diff --git a/neural_compressor/transformers/models/modeling_auto.py b/neural_compressor/transformers/models/modeling_auto.py index a4a91e27f03..b58be129974 100644 --- a/neural_compressor/transformers/models/modeling_auto.py +++ b/neural_compressor/transformers/models/modeling_auto.py @@ -128,7 +128,11 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): (RtnConfig, AwqConfig, TeqConfig, GPTQConfig, AutoRoundConfig), ): logger.info("Applying Weight Only Quantization.") - if use_xpu: + breakpoint() + if hasattr(quantization_config, "use_layer_wise") and quantization_config.use_layer_wise: + from neural_compressor.torch import load_empty_model + model = load_empty_model(pretrained_model_name_or_path) + elif use_xpu: # TODO: if low_cpu_mem_uasge is True, gptj will have accuracy issue on CPU device. kwargs["low_cpu_mem_usage"] = True kwargs["device_map"] = "cpu" diff --git a/neural_compressor/transformers/quantization/utils.py b/neural_compressor/transformers/quantization/utils.py index 8dba085a553..e5d060e4aa6 100644 --- a/neural_compressor/transformers/quantization/utils.py +++ b/neural_compressor/transformers/quantization/utils.py @@ -525,6 +525,7 @@ def convert_to_quantized_model(model, config, device="cpu"): model.eval() + breakpoint() q_model = replace_linear(model, None, None, config, device=device) if orig_dtype != torch.float32: diff --git a/test/3x/torch/quantization/weight_only/test_transfomers.py b/test/3x/torch/quantization/weight_only/test_transfomers.py index 95a89f86f68..6fefb7b628e 100644 --- a/test/3x/torch/quantization/weight_only/test_transfomers.py +++ b/test/3x/torch/quantization/weight_only/test_transfomers.py @@ -111,3 +111,26 @@ def test_save_load(self): loaded_model = AutoModelForCausalLM.from_pretrained(output_dir) loaded_output = loaded_model(dummy_input)[0] assert torch.equal(woq_output, loaded_output), "loaded output should be same. Please double check." + + def test_use_layer_wise(self): + model_name_or_path = self.model_name_or_path + + fp32_model = AutoModelForCausalLM.from_pretrained(model_name_or_path) + dummy_input = fp32_model.dummy_inputs["input_ids"] + + # RTN + woq_config = RtnConfig(bits=4, group_size=16, use_layer_wise=True) + woq_model = AutoModelForCausalLM.from_pretrained( + model_name_or_path, + quantization_config=woq_config, + ) + woq_output = woq_model(dummy_input)[0] + + # save + output_dir = "./transformers_tmp" + woq_model.save_pretrained(output_dir) + + # load + loaded_model = AutoModelForCausalLM.from_pretrained(output_dir) + loaded_output = loaded_model(dummy_input)[0] + assert torch.equal(woq_output, loaded_output), "loaded output should be same. Please double check." From 52762e3dd5309812d0b8f450b2a8f94925541dd4 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Sat, 14 Sep 2024 14:14:14 +0800 Subject: [PATCH 02/27] skip empty module Signed-off-by: Kaihui-intel --- neural_compressor/torch/algorithms/layer_wise/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/neural_compressor/torch/algorithms/layer_wise/utils.py b/neural_compressor/torch/algorithms/layer_wise/utils.py index daf0ad87060..b81aa819a25 100644 --- a/neural_compressor/torch/algorithms/layer_wise/utils.py +++ b/neural_compressor/torch/algorithms/layer_wise/utils.py @@ -257,6 +257,8 @@ def load_module(model, module_name, path, device="cpu"): path (str): path to load state_dict per layer. device (str, optional): module device. Defaults to "cpu". """ + if module_name == '': + return module = get_module(model, module_name) for n, p in module.named_parameters(): param_name = module_name + "." + n From d25527805288e3043c8b506952c17e44549e7763 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Sat, 14 Sep 2024 14:32:27 +0800 Subject: [PATCH 03/27] load ln_ Signed-off-by: Kaihui-intel --- neural_compressor/torch/algorithms/weight_only/rtn.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py index 5ad44688ad2..79f24ab9192 100644 --- a/neural_compressor/torch/algorithms/weight_only/rtn.py +++ b/neural_compressor/torch/algorithms/weight_only/rtn.py @@ -179,6 +179,8 @@ def convert( bits = int(dtype.lstrip("int")) dtype = "int" else: + if use_layer_wise and isinstance(m, torch.nn.Module): + load_module(model, name, model_path, device=device) continue log_msg = ( f"RTN quantization config: bits={bits}, group_size={group_size}, " From a515fd339a9978101d09cb34870da443c4b3f7b1 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Thu, 19 Sep 2024 09:19:07 +0800 Subject: [PATCH 04/27] load module Signed-off-by: Kaihui-intel --- neural_compressor/torch/algorithms/weight_only/rtn.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py index 79f24ab9192..b9a5c6a454b 100644 --- a/neural_compressor/torch/algorithms/weight_only/rtn.py +++ b/neural_compressor/torch/algorithms/weight_only/rtn.py @@ -140,9 +140,10 @@ def convert( register_weight_hooks(model, model_path, device=device, clean_weight=True) for name, m in model.named_modules(): + + if use_layer_wise: + load_module(model, name, model_path, device=device) if not isinstance(m, supported_layers): - if use_layer_wise and isinstance(m, torch.nn.Module): - load_module(model, name, model_path, device=device) continue if name in weight_config: # pragma: no cover # initialize op configuration @@ -179,8 +180,6 @@ def convert( bits = int(dtype.lstrip("int")) dtype = "int" else: - if use_layer_wise and isinstance(m, torch.nn.Module): - load_module(model, name, model_path, device=device) continue log_msg = ( f"RTN quantization config: bits={bits}, group_size={group_size}, " @@ -195,9 +194,6 @@ def convert( logger.debug(f"RTN quantized module:{name, m}") logger.debug(log_msg) - if use_layer_wise: - load_module(model, name, model_path, device=device) - # for only group_dim is 0 or only `transformers.Conv1D`, we need transpose weight. if is_transformers_imported(): transpose = (group_dim == 0) ^ (isinstance(m, transformers.Conv1D)) From cce5bf90e4edfb71aaed1624c89d82c7bca545e6 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Mon, 23 Sep 2024 11:37:40 +0800 Subject: [PATCH 05/27] remove rtn lw hook Signed-off-by: Kaihui-intel --- neural_compressor/torch/algorithms/weight_only/rtn.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py index b9a5c6a454b..23618afd4ad 100644 --- a/neural_compressor/torch/algorithms/weight_only/rtn.py +++ b/neural_compressor/torch/algorithms/weight_only/rtn.py @@ -130,15 +130,13 @@ def convert( if use_layer_wise: from neural_compressor.common.utils import DEFAULT_WORKSPACE - from neural_compressor.torch.algorithms.layer_wise.utils import get_path, load_module, register_weight_hooks + from neural_compressor.torch.algorithms.layer_wise.utils import get_path, load_module if model_path == "": model_path = model.path assert model_path, "model_path should not be None." model_path = get_path(model_path) - register_weight_hooks(model, model_path, device=device, clean_weight=True) - for name, m in model.named_modules(): if use_layer_wise: From 20576c6da0e8bb4e13734d72a26aeb8437b000ef Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Mon, 23 Sep 2024 12:02:17 +0800 Subject: [PATCH 06/27] remove breakpoint Signed-off-by: Kaihui-intel --- neural_compressor/transformers/models/modeling_auto.py | 3 ++- neural_compressor/transformers/quantization/utils.py | 2 -- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/neural_compressor/transformers/models/modeling_auto.py b/neural_compressor/transformers/models/modeling_auto.py index b58be129974..6132f424672 100644 --- a/neural_compressor/transformers/models/modeling_auto.py +++ b/neural_compressor/transformers/models/modeling_auto.py @@ -128,10 +128,11 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): (RtnConfig, AwqConfig, TeqConfig, GPTQConfig, AutoRoundConfig), ): logger.info("Applying Weight Only Quantization.") - breakpoint() if hasattr(quantization_config, "use_layer_wise") and quantization_config.use_layer_wise: from neural_compressor.torch import load_empty_model model = load_empty_model(pretrained_model_name_or_path) + if use_cpu: + quantization_config.post_init_cpu() elif use_xpu: # TODO: if low_cpu_mem_uasge is True, gptj will have accuracy issue on CPU device. kwargs["low_cpu_mem_usage"] = True diff --git a/neural_compressor/transformers/quantization/utils.py b/neural_compressor/transformers/quantization/utils.py index e5d060e4aa6..4a7b1e54ace 100644 --- a/neural_compressor/transformers/quantization/utils.py +++ b/neural_compressor/transformers/quantization/utils.py @@ -152,7 +152,6 @@ def _replace_linear( "fp16": ipex.quantization.WoqLowpMode.FP16, "int8": ipex.quantization.WoqLowpMode.INT8, } - ipex_qconfig_mapping = ipex.quantization.get_weight_only_quant_qconfig_mapping( weight_dtype=weight_dtype[quantization_config.bits], lowp_mode=compute_dtype[quantization_config.compute_dtype], @@ -525,7 +524,6 @@ def convert_to_quantized_model(model, config, device="cpu"): model.eval() - breakpoint() q_model = replace_linear(model, None, None, config, device=device) if orig_dtype != torch.float32: From db2f59f070796dafc805082f386ed420a366e0ba Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Mon, 23 Sep 2024 12:18:33 +0800 Subject: [PATCH 07/27] add ut for use/no-use layer wise Signed-off-by: Kaihui-intel --- .../torch/quantization/weight_only/test_transfomers.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/3x/torch/quantization/weight_only/test_transfomers.py b/test/3x/torch/quantization/weight_only/test_transfomers.py index 875d003eeb5..77607f681df 100644 --- a/test/3x/torch/quantization/weight_only/test_transfomers.py +++ b/test/3x/torch/quantization/weight_only/test_transfomers.py @@ -122,6 +122,7 @@ def test_use_layer_wise(self): dummy_input = fp32_model.dummy_inputs["input_ids"] # RTN + # use_layer_wise=True woq_config = RtnConfig(bits=4, group_size=16, use_layer_wise=True) woq_model = AutoModelForCausalLM.from_pretrained( model_name_or_path, @@ -137,6 +138,15 @@ def test_use_layer_wise(self): loaded_model = AutoModelForCausalLM.from_pretrained(output_dir) loaded_output = loaded_model(dummy_input)[0] assert torch.equal(woq_output, loaded_output), "loaded output should be same. Please double check." + + # use_layer_wise=False + woq_config = RtnConfig(bits=4, group_size=16, use_layer_wise=False) + woq_model = AutoModelForCausalLM.from_pretrained( + model_name_or_path, + quantization_config=woq_config, + ) + woq_output2 = woq_model(dummy_input)[0] + assert torch.equal(woq_output, woq_output2), "use_layer_wise output should be same. Please double check." def test_loading_autoawq_model(self): user_model = AutoModelForCausalLM.from_pretrained(self.autoawq_model) From e28bc9acce71bc29e57c3316500e8cf9efefcbde Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 23 Sep 2024 04:52:06 +0000 Subject: [PATCH 08/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_compressor/torch/algorithms/layer_wise/utils.py | 2 +- neural_compressor/torch/algorithms/weight_only/rtn.py | 2 +- neural_compressor/transformers/models/modeling_auto.py | 1 + test/3x/torch/quantization/weight_only/test_transfomers.py | 2 +- 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/neural_compressor/torch/algorithms/layer_wise/utils.py b/neural_compressor/torch/algorithms/layer_wise/utils.py index b81aa819a25..b0acce27cd4 100644 --- a/neural_compressor/torch/algorithms/layer_wise/utils.py +++ b/neural_compressor/torch/algorithms/layer_wise/utils.py @@ -257,7 +257,7 @@ def load_module(model, module_name, path, device="cpu"): path (str): path to load state_dict per layer. device (str, optional): module device. Defaults to "cpu". """ - if module_name == '': + if module_name == "": return module = get_module(model, module_name) for n, p in module.named_parameters(): diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py index 23618afd4ad..49c33a5ffb2 100644 --- a/neural_compressor/torch/algorithms/weight_only/rtn.py +++ b/neural_compressor/torch/algorithms/weight_only/rtn.py @@ -138,7 +138,7 @@ def convert( model_path = get_path(model_path) for name, m in model.named_modules(): - + if use_layer_wise: load_module(model, name, model_path, device=device) if not isinstance(m, supported_layers): diff --git a/neural_compressor/transformers/models/modeling_auto.py b/neural_compressor/transformers/models/modeling_auto.py index 56987fe28f8..da21bb44275 100644 --- a/neural_compressor/transformers/models/modeling_auto.py +++ b/neural_compressor/transformers/models/modeling_auto.py @@ -136,6 +136,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): logger.info("Applying Weight Only Quantization.") if hasattr(quantization_config, "use_layer_wise") and quantization_config.use_layer_wise: from neural_compressor.torch import load_empty_model + model = load_empty_model(pretrained_model_name_or_path) if use_cpu: quantization_config.post_init_cpu() diff --git a/test/3x/torch/quantization/weight_only/test_transfomers.py b/test/3x/torch/quantization/weight_only/test_transfomers.py index 875d003eeb5..d20f45e6554 100644 --- a/test/3x/torch/quantization/weight_only/test_transfomers.py +++ b/test/3x/torch/quantization/weight_only/test_transfomers.py @@ -114,7 +114,7 @@ def test_save_load(self): loaded_model = AutoModelForCausalLM.from_pretrained(output_dir) loaded_output = loaded_model(dummy_input)[0] assert torch.equal(woq_output, loaded_output), "loaded output should be same. Please double check." - + def test_use_layer_wise(self): model_name_or_path = self.model_name_or_path From 429ef15fb0da45e7f379556568640e520ca0b18a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 23 Sep 2024 05:06:41 +0000 Subject: [PATCH 09/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- test/3x/torch/quantization/weight_only/test_transfomers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/3x/torch/quantization/weight_only/test_transfomers.py b/test/3x/torch/quantization/weight_only/test_transfomers.py index 84b471ef388..64e9b3a4e9b 100644 --- a/test/3x/torch/quantization/weight_only/test_transfomers.py +++ b/test/3x/torch/quantization/weight_only/test_transfomers.py @@ -138,7 +138,7 @@ def test_use_layer_wise(self): loaded_model = AutoModelForCausalLM.from_pretrained(output_dir) loaded_output = loaded_model(dummy_input)[0] assert torch.equal(woq_output, loaded_output), "loaded output should be same. Please double check." - + # use_layer_wise=False woq_config = RtnConfig(bits=4, group_size=16, use_layer_wise=False) woq_model = AutoModelForCausalLM.from_pretrained( From e3320b26f31f7f087f69c1c936e0b652285535cc Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Mon, 23 Sep 2024 15:01:24 +0800 Subject: [PATCH 10/27] add lw check before convert Signed-off-by: Kaihui-intel --- .../transformers/models/modeling_auto.py | 19 +++++++++++++------ .../transformers/quantization/utils.py | 5 ----- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/neural_compressor/transformers/models/modeling_auto.py b/neural_compressor/transformers/models/modeling_auto.py index da21bb44275..366a19aeca3 100644 --- a/neural_compressor/transformers/models/modeling_auto.py +++ b/neural_compressor/transformers/models/modeling_auto.py @@ -134,12 +134,19 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): (RtnConfig, AwqConfig, TeqConfig, GPTQConfig, AutoRoundConfig), ): logger.info("Applying Weight Only Quantization.") - if hasattr(quantization_config, "use_layer_wise") and quantization_config.use_layer_wise: - from neural_compressor.torch import load_empty_model - - model = load_empty_model(pretrained_model_name_or_path) - if use_cpu: - quantization_config.post_init_cpu() + if hasattr(quantization_config, "use_layer_wise"): + import neural_compressor.torch.utils as torch_utils + + process_type = torch_utils.get_processor_type_from_user_config() + if process_type == torch_utils.ProcessorType.Client: + quantization_config.use_layer_wise = True + + if quantization_config.use_layer_wise: + from neural_compressor.torch import load_empty_model + + model = load_empty_model(pretrained_model_name_or_path) + if use_cpu: + quantization_config.post_init_cpu() elif use_xpu: # TODO: if low_cpu_mem_uasge is True, gptj will have accuracy issue on CPU device. kwargs["low_cpu_mem_usage"] = True diff --git a/neural_compressor/transformers/quantization/utils.py b/neural_compressor/transformers/quantization/utils.py index 91779a9ee3e..f29613d547c 100644 --- a/neural_compressor/transformers/quantization/utils.py +++ b/neural_compressor/transformers/quantization/utils.py @@ -367,11 +367,6 @@ def convert_to_quantized_model(model, config, device="cpu"): # mapping to INC config dtype = "int4" if config.weight_dtype == "int4_fullrange" else config.weight_dtype - import neural_compressor.torch.utils as torch_utils - - process_type = torch_utils.get_processor_type_from_user_config() - if process_type == torch_utils.ProcessorType.Client: - config.use_layer_wise = True if config.quant_method.value == "rtn": quant_config = RTNConfig( dtype=dtype, From 162b7e6238c0c9a2fef57a42cd982e49cda858b5 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Mon, 23 Sep 2024 15:47:46 +0800 Subject: [PATCH 11/27] fix lw check Signed-off-by: Kaihui-intel --- .../transformers/models/modeling_auto.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/neural_compressor/transformers/models/modeling_auto.py b/neural_compressor/transformers/models/modeling_auto.py index 366a19aeca3..25298e8e93b 100644 --- a/neural_compressor/transformers/models/modeling_auto.py +++ b/neural_compressor/transformers/models/modeling_auto.py @@ -134,6 +134,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): (RtnConfig, AwqConfig, TeqConfig, GPTQConfig, AutoRoundConfig), ): logger.info("Applying Weight Only Quantization.") + # set use_layer_wise on client if hasattr(quantization_config, "use_layer_wise"): import neural_compressor.torch.utils as torch_utils @@ -141,12 +142,12 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): if process_type == torch_utils.ProcessorType.Client: quantization_config.use_layer_wise = True - if quantization_config.use_layer_wise: - from neural_compressor.torch import load_empty_model + if hasattr(quantization_config, "use_layer_wise") and quantization_config.use_layer_wise: + from neural_compressor.torch import load_empty_model - model = load_empty_model(pretrained_model_name_or_path) - if use_cpu: - quantization_config.post_init_cpu() + model = load_empty_model(pretrained_model_name_or_path) + if use_cpu: + quantization_config.post_init_cpu() elif use_xpu: # TODO: if low_cpu_mem_uasge is True, gptj will have accuracy issue on CPU device. kwargs["low_cpu_mem_usage"] = True From 9c3257d3a027c2b823ba85168340ef4cd5d44956 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 23 Sep 2024 07:49:12 +0000 Subject: [PATCH 12/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_compressor/transformers/models/modeling_auto.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/neural_compressor/transformers/models/modeling_auto.py b/neural_compressor/transformers/models/modeling_auto.py index 366a19aeca3..236a0af8729 100644 --- a/neural_compressor/transformers/models/modeling_auto.py +++ b/neural_compressor/transformers/models/modeling_auto.py @@ -139,8 +139,8 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): process_type = torch_utils.get_processor_type_from_user_config() if process_type == torch_utils.ProcessorType.Client: - quantization_config.use_layer_wise = True - + quantization_config.use_layer_wise = True + if quantization_config.use_layer_wise: from neural_compressor.torch import load_empty_model From 275c6d54829a2b653087b43854a7d218c2eb4e61 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 23 Sep 2024 08:37:48 +0000 Subject: [PATCH 13/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_compressor/transformers/models/modeling_auto.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/neural_compressor/transformers/models/modeling_auto.py b/neural_compressor/transformers/models/modeling_auto.py index 25298e8e93b..fcd8536b685 100644 --- a/neural_compressor/transformers/models/modeling_auto.py +++ b/neural_compressor/transformers/models/modeling_auto.py @@ -140,8 +140,8 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): process_type = torch_utils.get_processor_type_from_user_config() if process_type == torch_utils.ProcessorType.Client: - quantization_config.use_layer_wise = True - + quantization_config.use_layer_wise = True + if hasattr(quantization_config, "use_layer_wise") and quantization_config.use_layer_wise: from neural_compressor.torch import load_empty_model From dc2c4b2a4b8bf6c95f0a762a914c842b8cbe62c5 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Wed, 25 Sep 2024 15:22:56 +0800 Subject: [PATCH 14/27] fix load oom Signed-off-by: Kaihui-intel --- neural_compressor/torch/algorithms/layer_wise/utils.py | 2 -- neural_compressor/torch/algorithms/weight_only/rtn.py | 3 +-- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/neural_compressor/torch/algorithms/layer_wise/utils.py b/neural_compressor/torch/algorithms/layer_wise/utils.py index b0acce27cd4..daf0ad87060 100644 --- a/neural_compressor/torch/algorithms/layer_wise/utils.py +++ b/neural_compressor/torch/algorithms/layer_wise/utils.py @@ -257,8 +257,6 @@ def load_module(model, module_name, path, device="cpu"): path (str): path to load state_dict per layer. device (str, optional): module device. Defaults to "cpu". """ - if module_name == "": - return module = get_module(model, module_name) for n, p in module.named_parameters(): param_name = module_name + "." + n diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py index 49c33a5ffb2..c8d0c4e293f 100644 --- a/neural_compressor/torch/algorithms/weight_only/rtn.py +++ b/neural_compressor/torch/algorithms/weight_only/rtn.py @@ -138,8 +138,7 @@ def convert( model_path = get_path(model_path) for name, m in model.named_modules(): - - if use_layer_wise: + if use_layer_wise and len(list(m.named_children()))==0: load_module(model, name, model_path, device=device) if not isinstance(m, supported_layers): continue From 0e823b2573830333671f6c5aeaafd2f56ada2ef9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 25 Sep 2024 08:10:54 +0000 Subject: [PATCH 15/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_compressor/torch/algorithms/weight_only/rtn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py index c8d0c4e293f..d1d6912e2fa 100644 --- a/neural_compressor/torch/algorithms/weight_only/rtn.py +++ b/neural_compressor/torch/algorithms/weight_only/rtn.py @@ -138,7 +138,7 @@ def convert( model_path = get_path(model_path) for name, m in model.named_modules(): - if use_layer_wise and len(list(m.named_children()))==0: + if use_layer_wise and len(list(m.named_children())) == 0: load_module(model, name, model_path, device=device) if not isinstance(m, supported_layers): continue From dea851291d8eb12b86c5cc7ad20f955b67c550fc Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Fri, 27 Sep 2024 16:48:45 +0800 Subject: [PATCH 16/27] update xpu model_type list Signed-off-by: Kaihui-intel --- .../weight_only/text-generation/run_generation_gpu_woq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/run_generation_gpu_woq.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/run_generation_gpu_woq.py index b5dbe20126e..9245d53eb50 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/run_generation_gpu_woq.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/run_generation_gpu_woq.py @@ -200,7 +200,7 @@ tokenizer.save_pretrained(args.output_dir) enable_optimize_transformers = False -opt_gpu_model_type_list = ["llama", "gptj", "mistral", "qwen"] +opt_gpu_model_type_list = ["llama", "gptj", "mistral", "qwen", "phi3"] if config.model_type in opt_gpu_model_type_list: enable_optimize_transformers = True From 44c312d4bf786c8b04fdb9404064fcb56ac3f541 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Sun, 29 Sep 2024 14:45:45 +0800 Subject: [PATCH 17/27] fix llama3 oom Signed-off-by: Kaihui-intel --- neural_compressor/transformers/quantization/utils.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/neural_compressor/transformers/quantization/utils.py b/neural_compressor/transformers/quantization/utils.py index f29613d547c..b9d171615d5 100644 --- a/neural_compressor/transformers/quantization/utils.py +++ b/neural_compressor/transformers/quantization/utils.py @@ -525,6 +525,11 @@ def convert_to_quantized_model(model, config, device="cpu"): if orig_dtype != torch.float32: q_model.to(dtype=orig_dtype) + if config.use_layer_wise and not (q_model.device == device or q_model.device.type == device): + logger.warning( + "Do not convert device to avoid out of memory. Recommend using saved quantized model to inference.") + return q_model + return q_model.to(device) From 9292ab0747b33dd11c96ce7a04df22fca9ef56b5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 29 Sep 2024 07:34:18 +0000 Subject: [PATCH 18/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_compressor/transformers/quantization/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/neural_compressor/transformers/quantization/utils.py b/neural_compressor/transformers/quantization/utils.py index b9d171615d5..45b1489281c 100644 --- a/neural_compressor/transformers/quantization/utils.py +++ b/neural_compressor/transformers/quantization/utils.py @@ -527,7 +527,8 @@ def convert_to_quantized_model(model, config, device="cpu"): if config.use_layer_wise and not (q_model.device == device or q_model.device.type == device): logger.warning( - "Do not convert device to avoid out of memory. Recommend using saved quantized model to inference.") + "Do not convert device to avoid out of memory. Recommend using saved quantized model to inference." + ) return q_model return q_model.to(device) From fd9c387c50c37b519b858c80f75a3552bf899b16 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Sun, 29 Sep 2024 15:01:58 +0800 Subject: [PATCH 19/27] fix empty model trust_remote_code Signed-off-by: Kaihui-intel --- neural_compressor/transformers/models/modeling_auto.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/neural_compressor/transformers/models/modeling_auto.py b/neural_compressor/transformers/models/modeling_auto.py index fcd8536b685..7d121b23529 100644 --- a/neural_compressor/transformers/models/modeling_auto.py +++ b/neural_compressor/transformers/models/modeling_auto.py @@ -144,8 +144,10 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): if hasattr(quantization_config, "use_layer_wise") and quantization_config.use_layer_wise: from neural_compressor.torch import load_empty_model - - model = load_empty_model(pretrained_model_name_or_path) + + kwargs["low_cpu_mem_usage"] = True + kwargs["device_map"] = "cpu" + model = load_empty_model(pretrained_model_name_or_path, **kwargs) if use_cpu: quantization_config.post_init_cpu() elif use_xpu: From 448fc0bc6e9a6e7e0adcf6da9135dc3865c1eb3e Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Sun, 29 Sep 2024 15:06:23 +0800 Subject: [PATCH 20/27] fix trust_remote_code Signed-off-by: Kaihui-intel --- neural_compressor/transformers/models/modeling_auto.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/neural_compressor/transformers/models/modeling_auto.py b/neural_compressor/transformers/models/modeling_auto.py index 7d121b23529..7e08503d4df 100644 --- a/neural_compressor/transformers/models/modeling_auto.py +++ b/neural_compressor/transformers/models/modeling_auto.py @@ -145,9 +145,8 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): if hasattr(quantization_config, "use_layer_wise") and quantization_config.use_layer_wise: from neural_compressor.torch import load_empty_model - kwargs["low_cpu_mem_usage"] = True - kwargs["device_map"] = "cpu" - model = load_empty_model(pretrained_model_name_or_path, **kwargs) + trust_remote_code = kwargs.pop("trust_remote_code", None) + model = load_empty_model(pretrained_model_name_or_path, trust_remote_code=trust_remote_code) if use_cpu: quantization_config.post_init_cpu() elif use_xpu: From 35e58e67386942c8e464bc20d90922386d7aa580 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 29 Sep 2024 07:50:26 +0000 Subject: [PATCH 21/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_compressor/transformers/models/modeling_auto.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neural_compressor/transformers/models/modeling_auto.py b/neural_compressor/transformers/models/modeling_auto.py index 7d121b23529..6d6c06e0eb4 100644 --- a/neural_compressor/transformers/models/modeling_auto.py +++ b/neural_compressor/transformers/models/modeling_auto.py @@ -144,7 +144,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): if hasattr(quantization_config, "use_layer_wise") and quantization_config.use_layer_wise: from neural_compressor.torch import load_empty_model - + kwargs["low_cpu_mem_usage"] = True kwargs["device_map"] = "cpu" model = load_empty_model(pretrained_model_name_or_path, **kwargs) From eb9dce37eaaa736bc0170d053ad7736b45cb2bd0 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Sun, 29 Sep 2024 15:20:15 +0800 Subject: [PATCH 22/27] resolve trust remote code Signed-off-by: Kaihui-intel --- .../transformers/models/modeling_auto.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/neural_compressor/transformers/models/modeling_auto.py b/neural_compressor/transformers/models/modeling_auto.py index 7e08503d4df..67da5dcad8e 100644 --- a/neural_compressor/transformers/models/modeling_auto.py +++ b/neural_compressor/transformers/models/modeling_auto.py @@ -144,8 +144,17 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): if hasattr(quantization_config, "use_layer_wise") and quantization_config.use_layer_wise: from neural_compressor.torch import load_empty_model + from transformers.dynamic_module_utils import resolve_trust_remote_code + + has_remote_code = hasattr(config, "auto_map") and cls.ORIG_MODEL.__name__ in config.auto_map + has_local_code = type(config) in cls.ORIG_MODEL._model_mapping.keys() + trust_remote_code = resolve_trust_remote_code( + trust_remote_code, + pretrained_model_name_or_path, + has_local_code, + has_remote_code, + ) - trust_remote_code = kwargs.pop("trust_remote_code", None) model = load_empty_model(pretrained_model_name_or_path, trust_remote_code=trust_remote_code) if use_cpu: quantization_config.post_init_cpu() From 8f4cee3014db7c5710f66f397b33430cb548e118 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 29 Sep 2024 07:55:29 +0000 Subject: [PATCH 23/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_compressor/transformers/models/modeling_auto.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neural_compressor/transformers/models/modeling_auto.py b/neural_compressor/transformers/models/modeling_auto.py index 7e08503d4df..fd60d03b72a 100644 --- a/neural_compressor/transformers/models/modeling_auto.py +++ b/neural_compressor/transformers/models/modeling_auto.py @@ -144,7 +144,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): if hasattr(quantization_config, "use_layer_wise") and quantization_config.use_layer_wise: from neural_compressor.torch import load_empty_model - + trust_remote_code = kwargs.pop("trust_remote_code", None) model = load_empty_model(pretrained_model_name_or_path, trust_remote_code=trust_remote_code) if use_cpu: From 83332a6df57540e1638c988fe09d416eebeba4ac Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 29 Sep 2024 08:09:11 +0000 Subject: [PATCH 24/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_compressor/transformers/models/modeling_auto.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/neural_compressor/transformers/models/modeling_auto.py b/neural_compressor/transformers/models/modeling_auto.py index 67da5dcad8e..27b9d38cdfe 100644 --- a/neural_compressor/transformers/models/modeling_auto.py +++ b/neural_compressor/transformers/models/modeling_auto.py @@ -143,9 +143,10 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): quantization_config.use_layer_wise = True if hasattr(quantization_config, "use_layer_wise") and quantization_config.use_layer_wise: + from transformers.dynamic_module_utils import resolve_trust_remote_code + from neural_compressor.torch import load_empty_model - from transformers.dynamic_module_utils import resolve_trust_remote_code - + has_remote_code = hasattr(config, "auto_map") and cls.ORIG_MODEL.__name__ in config.auto_map has_local_code = type(config) in cls.ORIG_MODEL._model_mapping.keys() trust_remote_code = resolve_trust_remote_code( @@ -154,7 +155,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): has_local_code, has_remote_code, ) - + model = load_empty_model(pretrained_model_name_or_path, trust_remote_code=trust_remote_code) if use_cpu: quantization_config.post_init_cpu() From 4402d61f61f1c28ce720a7193c908b419ef5f502 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Sun, 29 Sep 2024 15:28:04 +0800 Subject: [PATCH 25/27] update params Signed-off-by: Kaihui-intel --- neural_compressor/transformers/models/modeling_auto.py | 1 + 1 file changed, 1 insertion(+) diff --git a/neural_compressor/transformers/models/modeling_auto.py b/neural_compressor/transformers/models/modeling_auto.py index 27b9d38cdfe..1e93207f5a8 100644 --- a/neural_compressor/transformers/models/modeling_auto.py +++ b/neural_compressor/transformers/models/modeling_auto.py @@ -147,6 +147,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): from neural_compressor.torch import load_empty_model + trust_remote_code = kwargs.pop("trust_remote_code", None) has_remote_code = hasattr(config, "auto_map") and cls.ORIG_MODEL.__name__ in config.auto_map has_local_code = type(config) in cls.ORIG_MODEL._model_mapping.keys() trust_remote_code = resolve_trust_remote_code( From ad22bb4938824cdaa4b7e6ec4716fcd84e6c3c85 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Sun, 29 Sep 2024 22:35:02 +0800 Subject: [PATCH 26/27] fix load empty model Signed-off-by: Kaihui-intel --- neural_compressor/torch/utils/utility.py | 4 ++-- neural_compressor/transformers/models/modeling_auto.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/neural_compressor/torch/utils/utility.py b/neural_compressor/torch/utils/utility.py index 2a6fe5aae64..e52b15a87e0 100644 --- a/neural_compressor/torch/utils/utility.py +++ b/neural_compressor/torch/utils/utility.py @@ -331,11 +331,11 @@ def load_empty_model(pretrained_model_name_or_path, cls=None, **kwargs): if cls.__base__ == _BaseAutoModelClass: config = AutoConfig.from_pretrained(path, **kwargs) with init_empty_weights(): - model = cls.from_config(config) + model = cls.from_config(config, **kwargs) else: # pragma: no cover config = cls.config_class.from_pretrained(path, **kwargs) with init_empty_weights(): - model = cls(config) + model = cls(config, **kwargs) model.tie_weights() model.eval() model.path = pretrained_model_name_or_path diff --git a/neural_compressor/transformers/models/modeling_auto.py b/neural_compressor/transformers/models/modeling_auto.py index 1e93207f5a8..dcb765629b2 100644 --- a/neural_compressor/transformers/models/modeling_auto.py +++ b/neural_compressor/transformers/models/modeling_auto.py @@ -147,7 +147,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): from neural_compressor.torch import load_empty_model - trust_remote_code = kwargs.pop("trust_remote_code", None) + trust_remote_code = kwargs.get("trust_remote_code", None) has_remote_code = hasattr(config, "auto_map") and cls.ORIG_MODEL.__name__ in config.auto_map has_local_code = type(config) in cls.ORIG_MODEL._model_mapping.keys() trust_remote_code = resolve_trust_remote_code( From 43f095f9ce1f34ba83f51bb20e4f0e4099a9c7b7 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Mon, 30 Sep 2024 09:10:33 +0800 Subject: [PATCH 27/27] update readme Signed-off-by: Kaihui-intel --- .../transformers/weight_only/text-generation/README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/README.md b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/README.md index 4c3b38c0a48..1abe2633ea3 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/README.md +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/README.md @@ -116,13 +116,18 @@ Pytorch and Intel-extension-for-pytorch version for intel GPU > 2.1 are required ```bash pip install -r requirements_GPU.txt pip install transformers==4.38.1 # llama use 4.38.1 -source /opt/intel/oneapi/setvars.sh git clone https://github.com/intel/intel-extension-for-pytorch.git ipex-gpu cd ipex-gpu git submodule update --init --recursive export USE_AOT_DEVLIST='pvc,ats-m150' export BUILD_WITH_CPU=OFF +export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib/:$LD_LIBRARY_PATH +export OCL_ICD_VENDORS=/etc/OpenCL/vendors +export CCL_ROOT=${CONDA_PREFIX} +source /opt/intel/oneapi/setvars.sh --force +export LLM_ACC_TEST=1 + python setup.py install ```