From ac0ad862bfabf5588601ba4835b6bf6aafab722d Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 21 Jan 2025 08:52:53 +0800 Subject: [PATCH 01/19] add transformers class Signed-off-by: Kaihui-intel --- neural_compressor/transformers/__init__.py | 1 + .../transformers/models/__init__.py | 2 +- .../transformers/models/modeling_auto.py | 48 +++++++++------ .../transformers/quantization/utils.py | 61 ++++++++++++++++--- .../transformers/utils/quantization_config.py | 13 ++++ 5 files changed, 96 insertions(+), 29 deletions(-) diff --git a/neural_compressor/transformers/__init__.py b/neural_compressor/transformers/__init__.py index 4eb6a044664..54b0141e21d 100644 --- a/neural_compressor/transformers/__init__.py +++ b/neural_compressor/transformers/__init__.py @@ -23,4 +23,5 @@ AutoModelForCausalLM, AutoModel, AutoModelForSeq2SeqLM, + Qwen2VLForConditionalGeneration, ) diff --git a/neural_compressor/transformers/models/__init__.py b/neural_compressor/transformers/models/__init__.py index d951600ca48..9709d22af71 100644 --- a/neural_compressor/transformers/models/__init__.py +++ b/neural_compressor/transformers/models/__init__.py @@ -13,4 +13,4 @@ # limitations under the License. from .modeling_auto import _BaseINCAutoModelClass -from .modeling_auto import AutoModel, AutoModelForCausalLM, AutoModelForSeq2SeqLM +from .modeling_auto import AutoModel, AutoModelForCausalLM, AutoModelForSeq2SeqLM, Qwen2VLForConditionalGeneration diff --git a/neural_compressor/transformers/models/modeling_auto.py b/neural_compressor/transformers/models/modeling_auto.py index 1226fd21d97..abd1e86635d 100644 --- a/neural_compressor/transformers/models/modeling_auto.py +++ b/neural_compressor/transformers/models/modeling_auto.py @@ -354,24 +354,27 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): else: commit_hash = getattr(config, "_commit_hash", None) - has_remote_code = hasattr(config, "auto_map") and cls.ORIG_MODEL.__name__ in config.auto_map - - has_local_code = type(config) in cls.ORIG_MODEL._model_mapping.keys() - trust_remote_code = resolve_trust_remote_code( - trust_remote_code, - pretrained_model_name_or_path, - has_local_code, - has_remote_code, - ) - if has_remote_code and trust_remote_code: - class_ref = config.auto_map[cls.ORIG_MODEL.__name__] - model_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs_orig) - if os.path.isdir(pretrained_model_name_or_path): - model_class.register_for_auto_class(cls.ORIG_MODEL.__name__) - else: - cls.ORIG_MODEL.register(config.__class__, model_class, exist_ok=True) - elif type(config) in cls.ORIG_MODEL._model_mapping.keys(): - model_class = _get_model_class(config, cls.ORIG_MODEL._model_mapping) + if "AutoModel" in cls.ORIG_MODEL.__name__: + has_remote_code = hasattr(config, "auto_map") and cls.ORIG_MODEL.__name__ in config.auto_map + has_local_code = type(config) in cls.ORIG_MODEL._model_mapping.keys() + + trust_remote_code = resolve_trust_remote_code( + trust_remote_code, + pretrained_model_name_or_path, + has_local_code, + has_remote_code, + ) + if has_remote_code and trust_remote_code: + class_ref = config.auto_map[cls.ORIG_MODEL.__name__] + model_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs_orig) + if os.path.isdir(pretrained_model_name_or_path): + model_class.register_for_auto_class(cls.ORIG_MODEL.__name__) + else: + cls.ORIG_MODEL.register(config.__class__, model_class, exist_ok=True) + elif type(config) in cls.ORIG_MODEL._model_mapping.keys(): + model_class = _get_model_class(config, cls.ORIG_MODEL._model_mapping) + else: + model_class = cls.ORIG_MODEL # This variable will flag if we're loading a sharded checkpoint. In this case the archive file is just the # index of the files. @@ -747,3 +750,12 @@ class AutoModel(_BaseINCAutoModelClass): class AutoModelForSeq2SeqLM(_BaseINCAutoModelClass): ORIG_MODEL = transformers.AutoModelForSeq2SeqLM + +class Qwen2VLForConditionalGeneration(_BaseINCAutoModelClass): + ORIG_MODEL = transformers.Qwen2VLForConditionalGeneration + +class MllamaForConditionalGeneration(_BaseINCAutoModelClass): + ORIG_MODEL = transformers.MllamaForConditionalGeneration + +class LlavaForConditionalGeneration(_BaseINCAutoModelClass): + ORIG_MODEL = transformers.LlavaForConditionalGeneration \ No newline at end of file diff --git a/neural_compressor/transformers/quantization/utils.py b/neural_compressor/transformers/quantization/utils.py index 84b49cfe24b..39edc633d8c 100644 --- a/neural_compressor/transformers/quantization/utils.py +++ b/neural_compressor/transformers/quantization/utils.py @@ -33,10 +33,13 @@ convert, prepare, ) -from neural_compressor.torch.utils import is_ipex_available +from neural_compressor.torch.utils import is_ipex_available, is_package_available if is_ipex_available(): import intel_extension_for_pytorch as ipex + +if is_package_available("auto_round"): + import auto_round from typing import Union @@ -123,9 +126,12 @@ def _replace_linear( current_key_name = [] current_key_name.append(name) is_removed = False + print(isinstance(module, auto_round.export.export_to_itrex.model_wrapper.WeightOnlyLinear)) if ( isinstance(module, torch.nn.Linear) or isinstance(module, INCWeightOnlyLinear) + or (is_package_available("auto_round") and \ + isinstance(module, auto_round.export.export_to_itrex.model_wrapper.WeightOnlyLinear)) or (is_ipex_available() and isinstance(module, ipex.nn.utils._weight_prepack._IPEXLinear)) ) and (name not in modules_to_not_convert): # Check if the current key is not in the `modules_to_not_convert` @@ -475,6 +481,40 @@ def convert_to_quantized_model(model, config, device="cpu"): run_fn(model, *run_args) model = convert(model) elif config.quant_method.value == "autoround": + if config.is_vlm is True: + from neural_compressor.torch.algorithms.weight_only.autoround import get_mllm_dataloader as get_autoround_dataloader + from transformers import AutoTokenizer, AutoProcessor + tokenizer = AutoTokenizer.from_pretrained(model.config._name_or_path) + processor = AutoProcessor.from_pretrained(model.config._name_or_path, trust_remote_code=True) + dataloader, template, config.truncation, config.batch_size, \ + config.gradient_accumulate_steps, config.seq_len, config.n_samples = get_autoround_dataloader( + template=None, + model=model, + tokenizer=tokenizer, + image_processor=None, + dataset=config.dataset, + extra_data_dir=None, + seqlen=config.seq_len, + batch_size=config.batch_size, + split=None, + apply_template=None, + truncation=False, + nsamples=config.n_samples, + seed=42, + gradient_accumulate_steps=config.gradient_accumulate_steps, + quant_nontext_module=config.quant_nontext_module, + processor=processor, + ) + else: + from neural_compressor.torch.algorithms.weight_only.autoround import get_dataloader as get_autoround_dataloader + dataloader = get_autoround_dataloader( + tokenizer=config.tokenizer, + seqlen=config.seq_len, + dataset_name=config.dataset, + seed=42, + bs=config.batch_size, + nsamples=config.n_samples, + ) quant_config = AutoRoundConfig( dtype=dtype, bits=config.bits, @@ -486,24 +526,25 @@ def convert_to_quantized_model(model, config, device="cpu"): seqlen=config.seq_len, nsamples=config.n_samples, iters=config.iters, + batch_size=config.batch_size, scale_dtype=config.scale_dtype, use_layer_wise=config.use_layer_wise, + # vlm arguments + is_mllm=config.is_vlm, + quant_nontext_module=config.quant_nontext_module, + truncation=config.truncation, + gradient_accumulate_steps=config.gradient_accumulate_steps, + export_format=config.export_format, ) + if config.modules_to_not_convert != []: for module in config.modules_to_not_convert: module_name = ".*" + module quant_config.set_local(module_name, AutoRoundConfig(dtype="fp32")) logger.info(f"Do AutoRound algorithm with config {quant_config}") - from neural_compressor.torch.algorithms.weight_only.autoround import get_dataloader as get_autoround_dataloader + + - dataloader = get_autoround_dataloader( - tokenizer=config.tokenizer, - seqlen=config.seq_len, - dataset_name=config.dataset, - seed=42, - bs=config.batch_size, - nsamples=config.n_samples, - ) run_fn = run_fn_for_autoround run_args = (dataloader,) model = prepare(model=model, quant_config=quant_config) diff --git a/neural_compressor/transformers/utils/quantization_config.py b/neural_compressor/transformers/utils/quantization_config.py index 3e72de3c330..6868d834029 100644 --- a/neural_compressor/transformers/utils/quantization_config.py +++ b/neural_compressor/transformers/utils/quantization_config.py @@ -543,6 +543,12 @@ def __init__( iters: int = 200, use_layer_wise: bool = None, quant_lm_head: bool = False, + # vlm arguments + is_vlm: bool = False, + quant_nontext_module: Union[str, list] = None, + truncation: bool = False, + gradient_accumulate_steps: int = 1, + export_format="auto_round:gptq", **kwargs, ): @@ -593,6 +599,13 @@ def __init__( self.scale_dtype = scale_dtype self.use_layer_wise = use_layer_wise self.model_path = kwargs.get("model_path", "") + + # vlm arguments + self.is_vlm = is_vlm + self.quant_nontext_module = quant_nontext_module + self.truncation = truncation + self.gradient_accumulate_steps = gradient_accumulate_steps + self.export_format= export_format def to_diff_dict(self) -> Dict[str, Any]: """Removes all attributes from config which correspond to the default config attributes From 097128a14c6f40c94ac237daaed273ad62a954cc Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Mon, 10 Feb 2025 06:01:57 +0800 Subject: [PATCH 02/19] add ut Signed-off-by: Kaihui-intel --- .../transformers/quantization/utils.py | 1 - .../weight_only/test_transfomers.py | 45 +++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/neural_compressor/transformers/quantization/utils.py b/neural_compressor/transformers/quantization/utils.py index 39edc633d8c..f5474e33144 100644 --- a/neural_compressor/transformers/quantization/utils.py +++ b/neural_compressor/transformers/quantization/utils.py @@ -126,7 +126,6 @@ def _replace_linear( current_key_name = [] current_key_name.append(name) is_removed = False - print(isinstance(module, auto_round.export.export_to_itrex.model_wrapper.WeightOnlyLinear)) if ( isinstance(module, torch.nn.Linear) or isinstance(module, INCWeightOnlyLinear) diff --git a/test/3x/torch/quantization/weight_only/test_transfomers.py b/test/3x/torch/quantization/weight_only/test_transfomers.py index 83f6b664da0..f3dd4cae038 100644 --- a/test/3x/torch/quantization/weight_only/test_transfomers.py +++ b/test/3x/torch/quantization/weight_only/test_transfomers.py @@ -10,6 +10,7 @@ from neural_compressor.torch.utils import get_ipex_version from neural_compressor.transformers import ( AutoModelForCausalLM, + Qwen2VLForConditionalGeneration, AutoRoundConfig, AwqConfig, GPTQConfig, @@ -19,6 +20,12 @@ ipex_version = get_ipex_version() +try: + import auto_round + + auto_round_installed = True +except ImportError: + auto_round_installed = False class TestTansformersLikeAPI: def setup_class(self): @@ -30,6 +37,7 @@ def setup_class(self): def teardown_class(self): shutil.rmtree("nc_workspace", ignore_errors=True) shutil.rmtree("transformers_tmp", ignore_errors=True) + shutil.rmtree("transformers_vlm_tmp", ignore_errors=True) def test_quantization_for_llm(self): model_name_or_path = self.model_name_or_path @@ -208,3 +216,40 @@ def test_loading_autoawq_model(self): else: target_text = ["One day, the little girl in the back of my mind will say, “I’m so glad you’"] assert gen_text == target_text, "loading autoawq quantized model failed." + + @pytest.mark.skipif(not auto_round_installed, reason="auto_round module is not installed") + def test_vlm(self): + model_name = "Qwen/Qwen2-VL-2B-Instruct" + from neural_compressor.transformers import Qwen2VLForConditionalGeneration + from neural_compressor.transformers import AutoModelForCausalLM + woq_config = AutoRoundConfig( + bits=4, + group_size=128, + is_vlm=True, + dataset="liuhaotian/llava_conv_58k", + iters=2, + n_samples=5, + seq_len=512, + batch_size=1, + export_format="itrex", + ) + + woq_model = Qwen2VLForConditionalGeneration.from_pretrained(model_name, quantization_config=woq_config, attn_implementation='eager') + + from intel_extension_for_pytorch.nn.modules import WeightOnlyQuantizedLinear + assert isinstance(woq_model.model.layers[0].self_attn.k_proj, WeightOnlyQuantizedLinear), "replacing model failed." + + #save + woq_model.save_pretrained("transformers_vlm_tmp") + + #load + loaded_model = Qwen2VLForConditionalGeneration.from_pretrained("transformers_vlm_tmp") + assert isinstance(loaded_model.model.layers[0].self_attn.k_proj, WeightOnlyQuantizedLinear), "loaing model failed." + + # phi-3-vision-128k-instruct + model_name = "microsoft/Phi-3-vision-128k-instruct" + woq_model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=woq_config, attn_implementation='eager') + + from intel_extension_for_pytorch.nn.modules import WeightOnlyQuantizedLinear + breakpoint() + assert isinstance(woq_model.model.layers[0].self_attn.o_proj, WeightOnlyQuantizedLinear), "quantizaion failed." \ No newline at end of file From f46e72e52b69fd30c749f0bbe9346d187cfbd423 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 10 Feb 2025 01:44:56 +0000 Subject: [PATCH 03/19] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../transformers/models/modeling_auto.py | 11 ++-- .../transformers/quantization/utils.py | 66 +++++++++++-------- .../transformers/utils/quantization_config.py | 8 +-- .../weight_only/test_transfomers.py | 2 +- 4 files changed, 52 insertions(+), 35 deletions(-) diff --git a/neural_compressor/transformers/models/modeling_auto.py b/neural_compressor/transformers/models/modeling_auto.py index abd1e86635d..6c4a0ceda98 100644 --- a/neural_compressor/transformers/models/modeling_auto.py +++ b/neural_compressor/transformers/models/modeling_auto.py @@ -357,7 +357,7 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): if "AutoModel" in cls.ORIG_MODEL.__name__: has_remote_code = hasattr(config, "auto_map") and cls.ORIG_MODEL.__name__ in config.auto_map has_local_code = type(config) in cls.ORIG_MODEL._model_mapping.keys() - + trust_remote_code = resolve_trust_remote_code( trust_remote_code, pretrained_model_name_or_path, @@ -750,12 +750,15 @@ class AutoModel(_BaseINCAutoModelClass): class AutoModelForSeq2SeqLM(_BaseINCAutoModelClass): ORIG_MODEL = transformers.AutoModelForSeq2SeqLM - + + class Qwen2VLForConditionalGeneration(_BaseINCAutoModelClass): ORIG_MODEL = transformers.Qwen2VLForConditionalGeneration - + + class MllamaForConditionalGeneration(_BaseINCAutoModelClass): ORIG_MODEL = transformers.MllamaForConditionalGeneration + class LlavaForConditionalGeneration(_BaseINCAutoModelClass): - ORIG_MODEL = transformers.LlavaForConditionalGeneration \ No newline at end of file + ORIG_MODEL = transformers.LlavaForConditionalGeneration diff --git a/neural_compressor/transformers/quantization/utils.py b/neural_compressor/transformers/quantization/utils.py index f5474e33144..c75ebae5c5a 100644 --- a/neural_compressor/transformers/quantization/utils.py +++ b/neural_compressor/transformers/quantization/utils.py @@ -37,7 +37,7 @@ if is_ipex_available(): import intel_extension_for_pytorch as ipex - + if is_package_available("auto_round"): import auto_round @@ -129,8 +129,10 @@ def _replace_linear( if ( isinstance(module, torch.nn.Linear) or isinstance(module, INCWeightOnlyLinear) - or (is_package_available("auto_round") and \ - isinstance(module, auto_round.export.export_to_itrex.model_wrapper.WeightOnlyLinear)) + or ( + is_package_available("auto_round") + and isinstance(module, auto_round.export.export_to_itrex.model_wrapper.WeightOnlyLinear) + ) or (is_ipex_available() and isinstance(module, ipex.nn.utils._weight_prepack._IPEXLinear)) ) and (name not in modules_to_not_convert): # Check if the current key is not in the `modules_to_not_convert` @@ -481,31 +483,45 @@ def convert_to_quantized_model(model, config, device="cpu"): model = convert(model) elif config.quant_method.value == "autoround": if config.is_vlm is True: - from neural_compressor.torch.algorithms.weight_only.autoround import get_mllm_dataloader as get_autoround_dataloader - from transformers import AutoTokenizer, AutoProcessor + from transformers import AutoProcessor, AutoTokenizer + + from neural_compressor.torch.algorithms.weight_only.autoround import ( + get_mllm_dataloader as get_autoround_dataloader, + ) + tokenizer = AutoTokenizer.from_pretrained(model.config._name_or_path) processor = AutoProcessor.from_pretrained(model.config._name_or_path, trust_remote_code=True) - dataloader, template, config.truncation, config.batch_size, \ - config.gradient_accumulate_steps, config.seq_len, config.n_samples = get_autoround_dataloader( - template=None, - model=model, - tokenizer=tokenizer, - image_processor=None, - dataset=config.dataset, - extra_data_dir=None, - seqlen=config.seq_len, - batch_size=config.batch_size, - split=None, - apply_template=None, - truncation=False, - nsamples=config.n_samples, - seed=42, - gradient_accumulate_steps=config.gradient_accumulate_steps, - quant_nontext_module=config.quant_nontext_module, - processor=processor, + ( + dataloader, + template, + config.truncation, + config.batch_size, + config.gradient_accumulate_steps, + config.seq_len, + config.n_samples, + ) = get_autoround_dataloader( + template=None, + model=model, + tokenizer=tokenizer, + image_processor=None, + dataset=config.dataset, + extra_data_dir=None, + seqlen=config.seq_len, + batch_size=config.batch_size, + split=None, + apply_template=None, + truncation=False, + nsamples=config.n_samples, + seed=42, + gradient_accumulate_steps=config.gradient_accumulate_steps, + quant_nontext_module=config.quant_nontext_module, + processor=processor, ) else: - from neural_compressor.torch.algorithms.weight_only.autoround import get_dataloader as get_autoround_dataloader + from neural_compressor.torch.algorithms.weight_only.autoround import ( + get_dataloader as get_autoround_dataloader, + ) + dataloader = get_autoround_dataloader( tokenizer=config.tokenizer, seqlen=config.seq_len, @@ -541,8 +557,6 @@ def convert_to_quantized_model(model, config, device="cpu"): module_name = ".*" + module quant_config.set_local(module_name, AutoRoundConfig(dtype="fp32")) logger.info(f"Do AutoRound algorithm with config {quant_config}") - - run_fn = run_fn_for_autoround run_args = (dataloader,) diff --git a/neural_compressor/transformers/utils/quantization_config.py b/neural_compressor/transformers/utils/quantization_config.py index 6868d834029..a5ca440226c 100644 --- a/neural_compressor/transformers/utils/quantization_config.py +++ b/neural_compressor/transformers/utils/quantization_config.py @@ -544,7 +544,7 @@ def __init__( use_layer_wise: bool = None, quant_lm_head: bool = False, # vlm arguments - is_vlm: bool = False, + is_vlm: bool = False, quant_nontext_module: Union[str, list] = None, truncation: bool = False, gradient_accumulate_steps: int = 1, @@ -599,13 +599,13 @@ def __init__( self.scale_dtype = scale_dtype self.use_layer_wise = use_layer_wise self.model_path = kwargs.get("model_path", "") - + # vlm arguments - self.is_vlm = is_vlm + self.is_vlm = is_vlm self.quant_nontext_module = quant_nontext_module self.truncation = truncation self.gradient_accumulate_steps = gradient_accumulate_steps - self.export_format= export_format + self.export_format = export_format def to_diff_dict(self) -> Dict[str, Any]: """Removes all attributes from config which correspond to the default config attributes diff --git a/test/3x/torch/quantization/weight_only/test_transfomers.py b/test/3x/torch/quantization/weight_only/test_transfomers.py index f3dd4cae038..cc25d4fc4d3 100644 --- a/test/3x/torch/quantization/weight_only/test_transfomers.py +++ b/test/3x/torch/quantization/weight_only/test_transfomers.py @@ -252,4 +252,4 @@ def test_vlm(self): from intel_extension_for_pytorch.nn.modules import WeightOnlyQuantizedLinear breakpoint() - assert isinstance(woq_model.model.layers[0].self_attn.o_proj, WeightOnlyQuantizedLinear), "quantizaion failed." \ No newline at end of file + assert isinstance(woq_model.model.layers[0].self_attn.o_proj, WeightOnlyQuantizedLinear), "quantizaion failed." From 406f93b3740d2d5df0597178dd8ad46d601966e9 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Mon, 10 Feb 2025 08:56:38 +0800 Subject: [PATCH 04/19] add Mllama/Llava class Signed-off-by: Kaihui-intel --- neural_compressor/transformers/models/__init__.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/neural_compressor/transformers/models/__init__.py b/neural_compressor/transformers/models/__init__.py index 9709d22af71..c9e77038d9f 100644 --- a/neural_compressor/transformers/models/__init__.py +++ b/neural_compressor/transformers/models/__init__.py @@ -13,4 +13,11 @@ # limitations under the License. from .modeling_auto import _BaseINCAutoModelClass -from .modeling_auto import AutoModel, AutoModelForCausalLM, AutoModelForSeq2SeqLM, Qwen2VLForConditionalGeneration +from .modeling_auto import ( + AutoModel, + AutoModelForCausalLM, + AutoModelForSeq2SeqLM, + Qwen2VLForConditionalGeneration, + MllamaForConditionalGeneration, + LlavaForConditionalGeneration, +) From 634b14f2e61d880b21f833226289b4814a78b856 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Mon, 10 Feb 2025 09:33:28 +0800 Subject: [PATCH 05/19] rm breakpoint Signed-off-by: Kaihui-intel --- .../3x/torch/quantization/weight_only/test_transfomers.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/test/3x/torch/quantization/weight_only/test_transfomers.py b/test/3x/torch/quantization/weight_only/test_transfomers.py index cc25d4fc4d3..6e94186cbe2 100644 --- a/test/3x/torch/quantization/weight_only/test_transfomers.py +++ b/test/3x/torch/quantization/weight_only/test_transfomers.py @@ -235,8 +235,11 @@ def test_vlm(self): ) woq_model = Qwen2VLForConditionalGeneration.from_pretrained(model_name, quantization_config=woq_config, attn_implementation='eager') - - from intel_extension_for_pytorch.nn.modules import WeightOnlyQuantizedLinear + + if hasattr(torch, "xpu") and torch.xpu.is_available(): + from intel_extension_for_pytorch.nn.utils._quantize_convert import WeightOnlyQuantizedLinear + else: + from intel_extension_for_pytorch.nn.modules import WeightOnlyQuantizedLinear assert isinstance(woq_model.model.layers[0].self_attn.k_proj, WeightOnlyQuantizedLinear), "replacing model failed." #save @@ -251,5 +254,4 @@ def test_vlm(self): woq_model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=woq_config, attn_implementation='eager') from intel_extension_for_pytorch.nn.modules import WeightOnlyQuantizedLinear - breakpoint() assert isinstance(woq_model.model.layers[0].self_attn.o_proj, WeightOnlyQuantizedLinear), "quantizaion failed." From 4763dff7844be8c7bc741cae4cb634eb342e8745 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 10 Feb 2025 01:51:39 +0000 Subject: [PATCH 06/19] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_compressor/transformers/models/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neural_compressor/transformers/models/__init__.py b/neural_compressor/transformers/models/__init__.py index c9e77038d9f..4dc24600544 100644 --- a/neural_compressor/transformers/models/__init__.py +++ b/neural_compressor/transformers/models/__init__.py @@ -18,6 +18,6 @@ AutoModelForCausalLM, AutoModelForSeq2SeqLM, Qwen2VLForConditionalGeneration, - MllamaForConditionalGeneration, + MllamaForConditionalGeneration, LlavaForConditionalGeneration, ) From d249af4746d82880f9c852bfd6afee773b8d0f7e Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Mon, 10 Feb 2025 09:34:59 +0800 Subject: [PATCH 07/19] modify ut name Signed-off-by: Kaihui-intel --- .../weight_only/{test_transfomers.py => test_transformers.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename test/3x/torch/quantization/weight_only/{test_transfomers.py => test_transformers.py} (100%) diff --git a/test/3x/torch/quantization/weight_only/test_transfomers.py b/test/3x/torch/quantization/weight_only/test_transformers.py similarity index 100% rename from test/3x/torch/quantization/weight_only/test_transfomers.py rename to test/3x/torch/quantization/weight_only/test_transformers.py From cf9ad2f4d204949675f58d78142a3aef6146ee96 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Mon, 10 Feb 2025 10:08:14 +0800 Subject: [PATCH 08/19] clean code Signed-off-by: Kaihui-intel --- test/3x/torch/quantization/weight_only/test_transformers.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/3x/torch/quantization/weight_only/test_transformers.py b/test/3x/torch/quantization/weight_only/test_transformers.py index 6e94186cbe2..db3a7a8fe78 100644 --- a/test/3x/torch/quantization/weight_only/test_transformers.py +++ b/test/3x/torch/quantization/weight_only/test_transformers.py @@ -252,6 +252,4 @@ def test_vlm(self): # phi-3-vision-128k-instruct model_name = "microsoft/Phi-3-vision-128k-instruct" woq_model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=woq_config, attn_implementation='eager') - - from intel_extension_for_pytorch.nn.modules import WeightOnlyQuantizedLinear assert isinstance(woq_model.model.layers[0].self_attn.o_proj, WeightOnlyQuantizedLinear), "quantizaion failed." From 56e2caf5358ec5e0222b55e5790b9265020501f7 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Mon, 10 Feb 2025 12:19:05 +0800 Subject: [PATCH 09/19] fix auto_round.export & trust_remote_code Signed-off-by: Kaihui-intel --- neural_compressor/transformers/quantization/utils.py | 7 +++---- .../transformers/utils/quantization_config.py | 2 +- .../3x/torch/quantization/weight_only/test_transformers.py | 5 ++--- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/neural_compressor/transformers/quantization/utils.py b/neural_compressor/transformers/quantization/utils.py index c75ebae5c5a..e5273b66df9 100644 --- a/neural_compressor/transformers/quantization/utils.py +++ b/neural_compressor/transformers/quantization/utils.py @@ -40,6 +40,8 @@ if is_package_available("auto_round"): import auto_round + from auto_round.export.export_to_itrex.model_wrapper import WeightOnlyLinear as auto_round_woq_linear + from typing import Union @@ -129,10 +131,7 @@ def _replace_linear( if ( isinstance(module, torch.nn.Linear) or isinstance(module, INCWeightOnlyLinear) - or ( - is_package_available("auto_round") - and isinstance(module, auto_round.export.export_to_itrex.model_wrapper.WeightOnlyLinear) - ) + or (is_package_available("auto_round") and isinstance(module, auto_round_woq_linear)) or (is_ipex_available() and isinstance(module, ipex.nn.utils._weight_prepack._IPEXLinear)) ) and (name not in modules_to_not_convert): # Check if the current key is not in the `modules_to_not_convert` diff --git a/neural_compressor/transformers/utils/quantization_config.py b/neural_compressor/transformers/utils/quantization_config.py index a5ca440226c..779356de7c1 100644 --- a/neural_compressor/transformers/utils/quantization_config.py +++ b/neural_compressor/transformers/utils/quantization_config.py @@ -548,7 +548,7 @@ def __init__( quant_nontext_module: Union[str, list] = None, truncation: bool = False, gradient_accumulate_steps: int = 1, - export_format="auto_round:gptq", + export_format="itrex", **kwargs, ): diff --git a/test/3x/torch/quantization/weight_only/test_transformers.py b/test/3x/torch/quantization/weight_only/test_transformers.py index db3a7a8fe78..694aacc673d 100644 --- a/test/3x/torch/quantization/weight_only/test_transformers.py +++ b/test/3x/torch/quantization/weight_only/test_transformers.py @@ -231,10 +231,9 @@ def test_vlm(self): n_samples=5, seq_len=512, batch_size=1, - export_format="itrex", ) - woq_model = Qwen2VLForConditionalGeneration.from_pretrained(model_name, quantization_config=woq_config, attn_implementation='eager') + woq_model = Qwen2VLForConditionalGeneration.from_pretrained(model_name, quantization_config=woq_config, trust_remote_code=True) if hasattr(torch, "xpu") and torch.xpu.is_available(): from intel_extension_for_pytorch.nn.utils._quantize_convert import WeightOnlyQuantizedLinear @@ -251,5 +250,5 @@ def test_vlm(self): # phi-3-vision-128k-instruct model_name = "microsoft/Phi-3-vision-128k-instruct" - woq_model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=woq_config, attn_implementation='eager') + woq_model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=woq_config, trust_remote_code=True, attn_implementation='eager') assert isinstance(woq_model.model.layers[0].self_attn.o_proj, WeightOnlyQuantizedLinear), "quantizaion failed." From d12a9f51cfbd83720892a7040105c7b6943912c2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 10 Feb 2025 05:13:31 +0000 Subject: [PATCH 10/19] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_compressor/transformers/quantization/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neural_compressor/transformers/quantization/utils.py b/neural_compressor/transformers/quantization/utils.py index e5273b66df9..ae44fce9d13 100644 --- a/neural_compressor/transformers/quantization/utils.py +++ b/neural_compressor/transformers/quantization/utils.py @@ -41,7 +41,7 @@ if is_package_available("auto_round"): import auto_round from auto_round.export.export_to_itrex.model_wrapper import WeightOnlyLinear as auto_round_woq_linear - + from typing import Union From ba46b2145b560f18f351f72a22dff26015c9fb14 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Mon, 10 Feb 2025 16:27:07 +0800 Subject: [PATCH 11/19] add quant_nontext_module Signed-off-by: Kaihui-intel --- .../torch/algorithms/weight_only/autoround.py | 8 ++-- .../torch/quantization/config.py | 4 +- .../transformers/quantization/utils.py | 44 +++++++++++++++++-- .../transformers/utils/quantization_config.py | 2 +- .../weight_only/test_transformers.py | 11 +++++ 5 files changed, 58 insertions(+), 11 deletions(-) diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py index 5d0829f5161..01e85120a78 100644 --- a/neural_compressor/torch/algorithms/weight_only/autoround.py +++ b/neural_compressor/torch/algorithms/weight_only/autoround.py @@ -84,7 +84,7 @@ def __init__( enable_torch_compile: bool = None, # mllm is_mllm: bool = False, - quant_nontext_module: Union[str, list] = None, + quant_nontext_module: bool = False, extra_data_dir: str = None, image_processor=None, processor=None, @@ -150,7 +150,7 @@ def __init__( act_dynamic (bool): Whether to use dynamic activation quantization. Default is True. enable_norm_bias_tuning (bool): Whether to enable fast norm/layer_bias tuning. enable_torch_compile (bool): Whether to enable torch compile to optimize quant_block/layer, torch>=2.6 True. - quant_nontext_module (Union[str, list]): Whether to quantize nontext module. + quant_nontext_module (bool): Whether to quantize nontext module. is_mllm (bool): Indicates whether the model to be quantized is a multi-modal model (MLLM). extra_data_dir (str): The path for extra data such as images, audio or videos. processor (transformers.AutoProcessor): Any multi-modal model will require an object to encode or @@ -383,7 +383,7 @@ def get_mllm_dataloader( template, model=model, tokenizer=tokenizer, processor=processor, image_processor=image_processor ) dataset = template.default_dataset if dataset is None else dataset - if quant_nontext_module or (dataset in CALIB_DATASETS.keys() and not _only_text_test(model, tokenizer)): + if quant_nontext_module or (dataset in CALIB_DATASETS.keys() and not _only_text_test(model, tokenizer, "cpu", template.model_type)): if quant_nontext_module: logger.warning( "Quantitative nontext module is not supported for plain text datasets," @@ -399,7 +399,7 @@ def get_mllm_dataloader( truncation = False gradient_accumulate_steps = batch_size * gradient_accumulate_steps batch_size = 1 - + seed = 42 # The seed is fixed to 42 in transformers seqlen = 2048 if seqlen is None else seqlen # set text only calibration default args truncation = True if truncation is None else truncation dataset = dataset.replace(" ", "") diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py index 534e848ff6c..705f66d509b 100644 --- a/neural_compressor/torch/quantization/config.py +++ b/neural_compressor/torch/quantization/config.py @@ -950,7 +950,7 @@ def __init__( enable_torch_compile: bool = None, # mllm is_mllm: bool = False, - quant_nontext_module: Union[str, list] = None, + quant_nontext_module: bool = False, extra_data_dir: str = None, processor=None, image_processor=None, @@ -994,7 +994,7 @@ def __init__( export_format (str, optional): The format used for exporting the quantized model. Defaults to "itrex". enable_norm_bias_tuning (bool): Whether to enable fast norm/layer_bias tuning. enable_torch_compile (bool): Whether to enable torch compile to optimize quant_block/layer, torch>=2.6 True. - quant_nontext_module (Union[str, list]): Whether to quantize nontext module. + quant_nontext_module (bool): Whether to quantize nontext module. extra_data_dir (str): The path for extra data such as images, audio or videos. is_mllm (bool): Indicates whether the model to be quantized is a multi-modal model (MLLM). processor (transformers.AutoProcessor): Any multi-modal model will require an object to encode or diff --git a/neural_compressor/transformers/quantization/utils.py b/neural_compressor/transformers/quantization/utils.py index e5273b66df9..285036104b8 100644 --- a/neural_compressor/transformers/quantization/utils.py +++ b/neural_compressor/transformers/quantization/utils.py @@ -18,6 +18,7 @@ import math import os import types +import re from datasets import load_dataset @@ -40,6 +41,7 @@ if is_package_available("auto_round"): import auto_round + import transformers from auto_round.export.export_to_itrex.model_wrapper import WeightOnlyLinear as auto_round_woq_linear @@ -132,10 +134,10 @@ def _replace_linear( isinstance(module, torch.nn.Linear) or isinstance(module, INCWeightOnlyLinear) or (is_package_available("auto_round") and isinstance(module, auto_round_woq_linear)) - or (is_ipex_available() and isinstance(module, ipex.nn.utils._weight_prepack._IPEXLinear)) ) and (name not in modules_to_not_convert): # Check if the current key is not in the `modules_to_not_convert` - if not any(key in ".".join(current_key_name) for key in modules_to_not_convert): + if not any(key in ".".join(current_key_name) for key in modules_to_not_convert) and \ + not any(re.match(pattern, ".".join(current_key_name)) for pattern in modules_to_not_convert): in_features = module.in_features out_features = module.out_features if device == "cpu" or device == torch.device("cpu") or device == "auto": @@ -143,7 +145,7 @@ def _replace_linear( from intel_extension_for_pytorch.utils.weight_only_quantization import ( _convert_optimum_format_to_desired, ) - + qweight = module.qweight scales = module.scales qzeros = module.qzeros @@ -550,7 +552,41 @@ def convert_to_quantized_model(model, config, device="cpu"): gradient_accumulate_steps=config.gradient_accumulate_steps, export_format=config.export_format, ) - + + # vlm set non-text module config + if config.is_vlm is True: + from neural_compressor.torch.utils.utility import ( + get_multimodal_block_names, + find_matching_blocks, + get_layer_names_in_block, + ) + def set_nontext_module_config(model, to_quant_block_names, config): + all_block_list = get_multimodal_block_names(model, quant_vision=True) + all_block_set = set(tuple(block) for block in all_block_list) + quant_block_set = set(tuple(block) for block in to_quant_block_names) + set_to_full_prec = list(all_block_set - quant_block_set) + set_to_full_prec = get_layer_names_in_block(model, to_quant_block_names=set_to_full_prec) + for name in set_to_full_prec: + config.modules_to_not_convert.append(name) + + # skip layers not in blocks + config.modules_to_not_convert.append("model.vision_embed_tokens.img_projection*") + config.modules_to_not_convert.append("transformer.visual.attn_pool.*_proj") + config.modules_to_not_convert.append("model.mm_projector*") + config.modules_to_not_convert.append("multi_modal_projector") + config.modules_to_not_convert.append("visual.merger") + + all_blocks = get_multimodal_block_names(model, quant_config.quant_nontext_module) + to_quant_block_names = find_matching_blocks(model, all_blocks, quant_config.to_quant_block_names) + set_nontext_module_config(model, to_quant_block_names, config) + + for n, m in model.named_modules(): + if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.modeling_utils.Conv1D): + if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0: + config.modules_to_not_convert.append(n) + print( + f"{n} will not be quantized due to its shape not being divisible by 32," + " resulting in an exporting issue to autogptq") if config.modules_to_not_convert != []: for module in config.modules_to_not_convert: module_name = ".*" + module diff --git a/neural_compressor/transformers/utils/quantization_config.py b/neural_compressor/transformers/utils/quantization_config.py index 779356de7c1..00fe1ec0fbf 100644 --- a/neural_compressor/transformers/utils/quantization_config.py +++ b/neural_compressor/transformers/utils/quantization_config.py @@ -545,7 +545,7 @@ def __init__( quant_lm_head: bool = False, # vlm arguments is_vlm: bool = False, - quant_nontext_module: Union[str, list] = None, + quant_nontext_module: bool = False, truncation: bool = False, gradient_accumulate_steps: int = 1, export_format="itrex", diff --git a/test/3x/torch/quantization/weight_only/test_transformers.py b/test/3x/torch/quantization/weight_only/test_transformers.py index 694aacc673d..6aed7b3d3fd 100644 --- a/test/3x/torch/quantization/weight_only/test_transformers.py +++ b/test/3x/torch/quantization/weight_only/test_transformers.py @@ -249,6 +249,17 @@ def test_vlm(self): assert isinstance(loaded_model.model.layers[0].self_attn.k_proj, WeightOnlyQuantizedLinear), "loaing model failed." # phi-3-vision-128k-instruct + woq_config = AutoRoundConfig( + bits=4, + group_size=128, + is_vlm=True, + dataset="NeelNanda/pile-10k", + iters=2, + n_samples=5, + seq_len=64, + batch_size=1, + ) model_name = "microsoft/Phi-3-vision-128k-instruct" woq_model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=woq_config, trust_remote_code=True, attn_implementation='eager') assert isinstance(woq_model.model.layers[0].self_attn.o_proj, WeightOnlyQuantizedLinear), "quantizaion failed." + From 138316580e3e00714719e3356aa04e4cde55eece Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 10 Feb 2025 09:23:05 +0000 Subject: [PATCH 12/19] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../torch/algorithms/weight_only/autoround.py | 4 +++- .../transformers/quantization/utils.py | 24 ++++++++++--------- .../weight_only/test_transformers.py | 1 - 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py index 01e85120a78..1a52f3bb7b7 100644 --- a/neural_compressor/torch/algorithms/weight_only/autoround.py +++ b/neural_compressor/torch/algorithms/weight_only/autoround.py @@ -383,7 +383,9 @@ def get_mllm_dataloader( template, model=model, tokenizer=tokenizer, processor=processor, image_processor=image_processor ) dataset = template.default_dataset if dataset is None else dataset - if quant_nontext_module or (dataset in CALIB_DATASETS.keys() and not _only_text_test(model, tokenizer, "cpu", template.model_type)): + if quant_nontext_module or ( + dataset in CALIB_DATASETS.keys() and not _only_text_test(model, tokenizer, "cpu", template.model_type) + ): if quant_nontext_module: logger.warning( "Quantitative nontext module is not supported for plain text datasets," diff --git a/neural_compressor/transformers/quantization/utils.py b/neural_compressor/transformers/quantization/utils.py index 4ef6d84bd66..0ab18b91b0c 100644 --- a/neural_compressor/transformers/quantization/utils.py +++ b/neural_compressor/transformers/quantization/utils.py @@ -17,8 +17,8 @@ import json import math import os -import types import re +import types from datasets import load_dataset @@ -44,7 +44,6 @@ import transformers from auto_round.export.export_to_itrex.model_wrapper import WeightOnlyLinear as auto_round_woq_linear - from typing import Union torch = LazyImport("torch") @@ -136,8 +135,9 @@ def _replace_linear( or (is_package_available("auto_round") and isinstance(module, auto_round_woq_linear)) ) and (name not in modules_to_not_convert): # Check if the current key is not in the `modules_to_not_convert` - if not any(key in ".".join(current_key_name) for key in modules_to_not_convert) and \ - not any(re.match(pattern, ".".join(current_key_name)) for pattern in modules_to_not_convert): + if not any(key in ".".join(current_key_name) for key in modules_to_not_convert) and not any( + re.match(pattern, ".".join(current_key_name)) for pattern in modules_to_not_convert + ): in_features = module.in_features out_features = module.out_features if device == "cpu" or device == torch.device("cpu") or device == "auto": @@ -145,7 +145,7 @@ def _replace_linear( from intel_extension_for_pytorch.utils.weight_only_quantization import ( _convert_optimum_format_to_desired, ) - + qweight = module.qweight scales = module.scales qzeros = module.qzeros @@ -552,14 +552,15 @@ def convert_to_quantized_model(model, config, device="cpu"): gradient_accumulate_steps=config.gradient_accumulate_steps, export_format=config.export_format, ) - + # vlm set non-text module config if config.is_vlm is True: from neural_compressor.torch.utils.utility import ( - get_multimodal_block_names, find_matching_blocks, get_layer_names_in_block, + get_multimodal_block_names, ) + def set_nontext_module_config(model, to_quant_block_names, config): all_block_list = get_multimodal_block_names(model, quant_vision=True) all_block_set = set(tuple(block) for block in all_block_list) @@ -568,25 +569,26 @@ def set_nontext_module_config(model, to_quant_block_names, config): set_to_full_prec = get_layer_names_in_block(model, to_quant_block_names=set_to_full_prec) for name in set_to_full_prec: config.modules_to_not_convert.append(name) - + # skip layers not in blocks config.modules_to_not_convert.append("model.vision_embed_tokens.img_projection*") config.modules_to_not_convert.append("transformer.visual.attn_pool.*_proj") config.modules_to_not_convert.append("model.mm_projector*") config.modules_to_not_convert.append("multi_modal_projector") config.modules_to_not_convert.append("visual.merger") - + all_blocks = get_multimodal_block_names(model, quant_config.quant_nontext_module) to_quant_block_names = find_matching_blocks(model, all_blocks, quant_config.to_quant_block_names) set_nontext_module_config(model, to_quant_block_names, config) - + for n, m in model.named_modules(): if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.modeling_utils.Conv1D): if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0: config.modules_to_not_convert.append(n) print( f"{n} will not be quantized due to its shape not being divisible by 32," - " resulting in an exporting issue to autogptq") + " resulting in an exporting issue to autogptq" + ) if config.modules_to_not_convert != []: for module in config.modules_to_not_convert: module_name = ".*" + module diff --git a/test/3x/torch/quantization/weight_only/test_transformers.py b/test/3x/torch/quantization/weight_only/test_transformers.py index 6aed7b3d3fd..5c34c7f6e09 100644 --- a/test/3x/torch/quantization/weight_only/test_transformers.py +++ b/test/3x/torch/quantization/weight_only/test_transformers.py @@ -262,4 +262,3 @@ def test_vlm(self): model_name = "microsoft/Phi-3-vision-128k-instruct" woq_model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=woq_config, trust_remote_code=True, attn_implementation='eager') assert isinstance(woq_model.model.layers[0].self_attn.o_proj, WeightOnlyQuantizedLinear), "quantizaion failed." - From b25f8f58ee5a0c8ef84742f5b4d3e2972892308a Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Mon, 10 Feb 2025 17:21:52 +0800 Subject: [PATCH 13/19] add torchvision into ut req Signed-off-by: Kaihui-intel --- test/3x/torch/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/test/3x/torch/requirements.txt b/test/3x/torch/requirements.txt index 5b97060f9f8..c4e561a48e2 100644 --- a/test/3x/torch/requirements.txt +++ b/test/3x/torch/requirements.txt @@ -8,3 +8,4 @@ prettytable psutil pytest transformers +torchvision From a57af91b7497041e377ef817e907824865dd7113 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 10 Feb 2025 10:17:03 +0000 Subject: [PATCH 14/19] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- test/3x/torch/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/3x/torch/requirements.txt b/test/3x/torch/requirements.txt index c4e561a48e2..d9697dcac5e 100644 --- a/test/3x/torch/requirements.txt +++ b/test/3x/torch/requirements.txt @@ -7,5 +7,5 @@ peft prettytable psutil pytest -transformers torchvision +transformers From b239521333624d87c07e81acf443d4620d72ac0f Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Mon, 10 Feb 2025 19:59:01 +0800 Subject: [PATCH 15/19] disable phi3 ut Signed-off-by: Kaihui-intel --- .../weight_only/test_transformers.py | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/test/3x/torch/quantization/weight_only/test_transformers.py b/test/3x/torch/quantization/weight_only/test_transformers.py index 5c34c7f6e09..3e576ecc7bb 100644 --- a/test/3x/torch/quantization/weight_only/test_transformers.py +++ b/test/3x/torch/quantization/weight_only/test_transformers.py @@ -226,7 +226,7 @@ def test_vlm(self): bits=4, group_size=128, is_vlm=True, - dataset="liuhaotian/llava_conv_58k", + dataset="NeelNanda/pile-10k", iters=2, n_samples=5, seq_len=512, @@ -248,17 +248,17 @@ def test_vlm(self): loaded_model = Qwen2VLForConditionalGeneration.from_pretrained("transformers_vlm_tmp") assert isinstance(loaded_model.model.layers[0].self_attn.k_proj, WeightOnlyQuantizedLinear), "loaing model failed." - # phi-3-vision-128k-instruct - woq_config = AutoRoundConfig( - bits=4, - group_size=128, - is_vlm=True, - dataset="NeelNanda/pile-10k", - iters=2, - n_samples=5, - seq_len=64, - batch_size=1, - ) - model_name = "microsoft/Phi-3-vision-128k-instruct" - woq_model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=woq_config, trust_remote_code=True, attn_implementation='eager') - assert isinstance(woq_model.model.layers[0].self_attn.o_proj, WeightOnlyQuantizedLinear), "quantizaion failed." + # phi-3-vision-128k-instruct, disable as CI consumes too much time + # woq_config = AutoRoundConfig( + # bits=4, + # group_size=128, + # is_vlm=True, + # dataset="liuhaotian/llava_conv_58k", + # iters=2, + # n_samples=5, + # seq_len=64, + # batch_size=1, + # ) + # model_name = "microsoft/Phi-3-vision-128k-instruct" + # woq_model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=woq_config, trust_remote_code=True, attn_implementation='eager') + # assert isinstance(woq_model.model.layers[0].self_attn.o_proj, WeightOnlyQuantizedLinear), "quantizaion failed." From 7aeeeca2e32226ec547d8efe9b0a5f7cb6290383 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Mon, 10 Feb 2025 21:36:11 +0800 Subject: [PATCH 16/19] reduce vlm ut Signed-off-by: Kaihui-intel --- test/3x/torch/quantization/weight_only/test_autoround.py | 8 ++++---- .../torch/quantization/weight_only/test_transformers.py | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/test/3x/torch/quantization/weight_only/test_autoround.py b/test/3x/torch/quantization/weight_only/test_autoround.py index 46a71cc2cfd..e7664d9ad35 100644 --- a/test/3x/torch/quantization/weight_only/test_autoround.py +++ b/test/3x/torch/quantization/weight_only/test_autoround.py @@ -238,13 +238,13 @@ def test_mllm(self): image_processor=None, dataset="liuhaotian/llava_conv_58k", extra_data_dir=None, - seqlen=512, + seqlen=32, batch_size=1, split=None, apply_template=None, truncation=False, seed=42, - nsamples=5, + nsamples=1, gradient_accumulate_steps=1, quant_nontext_module=False, processor=processor, @@ -253,9 +253,9 @@ def test_mllm(self): bits=4, group_size=128, is_mllm=True, - nsamples=5, + nsamples=1, batch_size=batch_size, - iters=2, + iters=1, seqlen=seqlen, quant_nontext_module=False, truncation=truncation, diff --git a/test/3x/torch/quantization/weight_only/test_transformers.py b/test/3x/torch/quantization/weight_only/test_transformers.py index 3e576ecc7bb..684bb2c14e4 100644 --- a/test/3x/torch/quantization/weight_only/test_transformers.py +++ b/test/3x/torch/quantization/weight_only/test_transformers.py @@ -227,9 +227,9 @@ def test_vlm(self): group_size=128, is_vlm=True, dataset="NeelNanda/pile-10k", - iters=2, - n_samples=5, - seq_len=512, + iters=1, + n_samples=1, + seq_len=32, batch_size=1, ) From 5a33c16a0773981b6cb604ba37478789fb8f505a Mon Sep 17 00:00:00 2001 From: "Sun, Xuehao" Date: Tue, 11 Feb 2025 10:25:51 +0800 Subject: [PATCH 17/19] remove specific index-url for torch installation to avoid conflict Signed-off-by: Sun, Xuehao --- .azure-pipelines/scripts/install_nc.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure-pipelines/scripts/install_nc.sh b/.azure-pipelines/scripts/install_nc.sh index c6a1e2cf9c0..ef45af29f89 100644 --- a/.azure-pipelines/scripts/install_nc.sh +++ b/.azure-pipelines/scripts/install_nc.sh @@ -9,7 +9,7 @@ if [[ $1 = *"3x_pt"* ]]; then python setup.py pt bdist_wheel else echo -e "\n Install torch CPU ... " - pip install torch==2.5.1 --index-url https://download.pytorch.org/whl/cpu + pip install torch==2.5.1 python -m pip install intel-extension-for-pytorch==2.5.0 oneccl_bind_pt==2.5.0 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ python -m pip install --no-cache-dir -r requirements.txt python setup.py bdist_wheel From 95bac7c8971731c65798f58ae0498bc8a3deec36 Mon Sep 17 00:00:00 2001 From: "Sun, Xuehao" Date: Tue, 11 Feb 2025 10:57:05 +0800 Subject: [PATCH 18/19] update torch installation Signed-off-by: Sun, Xuehao --- .azure-pipelines/scripts/install_nc.sh | 2 +- .azure-pipelines/scripts/ut/3x/run_3x_pt.sh | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.azure-pipelines/scripts/install_nc.sh b/.azure-pipelines/scripts/install_nc.sh index ef45af29f89..c6a1e2cf9c0 100644 --- a/.azure-pipelines/scripts/install_nc.sh +++ b/.azure-pipelines/scripts/install_nc.sh @@ -9,7 +9,7 @@ if [[ $1 = *"3x_pt"* ]]; then python setup.py pt bdist_wheel else echo -e "\n Install torch CPU ... " - pip install torch==2.5.1 + pip install torch==2.5.1 --index-url https://download.pytorch.org/whl/cpu python -m pip install intel-extension-for-pytorch==2.5.0 oneccl_bind_pt==2.5.0 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ python -m pip install --no-cache-dir -r requirements.txt python setup.py bdist_wheel diff --git a/.azure-pipelines/scripts/ut/3x/run_3x_pt.sh b/.azure-pipelines/scripts/ut/3x/run_3x_pt.sh index 49bafea8bd0..38e8c7604a4 100644 --- a/.azure-pipelines/scripts/ut/3x/run_3x_pt.sh +++ b/.azure-pipelines/scripts/ut/3x/run_3x_pt.sh @@ -13,6 +13,7 @@ echo "##[section]import check pass" echo "##[group]set up UT env..." export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH pip install -r /neural-compressor/test/3x/torch/requirements.txt +pip install torch==2.5.1 # For auto-round pip install pytest-cov pip install pytest-html echo "##[endgroup]" From 9cd265e5ba6131c6601afeb15824bd309fd90097 Mon Sep 17 00:00:00 2001 From: "Sun, Xuehao" Date: Tue, 11 Feb 2025 11:11:17 +0800 Subject: [PATCH 19/19] fix torchvision==0.20.1 Signed-off-by: Sun, Xuehao --- .azure-pipelines/scripts/ut/3x/run_3x_pt.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure-pipelines/scripts/ut/3x/run_3x_pt.sh b/.azure-pipelines/scripts/ut/3x/run_3x_pt.sh index 38e8c7604a4..458292afa8d 100644 --- a/.azure-pipelines/scripts/ut/3x/run_3x_pt.sh +++ b/.azure-pipelines/scripts/ut/3x/run_3x_pt.sh @@ -13,7 +13,7 @@ echo "##[section]import check pass" echo "##[group]set up UT env..." export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH pip install -r /neural-compressor/test/3x/torch/requirements.txt -pip install torch==2.5.1 # For auto-round +pip install torch==2.5.1 torchvision==0.20.1 # For auto-round pip install pytest-cov pip install pytest-html echo "##[endgroup]"