From 5f5d688b5422290d946339face0f6ebd8aa2cf91 Mon Sep 17 00:00:00 2001 From: violetch24 Date: Thu, 4 Jul 2024 02:25:14 -0700 Subject: [PATCH 01/10] example update for 3.x ipex sq Signed-off-by: violetch24 --- .../smooth_quant/requirements.txt | 1 + .../smooth_quant/run_clm_no_trainer.py | 65 ++++++++++--------- 2 files changed, 34 insertions(+), 32 deletions(-) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt index f0b56e558d3..d4155dfbf75 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt @@ -11,3 +11,4 @@ neural-compressor intel-extension-for-transformers lm_eval==0.4.2 peft +optimum-intel diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py index ef0590e2982..1d6fba4fecb 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py @@ -52,6 +52,7 @@ help="calibration iters.") parser.add_argument("--tasks", default="lambada_openai,hellaswag,winogrande,piqa,wikitext", type=str, help="tasks for accuracy validation") +parser.add_argument("--max_new_tokens", default=32, type=int, help="output max new tokens") parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model") # ============SmoothQuant configs============== parser.add_argument("--sq", action="store_true") @@ -203,20 +204,23 @@ def run_fn(model): if args.load: - # TODO: we need run_benchmark.sh for loading and remove --accuracy in run_quant.sh, currently run_quant.sh will get fp32 result if args.int8 or args.int8_bf16_mixed: - print("load int8 model") + print("Loading SmoothQuant int8 model.") from neural_compressor.torch.quantization import load + from intel_extension_for_transformers.transformers.llm.evaluation.models import ( + TSModelCausalLMForITREX, + ) tokenizer = AutoTokenizer.from_pretrained(args.model) config = AutoConfig.from_pretrained(args.model) + origin_model_type = config.model_type user_model = load(os.path.abspath(os.path.expanduser(args.output_dir))) - setattr(user_model, "config", config) + user_model = TSModelCausalLMForITREX(user_model, config=config) + user_model.config.model_type = origin_model_type else: user_model, tokenizer = get_user_model() if args.accuracy: - user_model.eval() from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser eval_args = LMEvalParser( model="hf", @@ -229,36 +233,33 @@ def run_fn(model): results = evaluate(eval_args) for task_name in args.tasks.split(","): if task_name == "wikitext": - acc = results["results"][task_name]["word_perplexity,none"] + print("Accuracy for %s is: %s" % (task_name, results["results"][task_name]["word_perplexity,none"])) else: - acc = results["results"][task_name]["acc,none"] - print("Accuracy: %.5f" % acc) - print('Batch size = %d' % args.batch_size) + print("Accuracy for %s is: %s" % (task_name, results["results"][task_name]["acc,none"])) + if args.performance: - user_model.eval() - from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser + batch_size, input_leng = args.batch_size, 512 + example_inputs = torch.ones((batch_size, input_leng), dtype=torch.long) + print("Batch size = {:d}".format(batch_size)) + print("The length of input tokens = {:d}".format(input_leng)) import time - samples = args.iters * args.batch_size - eval_args = LMEvalParser( - model="hf", - user_model=user_model, - tokenizer=tokenizer, - batch_size=args.batch_size, - tasks=args.tasks, - limit=samples, - device="cpu", - ) - start = time.time() - results = evaluate(eval_args) - end = time.time() - for task_name in args.tasks.split(","): - if task_name == "wikitext": - acc = results["results"][task_name]["word_perplexity,none"] - else: - acc = results["results"][task_name]["acc,none"] - print("Accuracy: %.5f" % acc) - print('Throughput: %.3f samples/sec' % (samples / (end - start))) - print('Latency: %.3f ms' % ((end - start) * 1000 / samples)) - print('Batch size = %d' % args.batch_size) + total_iters = args.iters + warmup_iters = 5 + with torch.no_grad(): + for i in range(total_iters): + if i == warmup_iters: + start = time.time() + user_model.generate( + example_inputs, + max_new_tokens=args.max_new_tokens, + do_sample=False, + temperature=0.9, + num_beams=4, + ) + end = time.time() + latency = (end - start) / ((total_iters - warmup_iters) * args.batch_size) + throughput = ((total_iters - warmup_iters) * args.batch_size) / (end - start) + print("Latency: {:.3f} ms".format(latency * 10**3)) + print("Throughput: {:.3f} samples/sec".format(throughput)) From 6bdcbb03cfea0823a9098888285bdd9ca08ac6a1 Mon Sep 17 00:00:00 2001 From: violetch24 Date: Thu, 4 Jul 2024 23:08:49 -0700 Subject: [PATCH 02/10] add autotune to sq Signed-off-by: violetch24 --- .../smooth_quant/run_clm_no_trainer.py | 31 ++++++++++++++++--- .../torch/quantization/config.py | 6 +++- 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py index 1d6fba4fecb..122b6854b9b 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py @@ -192,14 +192,35 @@ def run_fn(model): if calib_iter >= args.calib_iters: break return - + + def eval_func(model): + from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser + eval_args = LMEvalParser( + model="hf", + user_model=model, + tokenizer=tokenizer, + batch_size=args.batch_size, + tasks=args.tasks, + device="cpu", + ) + results = evaluate(eval_args) + if args.tasks == "wikitext": + return results["results"][args.tasks]["word_perplexity,none"] + else: + return results["results"][args.tasks]["acc,none"] + from utils import get_example_inputs example_inputs = get_example_inputs(user_model, calib_dataloader) - from neural_compressor.torch.quantization import prepare, convert - user_model = prepare(model=user_model, quant_config=quant_config, example_inputs=example_inputs) - run_fn(user_model) - user_model = convert(user_model) + from neural_compressor.torch.quantization import SmoothQuantConfig, autotune, TuningConfig + tune_config = TuningConfig(config_set=SmoothQuantConfig.get_config_set_for_tuning()) + user_model = autotune( + user_model, + tune_config=tune_config, + eval_fn=eval_func, + run_fn=run_fn, + example_inputs=example_inputs, + ) user_model.save(args.output_dir) diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py index 71b01353d5a..861b27c9f62 100644 --- a/neural_compressor/torch/quantization/config.py +++ b/neural_compressor/torch/quantization/config.py @@ -1191,7 +1191,11 @@ def get_model_info(self, model: torch.nn.Module, example_inputs) -> List[Tuple[s @classmethod def get_config_set_for_tuning(cls) -> Union[None, "SmoothQuantConfig", List["SmoothQuantConfig"]]: - return SmoothQuantConfig(alpha=[0.1, 0.5], folding=[True, False], scale_sharing=[True, False]) + import numpy as np + + return SmoothQuantConfig( + alpha=np.arange(0.1, 1.0, 0.1).tolist(), folding=[True, False], scale_sharing=[True, False] + ) def get_default_sq_config() -> SmoothQuantConfig: From c2e5eabeb87676fe6f3bbb8885948de35f35506d Mon Sep 17 00:00:00 2001 From: violetch24 Date: Wed, 17 Jul 2024 19:12:32 -0700 Subject: [PATCH 03/10] modify test dir Signed-off-by: violetch24 --- examples/.config/model_params_pytorch_3x.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json index bbbab60bdbc..bf754a5bd75 100644 --- a/examples/.config/model_params_pytorch_3x.json +++ b/examples/.config/model_params_pytorch_3x.json @@ -85,7 +85,7 @@ "batch_size": 8 }, "gpt_j_ipex":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant", + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex", "dataset_location": "", "input_model": "", "main_script": "run_clm_no_trainer.py", @@ -99,7 +99,7 @@ "batch_size": 1 }, "llama2_7b_ipex":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant", + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex", "dataset_location": "", "input_model": "", "main_script": "run_clm_no_trainer.py", @@ -113,7 +113,7 @@ "batch_size": 1 }, "opt_125m_ipex":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant", + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex", "dataset_location": "", "input_model": "", "main_script": "run_clm_no_trainer.py", From 8a2f714a49868e9875691c09a1c98689d7bbaffe Mon Sep 17 00:00:00 2001 From: violetch24 Date: Tue, 23 Jul 2024 00:17:54 -0700 Subject: [PATCH 04/10] fix precision init Signed-off-by: violetch24 --- .../quantization/static_quant/ipex/run_clm_no_trainer.py | 6 +++--- test/3x/torch/quantization/test_static_quant.py | 3 +-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_clm_no_trainer.py index 0ccb2093537..b56c01f20f5 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_clm_no_trainer.py @@ -164,9 +164,9 @@ def get_user_model(): ) - from neural_compressor.torch.quantization import get_default_static_config, StaticQuantConfig - quant_config = get_default_static_config() - quant_config.excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"] + from neural_compressor.torch.quantization import StaticQuantConfig + excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"] + quant_config = StaticQuantConfig(excluded_precisions=excluded_precisions) if re.search("gpt", user_model.config.model_type): quant_config.set_local("add", StaticQuantConfig(w_dtype="fp32", act_dtype="fp32")) diff --git a/test/3x/torch/quantization/test_static_quant.py b/test/3x/torch/quantization/test_static_quant.py index 4aecd29eecf..5bc37180045 100644 --- a/test/3x/torch/quantization/test_static_quant.py +++ b/test/3x/torch/quantization/test_static_quant.py @@ -216,7 +216,7 @@ def test_static_quant_with_quantize_API(self): def test_static_quant_mixed_precision(self): fp32_model = copy.deepcopy(self.fp32_model) example_inputs = self.input - quant_config = get_default_static_config() + quant_config = StaticQuantConfig(excluded_precisions=["bf16"]) prepared_model = prepare(fp32_model, quant_config=quant_config, example_inputs=example_inputs) run_fn(prepared_model) q_model = convert(prepared_model) @@ -229,7 +229,6 @@ def test_static_quant_mixed_precision(self): q_model = convert(prepared_model) assert q_model is not None, "Quantization failed!" - quant_config.excluded_precisions = ["bf16"] prepared_model = prepare(fp32_model, quant_config=quant_config, example_inputs=example_inputs) run_fn(prepared_model) q_model = convert(prepared_model) From 37518e318061a4d144d5d46e5f0db5ebfa429fdd Mon Sep 17 00:00:00 2001 From: violetch24 Date: Tue, 30 Jul 2024 00:50:44 -0700 Subject: [PATCH 05/10] minor fix Signed-off-by: violetch24 --- .../quantization/smooth_quant/run_clm_no_trainer.py | 3 +++ neural_compressor/torch/quantization/autotune.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py index 115be74000f..cb363431dde 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py @@ -191,6 +191,9 @@ def run_fn(model): return def eval_func(model): + config = AutoConfig.from_pretrained(args.model) + setattr(model, "config", config) + from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser eval_args = LMEvalParser( model="hf", diff --git a/neural_compressor/torch/quantization/autotune.py b/neural_compressor/torch/quantization/autotune.py index 7a53b54b0d5..2c6dcaa768f 100644 --- a/neural_compressor/torch/quantization/autotune.py +++ b/neural_compressor/torch/quantization/autotune.py @@ -81,7 +81,7 @@ def autotune( best_quant_model = None eval_func_wrapper = EvaluationFuncWrapper(eval_fn, eval_args) config_loader, tuning_logger, tuning_monitor = init_tuning(tuning_config=tune_config) - baseline: float = eval_func_wrapper.evaluate(model) + baseline: float = eval_func_wrapper.evaluate(deepcopy(model)) tuning_monitor.set_baseline(baseline) tuning_logger.tuning_start() for trial_index, quant_config in enumerate(config_loader, 1): From 03eecf1bca3000fe20cea600bd6a24dd92fcc907 Mon Sep 17 00:00:00 2001 From: Zixuan Cheng <110808245+violetch24@users.noreply.github.com> Date: Tue, 30 Jul 2024 15:52:37 +0800 Subject: [PATCH 06/10] Update run_clm_no_trainer.py --- .../quantization/smooth_quant/run_clm_no_trainer.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py index cb363431dde..8b19c325f4b 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py @@ -162,15 +162,6 @@ def get_user_model(): collate_fn=calib_evaluator.collate_batch, ) - from neural_compressor.torch.quantization import SmoothQuantConfig - - args.alpha = eval(args.alpha) - excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"] - quant_config = SmoothQuantConfig(alpha=args.alpha, folding=False, excluded_precisions=excluded_precisions) - - if re.search("gpt", user_model.config.model_type): - quant_config.set_local(torch.add, SmoothQuantConfig(w_dtype="fp32", act_dtype="fp32")) - from neural_compressor.torch.algorithms.smooth_quant import move_input_to_device from tqdm import tqdm From 2e55e40a8807f7b2077ef435b567c193da43900a Mon Sep 17 00:00:00 2001 From: Zixuan Cheng <110808245+violetch24@users.noreply.github.com> Date: Tue, 30 Jul 2024 15:53:47 +0800 Subject: [PATCH 07/10] Update config.py --- neural_compressor/torch/quantization/config.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py index 8396baa2930..ee1bdb10a42 100644 --- a/neural_compressor/torch/quantization/config.py +++ b/neural_compressor/torch/quantization/config.py @@ -1475,7 +1475,10 @@ def get_config_set_for_tuning(cls) -> Union[None, "SmoothQuantConfig", List["Smo import numpy as np return SmoothQuantConfig( - alpha=np.arange(0.1, 1.0, 0.1).tolist(), folding=[True, False], scale_sharing=[True, False] + alpha=np.arange(0.1, 1.0, 0.1).tolist(), + folding=[True, False], + scale_sharing=[True, False], + excluded_precisions=["bf16"] ) From f5998a68ac0d913d7060a2602ba37fc434f4b50c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 30 Jul 2024 07:55:29 +0000 Subject: [PATCH 08/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_compressor/torch/quantization/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py index ee1bdb10a42..5fa47d30716 100644 --- a/neural_compressor/torch/quantization/config.py +++ b/neural_compressor/torch/quantization/config.py @@ -1478,7 +1478,7 @@ def get_config_set_for_tuning(cls) -> Union[None, "SmoothQuantConfig", List["Smo alpha=np.arange(0.1, 1.0, 0.1).tolist(), folding=[True, False], scale_sharing=[True, False], - excluded_precisions=["bf16"] + excluded_precisions=["bf16"], ) From bd82e023fbfcefe2760be065ba1345f866748d75 Mon Sep 17 00:00:00 2001 From: violetch24 Date: Tue, 30 Jul 2024 22:48:40 -0700 Subject: [PATCH 09/10] revert tsmodel Signed-off-by: violetch24 --- .../quantization/smooth_quant/run_clm_no_trainer.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py index 8b19c325f4b..5918ba91c69 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py @@ -217,18 +217,15 @@ def eval_func(model): if args.load: + # TODO: we need run_benchmark.sh for loading and remove --accuracy in run_quant.sh, currently run_quant.sh will get fp32 result if args.int8 or args.int8_bf16_mixed: - print("Loading SmoothQuant int8 model.") + print("load int8 model") from neural_compressor.torch.quantization import load - from intel_extension_for_transformers.transformers.llm.evaluation.models import ( - TSModelCausalLMForITREX, - ) + tokenizer = AutoTokenizer.from_pretrained(args.model) config = AutoConfig.from_pretrained(args.model) - origin_model_type = config.model_type user_model = load(os.path.abspath(os.path.expanduser(args.output_dir))) - user_model = TSModelCausalLMForITREX(user_model, config=config) - user_model.config.model_type = origin_model_type + setattr(user_model, "config", config) else: user_model, tokenizer = get_user_model() From c2cf0bf0d3f8e081cf89e6d6da43a0de56e32807 Mon Sep 17 00:00:00 2001 From: violetch24 Date: Tue, 30 Jul 2024 23:03:56 -0700 Subject: [PATCH 10/10] minor fix Signed-off-by: violetch24 --- .../quantization/smooth_quant/run_clm_no_trainer.py | 12 +++--------- neural_compressor/torch/quantization/config.py | 2 +- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py index 5918ba91c69..694c0505ea4 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py @@ -231,6 +231,7 @@ def eval_func(model): if args.accuracy: + user_model.eval() from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser eval_args = LMEvalParser( @@ -250,6 +251,7 @@ def eval_func(model): if args.performance: + user_model.eval() batch_size, input_leng = args.batch_size, 512 example_inputs = torch.ones((batch_size, input_leng), dtype=torch.long) print("Batch size = {:d}".format(batch_size)) @@ -262,15 +264,7 @@ def eval_func(model): for i in range(total_iters): if i == warmup_iters: start = time.time() - - user_model.generate( - example_inputs, - max_new_tokens=args.max_new_tokens, - do_sample=False, - temperature=0.9, - num_beams=4, - ) - + user_model(example_inputs) end = time.time() latency = (end - start) / ((total_iters - warmup_iters) * args.batch_size) throughput = ((total_iters - warmup_iters) * args.batch_size) / (end - start) diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py index 28f4172f806..a335864e87f 100644 --- a/neural_compressor/torch/quantization/config.py +++ b/neural_compressor/torch/quantization/config.py @@ -1478,7 +1478,7 @@ def get_config_set_for_tuning(cls) -> Union[None, "SmoothQuantConfig", List["Smo alpha=np.arange(0.1, 1.0, 0.1).tolist(), folding=[True, False], scale_sharing=[True, False], - excluded_precisions=["bf16"], + excluded_precisions=[["bf16"]], )