From 08e6578574c0cdf536f9a3ba7935b763f3f486d7 Mon Sep 17 00:00:00 2001 From: Xin He Date: Wed, 7 Dec 2022 09:47:35 +0800 Subject: [PATCH 1/2] add export for new API examples Signed-off-by: Xin He --- .../quantization/ptq_dynamic/eager/README.md | 5 +++ .../ptq_dynamic/eager/requirements.txt | 4 +- .../ptq_dynamic/eager/run_glue_tune.py | 35 ++++++++++++--- .../ptq_dynamic/eager/run_tuning.sh | 1 + .../quantization/ptq_static/fx/README.md | 8 ++++ .../ptq_static/fx/requirements.txt | 2 + .../quantization/ptq_static/fx/run_glue.py | 44 ++++++++++++++++++- .../quantization/ptq_static/fx/run_tuning.sh | 1 + .../quantization/qat/fx/README.md | 8 ++++ .../quantization/qat/fx/requirements.txt | 2 + .../quantization/qat/fx/run_glue_tune.py | 40 +++++++++++++++++ .../quantization/qat/fx/run_tuning.sh | 4 +- 12 files changed, 145 insertions(+), 9 deletions(-) diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/README.md b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/README.md index ac449cdb781..016d8d99456 100644 --- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/README.md +++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/README.md @@ -198,5 +198,10 @@ Shapley values originate from cooperative game theory that come with desirable p > **Note** : run_glue_tune_with_shap.py is the example of "SST2" task. If you want to execute other glue task, you may take some slight change under "ShapleyMSE" class. +# Appendix +## Export to ONNX +Right now, we experimentally support exporting PyTorch model to ONNX model, includes FP32 and INT8 model. + +By enabling `--onnx` argument, Intel Neural Compressor will export fp32 ONNX model, INT8 QDQ ONNX model, and INT8 QLinear ONNX model. diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/requirements.txt b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/requirements.txt index 7ad9dc04d0c..688b5217718 100644 --- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/requirements.txt +++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/requirements.txt @@ -6,4 +6,6 @@ torch >= 1.3 transformers>=4.10.0 shap scipy -sacremoses \ No newline at end of file +sacremoses +onnx +onnxruntime \ No newline at end of file diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/run_glue_tune.py b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/run_glue_tune.py index 13812b30b4e..b41c077ac59 100755 --- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/run_glue_tune.py +++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/run_glue_tune.py @@ -144,18 +144,25 @@ class ModelArguments: tune: bool = field( default=False, metadata={ - "help": "tune quantized model with Intel Neural Compressor)." - }, + "help": "tune quantized model with Intel Neural Compressor)."}, ) benchmark: bool = field( default=False, - metadata={"help": "run benchmark."}) + metadata={"help": "run benchmark."}, + ) int8: bool = field( default=False, - metadata={"help":"run benchmark."}) + metadata={"help":"initialize int8 model."}, + ) accuracy_only: bool = field( default=False, - metadata={"help":"Whether to only test accuracy for model tuned by Neural Compressor."}) + metadata={"help":"Whether to only test accuracy for model tuned by Neural Compressor."}, + ) + onnx: bool = field( + default=False, metadata={"help": "convert PyTorch model to ONNX"} + ) + + def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. @@ -439,6 +446,24 @@ def eval_func_for_nc(model_tuned): q_model = fit(model, conf=conf, eval_func=eval_func_for_nc) from neural_compressor.utils.load_huggingface import save_for_huggingface_upstream save_for_huggingface_upstream(q_model, tokenizer, training_args.output_dir) + + if model_args.onnx: + eval_dataloader = trainer.get_eval_dataloader() + it = iter(eval_dataloader) + input = next(it) + input.pop('labels') + symbolic_names = {0: 'batch_size', 1: 'max_seq_len'} + dynamic_axes = {k: symbolic_names for k in input.keys()} + from neural_compressor.config import Torch2ONNXConfig + int8_onnx_config = Torch2ONNXConfig( + dtype="int8", + opset_version=14, + example_inputs=tuple(input.values()), + input_names=list(input.keys()), + output_names=['labels'], + dynamic_axes=dynamic_axes, + ) + q_model.export('int8-nlp-model.onnx', int8_onnx_config) exit(0) if model_args.accuracy_only: diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/run_tuning.sh b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/run_tuning.sh index e01add178fb..edc07713079 100644 --- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/run_tuning.sh +++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/run_tuning.sh @@ -86,6 +86,7 @@ function run_tuning { --no_cuda \ --output_dir ${tuned_checkpoint} \ --tune \ + --onnx \ ${extra_cmd} } diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/README.md b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/README.md index 881332a1314..d9b82bf907b 100644 --- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/README.md +++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/README.md @@ -187,3 +187,11 @@ quantizer.model = common.Model(model) model = quantizer.fit() model.save(training_args.output_dir) ``` + +# Appendix + +## Export to ONNX + +Right now, we experimentally support exporting PyTorch model to ONNX model, includes FP32 and INT8 model. + +By enabling `--onnx` argument, Intel Neural Compressor will export fp32 ONNX model, INT8 QDQ ONNX model, and INT8 QLinear ONNX model. diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/requirements.txt b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/requirements.txt index fbbce5e4433..01afab8e2ae 100644 --- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/requirements.txt +++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/requirements.txt @@ -4,6 +4,8 @@ protobuf scipy scikit-learn Keras-Preprocessing +onnx +onnxruntime transformers >= 4.16.0 --find-links https://download.pytorch.org/whl/torch_stable.html torch >= 1.8.0+cpu diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_glue.py b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_glue.py index 717ae91d886..113bfa69341 100644 --- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_glue.py +++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_glue.py @@ -195,6 +195,9 @@ class ModelArguments: accuracy_only: bool = field( default=False, metadata={"help": "get accuracy"} ) + onnx: bool = field( + default=False, metadata={"help": "convert PyTorch model to ONNX"} + ) def main(): @@ -502,9 +505,46 @@ def eval_func(model): from neural_compressor.config import PostTrainingQuantConfig, TuningCriterion tuning_criterion = TuningCriterion(max_trials=600) conf = PostTrainingQuantConfig(approach="static", backend="pytorch_fx", tuning_criterion=tuning_criterion) - model = fit(model, conf=conf, calib_dataloader=eval_dataloader, eval_func=eval_func) + q_model = fit(model, conf=conf, calib_dataloader=eval_dataloader, eval_func=eval_func) from neural_compressor.utils.load_huggingface import save_for_huggingface_upstream - save_for_huggingface_upstream(model, tokenizer, training_args.output_dir) + save_for_huggingface_upstream(q_model, tokenizer, training_args.output_dir) + + if model_args.onnx: + it = iter(eval_dataloader) + input = next(it) + input.pop('labels') + symbolic_names = {0: 'batch_size', 1: 'max_seq_len'} + dynamic_axes = {k: symbolic_names for k in input.keys()} + from neural_compressor.config import Torch2ONNXConfig + fp32_onnx_config = Torch2ONNXConfig( + dtype="fp32", + opset_version=14, + example_inputs=tuple(input.values()), + input_names=list(input.keys()), + output_names=['labels'], + dynamic_axes=dynamic_axes, + ) + q_model.export('fp32-model.onnx', fp32_onnx_config) + int8_onnx_config = Torch2ONNXConfig( + dtype="int8", + opset_version=14, + quant_format="QDQ", + example_inputs=tuple(input.values()), + input_names=list(input.keys()), + output_names=['labels'], + dynamic_axes=dynamic_axes, + ) + q_model.export('int8-nlp-qdq-model.onnx', int8_onnx_config) + int8_onnx_config = Torch2ONNXConfig( + dtype="int8", + opset_version=14, + quant_format="QLinear", + example_inputs=tuple(input.values()), + input_names=list(input.keys()), + output_names=['labels'], + dynamic_axes=dynamic_axes, + ) + q_model.export('int8-nlp-qlinear-model.onnx', int8_onnx_config) return if model_args.benchmark or model_args.accuracy_only: diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_tuning.sh b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_tuning.sh index a3f5c6934c7..45279ae5308 100644 --- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_tuning.sh +++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_tuning.sh @@ -47,6 +47,7 @@ function run_tuning { if [ "${topology}" = "bert_base_MRPC" ];then TASK_NAME='mrpc' model_name_or_path=${input_model} + extra_cmd='--onnx' elif [ "${topology}" = "bert_base_CoLA" ]; then TASK_NAME='cola' model_name_or_path=${input_model} diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/README.md b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/README.md index e1c802c7ff2..fc6d1ccd4e1 100644 --- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/README.md +++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/README.md @@ -117,3 +117,11 @@ model = OptimizedModel.from_pretrained( ``` We also upstreamed several int8 models into HuggingFace [model hub](https://huggingface.co/models?other=Intel%C2%AE%20Neural%20Compressor) for users to ramp up. + +# Appendix + +## Export to ONNX + +Right now, we experimentally support exporting PyTorch model to ONNX model, includes FP32 and INT8 model. + +By enabling `--onnx` argument, Intel Neural Compressor will export fp32 ONNX model, INT8 QDQ ONNX model, and INT8 QLinear ONNX model. diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/requirements.txt b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/requirements.txt index 5386769210e..2bb6fc03b2d 100644 --- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/requirements.txt +++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/requirements.txt @@ -4,5 +4,7 @@ datasets == 1.18.0 sentencepiece != 0.1.92 protobuf scipy +onnx +onnxruntime --find-links https://download.pytorch.org/whl/torch_stable.html torch >= 1.8.0+cpu diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_glue_tune.py b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_glue_tune.py index f5bc771e712..f9fe765dbc2 100644 --- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_glue_tune.py +++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_glue_tune.py @@ -194,6 +194,9 @@ class ModelArguments: benchmark: bool = field( default=False, metadata={"help": "get benchmark instead of accuracy"} ) + onnx: bool = field( + default=False, metadata={"help": "convert PyTorch model to ONNX"} + ) def main(): @@ -533,6 +536,43 @@ def benchmark(model): from neural_compressor.utils.load_huggingface import save_for_huggingface_upstream save_for_huggingface_upstream(model, tokenizer, training_args.output_dir) + + if model_args.onnx: + it = iter(eval_dataloader) + input = next(it) + input.pop('labels') + symbolic_names = {0: 'batch_size', 1: 'max_seq_len'} + dynamic_axes = {k: symbolic_names for k in input.keys()} + from neural_compressor.config import Torch2ONNXConfig + fp32_onnx_config = Torch2ONNXConfig( + dtype="fp32", + opset_version=14, + example_inputs=tuple(input.values()), + input_names=list(input.keys()), + output_names=['labels'], + dynamic_axes=dynamic_axes, + ) + model.export('fp32-model.onnx', fp32_onnx_config) + int8_onnx_config = Torch2ONNXConfig( + dtype="int8", + opset_version=14, + quant_format="QDQ", + example_inputs=tuple(input.values()), + input_names=list(input.keys()), + output_names=['labels'], + dynamic_axes=dynamic_axes, + ) + model.export('int8-nlp-qdq-model.onnx', int8_onnx_config) + int8_onnx_config = Torch2ONNXConfig( + dtype="int8", + opset_version=14, + quant_format="QLinear", + example_inputs=tuple(input.values()), + input_names=list(input.keys()), + output_names=['labels'], + dynamic_axes=dynamic_axes, + ) + model.export('int8-nlp-qlinear-model.onnx', int8_onnx_config) return if model_args.benchmark: diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_tuning.sh b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_tuning.sh index 888a8968d24..55b74b334ee 100644 --- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_tuning.sh +++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_tuning.sh @@ -14,6 +14,7 @@ function init_params { task_name="mrpc" output_model="saved_results" input_model="bert-base-cased" + extra_cmd='--onnx' for var in "$@" do case $var in @@ -60,7 +61,8 @@ function run_tuning { --save_strategy steps \ --metric_for_best_model f1 \ --save_total_limit 1 \ - --tune + --tune \ + ${extra_cmd} } main "$@" From 7d87133bf6100d44faac7286e8d437613dbd71cc Mon Sep 17 00:00:00 2001 From: Xin He Date: Wed, 7 Dec 2022 09:53:04 +0800 Subject: [PATCH 2/2] test onnx for all topology Signed-off-by: Xin He --- .../quantization/ptq_static/fx/run_tuning.sh | 2 +- .../text-classification/quantization/qat/fx/run_tuning.sh | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_tuning.sh b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_tuning.sh index 45279ae5308..19712872786 100644 --- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_tuning.sh +++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_tuning.sh @@ -47,7 +47,6 @@ function run_tuning { if [ "${topology}" = "bert_base_MRPC" ];then TASK_NAME='mrpc' model_name_or_path=${input_model} - extra_cmd='--onnx' elif [ "${topology}" = "bert_base_CoLA" ]; then TASK_NAME='cola' model_name_or_path=${input_model} @@ -93,6 +92,7 @@ function run_tuning { --no_cuda \ --output_dir ${tuned_checkpoint} \ --tune \ + --onnx \ --overwrite_output_dir \ ${extra_cmd} } diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_tuning.sh b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_tuning.sh index 55b74b334ee..31d6f314e8b 100644 --- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_tuning.sh +++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_tuning.sh @@ -14,7 +14,6 @@ function init_params { task_name="mrpc" output_model="saved_results" input_model="bert-base-cased" - extra_cmd='--onnx' for var in "$@" do case $var in @@ -61,8 +60,8 @@ function run_tuning { --save_strategy steps \ --metric_for_best_model f1 \ --save_total_limit 1 \ - --tune \ - ${extra_cmd} + --onnx \ + --tune } main "$@"