From 08e6578574c0cdf536f9a3ba7935b763f3f486d7 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Wed, 7 Dec 2022 09:47:35 +0800
Subject: [PATCH 1/2] add export for new API examples

Signed-off-by: Xin He <xin3.he@intel.com>
---
 .../quantization/ptq_dynamic/eager/README.md  |  5 +++
 .../ptq_dynamic/eager/requirements.txt        |  4 +-
 .../ptq_dynamic/eager/run_glue_tune.py        | 35 ++++++++++++---
 .../ptq_dynamic/eager/run_tuning.sh           |  1 +
 .../quantization/ptq_static/fx/README.md      |  8 ++++
 .../ptq_static/fx/requirements.txt            |  2 +
 .../quantization/ptq_static/fx/run_glue.py    | 44 ++++++++++++++++++-
 .../quantization/ptq_static/fx/run_tuning.sh  |  1 +
 .../quantization/qat/fx/README.md             |  8 ++++
 .../quantization/qat/fx/requirements.txt      |  2 +
 .../quantization/qat/fx/run_glue_tune.py      | 40 +++++++++++++++++
 .../quantization/qat/fx/run_tuning.sh         |  4 +-
 12 files changed, 145 insertions(+), 9 deletions(-)

diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/README.md b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/README.md
index ac449cdb781..016d8d99456 100644
--- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/README.md
+++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/README.md
@@ -198,5 +198,10 @@ Shapley values originate from cooperative game theory that come with desirable p
 > **Note** : run_glue_tune_with_shap.py is the example of "SST2" task. If you want to execute other glue task, you may take some slight change under "ShapleyMSE" class.  
 
 
+# Appendix
 
+## Export to ONNX
 
+Right now, we experimentally support exporting PyTorch model to ONNX model, includes FP32 and INT8 model.
+
+By enabling `--onnx` argument, Intel Neural Compressor will export fp32 ONNX model, INT8 QDQ ONNX model, and INT8 QLinear ONNX model.
diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/requirements.txt b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/requirements.txt
index 7ad9dc04d0c..688b5217718 100644
--- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/requirements.txt
+++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/requirements.txt
@@ -6,4 +6,6 @@ torch >= 1.3
 transformers>=4.10.0
 shap
 scipy
-sacremoses
\ No newline at end of file
+sacremoses
+onnx
+onnxruntime
\ No newline at end of file
diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/run_glue_tune.py b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/run_glue_tune.py
index 13812b30b4e..b41c077ac59 100755
--- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/run_glue_tune.py
+++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/run_glue_tune.py
@@ -144,18 +144,25 @@ class ModelArguments:
     tune: bool = field(
         default=False,
         metadata={
-            "help": "tune quantized model with Intel Neural Compressor)."
-        },
+            "help": "tune quantized model with Intel Neural Compressor)."},
     )
     benchmark: bool = field(
         default=False,
-        metadata={"help": "run benchmark."})
+        metadata={"help": "run benchmark."},
+    )
     int8: bool = field(
         default=False,
-        metadata={"help":"run benchmark."})
+        metadata={"help":"initialize int8 model."},
+    )
     accuracy_only: bool = field(
         default=False,
-        metadata={"help":"Whether to only test accuracy for model tuned by Neural Compressor."})
+        metadata={"help":"Whether to only test accuracy for model tuned by Neural Compressor."},
+    )
+    onnx: bool = field(
+        default=False, metadata={"help": "convert PyTorch model to ONNX"}
+    )
+
+
 def main():
     # See all possible arguments in src/transformers/training_args.py
     # or by passing the --help flag to this script.
@@ -439,6 +446,24 @@ def eval_func_for_nc(model_tuned):
         q_model = fit(model, conf=conf, eval_func=eval_func_for_nc)
         from neural_compressor.utils.load_huggingface import save_for_huggingface_upstream
         save_for_huggingface_upstream(q_model, tokenizer, training_args.output_dir)
+
+        if model_args.onnx:
+            eval_dataloader = trainer.get_eval_dataloader()
+            it = iter(eval_dataloader)
+            input = next(it)
+            input.pop('labels')
+            symbolic_names = {0: 'batch_size', 1: 'max_seq_len'}
+            dynamic_axes = {k: symbolic_names for k in input.keys()}
+            from neural_compressor.config import Torch2ONNXConfig
+            int8_onnx_config = Torch2ONNXConfig(
+                dtype="int8",
+                opset_version=14,
+                example_inputs=tuple(input.values()),
+                input_names=list(input.keys()),
+                output_names=['labels'],
+                dynamic_axes=dynamic_axes,
+            )
+            q_model.export('int8-nlp-model.onnx', int8_onnx_config)
         exit(0)
 
     if model_args.accuracy_only:
diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/run_tuning.sh b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/run_tuning.sh
index e01add178fb..edc07713079 100644
--- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/run_tuning.sh
+++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/run_tuning.sh
@@ -86,6 +86,7 @@ function run_tuning {
         --no_cuda \
         --output_dir ${tuned_checkpoint} \
         --tune \
+        --onnx \
         ${extra_cmd}
 }
 
diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/README.md b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/README.md
index 881332a1314..d9b82bf907b 100644
--- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/README.md
+++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/README.md
@@ -187,3 +187,11 @@ quantizer.model = common.Model(model)
 model = quantizer.fit()
 model.save(training_args.output_dir)
 ```
+
+# Appendix
+
+## Export to ONNX
+
+Right now, we experimentally support exporting PyTorch model to ONNX model, includes FP32 and INT8 model.
+
+By enabling `--onnx` argument, Intel Neural Compressor will export fp32 ONNX model, INT8 QDQ ONNX model, and INT8 QLinear ONNX model.
diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/requirements.txt b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/requirements.txt
index fbbce5e4433..01afab8e2ae 100644
--- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/requirements.txt
+++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/requirements.txt
@@ -4,6 +4,8 @@ protobuf
 scipy
 scikit-learn
 Keras-Preprocessing
+onnx
+onnxruntime
 transformers >= 4.16.0
 --find-links https://download.pytorch.org/whl/torch_stable.html
 torch >= 1.8.0+cpu
diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_glue.py b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_glue.py
index 717ae91d886..113bfa69341 100644
--- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_glue.py
+++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_glue.py
@@ -195,6 +195,9 @@ class ModelArguments:
     accuracy_only: bool = field(
         default=False, metadata={"help": "get accuracy"}
     )
+    onnx: bool = field(
+        default=False, metadata={"help": "convert PyTorch model to ONNX"}
+    )
 
 
 def main():
@@ -502,9 +505,46 @@ def eval_func(model):
         from neural_compressor.config import PostTrainingQuantConfig, TuningCriterion
         tuning_criterion = TuningCriterion(max_trials=600)
         conf = PostTrainingQuantConfig(approach="static", backend="pytorch_fx", tuning_criterion=tuning_criterion)
-        model = fit(model, conf=conf, calib_dataloader=eval_dataloader, eval_func=eval_func)
+        q_model = fit(model, conf=conf, calib_dataloader=eval_dataloader, eval_func=eval_func)
         from neural_compressor.utils.load_huggingface import save_for_huggingface_upstream
-        save_for_huggingface_upstream(model, tokenizer, training_args.output_dir)
+        save_for_huggingface_upstream(q_model, tokenizer, training_args.output_dir)
+
+        if model_args.onnx:
+            it = iter(eval_dataloader)
+            input = next(it)
+            input.pop('labels')
+            symbolic_names = {0: 'batch_size', 1: 'max_seq_len'}
+            dynamic_axes = {k: symbolic_names for k in input.keys()}
+            from neural_compressor.config import Torch2ONNXConfig
+            fp32_onnx_config = Torch2ONNXConfig(
+                dtype="fp32",
+                opset_version=14,
+                example_inputs=tuple(input.values()),
+                input_names=list(input.keys()),
+                output_names=['labels'],
+                dynamic_axes=dynamic_axes,
+            )
+            q_model.export('fp32-model.onnx', fp32_onnx_config)
+            int8_onnx_config = Torch2ONNXConfig(
+                dtype="int8",
+                opset_version=14,
+                quant_format="QDQ",
+                example_inputs=tuple(input.values()),
+                input_names=list(input.keys()),
+                output_names=['labels'],
+                dynamic_axes=dynamic_axes,
+            )
+            q_model.export('int8-nlp-qdq-model.onnx', int8_onnx_config)
+            int8_onnx_config = Torch2ONNXConfig(
+                dtype="int8",
+                opset_version=14,
+                quant_format="QLinear",
+                example_inputs=tuple(input.values()),
+                input_names=list(input.keys()),
+                output_names=['labels'],
+                dynamic_axes=dynamic_axes,
+            )
+            q_model.export('int8-nlp-qlinear-model.onnx', int8_onnx_config)
         return
 
     if model_args.benchmark or model_args.accuracy_only:
diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_tuning.sh b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_tuning.sh
index a3f5c6934c7..45279ae5308 100644
--- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_tuning.sh
+++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_tuning.sh
@@ -47,6 +47,7 @@ function run_tuning {
     if [ "${topology}" = "bert_base_MRPC" ];then
         TASK_NAME='mrpc'
         model_name_or_path=${input_model}
+        extra_cmd='--onnx'
     elif [ "${topology}" = "bert_base_CoLA" ]; then
         TASK_NAME='cola'
         model_name_or_path=${input_model}
diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/README.md b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/README.md
index e1c802c7ff2..fc6d1ccd4e1 100644
--- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/README.md
+++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/README.md
@@ -117,3 +117,11 @@ model = OptimizedModel.from_pretrained(
 ```
 
 We also upstreamed several int8 models into HuggingFace [model hub](https://huggingface.co/models?other=Intel%C2%AE%20Neural%20Compressor) for users to ramp up.
+
+# Appendix
+
+## Export to ONNX
+
+Right now, we experimentally support exporting PyTorch model to ONNX model, includes FP32 and INT8 model.
+
+By enabling `--onnx` argument, Intel Neural Compressor will export fp32 ONNX model, INT8 QDQ ONNX model, and INT8 QLinear ONNX model.
diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/requirements.txt b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/requirements.txt
index 5386769210e..2bb6fc03b2d 100644
--- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/requirements.txt
+++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/requirements.txt
@@ -4,5 +4,7 @@ datasets == 1.18.0
 sentencepiece != 0.1.92
 protobuf
 scipy
+onnx
+onnxruntime
 --find-links https://download.pytorch.org/whl/torch_stable.html
 torch >= 1.8.0+cpu
diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_glue_tune.py b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_glue_tune.py
index f5bc771e712..f9fe765dbc2 100644
--- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_glue_tune.py
+++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_glue_tune.py
@@ -194,6 +194,9 @@ class ModelArguments:
     benchmark: bool = field(
         default=False, metadata={"help": "get benchmark instead of accuracy"}
     )
+    onnx: bool = field(
+        default=False, metadata={"help": "convert PyTorch model to ONNX"}
+    )
 
 
 def main():
@@ -533,6 +536,43 @@ def benchmark(model):
 
         from neural_compressor.utils.load_huggingface import save_for_huggingface_upstream
         save_for_huggingface_upstream(model, tokenizer, training_args.output_dir)
+
+        if model_args.onnx:
+            it = iter(eval_dataloader)
+            input = next(it)
+            input.pop('labels')
+            symbolic_names = {0: 'batch_size', 1: 'max_seq_len'}
+            dynamic_axes = {k: symbolic_names for k in input.keys()}
+            from neural_compressor.config import Torch2ONNXConfig
+            fp32_onnx_config = Torch2ONNXConfig(
+                dtype="fp32",
+                opset_version=14,
+                example_inputs=tuple(input.values()),
+                input_names=list(input.keys()),
+                output_names=['labels'],
+                dynamic_axes=dynamic_axes,
+            )
+            model.export('fp32-model.onnx', fp32_onnx_config)
+            int8_onnx_config = Torch2ONNXConfig(
+                dtype="int8",
+                opset_version=14,
+                quant_format="QDQ",
+                example_inputs=tuple(input.values()),
+                input_names=list(input.keys()),
+                output_names=['labels'],
+                dynamic_axes=dynamic_axes,
+            )
+            model.export('int8-nlp-qdq-model.onnx', int8_onnx_config)
+            int8_onnx_config = Torch2ONNXConfig(
+                dtype="int8",
+                opset_version=14,
+                quant_format="QLinear",
+                example_inputs=tuple(input.values()),
+                input_names=list(input.keys()),
+                output_names=['labels'],
+                dynamic_axes=dynamic_axes,
+            )
+            model.export('int8-nlp-qlinear-model.onnx', int8_onnx_config)
         return
 
     if model_args.benchmark:
diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_tuning.sh b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_tuning.sh
index 888a8968d24..55b74b334ee 100644
--- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_tuning.sh
+++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_tuning.sh
@@ -14,6 +14,7 @@ function init_params {
     task_name="mrpc"
     output_model="saved_results"
     input_model="bert-base-cased"
+    extra_cmd='--onnx'
     for var in "$@"
     do
       case $var in
@@ -60,7 +61,8 @@ function run_tuning {
         --save_strategy steps \
         --metric_for_best_model f1 \
         --save_total_limit 1 \
-        --tune
+        --tune \
+        ${extra_cmd}
 }
 
 main "$@"

From 7d87133bf6100d44faac7286e8d437613dbd71cc Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Wed, 7 Dec 2022 09:53:04 +0800
Subject: [PATCH 2/2] test onnx for all topology

Signed-off-by: Xin He <xin3.he@intel.com>
---
 .../quantization/ptq_static/fx/run_tuning.sh                 | 2 +-
 .../text-classification/quantization/qat/fx/run_tuning.sh    | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_tuning.sh b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_tuning.sh
index 45279ae5308..19712872786 100644
--- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_tuning.sh
+++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_tuning.sh
@@ -47,7 +47,6 @@ function run_tuning {
     if [ "${topology}" = "bert_base_MRPC" ];then
         TASK_NAME='mrpc'
         model_name_or_path=${input_model}
-        extra_cmd='--onnx'
     elif [ "${topology}" = "bert_base_CoLA" ]; then
         TASK_NAME='cola'
         model_name_or_path=${input_model}
@@ -93,6 +92,7 @@ function run_tuning {
         --no_cuda \
         --output_dir ${tuned_checkpoint} \
         --tune \
+        --onnx \
         --overwrite_output_dir \
         ${extra_cmd}
 }
diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_tuning.sh b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_tuning.sh
index 55b74b334ee..31d6f314e8b 100644
--- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_tuning.sh
+++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_tuning.sh
@@ -14,7 +14,6 @@ function init_params {
     task_name="mrpc"
     output_model="saved_results"
     input_model="bert-base-cased"
-    extra_cmd='--onnx'
     for var in "$@"
     do
       case $var in
@@ -61,8 +60,8 @@ function run_tuning {
         --save_strategy steps \
         --metric_for_best_model f1 \
         --save_total_limit 1 \
-        --tune \
-        ${extra_cmd}
+        --onnx \
+        --tune
 }
 
 main "$@"