intel
diff --git a/‎examples/deployment/neural_engine/sparse/bert_mini/README.md‎
Lines changed: 12 additions & 12 deletions b/‎examples/deployment/neural_engine/sparse/bert_mini/README.md‎
Lines changed: 12 additions & 12 deletions
diff --git a/‎examples/deployment/neural_engine/sparse/bert_mini/export_tranpose_ir.py‎
Lines changed: 20 additions & 0 deletions b/‎examples/deployment/neural_engine/sparse/bert_mini/export_tranpose_ir.py‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎examples/deployment/neural_engine/sparse/bert_mini/prepare_model.sh‎
Lines changed: 63 additions & 0 deletions b/‎examples/deployment/neural_engine/sparse/bert_mini/prepare_model.sh‎
Lines changed: 63 additions & 0 deletions
@@ -64,10 +64,18 @@ python prepare_dataset.py --dataset_name=glue --task_name=sst2 --output_dir=./da
 ### 2.2 Get sparse model
 
 Neural Engine can parse Sparse ONNX model and Neural Engine IR.
-You can train a Bert mini sst2 sparse model with distillation through Neural Compressor [example](https://github.com/intel-innersource/frameworks.ai.lpot.intel-lpot/blob/28e9b1e66c23f4443a2be8f2926fee1e919f5a14/examples/pytorch/nlp/huggingface_models/text-classification/pruning_while_distillation/group_lasso/eager/README.md). and transpose the weight and activation to get better performance.
-Neural Engine will automatically detect weight structured sparse ratio, as long as it beyond 70% (since normaly get performance gain when sparse ratio beyond 70%), Neural Engine will call [SparseLib](https://github.com/intel-innersource/frameworks.ai.nlp-toolkit.intel-nlp-toolkit/tree/develop/nlp_toolkit/backends/neural_engine/SparseLib) kernels and high performance layernorm op with transpose mode to improve inference performance.
+You can train a Bert mini sst2 sparse model with distillation through Neural Compressor [example](https://github.com/intel-innersource/frameworks.ai.lpot.intel-lpot/blob/28e9b1e66c23f4443a2be8f2926fee1e919f5a14/examples/pytorch/nlp/huggingface_models/text-classification/pruning_while_distillation/group_lasso/eager/README.md). Or use the [sparse model](https://huggingface.co/Intel/bert-mini-sst2-distilled-sparse-90-1X4-block) we publiced on huggingface which is bert mini on sst2 with sparse ratio 90% 1X4 block.
+You can get INT8 ONNX sparse model from optimization module by setting precision=int8, command as follows:
+```shell
+bash prepare_model.sh --input_model=Intel/bert-mini-sst2-distilled-sparse-90-1X4-block --task_name=sst2 --output_dir=./model_and_tokenizer --precision=int8
+```
+Then you can generate tranposed sparse model to get better performance, command as follows:
+```shell
+python export_tranpose_ir.py --input_model=./model_and_tokenizer/int8-model.onnx
+```
 
 ### Benchmark
+Neural Engine will automatically detect weight structured sparse ratio, as long as it beyond 70% (since normaly get performance gain when sparse ratio beyond 70%), Neural Engine will call [SparseLib](https://github.com/intel-innersource/frameworks.ai.nlp-toolkit.intel-nlp-toolkit/tree/develop/nlp_toolkit/backends/neural_engine/SparseLib) kernels and high performance layernorm op with transpose mode to improve inference performance.
 
   2.1 accuracy
   run python
@@ -92,15 +100,7 @@ Neural Engine will automatically detect weight structured sparse ratio, as long
   bash run_benchmark.sh --input_model=./sparse_int8_ir  --mode=performance --batch_size=8 --seq_len=128
   ```
 
-  or compile framwork model to IR using python API
-  
-  ```
-  from nlp_toolkit.backends.neural_engine.compile import compile
-  graph = compile('./sparse_int8_ir')
-  graph.save('./ir')
-  ```
-  
-  and run C++
+  Or run C++
   The warmup below is recommended to be 1/10 of iterations and no less than 3.
 
   ```
@@ -110,5 +110,5 @@ Neural Engine will automatically detect weight structured sparse ratio, as long
   export UNIFIED_BUFFER=1
   numactl -C 0-<cpu_cores-1> <NLP_Toolkit_folder>/nlp_toolkit/backends/neural_engine/bin/neural_engine
   --batch_size=<batch_size> --iterations=<iterations> --w=<warmup>
-  --seq_len=128 --config=./ir/conf.yaml --weight=./ir/model.bin
+  --seq_len=128 --config=./sparse_int8_ir/conf.yaml --weight=./sparse_int8_ir/model.bin
   ```
@@ -0,0 +1,20 @@
+from nlp_toolkit.backends.neural_engine.compile import compile
+from nlp_toolkit.backends.neural_engine.compile.graph import Graph
+import os
+import argparse
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input_model', default="./model_and_tokenizer/int8-model.onnx",
+                        type=str, help="Input model path.")
+    parser.add_argument('--output_dir',
+                        help='directory to save data to',
+                        type=str, default='./sparse_int8_ir')
+    args = parser.parse_args()
+
+    graph = compile(args.input_model)
+    graph.save()
+    model = Graph()
+    model.graph_init('./ir/conf.yaml', './ir/model.bin')
+    model.transpose_mode_int8()
+    model.save(args.output_dir)
@@ -0,0 +1,63 @@
+#!/bin/bash
+# set -x
+function main {
+  init_params "$@"
+  prepare_model
+}
+
+# init params
+function init_params {
+  for var in "$@"
+  do
+    case $var in
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+      --task_name=*)
+          task_name=$(echo $var |cut -f2 -d=)
+      ;;
+      --cache_dir=*)
+          cache_dir=$(echo $var |cut -f2 -d=)
+      ;;
+      --output_dir=*)
+          output_dir=$(echo $var |cut -f2 -d=)
+      ;;
+      --precision=*)
+          precision=$(echo $var |cut -f2 -d=)
+      ;;
+    esac
+  done
+}
+
+function prepare_model {
+
+    mode_cmd=""
+    if [[ ${precision} = 'int8' ]]; then
+        mode_cmd=$mode_cmd" --tune --quantization_approach PostTrainingStatic"
+    fi
+    if [[ ${precision} = 'bf16' ]]; then
+        mode_cmd=$mode_cmd" --enable_bf16"
+    fi
+    echo ${mode_cmd}
+   
+    cache="./tmp"
+    if [[ ${cache_dir} ]]; then
+        cache="$cache_dir"
+    fi
+    echo ${cache}
+ 
+    python run_glue.py \
+        --model_name_or_path ${input_model} \
+        --task_name ${task_name} \
+        --do_train \
+        --do_eval \
+        --cache_dir ${cache} \
+        --output_dir ${output_dir} \
+        --overwrite_output_dir \
+        --to_onnx \
+        ${mode_cmd}
+
+}
+
+main "$@"
+