intel · xurui1995 · Sep 4, 2024 · Aug 16, 2024 · Aug 26, 2024 · Aug 26, 2024
diff --git a/python/config.py.in b/python/config.py.in
@@ -5,7 +5,7 @@ llvm_obj_root = "@LLVM_BINARY_DIR@"
 llvm_lib_dir = "@LLVM_LIBRARY_DIR@"
 shlib_ext = "@LTDL_SHLIB_EXT@"
 gc_lib_dir = "@LLVM_LIBRARY_OUTPUT_INTDIR@"
-
+GC_ENABLE_DNNL_API ="@GC_ENABLE_DNNL_API@" in ["ON", "1"]
 
 if sys.platform.startswith("win32"):
     mlir_runner_utils_dir = os.path.normpath(os.path.join(llvm_obj_root, "bin"))

diff --git a/python/gc_mlir/_mlir_libs/_site_initialize_0.py b/python/gc_mlir/_mlir_libs/_site_initialize_0.py
@@ -5,6 +5,7 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
 # ===-----------------------------------------------------------------------===#
+from gc_mlir.config import GC_ENABLE_DNNL_API
 
 
 def context_init_hook(context):
@@ -13,11 +14,9 @@ def context_init_hook(context):
 
     register_cpuruntime_dialect(context)
 
-    try:
+    if GC_ENABLE_DNNL_API:
         from ._gc_mlir.onednn_graph import (
             register_dialect as register_onednn_graph_dialect,
         )
 
         register_onednn_graph_dialect(context)
-    except ModuleNotFoundError:
-        print("onednn_graph dialect not found")
diff --git a/scripts/correctness.sh b/scripts/correctness.sh
@@ -102,5 +102,8 @@ python3 -m benchgc --verbose 0 --driver mlir --case ${CASE_DIR}/reduce.mlir || F
 # mlir
 # python3 -m benchgc --verbose 0 --driver mlir --case ${CASE_DIR}/llama2.mlir || FAIL=1
 
+#mlp
+python3 -m benchgc --verbose 1  --driver pattern --case mlp --batch_size=32 --hidden_size_list=32x16x64 --has_bias=1x1 --act_type=noop --dtype=f32 
+
 set +e
 exit $FAIL
diff --git a/test/benchgc/CMakeLists.txt b/test/benchgc/CMakeLists.txt
@@ -39,3 +39,4 @@ add_subdirectory("src/benchgc/mlir")
 add_subdirectory("src/benchgc/linalg")
 add_subdirectory("src/benchgc/tensor")
 add_subdirectory("src/benchgc/arith")
+add_subdirectory("src/benchgc/pattern")
diff --git a/test/benchgc/README.md b/test/benchgc/README.md
@@ -2,32 +2,47 @@
 
 ## Description
 
-Benchgc is a tool used to verify the correctness and performance of graph compiler. Benchgc accepts MLIR files based on the OneDNN graph dialect as test cases and prepares test data for them. For correctness verification, Benchgc will use PyTorch as a reference for comparison.
+Benchgc is a tool used to verify the correctness and performance of graph compiler. Benchgc accepts MLIR files as test cases and prepares test data for them. For correctness verification, Benchgc will use PyTorch as a reference for comparison.
 
 ## Prerequisite
 * python >= 3.10
 * torch >= 2.2
-* pybind11
+* Enable mlir python binding, Refer to [`python/README.md`](../../python/README.md) for detail
 
-## Build and install
+## Build 
+There are two ways for using benchgc
+
+* Build `.whl` and install benchgc
 ```
 # Please execute at the top level of the project
 
-mkdir -p build
-cd build
-
+mkdir build && cd build
 cmake .. -DMLIR_DIR=$MLIR_PATH -DGC_TEST_ENABLE=ON -DGC_ENABLE_BINDINGS_PYTHON=ON -DGC_BENCH_ENABLE=ON
 make -j benchgc
-
 python -m pip install test/benchgc/dist/benchgc-*.whl
 
 ```
 
+* Run benchgc from source code
+
+```
+# Please execute at the top level of the project
+
+mkdir build && cd build
+cmake .. -DMLIR_DIR=$MLIR_PATH -DGC_TEST_ENABLE=ON -DGC_ENABLE_BINDINGS_PYTHON=ON -DGC_BENCH_ENABLE=ON
+make -j GcPythonModules
+export PYTHONPATH=$(pwd)/python_packages/gc_mlir_core/:$(pwd)/../test/benchgc/src/
+```
+
 ## Synopsis
 ```
-python -m benchgc [OPTIONS] --driver [DRIVER] --case [CASE]
+python -m benchgc [OPTIONS] --mode [MODE] --driver [DRIVER] --case [CASE]
 ```
-## Flags
+## Common Options
+### --mode [str]
+* C : correctness testing (by default)
+* P : performance testing
+
 ###  --driver [str]
 * linalg: test the single op in linalg dialect
 * mlir: upload a mlir file and run
@@ -38,11 +53,25 @@ python -m benchgc [OPTIONS] --driver [DRIVER] --case [CASE]
 * if driver=pattern, please provide the pre-defined pattern name, such as mlp here
 * if driver is a dialect name, please provide the detail op name to start a single op test
 
+### --entry [str]
+* default : "entry"
+* the entry name of the kernel of input mlir or generated mlir
+
 ### --seed [int]
 * set the seed to generate the test data and reprodce the test
 
 ### --verbose [int]
-* set the verbose level
+* set the verbose level, default : 0
+* 0 : NO_VERBOSE
+* 1 : MODULE_VERBOSE, print the module will be executed
+* 2 : ARG_VERBOSE, + print arg information
+* 3 : COMPARE_VERBOSE, + print threshold for comparison
+* 4 : ERROR_OUTPUT_VERBOSE, + print all error data points if failed
+* 5 : OUTPUT_VERBOSE, + print all result including passed tensor
+* 6 : INPUT_VERBOSE, + print input torch tensors
+
+### --ir_printing (action=store_true)
+* Print the ir during the pass-pipeline 
 
 ### --md index:SHAPExTYPE
 * Describe the shape and data type for argument
@@ -97,7 +126,28 @@ module {
 | Norm check | N | threshold |
 | Benchdnn driver | D | driver_name:dtype:case |
 
+## Bench Options
+### --bench_kind [str]
+* py : use the MLIR Python API to invoke the kernel and use Python to calculate the time cost
+* wrapper : modify MLIR by wrapping the kernel into a new method and calling the `nanoTime()` method before and after calling the kernel. Finally, calculate the difference as the time cost
+
+### --warm_up [int]
+* warm-up times of the execution
+
+### --repeat [int]
+* repeat times of the execution
+
+## Pattern Options
+Each pattern has its own unique options.
+### mlp
+* `--batch_size`: the input
+* `--hidden_size_list`: hidden_sizes of mlp, example: 32x16x64
+* `--has_bias`: if the matmul op has bias, example: 1x0
+* `--act_type`: choices=["noop", "relu"]
+* `--dtype`: choices=["bf16", "f32"]
+
 ## Example
+### Correctness testing example
 ```
 # single add op test 
 # using the same data filling / compare strategy as the benchdnn primitive driver if not set
@@ -254,4 +304,126 @@ p2p check: threshold: 0.0000000
               (1, 0): ref:   25.1690636 res:   25.1690636 abs_diff:    0.0000000 rel_diff:    0.0000000
               (1, 1): ref:   -7.8600063 res:   -7.8600044 abs_diff:    0.0000019 rel_diff:    0.0000002
 FAIL: linalg.matmul_transpose_b
+```
+
+### Perf testing example
+* single op example
+```
+python3 -m benchgc --verbose 1  --mode P  --driver linalg --case add --md 0:4x5xf32 --md 1:4x5xf32 --md 2:4x5xf32
+
+module {
+  func.func @entry(%arg0: tensor<4x5xf32>, %arg1: tensor<4x5xf32>) -> tensor<4x5xf32> attributes {llvm.emit_c_interface} {
+    %cst = arith.constant 0.000000e+00 : f32
+    %0 = tensor.empty() : tensor<4x5xf32>
+    %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<4x5xf32>) -> tensor<4x5xf32>
+    %2 = linalg.add ins(%arg0, %arg1 : tensor<4x5xf32>, tensor<4x5xf32>) outs(%1 : tensor<4x5xf32>) -> tensor<4x5xf32>
+    return %2 : tensor<4x5xf32>
+  }
+}
+
+===========bench result===========
+{
+    "args": {
+        "mode": "P",
+        "driver": "linalg",
+        "case": "add",
+        "md": [
+            "0:4x5xf32",
+            "1:4x5xf32",
+            "2:4x5xf32"
+        ],
+        "fill": [],
+        "cmp": [],
+        "seed": 0,
+        "verbose": 1,
+        "entry": "entry",
+        "ir_printing": false,
+        "cast": "cast_signed",
+        "dimension": null,
+        "dimensions": null,
+        "dilations": null,
+        "strides": null,
+        "bench_kind": "py",
+        "warm_up": 100,
+        "repeat": 100
+    },
+    "compile_cost(ms)": 37.72595152258873,
+    "execute_cost(ms)": 0.00022314488887786865
+}
+```
+
+* mlir example
+```
+python3 -m benchgc --mode P --verbose 1  --driver mlir --case=./test.mlir  --bench_kind wrapper --warm_up 50 --repeat 200 
+\module {
+  func.func @entry(%arg0: tensor<512x128xf32>) -> tensor<512x128xf32> attributes {llvm.emit_c_interface} {
+    %cst = arith.constant 0.000000e+00 : f32
+    %0 = tensor.empty() : tensor<512x128xf32>
+    %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<512x128xf32>) -> tensor<512x128xf32>
+    %2 = linalg.abs ins(%arg0 : tensor<512x128xf32>) outs(%1 : tensor<512x128xf32>) -> tensor<512x128xf32>
+    return %2 : tensor<512x128xf32>
+  }
+}
+
+===========bench result===========
+{
+    "args": {
+        "mode": "P",
+        "driver": "mlir",
+        "case": "/home/xurui/gc_v2/test.mlir",
+        "md": [],
+        "fill": [],
+        "cmp": [],
+        "seed": 0,
+        "verbose": 1,
+        "entry": "entry",
+        "ir_printing": false,
+        "bench_kind": "wrapper",
+        "warm_up": 50,
+        "repeat": 200
+    },
+    "compile_cost(ms)": 70.6995539367199,
+    "execute_cost(ms)": 0.029325044999999984
+}
+```
+* mlp example
+```
+python3 -m benchgc --verbose 1 --mode P --driver pattern --case mlp --batch_size=32 --hidden_size_list=32x16x64 --has_bias=0x0 --act_type=noop --dtype=f32 
+
+module {
+  func.func @entry(%arg0: tensor<32x32xf32>, %arg1: tensor<32x16xf32>, %arg2: tensor<16x64xf32>) -> tensor<32x64xf32> attributes {llvm.emit_c_interface} {
+    %0 = tensor.empty() : tensor<32x16xf32>
+    %1 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%arg0, %arg1 : tensor<32x32xf32>, tensor<32x16xf32>) outs(%0 : tensor<32x16xf32>) -> tensor<32x16xf32>
+    %2 = tensor.empty() : tensor<32x64xf32>
+    %3 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1, %arg2 : tensor<32x16xf32>, tensor<16x64xf32>) outs(%2 : tensor<32x64xf32>) -> tensor<32x64xf32>
+    return %3 : tensor<32x64xf32>
+  }
+}
+
+===========bench result===========
+{
+    "args": {
+        "mode": "P",
+        "driver": "pattern",
+        "case": "mlp",
+        "md": [],
+        "fill": [],
+        "cmp": [],
+        "seed": 0,
+        "verbose": 1,
+        "entry": "entry",
+        "ir_printing": false,
+        "bench_kind": "py",
+        "warm_up": 100,
+        "repeat": 100,
+        "batch_size": 32,
+        "hidden_size_list": "32x16x64",
+        "has_bias": "0x0",
+        "act_type": "noop",
+        "dtype": "f32"
+    },
+    "compile_cost(ms)": 109.86808314919472,
+    "execute_cost(ms)": 0.02944003790616989
+}
+
 ```