add patterns tuning and graph tuning (#226)

Eason9393 · web-flow · commit f70394cba366 · 2022-09-16T10:34:35.000+08:00
diff --git a/nlp_toolkit/backends/neural_engine/compile/graph/graph.py b/nlp_toolkit/backends/neural_engine/compile/graph/graph.py
@@ -21,7 +21,8 @@
 import numpy as np
 import yaml
 import os
-
+import copy
+import time
 
 class Graph(object):
 
@@ -483,7 +484,94 @@ def dict_representer(dumper, data):
 
         logger.info("Emit done...")
 
-    def get_sparse_nodes_name(self, threshold=0.7):
+    def graph_dispatch(self, tune = True, inputs_shape = []):
+        sparse_nodes_name = self.get_sparse_nodes_name()
+        if tune:
+            logger.info("Tuning graph start ...")
+            self._tune_onednn_graph(inputs_shape)
+            self._tune_sparse_graph(inputs_shape, sparse_nodes_name)
+            logger.info("Tuning graph end ...")
+        else:
+            # if not tune, map to sparse graph directly 
+            self.transpose_mode_int8(sparse_nodes_name)
+    
+    def _tune_onednn_graph(self, inputs_shape = []):
+        onednn_graph_nodes_map = self._get_onednn_graph_nodes()
+        if onednn_graph_nodes_map == {"InnerProduct": [], "Softmax": []}:
+            pass
+        else:
+            onednn_graph_nodes_name_list = self._generate_onednn_graph_nodes_name_list(onednn_graph_nodes_map)
+            golden_onednn_graph_nodes_name = []
+            min_latency = float("inf")
+            for onednn_graph_nodes_name in onednn_graph_nodes_name_list:
+                curr_latency = float("inf")
+                try:
+                    curr_model = copy.deepcopy(self)
+                    curr_model._generate_onednn_graph_nodes(onednn_graph_nodes_name)
+                    curr_result, curr_latency = curr_model._get_latency(inputs_shape)
+                except:
+                    logger.warning("Graph can not be inferenced, please check the graph!")
+                # update min latency and transpose nodes name
+                if curr_latency < min_latency:
+                    min_latency = curr_latency
+                    golden_onednn_graph_nodes_name = onednn_graph_nodes_name
+            self._generate_onednn_graph_nodes(golden_onednn_graph_nodes_name)
+
+    def _get_onednn_graph_nodes(self):
+        # onednn graph only support fp32 inner_product and softmax
+        onednn_graph_nodes_map = {"InnerProduct": [], "Softmax": []}
+        for node in self.nodes:
+            if node.op_type == "InnerProduct":
+                weight = node.input_tensors[1]
+                if type(weight.data) == np.ndarray and \
+                    weight.data.dtype == "float32":
+                    onednn_graph_nodes_map["InnerProduct"].append(node.name)
+            elif node.op_type == "Softmax":
+                if node.attr.get("output_dtype", "float32") == "float32":
+                    onednn_graph_nodes_map["Softmax"].append(node.name)
+        return onednn_graph_nodes_map
+
+    def _generate_onednn_graph_nodes_name_list(self, onednn_graph_nodes_map):
+        # strategy:
+        # 1.softmax: all nodes map to onednn graph or not
+        # 2.innerproduct: tune accorording weight shape
+        ip_nodes_name_list = self._generate_transpose_nodes_name_list(onednn_graph_nodes_map["InnerProduct"])
+        onednn_graph_nodes_name_list = []
+        for ip_nodes_name in ip_nodes_name_list:
+            onednn_graph_nodes_name_list.append(ip_nodes_name)
+            onednn_graph_nodes_name_list.append(ip_nodes_name + onednn_graph_nodes_map["Softmax"])
+        return onednn_graph_nodes_name_list
+
+    def _generate_onednn_graph_nodes(self, onednn_graph_nodes_name):
+        for node in self.nodes:
+            if node.name in onednn_graph_nodes_name:
+                if node.op_type == "InnerProduct":
+                    node.op_type = "InnerProductGraph"
+                elif node.op_type == "Softmax":
+                    node.op_type = "SoftmaxGraph" 
+
+    def _tune_sparse_graph(self, inputs_shape = [], sparse_nodes_name = []):
+        if sparse_nodes_name == []:
+            pass
+        else:
+            trans_nodes_name_list = self._generate_transpose_nodes_name_list(sparse_nodes_name)
+            golden_trans_nodes_name = []
+            min_latency = float("inf")
+            for trans_nodes_name in trans_nodes_name_list:
+                curr_latency = float("inf")
+                try:
+                    curr_model = copy.deepcopy(self)
+                    curr_model.transpose_mode_int8(trans_nodes_name)
+                    curr_result, curr_latency = curr_model._get_latency(inputs_shape)
+                except:
+                    logger.warning("Graph can not be inferenced, please check the graph!")
+                # update min latency and transpose nodes name
+                if curr_latency < min_latency:
+                    min_latency = curr_latency
+                    golden_trans_nodes_name = trans_nodes_name
+            self.transpose_mode_int8(golden_trans_nodes_name)
+    
+    def get_sparse_nodes_name(self, threshold = 0.7):
 
         def get_zero_ratio(matrix, block):
             sparse_ratio = -1
@@ -496,9 +584,9 @@ def get_zero_ratio(matrix, block):
                         is_zero_block = True
                         for br in range(block[0]):
                             for bc in range(block[1]):
-                                if matrix[mr * block[0] + br][mc * block[1] + bc] != 0:
-                                    is_zero_block = False
-                                    break
+                                if matrix[mr*block[0]+br][mc*block[1]+bc] != 0:
+                                   is_zero_block = False
+                                   break
                             if not is_zero_block:
                                 break
                         if is_zero_block == True:
@@ -511,7 +599,7 @@ def get_zero_ratio(matrix, block):
             if node.op_type == "InnerProduct":
                 # sparse kernel limitation:
                 # 1. int8
-                # 2. sparse_ratio > 0.5(1*4)
+                # 2. sparse_ratio > 0.7(1*4)
                 # 3. output channel of weight_shape = 4x
                 # 4. post op != tanh
                 if 'append_op' not in node.attr \
@@ -521,12 +609,84 @@ def get_zero_ratio(matrix, block):
                     if type(weight.data) == np.ndarray and \
                         (weight.data.dtype == 'int8' \
                         or weight.data.dtype == 'uint8') \
-                        and weight.data.shape[1] % 4 == 0:
-
+                        and weight.data.shape[1] % 4 == 0: # 1*4 sparse block
                         zero_ratio = get_zero_ratio(weight.data, [1, 4])
                         if zero_ratio >= threshold:
                             sparse_nodes_name.append(node.name)
-        return sparse_nodes_name
+
+        return sparse_nodes_name 
+    
+    def _generate_transpose_nodes_name_list(self, sparse_nodes_name):
+        transpose_nodes_list = []
+        if sparse_nodes_name == []:
+            return transpose_nodes_list
+        # switch the nodes which has the same weight shape and pose op
+        weight_shape_map = {}
+        for node in self.nodes:
+            if node.name in sparse_nodes_name:
+                weight = node.input_tensors[1]
+                weight_shape = tuple(weight.shape) # list to tuple for dict key
+                if weight_shape in weight_shape_map.keys():
+                    weight_shape_map[weight_shape].append(node.name)
+                else:
+                    weight_shape_map[weight_shape] = [node.name]
+        
+        # binary reflected gray code to generate the all combinations fo the n elements
+        def brgd(n):
+            if n==1:
+                return ["0","1"]
+            L1 = brgd(n-1)
+            L2 = copy.deepcopy(L1)
+            L2.reverse()
+            L1 = ["0" + l for l in L1]
+            L2 = ["1" + l for l in L2]
+            L = L1 + L2
+            return L
+
+        transpose_mask_list = brgd(len(weight_shape_map))
+        for transpose_mask in transpose_mask_list:
+            transpose_nodes = []
+            for idx, weight_shape in enumerate(weight_shape_map):
+                if transpose_mask[idx]=="1":
+                    transpose_nodes += weight_shape_map[weight_shape]
+            transpose_nodes_list.append(transpose_nodes)
+
+        return transpose_nodes_list
+
+
+    def _generate_inputs(self, inputs_shape = []):
+        dtype_map = {"float32": np.float32,
+               "int8": np.int8,
+               "int32": np.int32,
+               "int64": np.int64,
+               "uint8": np.uint8,
+               }
+        inputs = []
+        id = 0
+        for node in self.nodes:
+            if node.op_type == "Input":
+                for tensor in node.output_tensors:
+                    if not isinstance(tensor.data, np.ndarray):
+                        if inputs_shape == []:
+                            shape = [16 for s in tensor.shape if s == -1]
+                        else:
+                            shape = inputs_shape[id]
+                        dtype = dtype_map[tensor.dtype]
+                        input = np.random.uniform(low=0, high=10, size=shape).astype(dtype)
+                        inputs.append(input)     
+                        id += 1
+        return inputs
+    
+    def _get_latency(self, inputs_shape = [], iterations = 10, warm_up = 5):
+        inputs = self._generate_inputs(inputs_shape)
+        iter_latency = []
+        for _ in range(iterations):
+            start_time = time.time()
+            result = self.inference(inputs)
+            end_time = time.time()
+            iter_latency.append(end_time - start_time)
+        latency = np.array(iter_latency[warm_up:]).mean()
+        return result, latency
 
     def transpose_mode_int8(self, node_name_list=None):
         from ..ops import Tensor
diff --git a/nlp_toolkit/backends/neural_engine/compile/sub_graph/pattern.py b/nlp_toolkit/backends/neural_engine/compile/sub_graph/pattern.py
@@ -60,6 +60,9 @@
     'OutputData',
 ]
 
+# for superbert, superbert patterns are huge patterns based on supported patterns
+superbert_patterns = []
+
 PATTERNS = {}
 
 
diff --git a/nlp_toolkit/backends/neural_engine/compile/sub_graph/subgraph_matcher.py b/nlp_toolkit/backends/neural_engine/compile/sub_graph/subgraph_matcher.py
@@ -15,7 +15,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .pattern import supported_patterns, PATTERNS
+import time
+import copy
+import numpy as np
+from tqdm import tqdm
+from .pattern import supported_patterns, superbert_patterns, PATTERNS
 from .. import logger
 
 EXECUTOR_TYPE = {
@@ -47,25 +51,65 @@
     "_MklLayerNorm": "LayerNorm",
 }
 
-
 class SubGraphMatcher(object):
-    def __call__(self, model):
-        patterns_switch = {
-            'LayerNorm': True,
-            'TransposeBatchMatMul': True,
-            'MatMulWithBiasGelu': True,
-            'MatMulWithBiasAdd': True,
-            'MatMulWithBiasTanh': True,
-        }
-        logger.info('Start to implement Sub-Graph matching and replacing...')
-        for pattern in supported_patterns:
-            if pattern in PATTERNS:
-                if pattern in patterns_switch.keys() and not patterns_switch[pattern]:
-                    continue
-                else:
-                    p_fusion = PATTERNS[pattern]()
-                    model = p_fusion(model)
+    def __call__(self, model, tune = False):
+        logger.info('Start to implement Sub-Graph matching and replacing...') 
+        if tune:
+            self._tune_patterns(model)
+        else:
+            self._fuse_patterns(model)
+        logger.info('Sub-Graph match and replace done...')
+        return model
 
+    def _fuse_patterns(self, model, supported_patterns=supported_patterns, pattern_mask=None):
+        pattern_mask = [True for _ in range(len(supported_patterns))] \
+                        if pattern_mask == None else pattern_mask
+        for pattern_id, pattern in enumerate(supported_patterns):
+            if pattern in PATTERNS and pattern_mask[pattern_id]:
+                p_fusion = PATTERNS[pattern]()
+                model = p_fusion(model)
+        self._remove_identity(model) 
+         
+    def _tune_patterns(self, model, iterations = 10, warm_up = 5):
+        # pattern tuning strategy(for superbert): 
+        #    1. only one pattern off/on each time (pruning)
+        #    2. check accuracy with framework
+        #    3. and only save min latency config
+        logger.info('Start tuning pattern...')
+        all_patterns = supported_patterns + superbert_patterns
+        pattern_mask = [True for i in range(len(all_patterns))]
+        min_latency = float("inf")
+        # skip tuning input node fusion and output node fusion
+        for idx in tqdm(range(len(supported_patterns), len(all_patterns))):
+            # pattern on
+            on_latency = float("inf")
+            try:
+                on_model = copy.deepcopy(model)
+                self._fuse_patterns(on_model, all_patterns, pattern_mask)
+                on_result, on_latency = on_model._get_latency([], iterations, warm_up)
+            except:
+                logger.warning("Graph can not be inferenced, please check the graph!")
+            # pattern off
+            off_latency = float("inf")
+            try:
+                off_pattern_mask = copy.deepcopy(pattern_mask)
+                off_pattern_mask[idx] = False
+                off_model = copy.deepcopy(model)
+                self._fuse_patterns(off_model, all_patterns, off_pattern_mask)
+                off_result, off_latency = off_model._get_latency([], iterations, warm_up)
+            except:
+                logger.warning("Graph can not be inferenced, please check the graph!")
+            # update min latency and pattern mask
+            if off_latency < on_latency and off_latency < min_latency:
+                min_latency = off_latency
+                pattern_mask = off_pattern_mask
+        
+        # generate model according pattern mask 
+        self._fuse_patterns(model, all_patterns, pattern_mask)
+        logger.info('End tuning pattern...')
+        return model
+
+    def _remove_identity(self, model):
         rm_node_names = []
         rm_op_type = ['Identity']
         for i in range(len(model.nodes)):
@@ -77,6 +121,4 @@ def __call__(self, model):
                     op_type = EXECUTOR_TYPE[node.op_type]
                     model.nodes[i].op_type = op_type
         model.remove_nodes(rm_node_names)
-        logger.info('Sub-Graph match and replace done...')
 
-        return model
diff --git a/nlp_toolkit/backends/neural_engine/test/pytest/test_graph_dispatch.py b/nlp_toolkit/backends/neural_engine/test/pytest/test_graph_dispatch.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import numpy as np
+from nlp_toolkit.backends.neural_engine.compile import compile
+
+class TestGraphDispatch(unittest.TestCase):
+    @classmethod
+    def setUpClass(self):
+        pass
+
+    @classmethod
+    def tearDownClass(self):
+        pass
+
+    def test_graph_dispatch(self):
+        # set input data
+        shape = [1, 128]
+        input_0 = np.random.uniform(low=0, high=128, size=shape).astype('int32')
+        input_1 = np.random.uniform(low=0, high=1, size=shape).astype('int32')
+        input_2 = np.random.uniform(low=0, high=1, size=shape).astype('int32')
+    
+        # validate int8 sparse graph tuning
+        int8_model_path = "/home/tensorflow/inc_ut/engine/bert_mini_int8_original_IR"
+        self.assertTrue(os.path.exists(int8_model_path),
+            'INT8 IR model is not found, please set your own model path!')
+        int8_model = compile(int8_model_path)
+        int8_output_dict = int8_model.inference([input_0, input_1, input_2])
+        int8_output = list(int8_output_dict.values())[0]
+        # sparse graph tuning
+        int8_model.graph_dispatch(inputs_shape = [shape, shape, shape])
+        int8_dispatch_output_dict = int8_model.inference([input_0, input_1, input_2])
+        int8_dispatch_output = list(int8_dispatch_output_dict.values())[0]
+        # compare outputs
+        self.assertTrue((int8_output == int8_dispatch_output).all())
+
+        # validate onednn graph tuning
+        fp32_model_path = "/home/tensorflow/inc_ut/engine/bert_mini_sst2_1x4_fp32.onnx"
+        self.assertTrue(os.path.exists(fp32_model_path),
+            'FP32 ONNX model is not found, please set your own model path!')
+        fp32_model = compile(fp32_model_path)
+        fp32_output_dict = fp32_model.inference([input_0, input_1, input_2])
+        fp32_output = list(fp32_output_dict.values())[0]
+        # onednn graph tuning
+        fp32_model.graph_dispatch(inputs_shape = [shape, shape, shape])
+        fp32_dispatch_output_dict = fp32_model.inference([input_0, input_1, input_2])
+        fp32_dispatch_output = list(fp32_dispatch_output_dict.values())[0]
+        # compare outputs
+        self.assertTrue((fp32_output == fp32_dispatch_output).all())
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/nlp_toolkit/backends/neural_engine/test/pytest/test_pattern_dispatch.py b/nlp_toolkit/backends/neural_engine/test/pytest/test_pattern_dispatch.py

Original file line number	Diff line number	Diff line change
`@@ -60,6 +60,9 @@`
`60`	`60`	`'OutputData',`
`61`	`61`	`]`
`62`	`62`
	`63`	`+# for superbert, superbert patterns are huge patterns based on supported patterns`
	`64`	`+superbert_patterns = []`
	`65`	`+`
`63`	`66`	`PATTERNS = {}`
`64`	`67`
`65`	`68`