From bc71fc561e03458ea97a065d7132ea737b321fe5 Mon Sep 17 00:00:00 2001 From: Mengni Wang Date: Thu, 15 Dec 2022 11:08:33 +0800 Subject: [PATCH 1/2] Add recipe for TRT EP Signed-off-by: Mengni Wang --- neural_compressor/adaptor/onnxrt.py | 66 ++++++++++++++++--- neural_compressor/adaptor/onnxrt_qdq.yaml | 36 +++++++--- .../adaptor/ox_utils/calibration.py | 3 +- 3 files changed, 85 insertions(+), 20 deletions(-) diff --git a/neural_compressor/adaptor/onnxrt.py b/neural_compressor/adaptor/onnxrt.py index b5072688a7b..b0cbc407ae6 100644 --- a/neural_compressor/adaptor/onnxrt.py +++ b/neural_compressor/adaptor/onnxrt.py @@ -67,14 +67,6 @@ def __init__(self, framework_specific_info): "supported backends: {}".format(ONNXRT_BACKENDS[self.backend], [ONNXRT_BACKENDS[i] for i in ort.get_all_providers()])) - if self.backend == 'TensorrtExecutionProvider': - from neural_compressor import options - options.onnxrt.qdq_setting.AddQDQPairToWeight = True - options.onnxrt.qdq_setting.DedicatedQDQPair = True - options.onnxrt.graph_optimization.level = 'DISABLE_ALL' - self.static = True - self.dynamic = False - if (not self.dynamic and "format" in framework_specific_info and \ framework_specific_info["format"].lower() == 'qdq') or \ self.backend == 'TensorrtExecutionProvider': @@ -114,6 +106,16 @@ def __init__(self, framework_specific_info): self.quantizable_op_types += \ self.query_handler.get_op_types_by_precision(precision=precision) + if self.backend == 'TensorrtExecutionProvider': + from neural_compressor import options + options.onnxrt.qdq_setting.AddQDQPairToWeight = True + options.onnxrt.qdq_setting.DedicatedQDQPair = True + options.onnxrt.graph_optimization.level = 'DISABLE_ALL' + options.onnxrt.qdq_setting.OpTypesToExcludeOutputQuantizatioin = \ + ['Conv', 'Gemm', 'Add', 'MatMul'] + self.static = True + self.dynamic = False + self.evaluate_nums = 0 self.fp32_results = [] @@ -517,10 +519,46 @@ def _pre_optimize(self, model, level=1): if self.graph_optimization.gemm2matmul else tmp_model model.model = self._rename_node(model.model) model = self._revert_fusedconv(model) + if self.backend == 'TensorrtExecutionProvider': + model = self._revert_conv_add_fusion(model) model = split_shared_bias(model) model.topological_sort() self.pre_optimized_model = copy.deepcopy(model) + def _revert_conv_add_fusion(self, model): + from onnx import numpy_helper + from neural_compressor.adaptor.ox_utils.util import attribute_to_kwarg + from onnx import onnx_pb as onnx_proto + from neural_compressor.adaptor.ox_utils.util import find_by_name + add_nodes = [] + remove_nodes = [] + for node in model.model.graph.node: + if node.op_type == 'Conv' and len(node.input) == 3: + bias_tensor = model.get_initializer(node.input[2]) + bias_array = numpy_helper.to_array(bias_tensor).reshape((-1, 1, 1)) + model.remove_initializer(bias_tensor) + model.add_initializer(numpy_helper.from_array(bias_array, bias_tensor.name)) + kwargs = {} + activation_params = None + for attr in node.attribute: + kwargs.update(attribute_to_kwarg(attr)) + conv = onnx.helper.make_node( + 'Conv', + node.input[0:2], + [node.name + '_revert'], + node.name, **kwargs) + add = onnx.helper.make_node( + 'Add', + [conv.output[0], node.input[2]], + node.output, + node.name + '_add') + add_nodes.extend([conv, add]) + + model.remove_nodes(remove_nodes) + model.add_nodes(add_nodes) + model.update() + return model + def _revert_fusedconv(self, model): from neural_compressor.adaptor.ox_utils.util import attribute_to_kwarg from onnx import onnx_pb as onnx_proto @@ -684,6 +722,10 @@ def query_fw_capability(self, model): else: continue + if self.backend == 'TensorrtExecutionProvider' and \ + precision not in query.get_fallback_list(): + optypes.append('Add') + for op in optypes: if op not in quantizable_optype: continue @@ -736,6 +778,14 @@ def query_fw_capability(self, model): all_conv_matmul.append(node) for _, node in enumerate(self.pre_optimized_model.nodes()): + # for TRT EP, only insert Q/DQ to inputs of Add nodes followed by ReduceMean + if node.op_type == 'Add' and self.backend == 'TensorrtExecutionProvider': + children = self.pre_optimized_model.get_children(node) + if 'ReduceMean' not in [i.op_type for i in children]: + op_wise.update({(node.name, node.op_type): + [{'weight': {'dtype': 'fp32'}, 'activation': {'dtype': 'fp32'}}]}) + continue + if node.op_type in optype_wise: if (exclude_first_quantizable_op and node.name in first_quantizable_node) \ or (exclude_last_quantizable_op and node.name in last_quantizable_node): diff --git a/neural_compressor/adaptor/onnxrt_qdq.yaml b/neural_compressor/adaptor/onnxrt_qdq.yaml index 28f22724a31..33d3ba90ba9 100644 --- a/neural_compressor/adaptor/onnxrt_qdq.yaml +++ b/neural_compressor/adaptor/onnxrt_qdq.yaml @@ -75,7 +75,21 @@ CPUExecutionProvider: *ref_1_7 CUDAExecutionProvider: *ref_1_7 TensorrtExecutionProvider: { - 'Conv': &cap_s8_sym_pertensor_default { + 'Conv': &cap_s8_sym_default { + 'weight': { + 'dtype': ['int8'], + 'scheme': ['sym'], + 'granularity': ['per_tensor', 'per_channel'], + 'algorithm': ['minmax'] + }, + 'activation': { + 'dtype': ['int8'], + 'scheme': ['sym'], + 'granularity': ['per_tensor'], + 'algorithm': ['minmax'] + } + }, + 'MatMul': &cap_s8_sym_pertensor_default { 'weight': { 'dtype': ['int8'], 'scheme': ['sym'], @@ -89,16 +103,16 @@ 'algorithm': ['minmax'] } }, - 'MatMul': *cap_s8_sym_pertensor_default, 'Attention': *cap_s8_sym_pertensor_default, 'LeakyRelu': *cap_s8_sym_pertensor_default, - 'Gather': *cap_s8_sym_pertensor_default, + 'Gather': *cap_s8_sym_default, 'Sigmoid': *cap_s8_sym_pertensor_default, 'MaxPool': *cap_s8_sym_pertensor_default, 'EmbedLayerNormalization': *cap_s8_sym_pertensor_default, 'GlobalAveragePool': *cap_s8_sym_pertensor_default, 'Pad': *cap_s8_sym_pertensor_default, 'Split': *cap_s8_sym_pertensor_default, + 'Add': *cap_s8_sym_pertensor_default, } graph_optimization: &default_optimization # from onnxruntime graph_optimization_level @@ -123,11 +137,11 @@ CPUExecutionProvider: *ref_1_7 CUDAExecutionProvider: *ref_1_7 TensorrtExecutionProvider: &ref_1_8 { - 'Conv': *cap_s8_sym_pertensor_default, + 'Conv': *cap_s8_sym_default, 'MatMul': *cap_s8_sym_pertensor_default, 'Attention': *cap_s8_sym_pertensor_default, 'LeakyRelu': *cap_s8_sym_pertensor_default, - 'Gather': *cap_s8_sym_pertensor_default, + 'Gather': *cap_s8_sym_default, 'Sigmoid': *cap_s8_sym_pertensor_default, 'MaxPool': *cap_s8_sym_pertensor_default, 'EmbedLayerNormalization': *cap_s8_sym_pertensor_default, @@ -140,7 +154,8 @@ 'AveragePool': *cap_s8_sym_pertensor_default, 'Unsqueeze': *cap_s8_sym_pertensor_default, 'Transpose': *cap_s8_sym_pertensor_default, - 'Resize': *cap_s8_sym_pertensor_default + 'Resize': *cap_s8_sym_pertensor_default, + 'Add': *cap_s8_sym_pertensor_default, } graph_optimization: @@ -317,11 +332,11 @@ CPUExecutionProvider: *ref_1_11 CUDAExecutionProvider: *ref_1_11 TensorrtExecutionProvider: { - 'Conv': *cap_s8_sym_pertensor_default, - 'MatMul': *cap_s8_sym_pertensor_default, + 'Conv': *cap_s8_sym_default, + 'MatMul': *cap_s8_sym_default, 'Attention': *cap_s8_sym_pertensor_default, 'LeakyRelu': *cap_s8_sym_pertensor_default, - 'Gather': *cap_s8_sym_pertensor_default, + 'Gather': *cap_s8_sym_default, 'Sigmoid': *cap_s8_sym_pertensor_default, 'MaxPool': *cap_s8_sym_pertensor_default, 'EmbedLayerNormalization': *cap_s8_sym_pertensor_default, @@ -335,7 +350,8 @@ 'Unsqueeze': *cap_s8_sym_pertensor_default, 'Transpose': *cap_s8_sym_pertensor_default, 'Resize': *cap_s8_sym_pertensor_default, - 'Gemm': *cap_s8_sym_pertensor_default + 'Gemm': *cap_s8_sym_default, + 'Add': *cap_s8_sym_pertensor_default, } diff --git a/neural_compressor/adaptor/ox_utils/calibration.py b/neural_compressor/adaptor/ox_utils/calibration.py index 587c451fbc4..3a2261af688 100644 --- a/neural_compressor/adaptor/ox_utils/calibration.py +++ b/neural_compressor/adaptor/ox_utils/calibration.py @@ -436,8 +436,7 @@ def calculate_quantization_params(self, q_config, quantization_thresholds): if parent and parent.name in q_config and q_config[parent.name] not in ['fp32']: scheme = q_config[parent.name]['activation']['scheme'] qType = q_config[parent.name]['activation']['dtype'] - elif tensor_name in self.model_wrapper.input() and \ - self.backend in ['TensorrtExecutionProvider']: + elif self.backend in ['TensorrtExecutionProvider']: scheme = 'sym' qType = 3 node_thresholds = quantization_thresholds[tensor_name] From c1c4f3ec6f3a025593f46b0337342029b48528b8 Mon Sep 17 00:00:00 2001 From: Mengni Wang Date: Thu, 15 Dec 2022 11:12:23 +0800 Subject: [PATCH 2/2] remove codes Signed-off-by: Mengni Wang --- neural_compressor/adaptor/onnxrt.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/neural_compressor/adaptor/onnxrt.py b/neural_compressor/adaptor/onnxrt.py index b0cbc407ae6..a0df0445556 100644 --- a/neural_compressor/adaptor/onnxrt.py +++ b/neural_compressor/adaptor/onnxrt.py @@ -528,8 +528,6 @@ def _pre_optimize(self, model, level=1): def _revert_conv_add_fusion(self, model): from onnx import numpy_helper from neural_compressor.adaptor.ox_utils.util import attribute_to_kwarg - from onnx import onnx_pb as onnx_proto - from neural_compressor.adaptor.ox_utils.util import find_by_name add_nodes = [] remove_nodes = [] for node in model.model.graph.node: