Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 56 additions & 8 deletions neural_compressor/adaptor/onnxrt.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,14 +67,6 @@ def __init__(self, framework_specific_info):
"supported backends: {}".format(ONNXRT_BACKENDS[self.backend],
[ONNXRT_BACKENDS[i] for i in ort.get_all_providers()]))

if self.backend == 'TensorrtExecutionProvider':
from neural_compressor import options
options.onnxrt.qdq_setting.AddQDQPairToWeight = True
options.onnxrt.qdq_setting.DedicatedQDQPair = True
options.onnxrt.graph_optimization.level = 'DISABLE_ALL'
self.static = True
self.dynamic = False

if (not self.dynamic and "format" in framework_specific_info and \
framework_specific_info["format"].lower() == 'qdq') or \
self.backend == 'TensorrtExecutionProvider':
Expand Down Expand Up @@ -114,6 +106,16 @@ def __init__(self, framework_specific_info):
self.quantizable_op_types += \
self.query_handler.get_op_types_by_precision(precision=precision)

if self.backend == 'TensorrtExecutionProvider':
from neural_compressor import options
options.onnxrt.qdq_setting.AddQDQPairToWeight = True
options.onnxrt.qdq_setting.DedicatedQDQPair = True
options.onnxrt.graph_optimization.level = 'DISABLE_ALL'
options.onnxrt.qdq_setting.OpTypesToExcludeOutputQuantizatioin = \
['Conv', 'Gemm', 'Add', 'MatMul']
self.static = True
self.dynamic = False

self.evaluate_nums = 0

self.fp32_results = []
Expand Down Expand Up @@ -517,10 +519,44 @@ def _pre_optimize(self, model, level=1):
if self.graph_optimization.gemm2matmul else tmp_model
model.model = self._rename_node(model.model)
model = self._revert_fusedconv(model)
if self.backend == 'TensorrtExecutionProvider':
model = self._revert_conv_add_fusion(model)
model = split_shared_bias(model)
model.topological_sort()
self.pre_optimized_model = copy.deepcopy(model)

def _revert_conv_add_fusion(self, model):
from onnx import numpy_helper
from neural_compressor.adaptor.ox_utils.util import attribute_to_kwarg
add_nodes = []
remove_nodes = []
for node in model.model.graph.node:
if node.op_type == 'Conv' and len(node.input) == 3:
bias_tensor = model.get_initializer(node.input[2])
bias_array = numpy_helper.to_array(bias_tensor).reshape((-1, 1, 1))
model.remove_initializer(bias_tensor)
model.add_initializer(numpy_helper.from_array(bias_array, bias_tensor.name))
kwargs = {}
activation_params = None
for attr in node.attribute:
kwargs.update(attribute_to_kwarg(attr))
conv = onnx.helper.make_node(
'Conv',
node.input[0:2],
[node.name + '_revert'],
node.name, **kwargs)
add = onnx.helper.make_node(
'Add',
[conv.output[0], node.input[2]],
node.output,
node.name + '_add')
add_nodes.extend([conv, add])

model.remove_nodes(remove_nodes)
model.add_nodes(add_nodes)
model.update()
return model

def _revert_fusedconv(self, model):
from neural_compressor.adaptor.ox_utils.util import attribute_to_kwarg
from onnx import onnx_pb as onnx_proto
Expand Down Expand Up @@ -684,6 +720,10 @@ def query_fw_capability(self, model):
else:
continue

if self.backend == 'TensorrtExecutionProvider' and \
precision not in query.get_fallback_list():
optypes.append('Add')

for op in optypes:
if op not in quantizable_optype:
continue
Expand Down Expand Up @@ -736,6 +776,14 @@ def query_fw_capability(self, model):
all_conv_matmul.append(node)

for _, node in enumerate(self.pre_optimized_model.nodes()):
# for TRT EP, only insert Q/DQ to inputs of Add nodes followed by ReduceMean
if node.op_type == 'Add' and self.backend == 'TensorrtExecutionProvider':
children = self.pre_optimized_model.get_children(node)
if 'ReduceMean' not in [i.op_type for i in children]:
op_wise.update({(node.name, node.op_type):
[{'weight': {'dtype': 'fp32'}, 'activation': {'dtype': 'fp32'}}]})
continue

if node.op_type in optype_wise:
if (exclude_first_quantizable_op and node.name in first_quantizable_node) \
or (exclude_last_quantizable_op and node.name in last_quantizable_node):
Expand Down
36 changes: 26 additions & 10 deletions neural_compressor/adaptor/onnxrt_qdq.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,21 @@
CPUExecutionProvider: *ref_1_7
CUDAExecutionProvider: *ref_1_7
TensorrtExecutionProvider: {
'Conv': &cap_s8_sym_pertensor_default {
'Conv': &cap_s8_sym_default {
'weight': {
'dtype': ['int8'],
'scheme': ['sym'],
'granularity': ['per_tensor', 'per_channel'],
'algorithm': ['minmax']
},
'activation': {
'dtype': ['int8'],
'scheme': ['sym'],
'granularity': ['per_tensor'],
'algorithm': ['minmax']
}
},
'MatMul': &cap_s8_sym_pertensor_default {
'weight': {
'dtype': ['int8'],
'scheme': ['sym'],
Expand All @@ -89,16 +103,16 @@
'algorithm': ['minmax']
}
},
'MatMul': *cap_s8_sym_pertensor_default,
'Attention': *cap_s8_sym_pertensor_default,
'LeakyRelu': *cap_s8_sym_pertensor_default,
'Gather': *cap_s8_sym_pertensor_default,
'Gather': *cap_s8_sym_default,
'Sigmoid': *cap_s8_sym_pertensor_default,
'MaxPool': *cap_s8_sym_pertensor_default,
'EmbedLayerNormalization': *cap_s8_sym_pertensor_default,
'GlobalAveragePool': *cap_s8_sym_pertensor_default,
'Pad': *cap_s8_sym_pertensor_default,
'Split': *cap_s8_sym_pertensor_default,
'Add': *cap_s8_sym_pertensor_default,
}

graph_optimization: &default_optimization # from onnxruntime graph_optimization_level
Expand All @@ -123,11 +137,11 @@
CPUExecutionProvider: *ref_1_7
CUDAExecutionProvider: *ref_1_7
TensorrtExecutionProvider: &ref_1_8 {
'Conv': *cap_s8_sym_pertensor_default,
'Conv': *cap_s8_sym_default,
'MatMul': *cap_s8_sym_pertensor_default,
'Attention': *cap_s8_sym_pertensor_default,
'LeakyRelu': *cap_s8_sym_pertensor_default,
'Gather': *cap_s8_sym_pertensor_default,
'Gather': *cap_s8_sym_default,
'Sigmoid': *cap_s8_sym_pertensor_default,
'MaxPool': *cap_s8_sym_pertensor_default,
'EmbedLayerNormalization': *cap_s8_sym_pertensor_default,
Expand All @@ -140,7 +154,8 @@
'AveragePool': *cap_s8_sym_pertensor_default,
'Unsqueeze': *cap_s8_sym_pertensor_default,
'Transpose': *cap_s8_sym_pertensor_default,
'Resize': *cap_s8_sym_pertensor_default
'Resize': *cap_s8_sym_pertensor_default,
'Add': *cap_s8_sym_pertensor_default,
}

graph_optimization:
Expand Down Expand Up @@ -317,11 +332,11 @@
CPUExecutionProvider: *ref_1_11
CUDAExecutionProvider: *ref_1_11
TensorrtExecutionProvider: {
'Conv': *cap_s8_sym_pertensor_default,
'MatMul': *cap_s8_sym_pertensor_default,
'Conv': *cap_s8_sym_default,
'MatMul': *cap_s8_sym_default,
'Attention': *cap_s8_sym_pertensor_default,
'LeakyRelu': *cap_s8_sym_pertensor_default,
'Gather': *cap_s8_sym_pertensor_default,
'Gather': *cap_s8_sym_default,
'Sigmoid': *cap_s8_sym_pertensor_default,
'MaxPool': *cap_s8_sym_pertensor_default,
'EmbedLayerNormalization': *cap_s8_sym_pertensor_default,
Expand All @@ -335,7 +350,8 @@
'Unsqueeze': *cap_s8_sym_pertensor_default,
'Transpose': *cap_s8_sym_pertensor_default,
'Resize': *cap_s8_sym_pertensor_default,
'Gemm': *cap_s8_sym_pertensor_default
'Gemm': *cap_s8_sym_default,
'Add': *cap_s8_sym_pertensor_default,
}


Expand Down
3 changes: 1 addition & 2 deletions neural_compressor/adaptor/ox_utils/calibration.py
Original file line number Diff line number Diff line change
Expand Up @@ -436,8 +436,7 @@ def calculate_quantization_params(self, q_config, quantization_thresholds):
if parent and parent.name in q_config and q_config[parent.name] not in ['fp32']:
scheme = q_config[parent.name]['activation']['scheme']
qType = q_config[parent.name]['activation']['dtype']
elif tensor_name in self.model_wrapper.input() and \
self.backend in ['TensorrtExecutionProvider']:
elif self.backend in ['TensorrtExecutionProvider']:
scheme = 'sym'
qType = 3
node_thresholds = quantization_thresholds[tensor_name]
Expand Down