From f66b33494976ca60193d263f2ab9e684e005f65b Mon Sep 17 00:00:00 2001 From: mengniwa Date: Wed, 7 Dec 2022 09:32:04 +0800 Subject: [PATCH 01/12] draft for qlinear to qdq Signed-off-by: mengniwa --- .../adaptor/ox_utils/operators/activation.py | 40 ++++++++++++++++- .../adaptor/ox_utils/operators/binary_op.py | 45 ++++++++++++++++++- .../adaptor/ox_utils/operators/concat.py | 40 +++++++++++++++++ .../adaptor/ox_utils/operators/conv.py | 42 +++++++++++++++++ .../adaptor/ox_utils/operators/gather.py | 34 +++++++++++++- .../adaptor/ox_utils/operators/gavgpool.py | 39 +++++++++++++++- .../adaptor/ox_utils/operators/gemm.py | 45 ++++++++++++++++++- .../adaptor/ox_utils/operators/matmul.py | 45 ++++++++++++++++++- .../adaptor/ox_utils/operators/ops.py | 42 ++++++++++++++++- .../adaptor/ox_utils/operators/pooling.py | 37 +++++++++++++++ neural_compressor/adaptor/ox_utils/util.py | 18 ++++++++ 11 files changed, 420 insertions(+), 7 deletions(-) diff --git a/neural_compressor/adaptor/ox_utils/operators/activation.py b/neural_compressor/adaptor/ox_utils/operators/activation.py index 5339f6834ad..a6f11b35cf8 100644 --- a/neural_compressor/adaptor/ox_utils/operators/activation.py +++ b/neural_compressor/adaptor/ox_utils/operators/activation.py @@ -87,4 +87,42 @@ def quantize(self): self.quantizer.dequantize_tensor(node, node.input[0]) else: self.quantizer.model.replace_input_of_all_nodes(node.output[0], node.input[0]) - self.quantizer.remove_nodes.append(node) \ No newline at end of file + self.quantizer.remove_nodes.append(node) + +@qop_registry(op_types="QLinearLeakyRelu, QLinearSigmoid") +class QActivationOperator(QOperator): + def __init__(self, onnx_node): + super().__init__(onnx_node) + + def convert(self): + node = self.node + add_nodes = [] + # input dq + in_dq = onnx.helper.make_node( + 'DequantizeLinear', + node.inputs[:3], + [node.name + '_in_dequant']) + inputs = [node.name + '_in_dequant'] + add_nodes.append(in_dq) + # output q + if not self.disable_qdq_for_node_output: + out_q = onnx.helper.make_node( + 'QuantizeLinear', + [node.name + '_out', node.inputs[3], node.inputs[4]], + node.outputs, + node.name + '_out_quant') + outputs = [node.name + '_out'] + add_nodes.append(out_q) + else: + outputs = node.output + kwargs = {} + for attribute in node.attribute: # pragma: no cover + kwargs.update(attribute_to_kwarg(attribute)) + kwargs["domain"] = ms_domain + + activation_node = onnx.helper.make_node( + node.op_type.split('QLinear')[-1], inputs, + outputs, node.name + '_convert', **kwargs) + add_nodes.append(activation_node) + return True, add_nodes + diff --git a/neural_compressor/adaptor/ox_utils/operators/binary_op.py b/neural_compressor/adaptor/ox_utils/operators/binary_op.py index 3848cd6ee9b..5b39f36bb0a 100644 --- a/neural_compressor/adaptor/ox_utils/operators/binary_op.py +++ b/neural_compressor/adaptor/ox_utils/operators/binary_op.py @@ -77,4 +77,47 @@ def convert(self, convert_format): self.quantizer.new_nodes += [qlinear_binary_math_node] self.quantizer.remove_nodes.extend(parents) self.quantizer.remove_nodes.append(child) - self.quantizer.remove_nodes.append(node) \ No newline at end of file + self.quantizer.remove_nodes.append(node) + +@qop_registry(op_types="QLinearAdd, QLinearMul") +class QBinaryOperator(QOperator): + def __init__(self, onnx_node): + super().__init__(onnx_node) + + def convert(self): + node = self.node + add_nodes = [] + # input dq + in_dq1 = onnx.helper.make_node( + 'DequantizeLinear', + node.inputs[:3], + [node.name + '_in_dequant1']) + + in_dq2 = onnx.helper.make_node( + 'DequantizeLinear', + node.inputs[3:6], + [node.name + '_in_dequant2']) + inputs = [node.name + '_in_dequant1', node.name + '_in_dequant2'] + + add_nodes.extend([in_dq1, in_dq2]) + # output q + if not self.disable_qdq_for_node_output: + out_q = onnx.helper.make_node( + 'QuantizeLinear', + [node.name + '_out', node.inputs[6], node.inputs[7]], + node.outputs, + node.name + '_out_quant') + outputs = [node.name + '_out'] + add_nodes.append(out_q) + else: + outputs = node.output + kwargs = {} + for attribute in node.attribute: # pragma: no cover + kwargs.update(attribute_to_kwarg(attribute)) + kwargs["domain"] = ms_domain + + binary_node = onnx.helper.make_node( + node.op_type.split('QLinear')[-1], inputs, + outputs, node.name + '_convert', **kwargs) + add_nodes.append(binary_node) + return True, add_nodes diff --git a/neural_compressor/adaptor/ox_utils/operators/concat.py b/neural_compressor/adaptor/ox_utils/operators/concat.py index 763ac8e6541..3070d772202 100644 --- a/neural_compressor/adaptor/ox_utils/operators/concat.py +++ b/neural_compressor/adaptor/ox_utils/operators/concat.py @@ -96,3 +96,43 @@ def cast(self): # pragma: no cover if node.input[0] not in [i.tensor_name for i in self.quantizer.new_value_info.values()]: return self.quantizer.dtype_cast(self.node, self.dtype) + +@qop_registry(op_types="QLinearConcat") +class QConcatOperator(QOperator): + def __init__(self, onnx_node): + super().__init__(onnx_node) + + def convert(self): + node = self.node + add_nodes = [] + inputs = [] + # input dq + for i in range((len(node.inputs) - 2) / 3 - 1): + in_dq = onnx.helper.make_node( + 'DequantizeLinear', + node.inputs[2 + i*3 : 2 + (i+1)*3], + [node.name + '_in_dequant_' + str(i)]) + inputs.append(node.name + '_in_dequant_' + str(i)) + add_nodes.append(in_dq) + + # output q + if not self.disable_qdq_for_node_output: + out_q = onnx.helper.make_node( + 'QuantizeLinear', + [node.name + '_out', node.inputs[0], node.inputs[1]], + node.outputs, + node.name + '_out_quant') + outputs = [node.name + '_out'] + add_nodes.append(out_q) + else: + outputs = node.output + kwargs = {} + for attribute in node.attribute: # pragma: no cover + kwargs.update(attribute_to_kwarg(attribute)) + kwargs["domain"] = ms_domain + + concat_node = onnx.helper.make_node( + 'Concat', inputs, + outputs, node.name + '_convert', **kwargs) + add_nodes.append(concat_node) + return True, add_nodes diff --git a/neural_compressor/adaptor/ox_utils/operators/conv.py b/neural_compressor/adaptor/ox_utils/operators/conv.py index 90b849bd9e6..ce9671b5f80 100644 --- a/neural_compressor/adaptor/ox_utils/operators/conv.py +++ b/neural_compressor/adaptor/ox_utils/operators/conv.py @@ -164,4 +164,46 @@ def convert(self, convert_format): self.quantizer.remove_nodes.append(child) self.quantizer.remove_nodes.append(node) +@qop_registry(op_types="QLinearConv") +class QConvOperator(QOperator): + def __init__(self, onnx_node): + super().__init__(onnx_node) + def convert(self): + # TODO + node = self.node + add_nodes = [] + # input dq + in_dq1 = onnx.helper.make_node( + 'DequantizeLinear', + node.inputs[:3], + [node.name + '_in_dequant1']) + + in_dq2 = onnx.helper.make_node( + 'DequantizeLinear', + node.inputs[3:6], + [node.name + '_in_dequant2']) + inputs = [node.name + '_in_dequant1', node.name + '_in_dequant2'] + + add_nodes.extend([in_dq1, in_dq2]) + # output q + if not self.disable_qdq_for_node_output: + out_q = onnx.helper.make_node( + 'QuantizeLinear', + [node.name + '_out', node.inputs[6], node.inputs[7]], + node.outputs, + node.name + '_out_quant') + outputs = [node.name + '_out'] + add_nodes.append(out_q) + else: + outputs = node.output + kwargs = {} + for attribute in node.attribute: # pragma: no cover + kwargs.update(attribute_to_kwarg(attribute)) + kwargs["domain"] = ms_domain + + binary_node = onnx.helper.make_node( + node.op_type.split('QLinear')[-1], inputs, + outputs, node.name + '_convert', **kwargs) + add_nodes.append(binary_node) + return True, add_nodes diff --git a/neural_compressor/adaptor/ox_utils/operators/gather.py b/neural_compressor/adaptor/ox_utils/operators/gather.py index 93f98823047..1e81de4e6c0 100644 --- a/neural_compressor/adaptor/ox_utils/operators/gather.py +++ b/neural_compressor/adaptor/ox_utils/operators/gather.py @@ -89,4 +89,36 @@ def convert(self, convert_format): for n in self.quantizer.model.get_children(child): self.quantizer.model.replace_node_input(n, child.output[0], gather_new_output) - self.quantizer.remove_nodes.extend([node, parents[0]]) \ No newline at end of file + self.quantizer.remove_nodes.extend([node, parents[0]]) + +@qop_registry(op_types="Gather") +class QGatherOperator(QOperator): + def __init__(self, onnx_node): + super().__init__(onnx_node) + + def convert(self): + node = self.node + add_nodes = [] + inputs = [] + if all([i.op_type != 'DequantizeLinear' for i in self.children]): + return False, add_nodes + for child in self.children: + if child.op_type == 'DequantizeLinear': + in_dq = onnx.helper.make_node( + 'DequantizeLinear', + [node.input[0], child.input[1], child.input[2]], + [node.name + '_in_dequant_' + str(i)]) + inputs.append(node.name + '_in_dequant_' + str(i)) + add_nodes.append(in_dq) + break + outputs = node.outputs + kwargs = {} + for attribute in node.attribute: # pragma: no cover + kwargs.update(attribute_to_kwarg(attribute)) + kwargs["domain"] = ms_domain + + gather_node = onnx.helper.make_node( + 'Gather', inputs, + outputs, node.name + '_convert', **kwargs) + add_nodes.append(gather_node) + return True, add_nodes diff --git a/neural_compressor/adaptor/ox_utils/operators/gavgpool.py b/neural_compressor/adaptor/ox_utils/operators/gavgpool.py index b4bafcafeae..5d9a949fed0 100644 --- a/neural_compressor/adaptor/ox_utils/operators/gavgpool.py +++ b/neural_compressor/adaptor/ox_utils/operators/gavgpool.py @@ -58,4 +58,41 @@ def convert(self, convert_format): self.quantizer.new_nodes += [qnode] self.quantizer.remove_nodes.append(child) self.quantizer.remove_nodes.append(parent) - self.quantizer.remove_nodes.append(node) \ No newline at end of file + self.quantizer.remove_nodes.append(node) + +@qop_registry(op_types="QLinearGlobalAveragePool") +class QGlobalAveragePoolOperator(QOperator): + def __init__(self, onnx_node): + super().__init__(onnx_node) + + def convert(self): + node = self.node + add_nodes = [] + # input dq + in_dq = onnx.helper.make_node( + 'DequantizeLinear', + node.inputs[:3], + [node.name + '_in_dequant']) + inputs = [node.name + '_in_dequant'] + add_nodes.append(in_dq) + # output q + if not self.disable_qdq_for_node_output: + out_q = onnx.helper.make_node( + 'QuantizeLinear', + [node.name + '_out', node.inputs[3], node.inputs[4]], + node.outputs, + node.name + '_out_quant') + outputs = [node.name + '_out'] + add_nodes.append(out_q) + else: + outputs = node.output + kwargs = {} + for attribute in node.attribute: # pragma: no cover + kwargs.update(attribute_to_kwarg(attribute)) + kwargs["domain"] = ms_domain + + activation_node = onnx.helper.make_node( + 'GlobalAveragePool', inputs, + outputs, node.name + '_convert', **kwargs) + add_nodes.append(activation_node) + return True, add_nodes diff --git a/neural_compressor/adaptor/ox_utils/operators/gemm.py b/neural_compressor/adaptor/ox_utils/operators/gemm.py index 65aca2e8a7d..c523e37b696 100644 --- a/neural_compressor/adaptor/ox_utils/operators/gemm.py +++ b/neural_compressor/adaptor/ox_utils/operators/gemm.py @@ -91,4 +91,47 @@ def convert(self, convert_format): self.quantizer.new_nodes.append(qgemm_node) self.quantizer.remove_nodes.extend(parents) self.quantizer.remove_nodes.append(child) - self.quantizer.remove_nodes.append(node) \ No newline at end of file + self.quantizer.remove_nodes.append(node) + +@qop_registry(op_types="QGemm") +class QGemmOperator(QOperator): + def __init__(self, onnx_node): + super().__init__(onnx_node) + + def convert(self): + node = self.node + add_nodes = [] + # input dq + in_dq1 = onnx.helper.make_node( + 'DequantizeLinear', + node.inputs[:3], + [node.name + '_in_dequant1']) + + in_dq2 = onnx.helper.make_node( + 'DequantizeLinear', + node.inputs[3:6], + [node.name + '_in_dequant2']) + inputs = [node.name + '_in_dequant1', node.name + '_in_dequant2'] + + add_nodes.extend([in_dq1, in_dq2]) + # output q + if not self.disable_qdq_for_node_output: + out_q = onnx.helper.make_node( + 'QuantizeLinear', + [node.name + '_out', node.inputs[6], node.inputs[7]], + node.outputs, + node.name + '_out_quant') + outputs = [node.name + '_out'] + add_nodes.append(out_q) + else: + outputs = node.output + kwargs = {} + for attribute in node.attribute: # pragma: no cover + kwargs.update(attribute_to_kwarg(attribute)) + kwargs["domain"] = ms_domain + + gemm_node = onnx.helper.make_node( + 'Gemm', inputs, + outputs, node.name + '_convert', **kwargs) + add_nodes.append(gemm_node) + return True, add_nodes diff --git a/neural_compressor/adaptor/ox_utils/operators/matmul.py b/neural_compressor/adaptor/ox_utils/operators/matmul.py index 988e157e323..a2b81af703d 100644 --- a/neural_compressor/adaptor/ox_utils/operators/matmul.py +++ b/neural_compressor/adaptor/ox_utils/operators/matmul.py @@ -122,4 +122,47 @@ def convert(self, convert_format): self.quantizer.new_nodes.append(qlinear_matmul_node) self.quantizer.remove_nodes.extend(parents) self.quantizer.remove_nodes.append(child) - self.quantizer.remove_nodes.append(node) \ No newline at end of file + self.quantizer.remove_nodes.append(node) + +@qop_registry(op_types="QLinearMatMul") +class QMatMulOperator(QOperator): + def __init__(self, onnx_node): + super().__init__(onnx_node) + + def convert(self): + node = self.node + add_nodes = [] + # input dq + in_dq1 = onnx.helper.make_node( + 'DequantizeLinear', + node.inputs[:3], + [node.name + '_in_dequant1']) + + in_dq2 = onnx.helper.make_node( + 'DequantizeLinear', + node.inputs[3:6], + [node.name + '_in_dequant2']) + inputs = [node.name + '_in_dequant1', node.name + '_in_dequant2'] + + add_nodes.extend([in_dq1, in_dq2]) + # output q + if not self.disable_qdq_for_node_output: + out_q = onnx.helper.make_node( + 'QuantizeLinear', + [node.name + '_out', node.inputs[6], node.inputs[7]], + node.outputs, + node.name + '_out_quant') + outputs = [node.name + '_out'] + add_nodes.append(out_q) + else: + outputs = node.output + kwargs = {} + for attribute in node.attribute: # pragma: no cover + kwargs.update(attribute_to_kwarg(attribute)) + kwargs["domain"] = ms_domain + + matmul_node = onnx.helper.make_node( + 'MatMul', inputs, + outputs, node.name + '_convert', **kwargs) + add_nodes.append(matmul_node) + return True, add_nodes diff --git a/neural_compressor/adaptor/ox_utils/operators/ops.py b/neural_compressor/adaptor/ox_utils/operators/ops.py index 33d4ecf7c5d..a3e9f73d968 100644 --- a/neural_compressor/adaptor/ox_utils/operators/ops.py +++ b/neural_compressor/adaptor/ox_utils/operators/ops.py @@ -17,6 +17,7 @@ # OPERATORS = {} +QOPERATORS= {} def op_registry(op_types): '''The class decorator used to register all Operator subclasses. @@ -34,6 +35,24 @@ def decorator_op(cls): return cls return decorator_op +def qop_registry(op_types): + '''The class decorator used to register all qOperator subclasses. + + Args: + cls (class): The class of register. + ''' + def decorator_op(cls): + assert cls.__name__.endswith( + 'Operator'), "The name of subclass of QOperator should end with \'Operator\' substring." + if cls.__name__[:-len('Operator')] in OPERATORS: # pragma: no cover + raise ValueError('Cannot have two operators with the same name.') + for single_op_type in [op_type.strip() for op_type in op_types.split(',')]: + if single_op_type.startswith('QLinear') or single_op_type in ['QGemm']: + QOPERATORS[single_op_type] = cls + return cls + return decorator_op + + class Operator(object): def __init__(self, onnx_quantizer, onnx_node): self.quantizer = onnx_quantizer @@ -81,4 +100,25 @@ def convert(self, convert_format): return def cast(self): # pragma: no cover - self.quantizer.dtype_cast(self.node, self.dtype) \ No newline at end of file + self.quantizer.dtype_cast(self.node, self.dtype) + +class QOperator(object): + def __init__(self, onnx_node, add_qdq_to_weight=False, + dedicated_qdq=False, optypes_to_exclude_output_quantization=[]): + self.node = onnx_node + self.add_qdq_to_weight = add_qdq_to_weight + self.dedicated_qdq = dedicated_qdq + self.disable_qdq_for_node_output = True if onnx_node.op_type in \ + optypes_to_exclude_output_quantization else False + self.per_channel = False + self.algorithm = 'minmax' + self.weight_scheme = 'sym' + self.weight_dtype = None + self.activation_dtype = None + self.activation_scheme = 'asym' + + def convert_check(self, convert_format): + return True + + def convert(self): + return diff --git a/neural_compressor/adaptor/ox_utils/operators/pooling.py b/neural_compressor/adaptor/ox_utils/operators/pooling.py index bba746129e6..e0d177fab67 100644 --- a/neural_compressor/adaptor/ox_utils/operators/pooling.py +++ b/neural_compressor/adaptor/ox_utils/operators/pooling.py @@ -80,3 +80,40 @@ def convert(self, convert_format): self.quantizer.new_nodes.append(qnode) self.quantizer.remove_nodes.append(node) + +@qop_registry(op_types="QLinearAveragePool") +class QPoolOperator(QOperator): + def __init__(self, onnx_node): + super().__init__(onnx_node) + + def convert(self): + node = self.node + add_nodes = [] + # input dq + in_dq = onnx.helper.make_node( + 'DequantizeLinear', + node.inputs[:3], + [node.name + '_in_dequant']) + inputs = [node.name + '_in_dequant'] + add_nodes.append(in_dq) + # output q + if not self.disable_qdq_for_node_output: + out_q = onnx.helper.make_node( + 'QuantizeLinear', + [node.name + '_out', node.inputs[3], node.inputs[4]], + node.outputs, + node.name + '_out_quant') + outputs = [node.name + '_out'] + add_nodes.append(out_q) + else: + outputs = node.output + kwargs = {} + for attribute in node.attribute: # pragma: no cover + kwargs.update(attribute_to_kwarg(attribute)) + kwargs["domain"] = ms_domain + + activation_node = onnx.helper.make_node( + 'AveragePool', inputs, + outputs, node.name + '_convert', **kwargs) + add_nodes.append(activation_node) + return True, add_nodes diff --git a/neural_compressor/adaptor/ox_utils/util.py b/neural_compressor/adaptor/ox_utils/util.py index d5041204efd..dc1be039ca7 100644 --- a/neural_compressor/adaptor/ox_utils/util.py +++ b/neural_compressor/adaptor/ox_utils/util.py @@ -379,3 +379,21 @@ def find_by_name(name, item_list): return items[0] else: return None + +def convert_qoperator_to_qdq(model, config): + from neural_compressor.adaptor.ox_utils/operators.ops import QOPERATORS + add_nodes = [] + remove_nodes = [] + inits = [] + for node in model.nodes(): + if node.op_type in QOPERATORS: + converter = QOPERATORS[node.op_type](node) + done, add_node = converter.convert() + if done: + add_nodes.extend(add_node) + remove_nodes.extend(node) + model.add_nodes(add_nodes) + model.remove_nodes(remove_nodes) + model.remove_unused_constant() + model.topological_sort() + return model From 51f3aecdd6cbf7abd08bd33529de60092a9340fa Mon Sep 17 00:00:00 2001 From: mengniwa Date: Thu, 8 Dec 2022 16:59:08 +0800 Subject: [PATCH 02/12] fix node conversion Signed-off-by: mengniwa --- .../adaptor/ox_utils/operators/activation.py | 20 +++--- .../adaptor/ox_utils/operators/attention.py | 71 ++++++++++++++++++- .../adaptor/ox_utils/operators/binary_op.py | 24 ++++--- .../adaptor/ox_utils/operators/concat.py | 19 ++--- .../adaptor/ox_utils/operators/conv.py | 54 ++++++++++---- .../adaptor/ox_utils/operators/direct_q8.py | 36 +++++++++- .../adaptor/ox_utils/operators/gather.py | 19 ++--- .../adaptor/ox_utils/operators/gavgpool.py | 22 +++--- .../adaptor/ox_utils/operators/gemm.py | 66 +++++++++++++---- .../adaptor/ox_utils/operators/matmul.py | 49 +++++++++---- .../adaptor/ox_utils/operators/ops.py | 15 ++-- .../adaptor/ox_utils/operators/pooling.py | 19 ++--- .../adaptor/ox_utils/operators/split.py | 36 +++++++++- neural_compressor/adaptor/ox_utils/util.py | 18 ----- neural_compressor/config.py | 25 +++++++ .../experimental/export/__init__.py | 1 + neural_compressor/model/onnx_model.py | 18 +++++ 17 files changed, 375 insertions(+), 137 deletions(-) diff --git a/neural_compressor/adaptor/ox_utils/operators/activation.py b/neural_compressor/adaptor/ox_utils/operators/activation.py index a6f11b35cf8..67c354d2164 100644 --- a/neural_compressor/adaptor/ox_utils/operators/activation.py +++ b/neural_compressor/adaptor/ox_utils/operators/activation.py @@ -17,7 +17,7 @@ # import onnx -from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator +from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator, QOperator, qop_registry from neural_compressor.adaptor.ox_utils.util import attribute_to_kwarg, ms_domain @op_registry(op_types="LeakyRelu, Sigmoid") @@ -91,25 +91,27 @@ def quantize(self): @qop_registry(op_types="QLinearLeakyRelu, QLinearSigmoid") class QActivationOperator(QOperator): - def __init__(self, onnx_node): - super().__init__(onnx_node) + def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): + super().__init__(onnx_node, children, initializers, channel_axis, exclude_output_quantization) def convert(self): node = self.node add_nodes = [] + inits = [] # input dq in_dq = onnx.helper.make_node( 'DequantizeLinear', - node.inputs[:3], - [node.name + '_in_dequant']) + node.input[:3], + [node.name + '_in_dequant'], + node.name + '_in_dequant') inputs = [node.name + '_in_dequant'] add_nodes.append(in_dq) # output q if not self.disable_qdq_for_node_output: out_q = onnx.helper.make_node( 'QuantizeLinear', - [node.name + '_out', node.inputs[3], node.inputs[4]], - node.outputs, + [node.name + '_out', node.input[3], node.input[4]], + node.output, node.name + '_out_quant') outputs = [node.name + '_out'] add_nodes.append(out_q) @@ -118,11 +120,9 @@ def convert(self): kwargs = {} for attribute in node.attribute: # pragma: no cover kwargs.update(attribute_to_kwarg(attribute)) - kwargs["domain"] = ms_domain activation_node = onnx.helper.make_node( node.op_type.split('QLinear')[-1], inputs, outputs, node.name + '_convert', **kwargs) add_nodes.append(activation_node) - return True, add_nodes - + return True, add_nodes, inits \ No newline at end of file diff --git a/neural_compressor/adaptor/ox_utils/operators/attention.py b/neural_compressor/adaptor/ox_utils/operators/attention.py index 9bd33ae4c26..bc45ecaa294 100644 --- a/neural_compressor/adaptor/ox_utils/operators/attention.py +++ b/neural_compressor/adaptor/ox_utils/operators/attention.py @@ -17,8 +17,8 @@ # import onnx -from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator -from neural_compressor.adaptor.ox_utils.util import attribute_to_kwarg, ms_domain +from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator, qop_registry, QOperator +from neural_compressor.adaptor.ox_utils.util import attribute_to_kwarg, ms_domain, find_by_name @op_registry(op_types="Attention") class AttentionOperator(Operator): @@ -74,3 +74,70 @@ def convert(self, convert_format): self.quantizer.new_nodes.append(qattention_node) self.quantizer.remove_nodes.append(node) + +@qop_registry(op_types="QAttention") +class QAttentionOperator(QOperator): + def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): + super().__init__(onnx_node, children, initializers, channel_axis, exclude_output_quantization) + + def convert(self): + node = self.node + add_nodes = [] + inputs = [] + inits = [] + # input dq + in_dq1 = onnx.helper.make_node( + 'DequantizeLinear', + [node.input[0], node.input[3], node.input[6]], + [node.name + '_in_dequant1'], + node.name + '_in_dequant1') + + weight_scale = onnx.numpy_helper.to_array( + find_by_name(node.input[4], self.initializers)) + if len(weight_scale) > 1: + if 'MatMul' not in self.axis: + from neural_compressor.utils import logger + logger.warning( + "Don't offer the axis of per-channel quantizd Attention, use default axis=1") + axis = 1 + else: + axis = self.axis['Attention'] + in_dq2 = onnx.helper.make_node( + 'DequantizeLinear', + [node.input[1], node.input[4], node.input[7]], + [node.name + '_in_dequant2'], + node.name + '_in_dequant2', + axis=axis) + else: + in_dq2 = onnx.helper.make_node( + 'DequantizeLinear', + [node.input[1], node.input[4], node.input[7]], + [node.name + '_in_dequant2'], + node.name + '_in_dequant2') + inputs = [node.name + '_in_dequant1', + node.name + '_in_dequant2', + node.input[2], + node.input[5]] + + add_nodes.extend([in_dq1, in_dq2]) + # output q + if not self.disable_qdq_for_node_output: + out_q = onnx.helper.make_node( + 'QuantizeLinear', + [node.name + '_out', node.input[6], node.input[7]], + node.output, + node.name + '_out_quant') + outputs = [node.name + '_out'] + add_nodes.append(out_q) + else: + outputs = node.output + kwargs = {} + for attribute in node.attribute: # pragma: no cover + kwargs.update(attribute_to_kwarg(attribute)) + kwargs["domain"] = ms_domain + + binary_node = onnx.helper.make_node( + 'Attention', inputs, + outputs, node.name + '_convert', **kwargs) + add_nodes.append(binary_node) + return True, add_nodes, inits \ No newline at end of file diff --git a/neural_compressor/adaptor/ox_utils/operators/binary_op.py b/neural_compressor/adaptor/ox_utils/operators/binary_op.py index 5b39f36bb0a..6235b121a41 100644 --- a/neural_compressor/adaptor/ox_utils/operators/binary_op.py +++ b/neural_compressor/adaptor/ox_utils/operators/binary_op.py @@ -17,7 +17,7 @@ # import onnx -from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator +from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator, QOperator, qop_registry from neural_compressor.adaptor.ox_utils.util import attribute_to_kwarg, ms_domain @op_registry(op_types="Add, Mul") @@ -81,22 +81,25 @@ def convert(self, convert_format): @qop_registry(op_types="QLinearAdd, QLinearMul") class QBinaryOperator(QOperator): - def __init__(self, onnx_node): - super().__init__(onnx_node) + def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): + super().__init__(onnx_node, children, initializers, channel_axis, exclude_output_quantization) def convert(self): node = self.node add_nodes = [] + inits = [] # input dq in_dq1 = onnx.helper.make_node( 'DequantizeLinear', - node.inputs[:3], - [node.name + '_in_dequant1']) + node.input[:3], + [node.name + '_in_dequant1'], + node.name + '_in_dequant1') in_dq2 = onnx.helper.make_node( 'DequantizeLinear', - node.inputs[3:6], - [node.name + '_in_dequant2']) + node.input[3:6], + [node.name + '_in_dequant2'], + node.name + '_in_dequant2') inputs = [node.name + '_in_dequant1', node.name + '_in_dequant2'] add_nodes.extend([in_dq1, in_dq2]) @@ -104,8 +107,8 @@ def convert(self): if not self.disable_qdq_for_node_output: out_q = onnx.helper.make_node( 'QuantizeLinear', - [node.name + '_out', node.inputs[6], node.inputs[7]], - node.outputs, + [node.name + '_out', node.input[6], node.input[7]], + node.output, node.name + '_out_quant') outputs = [node.name + '_out'] add_nodes.append(out_q) @@ -114,10 +117,9 @@ def convert(self): kwargs = {} for attribute in node.attribute: # pragma: no cover kwargs.update(attribute_to_kwarg(attribute)) - kwargs["domain"] = ms_domain binary_node = onnx.helper.make_node( node.op_type.split('QLinear')[-1], inputs, outputs, node.name + '_convert', **kwargs) add_nodes.append(binary_node) - return True, add_nodes + return True, add_nodes, inits diff --git a/neural_compressor/adaptor/ox_utils/operators/concat.py b/neural_compressor/adaptor/ox_utils/operators/concat.py index 3070d772202..ceab401650c 100644 --- a/neural_compressor/adaptor/ox_utils/operators/concat.py +++ b/neural_compressor/adaptor/ox_utils/operators/concat.py @@ -17,7 +17,7 @@ # import onnx -from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator +from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator, QOperator, qop_registry from neural_compressor.adaptor.ox_utils.util import attribute_to_kwarg, ms_domain @op_registry(op_types="Concat") @@ -99,19 +99,21 @@ def cast(self): # pragma: no cover @qop_registry(op_types="QLinearConcat") class QConcatOperator(QOperator): - def __init__(self, onnx_node): - super().__init__(onnx_node) + def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): + super().__init__(onnx_node, children, initializers, channel_axis, exclude_output_quantization) def convert(self): node = self.node add_nodes = [] inputs = [] + inits = [] # input dq for i in range((len(node.inputs) - 2) / 3 - 1): in_dq = onnx.helper.make_node( 'DequantizeLinear', - node.inputs[2 + i*3 : 2 + (i+1)*3], - [node.name + '_in_dequant_' + str(i)]) + node.input[2 + i*3 : 2 + (i+1)*3], + [node.name + '_in_dequant_' + str(i)], + node.name + '_in_dequant_' + str(i)) inputs.append(node.name + '_in_dequant_' + str(i)) add_nodes.append(in_dq) @@ -119,8 +121,8 @@ def convert(self): if not self.disable_qdq_for_node_output: out_q = onnx.helper.make_node( 'QuantizeLinear', - [node.name + '_out', node.inputs[0], node.inputs[1]], - node.outputs, + [node.name + '_out', node.input[0], node.input[1]], + node.output, node.name + '_out_quant') outputs = [node.name + '_out'] add_nodes.append(out_q) @@ -129,10 +131,9 @@ def convert(self): kwargs = {} for attribute in node.attribute: # pragma: no cover kwargs.update(attribute_to_kwarg(attribute)) - kwargs["domain"] = ms_domain concat_node = onnx.helper.make_node( 'Concat', inputs, outputs, node.name + '_convert', **kwargs) add_nodes.append(concat_node) - return True, add_nodes + return True, add_nodes, inits diff --git a/neural_compressor/adaptor/ox_utils/operators/conv.py b/neural_compressor/adaptor/ox_utils/operators/conv.py index ce9671b5f80..82acb52e201 100644 --- a/neural_compressor/adaptor/ox_utils/operators/conv.py +++ b/neural_compressor/adaptor/ox_utils/operators/conv.py @@ -19,7 +19,7 @@ import onnx from onnx import onnx_pb as onnx_proto -from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator +from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator, QOperator, qop_registry from neural_compressor.adaptor.ox_utils.util import find_by_name, attribute_to_kwarg @op_registry(op_types="Conv, FusedConv") @@ -166,32 +166,59 @@ def convert(self, convert_format): @qop_registry(op_types="QLinearConv") class QConvOperator(QOperator): - def __init__(self, onnx_node): - super().__init__(onnx_node) + def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): + super().__init__(onnx_node, children, initializers, channel_axis, exclude_output_quantization) def convert(self): - # TODO node = self.node add_nodes = [] + inits = [] # input dq in_dq1 = onnx.helper.make_node( 'DequantizeLinear', - node.inputs[:3], - [node.name + '_in_dequant1']) + node.input[:3], + [node.name + '_in_dequant1'], + node.name + '_in_dequant1') in_dq2 = onnx.helper.make_node( 'DequantizeLinear', - node.inputs[3:6], - [node.name + '_in_dequant2']) - inputs = [node.name + '_in_dequant1', node.name + '_in_dequant2'] - + node.input[3:6], + [node.name + '_in_dequant2'], + node.name + '_in_dequant2') add_nodes.extend([in_dq1, in_dq2]) + inputs = [node.name + '_in_dequant1', node.name + '_in_dequant2'] + if len(node.input) == 9: + import numpy as np + input_scale = onnx.numpy_helper.to_array( + find_by_name(node.input[1], self.initializers)) + weight_scale = onnx.numpy_helper.to_array( + find_by_name(node.input[4], self.initializers)) + bias_scale = input_scale * weight_scale + + # update scale initializer + bias_scale_data = np.asarray(bias_scale, dtype=np.float32).reshape(-1) + bias_scale_initializer = onnx.numpy_helper.from_array(bias_scale_data, + node.input[8] + '_scale') + inits.extend([bias_scale_initializer]) + + # update zero initializer + bias_zp_data = np.zeros(bias_scale.shape, dtype=np.int32).reshape(-1) + bias_zp_initializer = onnx.numpy_helper.from_array( + bias_zp_data, node.input[8] + '_zero_point') + inits.extend([bias_zp_initializer]) + in_dq3 = onnx.helper.make_node( + 'DequantizeLinear', + [node.input[8], bias_scale_initializer.name, bias_zp_initializer.name], + [node.name + '_in_dequant3'], + node.name + '_in_dequant3') + inputs.append(in_dq3.name) + add_nodes.append(in_dq3) # output q if not self.disable_qdq_for_node_output: out_q = onnx.helper.make_node( 'QuantizeLinear', - [node.name + '_out', node.inputs[6], node.inputs[7]], - node.outputs, + [node.name + '_out', node.input[6], node.input[7]], + node.output, node.name + '_out_quant') outputs = [node.name + '_out'] add_nodes.append(out_q) @@ -200,10 +227,9 @@ def convert(self): kwargs = {} for attribute in node.attribute: # pragma: no cover kwargs.update(attribute_to_kwarg(attribute)) - kwargs["domain"] = ms_domain binary_node = onnx.helper.make_node( node.op_type.split('QLinear')[-1], inputs, outputs, node.name + '_convert', **kwargs) add_nodes.append(binary_node) - return True, add_nodes + return True, add_nodes, inits diff --git a/neural_compressor/adaptor/ox_utils/operators/direct_q8.py b/neural_compressor/adaptor/ox_utils/operators/direct_q8.py index 00522c178a1..0f75016667a 100644 --- a/neural_compressor/adaptor/ox_utils/operators/direct_q8.py +++ b/neural_compressor/adaptor/ox_utils/operators/direct_q8.py @@ -16,7 +16,8 @@ # limitations under the License. # -from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator +from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator, qop_registry, QOperator +from neural_compressor.adaptor.ox_utils.util import attribute_to_kwarg, ms_domain @op_registry(op_types="Reshape, Transpose, Squeeze, Unsqueeze") class Direct8BitOperator(Operator): @@ -83,3 +84,36 @@ def cast(self): if node.input[0] not in [i.tensor_name for i in self.quantizer.new_value_info.values()]: return self.quantizer.dtype_cast(self.node, self.dtype) + +@qop_registry(op_types="Reshape, Transpose, Squeeze, Unsqueeze") +class QDirectOperator(QOperator): + def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): + super().__init__(onnx_node, children, initializers, channel_axis, exclude_output_quantization) + + def convert(self): + node = self.node + add_nodes = [] + inputs = [] + inits = [] + if all([i.op_type != 'DequantizeLinear' for i in self.children]): + return False, add_nodes + for child in self.children: + if child.op_type == 'DequantizeLinear': + in_dq = onnx.helper.make_node( + 'DequantizeLinear', + [node.input[0], child.input[1], child.input[2]], + [node.name + '_in_dequant_' + str(i)]) + inputs.append(node.name + '_in_dequant_' + str(i)) + add_nodes.append(in_dq) + break + outputs = node.output + kwargs = {} + for attribute in node.attribute: # pragma: no cover + kwargs.update(attribute_to_kwarg(attribute)) + kwargs["domain"] = ms_domain + + gather_node = onnx.helper.make_node( + node.op_type, inputs, + outputs, node.name + '_convert', **kwargs) + add_nodes.append(gather_node) + return True, add_nodes, inits \ No newline at end of file diff --git a/neural_compressor/adaptor/ox_utils/operators/gather.py b/neural_compressor/adaptor/ox_utils/operators/gather.py index 1e81de4e6c0..cf506d52c90 100644 --- a/neural_compressor/adaptor/ox_utils/operators/gather.py +++ b/neural_compressor/adaptor/ox_utils/operators/gather.py @@ -17,8 +17,8 @@ # import onnx -from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator -from neural_compressor.adaptor.ox_utils.util import attribute_to_kwarg +from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator, QOperator, qop_registry +from neural_compressor.adaptor.ox_utils.util import attribute_to_kwarg, ms_domain @op_registry(op_types="Gather") class GatherOperator(Operator): @@ -93,13 +93,14 @@ def convert(self, convert_format): @qop_registry(op_types="Gather") class QGatherOperator(QOperator): - def __init__(self, onnx_node): - super().__init__(onnx_node) + def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): + super().__init__(onnx_node, children, initializers, channel_axis, exclude_output_quantization) def convert(self): node = self.node add_nodes = [] inputs = [] + inits = [] if all([i.op_type != 'DequantizeLinear' for i in self.children]): return False, add_nodes for child in self.children: @@ -107,18 +108,18 @@ def convert(self): in_dq = onnx.helper.make_node( 'DequantizeLinear', [node.input[0], child.input[1], child.input[2]], - [node.name + '_in_dequant_' + str(i)]) - inputs.append(node.name + '_in_dequant_' + str(i)) + [node.name + '_in_dequant'], + node.name + '_in_dequant') + inputs.append(node.name + '_in_dequant') add_nodes.append(in_dq) break - outputs = node.outputs + outputs = node.output kwargs = {} for attribute in node.attribute: # pragma: no cover kwargs.update(attribute_to_kwarg(attribute)) - kwargs["domain"] = ms_domain gather_node = onnx.helper.make_node( 'Gather', inputs, outputs, node.name + '_convert', **kwargs) add_nodes.append(gather_node) - return True, add_nodes + return True, add_nodes, inits diff --git a/neural_compressor/adaptor/ox_utils/operators/gavgpool.py b/neural_compressor/adaptor/ox_utils/operators/gavgpool.py index 5d9a949fed0..96d462a2a35 100644 --- a/neural_compressor/adaptor/ox_utils/operators/gavgpool.py +++ b/neural_compressor/adaptor/ox_utils/operators/gavgpool.py @@ -17,7 +17,7 @@ # import onnx -from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator +from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator, QOperator, qop_registry from neural_compressor.adaptor.ox_utils.util import attribute_to_kwarg, ms_domain @op_registry(op_types="GlobalAveragePool") @@ -62,37 +62,35 @@ def convert(self, convert_format): @qop_registry(op_types="QLinearGlobalAveragePool") class QGlobalAveragePoolOperator(QOperator): - def __init__(self, onnx_node): - super().__init__(onnx_node) + def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): + super().__init__(onnx_node, children, initializers, channel_axis, exclude_output_quantization) def convert(self): node = self.node add_nodes = [] + inits = [] # input dq in_dq = onnx.helper.make_node( 'DequantizeLinear', - node.inputs[:3], - [node.name + '_in_dequant']) + node.input[:3], + [node.name + '_in_dequant'], + node.name + '_in_dequant') inputs = [node.name + '_in_dequant'] add_nodes.append(in_dq) # output q if not self.disable_qdq_for_node_output: out_q = onnx.helper.make_node( 'QuantizeLinear', - [node.name + '_out', node.inputs[3], node.inputs[4]], - node.outputs, + [node.name + '_out', node.input[3], node.input[4]], + node.output, node.name + '_out_quant') outputs = [node.name + '_out'] add_nodes.append(out_q) else: outputs = node.output kwargs = {} - for attribute in node.attribute: # pragma: no cover - kwargs.update(attribute_to_kwarg(attribute)) - kwargs["domain"] = ms_domain - activation_node = onnx.helper.make_node( 'GlobalAveragePool', inputs, outputs, node.name + '_convert', **kwargs) add_nodes.append(activation_node) - return True, add_nodes + return True, add_nodes, inits diff --git a/neural_compressor/adaptor/ox_utils/operators/gemm.py b/neural_compressor/adaptor/ox_utils/operators/gemm.py index c523e37b696..0ba76389f7c 100644 --- a/neural_compressor/adaptor/ox_utils/operators/gemm.py +++ b/neural_compressor/adaptor/ox_utils/operators/gemm.py @@ -17,7 +17,7 @@ # import onnx -from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator +from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator, QOperator, qop_registry from neural_compressor.adaptor.ox_utils.util import find_by_name, ms_domain, \ attribute_to_kwarg, is_B_transposed @@ -95,31 +95,68 @@ def convert(self, convert_format): @qop_registry(op_types="QGemm") class QGemmOperator(QOperator): - def __init__(self, onnx_node): - super().__init__(onnx_node) + def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): + super().__init__(onnx_node, children, initializers, channel_axis, exclude_output_quantization) def convert(self): + import numpy as np node = self.node add_nodes = [] + inits = [] + + input_scale = onnx.numpy_helper.to_array( + find_by_name(node.input[1], self.initializers)) + weight_scale = onnx.numpy_helper.to_array( + find_by_name(node.input[4], self.initializers)) + bias_scale = input_scale * weight_scale + # input dq in_dq1 = onnx.helper.make_node( 'DequantizeLinear', - node.inputs[:3], - [node.name + '_in_dequant1']) - - in_dq2 = onnx.helper.make_node( + node.input[:3], + [node.name + '_in_dequant1'], + node.name + '_in_dequant1') + + if len(weight_scale) > 1: + axis = 0 if is_B_transposed(node) else 1 + in_dq2 = onnx.helper.make_node( + 'DequantizeLinear', + node.input[3:6], + [node.name + '_in_dequant2'], + node.name + '_in_dequant2', + axis=axis) + else: + in_dq2 = onnx.helper.make_node( + 'DequantizeLinear', + node.input[3:6], + [node.name + '_in_dequant2'], + node.name + '_in_dequant2') + + # update scale initializer + bias_scale_data = np.asarray(bias_scale, dtype=np.float32).reshape(-1) + bias_scale_initializer = onnx.numpy_helper.from_array(bias_scale_data, + node.input[6] + '_scale') + inits.extend([bias_scale_initializer]) + + # update zero initializer + bias_zp_data = np.zeros(bias_scale.shape, dtype=np.int32).reshape(-1) + bias_zp_initializer = onnx.numpy_helper.from_array( + bias_zp_data, node.input[6] + '_zero_point') + inits.extend([bias_zp_initializer]) + in_dq3 = onnx.helper.make_node( 'DequantizeLinear', - node.inputs[3:6], - [node.name + '_in_dequant2']) - inputs = [node.name + '_in_dequant1', node.name + '_in_dequant2'] + [node.input[8], bias_scale_initializer.name, bias_zp_initializer.name], + [node.name + '_in_dequant3']) - add_nodes.extend([in_dq1, in_dq2]) + inputs = [in_dq1.name, in_dq2.name, in_dq3.name] + add_nodes.extend([in_dq1, in_dq2, in_dq3]) + # output q if not self.disable_qdq_for_node_output: out_q = onnx.helper.make_node( 'QuantizeLinear', - [node.name + '_out', node.inputs[6], node.inputs[7]], - node.outputs, + [node.name + '_out', node.input[6], node.input[7]], + node.output, node.name + '_out_quant') outputs = [node.name + '_out'] add_nodes.append(out_q) @@ -128,10 +165,9 @@ def convert(self): kwargs = {} for attribute in node.attribute: # pragma: no cover kwargs.update(attribute_to_kwarg(attribute)) - kwargs["domain"] = ms_domain gemm_node = onnx.helper.make_node( 'Gemm', inputs, outputs, node.name + '_convert', **kwargs) add_nodes.append(gemm_node) - return True, add_nodes + return True, add_nodes, inits \ No newline at end of file diff --git a/neural_compressor/adaptor/ox_utils/operators/matmul.py b/neural_compressor/adaptor/ox_utils/operators/matmul.py index a2b81af703d..6433d893e52 100644 --- a/neural_compressor/adaptor/ox_utils/operators/matmul.py +++ b/neural_compressor/adaptor/ox_utils/operators/matmul.py @@ -17,8 +17,8 @@ # import onnx -from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator -from neural_compressor.adaptor.ox_utils.util import find_by_name +from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator, QOperator, qop_registry +from neural_compressor.adaptor.ox_utils.util import find_by_name, ms_domain, attribute_to_kwarg from onnx import onnx_pb as onnx_proto @op_registry(op_types="MatMul") @@ -126,22 +126,42 @@ def convert(self, convert_format): @qop_registry(op_types="QLinearMatMul") class QMatMulOperator(QOperator): - def __init__(self, onnx_node): - super().__init__(onnx_node) + def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): + super().__init__(onnx_node, children, initializers, channel_axis, exclude_output_quantization) def convert(self): node = self.node add_nodes = [] + inits = [] # input dq in_dq1 = onnx.helper.make_node( 'DequantizeLinear', - node.inputs[:3], - [node.name + '_in_dequant1']) - - in_dq2 = onnx.helper.make_node( - 'DequantizeLinear', - node.inputs[3:6], - [node.name + '_in_dequant2']) + node.input[:3], + [node.name + '_in_dequant1'], + node.name + '_in_dequant1') + + weight_scale = onnx.numpy_helper.to_array( + find_by_name(node.input[4], self.initializers)) + import pdb;pdb.set_trace() + if weight_scale is not None and len(weight_scale) > 1: + if 'MatMul' not in self.axis: + from neural_compressor.utils import logger + logger.warning("Don't offer the axis of per-channel quantizd MatMul, use default axis=1") + axis = 1 + else: + axis = self.axis['MatMul'] + in_dq2 = onnx.helper.make_node( + 'DequantizeLinear', + node.input[3:6], + [node.name + '_in_dequant2'], + node.name + '_in_dequant2', + axis=axis) + elif weight_scale is not None: + in_dq2 = onnx.helper.make_node( + 'DequantizeLinear', + node.input[3:6], + [node.name + '_in_dequant2'], + node.name + '_in_dequant2') inputs = [node.name + '_in_dequant1', node.name + '_in_dequant2'] add_nodes.extend([in_dq1, in_dq2]) @@ -149,8 +169,8 @@ def convert(self): if not self.disable_qdq_for_node_output: out_q = onnx.helper.make_node( 'QuantizeLinear', - [node.name + '_out', node.inputs[6], node.inputs[7]], - node.outputs, + [node.name + '_out', node.input[6], node.input[7]], + node.output, node.name + '_out_quant') outputs = [node.name + '_out'] add_nodes.append(out_q) @@ -159,10 +179,9 @@ def convert(self): kwargs = {} for attribute in node.attribute: # pragma: no cover kwargs.update(attribute_to_kwarg(attribute)) - kwargs["domain"] = ms_domain matmul_node = onnx.helper.make_node( 'MatMul', inputs, outputs, node.name + '_convert', **kwargs) add_nodes.append(matmul_node) - return True, add_nodes + return True, add_nodes, inits diff --git a/neural_compressor/adaptor/ox_utils/operators/ops.py b/neural_compressor/adaptor/ox_utils/operators/ops.py index a3e9f73d968..609f68d9095 100644 --- a/neural_compressor/adaptor/ox_utils/operators/ops.py +++ b/neural_compressor/adaptor/ox_utils/operators/ops.py @@ -103,19 +103,14 @@ def cast(self): # pragma: no cover self.quantizer.dtype_cast(self.node, self.dtype) class QOperator(object): - def __init__(self, onnx_node, add_qdq_to_weight=False, - dedicated_qdq=False, optypes_to_exclude_output_quantization=[]): + def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): self.node = onnx_node - self.add_qdq_to_weight = add_qdq_to_weight - self.dedicated_qdq = dedicated_qdq + self.chilren = children + self.initializers = initializers self.disable_qdq_for_node_output = True if onnx_node.op_type in \ - optypes_to_exclude_output_quantization else False + exclude_output_quantization else False + self.axis = channel_axis self.per_channel = False - self.algorithm = 'minmax' - self.weight_scheme = 'sym' - self.weight_dtype = None - self.activation_dtype = None - self.activation_scheme = 'asym' def convert_check(self, convert_format): return True diff --git a/neural_compressor/adaptor/ox_utils/operators/pooling.py b/neural_compressor/adaptor/ox_utils/operators/pooling.py index e0d177fab67..544f7c4da2a 100644 --- a/neural_compressor/adaptor/ox_utils/operators/pooling.py +++ b/neural_compressor/adaptor/ox_utils/operators/pooling.py @@ -17,7 +17,7 @@ # import onnx -from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator +from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator, QOperator, qop_registry from neural_compressor.adaptor.ox_utils.util import attribute_to_kwarg, ms_domain @op_registry(op_types="AveragePool") @@ -83,25 +83,27 @@ def convert(self, convert_format): @qop_registry(op_types="QLinearAveragePool") class QPoolOperator(QOperator): - def __init__(self, onnx_node): - super().__init__(onnx_node) + def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): + super().__init__(onnx_node, children, initializers, channel_axis, exclude_output_quantization) def convert(self): node = self.node add_nodes = [] + inits = [] # input dq in_dq = onnx.helper.make_node( 'DequantizeLinear', - node.inputs[:3], - [node.name + '_in_dequant']) + node.input[:3], + [node.name + '_in_dequant'], + node.name + '_in_dequant') inputs = [node.name + '_in_dequant'] add_nodes.append(in_dq) # output q if not self.disable_qdq_for_node_output: out_q = onnx.helper.make_node( 'QuantizeLinear', - [node.name + '_out', node.inputs[3], node.inputs[4]], - node.outputs, + [node.name + '_out', node.input[3], node.input[4]], + node.output, node.name + '_out_quant') outputs = [node.name + '_out'] add_nodes.append(out_q) @@ -110,10 +112,9 @@ def convert(self): kwargs = {} for attribute in node.attribute: # pragma: no cover kwargs.update(attribute_to_kwarg(attribute)) - kwargs["domain"] = ms_domain activation_node = onnx.helper.make_node( 'AveragePool', inputs, outputs, node.name + '_convert', **kwargs) add_nodes.append(activation_node) - return True, add_nodes + return True, add_nodes, inits diff --git a/neural_compressor/adaptor/ox_utils/operators/split.py b/neural_compressor/adaptor/ox_utils/operators/split.py index a5ec5532711..50fff613bc4 100644 --- a/neural_compressor/adaptor/ox_utils/operators/split.py +++ b/neural_compressor/adaptor/ox_utils/operators/split.py @@ -17,8 +17,8 @@ # import onnx -from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator -from neural_compressor.adaptor.ox_utils.util import attribute_to_kwarg +from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator, QOperator, qop_registry +from neural_compressor.adaptor.ox_utils.util import attribute_to_kwarg, ms_domain @op_registry(op_types="Split") @@ -81,3 +81,35 @@ def cast(self): # pragma: no cover if node.input[0] not in [i.tensor_name for i in self.quantizer.new_value_info.values()]: return self.quantizer.dtype_cast(self.node, self.dtype) + +@qop_registry(op_types="QSplit") +class QDirectOperator(QOperator): + def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): + super().__init__(onnx_node, children, initializers, channel_axis, exclude_output_quantization) + + def convert(self): + node = self.node + add_nodes = [] + inputs = [] + inits = [] + if all([i.op_type != 'DequantizeLinear' for i in self.children]): + return False, add_nodes + for child in self.children: + if child.op_type == 'DequantizeLinear': + in_dq = onnx.helper.make_node( + 'DequantizeLinear', + [node.input[0], child.input[1], child.input[2]], + [node.name + '_in_dequant_' + str(i)]) + inputs.append(node.name + '_in_dequant_' + str(i)) + add_nodes.append(in_dq) + break + outputs = node.output + kwargs = {} + for attribute in node.attribute: # pragma: no cover + kwargs.update(attribute_to_kwarg(attribute)) + + gather_node = onnx.helper.make_node( + node.op_type, inputs, + outputs, node.name + '_convert', **kwargs) + add_nodes.append(gather_node) + return True, add_nodes, inits \ No newline at end of file diff --git a/neural_compressor/adaptor/ox_utils/util.py b/neural_compressor/adaptor/ox_utils/util.py index dc1be039ca7..d5041204efd 100644 --- a/neural_compressor/adaptor/ox_utils/util.py +++ b/neural_compressor/adaptor/ox_utils/util.py @@ -379,21 +379,3 @@ def find_by_name(name, item_list): return items[0] else: return None - -def convert_qoperator_to_qdq(model, config): - from neural_compressor.adaptor.ox_utils/operators.ops import QOPERATORS - add_nodes = [] - remove_nodes = [] - inits = [] - for node in model.nodes(): - if node.op_type in QOPERATORS: - converter = QOPERATORS[node.op_type](node) - done, add_node = converter.convert() - if done: - add_nodes.extend(add_node) - remove_nodes.extend(node) - model.add_nodes(add_nodes) - model.remove_nodes(remove_nodes) - model.remove_unused_constant() - model.topological_sort() - return model diff --git a/neural_compressor/config.py b/neural_compressor/config.py index 6e45e9722b5..2d1fd81ab18 100644 --- a/neural_compressor/config.py +++ b/neural_compressor/config.py @@ -815,6 +815,31 @@ def dynamic_axes(self, dynamic_axes): self._dynamic_axes = dynamic_axes +class ONNXQlinear2QDQConfig: + def __init__( + self, + channel_axis={}, + exclude_output_quantization={} + ): + self._channel_axis = channel_axis + self._exclude_output_quantization = exclude_output_quantization + + @property + def channel_axis(self): + return self._channel_axis + + @channel_axis.setter + def channel_axis(self, channel_axis): + self._dtype = channel_axis + + @property + def exclude_output_quantization(self): + return self._exclude_output_quantization + + @exclude_output_quantization.setter + def exclude_output_quantization(self, exclude_output_quantization): + self._exclude_output_quantization = exclude_output_quantization + class Torch2ONNXConfig(ExportConfig): def __init__( self, diff --git a/neural_compressor/experimental/export/__init__.py b/neural_compressor/experimental/export/__init__.py index 2ccf049bf64..529ea48ed35 100644 --- a/neural_compressor/experimental/export/__init__.py +++ b/neural_compressor/experimental/export/__init__.py @@ -19,3 +19,4 @@ """Intel Neural Compressor Export.""" from .torch2onnx import torch_to_fp32_onnx, torch_to_int8_onnx +from .qlinear2qdq import onnx_qlinear_to_qdq diff --git a/neural_compressor/model/onnx_model.py b/neural_compressor/model/onnx_model.py index 90fcda508c6..b28fbd8eb35 100644 --- a/neural_compressor/model/onnx_model.py +++ b/neural_compressor/model/onnx_model.py @@ -154,6 +154,10 @@ def add_initializer(self, tensor): if ortq.find_by_name(tensor.name, self._model.graph.initializer) is None: self._model.graph.initializer.extend([tensor]) + def add_initializers(self, tensors): + for tensor in tensors: + self.add_initializer(tensor) + def get_initializer(self, name): for tensor in self._model.graph.initializer: if tensor.name == name: @@ -423,3 +427,17 @@ def get_nodes_chain(self, start_node, stop_node, result_chain=[]): start_node.append(parent.name) return result_chain + + def export(self, save_path, conf): + from neural_compressor.experimental.export import onnx_qlinear_to_qdq + add_nodes, remove_nodes, inits = onnx_qlinear_to_qdq(self._model, + self.initializer(), + self._output_name_to_node, + conf.channel_axis, + conf.exclude_output_quantization) + self.add_nodes(add_nodes) + self.remove_nodes(remove_nodes) + self.remove_unused_constant() + self.add_initializers(inits) + self.topological_sort() + self.save(save_path) From 907120e9286824d9439e587728742fcadc09ffcc Mon Sep 17 00:00:00 2001 From: mengniwa Date: Thu, 8 Dec 2022 21:02:10 +0800 Subject: [PATCH 03/12] fix node and add ut Signed-off-by: mengniwa --- .../adaptor/ox_utils/operators/argmax.py | 13 +- .../adaptor/ox_utils/operators/attention.py | 2 +- .../adaptor/ox_utils/operators/conv.py | 32 +- .../adaptor/ox_utils/operators/direct_q8.py | 30 +- .../ox_utils/operators/embed_layernorm.py | 37 +- .../adaptor/ox_utils/operators/gather.py | 30 +- .../adaptor/ox_utils/operators/gemm.py | 2 +- .../adaptor/ox_utils/operators/matmul.py | 3 +- .../adaptor/ox_utils/operators/maxpool.py | 9 +- .../adaptor/ox_utils/operators/ops.py | 49 +- .../adaptor/ox_utils/operators/pad.py | 9 +- .../adaptor/ox_utils/operators/resize.py | 6 +- .../adaptor/ox_utils/operators/split.py | 25 +- test/export/test_onnx_qlieanr_to_qdq.py | 813 ++++++++++++++++++ 14 files changed, 971 insertions(+), 89 deletions(-) create mode 100644 test/export/test_onnx_qlieanr_to_qdq.py diff --git a/neural_compressor/adaptor/ox_utils/operators/argmax.py b/neural_compressor/adaptor/ox_utils/operators/argmax.py index 9344498698e..69f310ee4d8 100644 --- a/neural_compressor/adaptor/ox_utils/operators/argmax.py +++ b/neural_compressor/adaptor/ox_utils/operators/argmax.py @@ -15,9 +15,9 @@ # See the License for the specific language governing permissions and # limitations under the License. # - - -from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator +import onnx +from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator, QOperator, qop_registry +from neural_compressor.adaptor.ox_utils.util import attribute_to_kwarg @op_registry(op_types="ArgMax") class ArgMaxOperator(Operator): @@ -36,4 +36,9 @@ def convert(self, convert_format): if origin_name in self.quantizer.quantized_value_map: node.input[0] = self.quantizer.quantized_value_map[origin_name].q_name - node.name = node.name + '_quant' \ No newline at end of file + node.name = node.name + '_quant' + +@qop_registry(op_types="ArgMax") +class QArgMaxOperator(QOperator): + def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): + super().__init__(onnx_node, children, initializers, channel_axis, exclude_output_quantization) \ No newline at end of file diff --git a/neural_compressor/adaptor/ox_utils/operators/attention.py b/neural_compressor/adaptor/ox_utils/operators/attention.py index bc45ecaa294..ddaa970c6f2 100644 --- a/neural_compressor/adaptor/ox_utils/operators/attention.py +++ b/neural_compressor/adaptor/ox_utils/operators/attention.py @@ -94,7 +94,7 @@ def convert(self): weight_scale = onnx.numpy_helper.to_array( find_by_name(node.input[4], self.initializers)) - if len(weight_scale) > 1: + if len(weight_scale.shape) == 1: if 'MatMul' not in self.axis: from neural_compressor.utils import logger logger.warning( diff --git a/neural_compressor/adaptor/ox_utils/operators/conv.py b/neural_compressor/adaptor/ox_utils/operators/conv.py index 82acb52e201..e028b5caa17 100644 --- a/neural_compressor/adaptor/ox_utils/operators/conv.py +++ b/neural_compressor/adaptor/ox_utils/operators/conv.py @@ -179,20 +179,36 @@ def convert(self): node.input[:3], [node.name + '_in_dequant1'], node.name + '_in_dequant1') - - in_dq2 = onnx.helper.make_node( - 'DequantizeLinear', - node.input[3:6], - [node.name + '_in_dequant2'], - node.name + '_in_dequant2') + + weight_scale = onnx.numpy_helper.to_array( + find_by_name(node.input[4], self.initializers)) + if weight_scale is not None and len(weight_scale.shape) == 1: + if 'Conv' not in self.axis: + from neural_compressor.utils import logger + logger.warning("Don't offer the axis of per-channel quantizd Conv, use default axis=0") + axis = 0 + else: + axis = self.axis['Conv'] + in_dq2 = onnx.helper.make_node( + 'DequantizeLinear', + node.input[3:6], + [node.name + '_in_dequant2'], + node.name + '_in_dequant2', + axis=axis) + elif weight_scale is not None: + in_dq2 = onnx.helper.make_node( + 'DequantizeLinear', + node.input[3:6], + [node.name + '_in_dequant2'], + node.name + '_in_dequant2') + add_nodes.extend([in_dq1, in_dq2]) inputs = [node.name + '_in_dequant1', node.name + '_in_dequant2'] if len(node.input) == 9: import numpy as np input_scale = onnx.numpy_helper.to_array( find_by_name(node.input[1], self.initializers)) - weight_scale = onnx.numpy_helper.to_array( - find_by_name(node.input[4], self.initializers)) + bias_scale = input_scale * weight_scale # update scale initializer diff --git a/neural_compressor/adaptor/ox_utils/operators/direct_q8.py b/neural_compressor/adaptor/ox_utils/operators/direct_q8.py index 0f75016667a..410fcf9eb8c 100644 --- a/neural_compressor/adaptor/ox_utils/operators/direct_q8.py +++ b/neural_compressor/adaptor/ox_utils/operators/direct_q8.py @@ -88,32 +88,4 @@ def cast(self): @qop_registry(op_types="Reshape, Transpose, Squeeze, Unsqueeze") class QDirectOperator(QOperator): def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): - super().__init__(onnx_node, children, initializers, channel_axis, exclude_output_quantization) - - def convert(self): - node = self.node - add_nodes = [] - inputs = [] - inits = [] - if all([i.op_type != 'DequantizeLinear' for i in self.children]): - return False, add_nodes - for child in self.children: - if child.op_type == 'DequantizeLinear': - in_dq = onnx.helper.make_node( - 'DequantizeLinear', - [node.input[0], child.input[1], child.input[2]], - [node.name + '_in_dequant_' + str(i)]) - inputs.append(node.name + '_in_dequant_' + str(i)) - add_nodes.append(in_dq) - break - outputs = node.output - kwargs = {} - for attribute in node.attribute: # pragma: no cover - kwargs.update(attribute_to_kwarg(attribute)) - kwargs["domain"] = ms_domain - - gather_node = onnx.helper.make_node( - node.op_type, inputs, - outputs, node.name + '_convert', **kwargs) - add_nodes.append(gather_node) - return True, add_nodes, inits \ No newline at end of file + super().__init__(onnx_node, children, initializers, channel_axis, exclude_output_quantization) \ No newline at end of file diff --git a/neural_compressor/adaptor/ox_utils/operators/embed_layernorm.py b/neural_compressor/adaptor/ox_utils/operators/embed_layernorm.py index 256298b7142..b48204518c4 100644 --- a/neural_compressor/adaptor/ox_utils/operators/embed_layernorm.py +++ b/neural_compressor/adaptor/ox_utils/operators/embed_layernorm.py @@ -17,7 +17,7 @@ # import onnx -from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator +from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator, QOperator, qop_registry from neural_compressor.adaptor.ox_utils.util import attribute_to_kwarg, ms_domain @op_registry(op_types="EmbedLayerNormalization") @@ -69,4 +69,37 @@ def convert(self, convert_format): inputs, node.output, node.name, **kwargs) self.quantizer.new_nodes.append(qembed_layer_norm_node) - self.quantizer.remove_nodes.extend(parents) \ No newline at end of file + self.quantizer.remove_nodes.extend(parents) + +@qop_registry(op_types="QEmbedLayerNormalization") +class QEmbedLayerNormalizationOperator(QOperator): + def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): + super().__init__(onnx_node, children, initializers, channel_axis, exclude_output_quantization) + + def convert(self): + node = self.node + add_nodes = [] + inits = [] + inputs = [node.input[0], node.input[1]] + # input dq + for i in range(5): + in_dq = onnx.helper.make_node( + 'DequantizeLinear', + [node.input[2+i], node.input[-10+i], node.input[-5+i]], + [node.name + '_in_dequant_' + str(i)], + node.name + '_in_dequant_' + str(i)) + inputs.append(node.name + '_in_dequant_' + str(i)) + add_nodes.append(in_dq) + if len(node.input) > 17: + inputs.append(node.input[7]) + + outputs = node.output + kwargs = {} + for attribute in node.attribute: # pragma: no cover + kwargs.update(attribute_to_kwarg(attribute)) + + binary_node = onnx.helper.make_node( + 'EmbedLayerNormalization', inputs, + outputs, node.name + '_convert', **kwargs) + add_nodes.append(binary_node) + return True, add_nodes, inits \ No newline at end of file diff --git a/neural_compressor/adaptor/ox_utils/operators/gather.py b/neural_compressor/adaptor/ox_utils/operators/gather.py index cf506d52c90..8d315f07cce 100644 --- a/neural_compressor/adaptor/ox_utils/operators/gather.py +++ b/neural_compressor/adaptor/ox_utils/operators/gather.py @@ -18,7 +18,7 @@ import onnx from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator, QOperator, qop_registry -from neural_compressor.adaptor.ox_utils.util import attribute_to_kwarg, ms_domain +from neural_compressor.adaptor.ox_utils.util import attribute_to_kwarg @op_registry(op_types="Gather") class GatherOperator(Operator): @@ -95,31 +95,3 @@ def convert(self, convert_format): class QGatherOperator(QOperator): def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): super().__init__(onnx_node, children, initializers, channel_axis, exclude_output_quantization) - - def convert(self): - node = self.node - add_nodes = [] - inputs = [] - inits = [] - if all([i.op_type != 'DequantizeLinear' for i in self.children]): - return False, add_nodes - for child in self.children: - if child.op_type == 'DequantizeLinear': - in_dq = onnx.helper.make_node( - 'DequantizeLinear', - [node.input[0], child.input[1], child.input[2]], - [node.name + '_in_dequant'], - node.name + '_in_dequant') - inputs.append(node.name + '_in_dequant') - add_nodes.append(in_dq) - break - outputs = node.output - kwargs = {} - for attribute in node.attribute: # pragma: no cover - kwargs.update(attribute_to_kwarg(attribute)) - - gather_node = onnx.helper.make_node( - 'Gather', inputs, - outputs, node.name + '_convert', **kwargs) - add_nodes.append(gather_node) - return True, add_nodes, inits diff --git a/neural_compressor/adaptor/ox_utils/operators/gemm.py b/neural_compressor/adaptor/ox_utils/operators/gemm.py index 0ba76389f7c..8688e478e5a 100644 --- a/neural_compressor/adaptor/ox_utils/operators/gemm.py +++ b/neural_compressor/adaptor/ox_utils/operators/gemm.py @@ -117,7 +117,7 @@ def convert(self): [node.name + '_in_dequant1'], node.name + '_in_dequant1') - if len(weight_scale) > 1: + if len(weight_scale.shape) == 1: axis = 0 if is_B_transposed(node) else 1 in_dq2 = onnx.helper.make_node( 'DequantizeLinear', diff --git a/neural_compressor/adaptor/ox_utils/operators/matmul.py b/neural_compressor/adaptor/ox_utils/operators/matmul.py index 6433d893e52..822438a4dc2 100644 --- a/neural_compressor/adaptor/ox_utils/operators/matmul.py +++ b/neural_compressor/adaptor/ox_utils/operators/matmul.py @@ -142,8 +142,7 @@ def convert(self): weight_scale = onnx.numpy_helper.to_array( find_by_name(node.input[4], self.initializers)) - import pdb;pdb.set_trace() - if weight_scale is not None and len(weight_scale) > 1: + if weight_scale is not None and len(weight_scale.shape) == 1: if 'MatMul' not in self.axis: from neural_compressor.utils import logger logger.warning("Don't offer the axis of per-channel quantizd MatMul, use default axis=1") diff --git a/neural_compressor/adaptor/ox_utils/operators/maxpool.py b/neural_compressor/adaptor/ox_utils/operators/maxpool.py index f93befc9a4f..2a521095fda 100644 --- a/neural_compressor/adaptor/ox_utils/operators/maxpool.py +++ b/neural_compressor/adaptor/ox_utils/operators/maxpool.py @@ -16,7 +16,7 @@ # limitations under the License. # -from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator +from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator, QOperator, qop_registry @op_registry(op_types="MaxPool") class MaxPoolOperator(Operator): @@ -67,4 +67,9 @@ def convert(self, convert_format): self.quantizer.model.replace_node_input(n, child.output[0], node.output[0]) - self.quantizer.remove_nodes.append(parent) \ No newline at end of file + self.quantizer.remove_nodes.append(parent) + +@qop_registry(op_types="MaxPool") +class QMaxPoolOperator(QOperator): + def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): + super().__init__(onnx_node, children, initializers, channel_axis, exclude_output_quantization) \ No newline at end of file diff --git a/neural_compressor/adaptor/ox_utils/operators/ops.py b/neural_compressor/adaptor/ox_utils/operators/ops.py index 609f68d9095..8f410ab3143 100644 --- a/neural_compressor/adaptor/ox_utils/operators/ops.py +++ b/neural_compressor/adaptor/ox_utils/operators/ops.py @@ -15,6 +15,9 @@ # See the License for the specific language governing permissions and # limitations under the License. # +from neural_compressor.utils.utility import LazyImport +from neural_compressor.adaptor.ox_utils.util import attribute_to_kwarg +onnx = LazyImport('onnx') OPERATORS = {} QOPERATORS= {} @@ -105,15 +108,51 @@ def cast(self): # pragma: no cover class QOperator(object): def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): self.node = onnx_node - self.chilren = children + self.children = children self.initializers = initializers self.disable_qdq_for_node_output = True if onnx_node.op_type in \ exclude_output_quantization else False self.axis = channel_axis self.per_channel = False - def convert_check(self, convert_format): - return True - def convert(self): - return + node = self.node + add_nodes = [] + inputs = [] + inits = [] + # output is not int8 tensor + if all([i.op_type != 'DequantizeLinear' for i in self.children]): + return False, add_nodes + # input dq + for child in self.children: + if child.op_type == 'DequantizeLinear': + in_dq = onnx.helper.make_node( + 'DequantizeLinear', + [node.input[0], child.input[1], child.input[2]], + [node.name + '_in_dequant'], + node.name + '_in_dequant') + inputs.append(node.name + '_in_dequant') + add_nodes.append(in_dq) + break + # output q + if not self.disable_qdq_for_node_output: + out_q = onnx.helper.make_node( + 'QuantizeLinear', + [node.name + '_out', in_dq.input[1], in_dq.input[2]], + node.output, + node.name + '_out_quant') + outputs = [node.name + '_out'] + add_nodes.append(out_q) + else: + outputs = node.output + + kwargs = {} + for attribute in node.attribute: # pragma: no cover + kwargs.update(attribute_to_kwarg(attribute)) + + inputs.append(node.input[1:]) + new_node = onnx.helper.make_node( + node.op_type, inputs, + outputs, node.name + '_convert', **kwargs) + add_nodes.append(new_node) + return True, add_nodes, inits \ No newline at end of file diff --git a/neural_compressor/adaptor/ox_utils/operators/pad.py b/neural_compressor/adaptor/ox_utils/operators/pad.py index 0f0acfcbec7..a2202b5d7fd 100644 --- a/neural_compressor/adaptor/ox_utils/operators/pad.py +++ b/neural_compressor/adaptor/ox_utils/operators/pad.py @@ -17,7 +17,7 @@ # import onnx -from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator +from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator, QOperator, qop_registry from neural_compressor.adaptor.ox_utils.util import attribute_to_kwarg, quantize_nparray @op_registry(op_types="Pad") @@ -93,4 +93,9 @@ def convert(self, convert_format): # Create an entry for output quantized value node.input[0] = parent.input[0] node.output[0] = child.output[0] - self.quantizer.remove_nodes.extend([parent, child]) \ No newline at end of file + self.quantizer.remove_nodes.extend([parent, child]) + +@qop_registry(op_types="Pad") +class QPadOperator(QOperator): + def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): + super().__init__(onnx_node, children, initializers, channel_axis, exclude_output_quantization) \ No newline at end of file diff --git a/neural_compressor/adaptor/ox_utils/operators/resize.py b/neural_compressor/adaptor/ox_utils/operators/resize.py index d5f906f8372..bff3549ff8f 100644 --- a/neural_compressor/adaptor/ox_utils/operators/resize.py +++ b/neural_compressor/adaptor/ox_utils/operators/resize.py @@ -16,7 +16,7 @@ # limitations under the License. # -from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator +from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator, QOperator, qop_registry @op_registry(op_types="Resize") class ResizeOperator(Operator): @@ -70,3 +70,7 @@ def convert(self, convert_format): child.output[0], node.output[0] + '_quantized') node.output[0] = node.output[0] + '_quantized' +@qop_registry(op_types="Resize") +class QResizeOperator(QOperator): + def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): + super().__init__(onnx_node, children, initializers, channel_axis, exclude_output_quantization) \ No newline at end of file diff --git a/neural_compressor/adaptor/ox_utils/operators/split.py b/neural_compressor/adaptor/ox_utils/operators/split.py index 50fff613bc4..1699716abb2 100644 --- a/neural_compressor/adaptor/ox_utils/operators/split.py +++ b/neural_compressor/adaptor/ox_utils/operators/split.py @@ -18,7 +18,7 @@ import onnx from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator, QOperator, qop_registry -from neural_compressor.adaptor.ox_utils.util import attribute_to_kwarg, ms_domain +from neural_compressor.adaptor.ox_utils.util import attribute_to_kwarg @op_registry(op_types="Split") @@ -92,17 +92,36 @@ def convert(self): add_nodes = [] inputs = [] inits = [] + if all([i.op_type != 'DequantizeLinear' for i in self.children]): return False, add_nodes + # input dq for child in self.children: if child.op_type == 'DequantizeLinear': in_dq = onnx.helper.make_node( 'DequantizeLinear', [node.input[0], child.input[1], child.input[2]], - [node.name + '_in_dequant_' + str(i)]) - inputs.append(node.name + '_in_dequant_' + str(i)) + [node.name + '_in_dequant'], + node.name + '_in_dequant') + inputs.append(node.name + '_in_dequant') add_nodes.append(in_dq) break + + # output q + outputs = [] + output_optype_mapping = dict((child.input[0], child.op_type) for child in self.children) + if not self.disable_qdq_for_node_output: + for i, out in enumerate(node.output): + out_q = onnx.helper.make_node( + 'QuantizeLinear', + [node.name + '_out_' + str(i), in_dq.input[1], in_dq.input[2]], + [node.output[i]], + node.name + '_out_quant_' + str(i)) + outputs.append([node.name + '_out_quant_' + str(i)]) + add_nodes.append(out_q) + else: + outputs = node.output + outputs = node.output kwargs = {} for attribute in node.attribute: # pragma: no cover diff --git a/test/export/test_onnx_qlieanr_to_qdq.py b/test/export/test_onnx_qlieanr_to_qdq.py new file mode 100644 index 00000000000..4164b8d17fc --- /dev/null +++ b/test/export/test_onnx_qlieanr_to_qdq.py @@ -0,0 +1,813 @@ +import os +import shutil +import unittest +import copy +import onnx +import numpy as np +from onnx import helper, TensorProto, numpy_helper, onnx_pb +from neural_compressor.adaptor.ox_utils.quantizer import Quantizer +from neural_compressor.adaptor.ox_utils.util import QuantizedInitializer, QuantizedValue, QuantizationMode +import onnxruntime as ort +from neural_compressor import options +from neural_compressor.config import ONNXQlinear2QDQConfig +from neural_compressor.experimental.common import Model + +def build_model(): + initializers = [] + input = helper.make_tensor_value_info('input', TensorProto.FLOAT, [1, 3, 15, 15]) + output = helper.make_tensor_value_info('reshape_output', TensorProto.FLOAT, [88, 11]) + + add_node = onnx.helper.make_node('Add', ['input', 'add_init'], ['add_out'], name='add') + + conv1_weight_initializer = numpy_helper.from_array( + np.random.randint(-1, 2, [3, 3, 3, 3]).astype(np.float32), name='conv1_weight') + conv1_node = helper.make_node('Conv', ['add_out', 'conv1_weight'], ['conv1_output'], name='conv1') + + conv2_weight_initializer = numpy_helper.from_array( + np.random.randint(-1, 2, [5, 3, 3, 3]).astype(np.float32), name='conv2_weight') + conv2_node = helper.make_node('Conv', ['add_out', 'conv2_weight'], ['conv2_output'], name='conv2') + + # 1, 8, 13, 13 + concat_node = helper.make_node('Concat', ['conv1_output', 'conv2_output'], [ + 'concat_output'], name='Concat', axis=1) + # 1, 8, 11, 11 + avg_args = {'kernel_shape': [3, 3]} + avgpool_node = helper.make_node('AveragePool', ['concat_output'], ['avg_output'], name='AveragePool', **avg_args) + reshape_node = onnx.helper.make_node('Reshape', ['avg_output', 'shape'], ['reshape_output'], name='Reshape') + + initializers = [conv1_weight_initializer, conv2_weight_initializer] + initializers.append(onnx.numpy_helper.from_array(np.array([88, 11], dtype=np.int64), name='shape')) + initializers.append(onnx.numpy_helper.from_array(np.zeros((1, 3, 15, 15)), name='add_init')) + graph = helper.make_graph([conv1_node, conv2_node, concat_node, avgpool_node, reshape_node, add_node], + 'test', [input], [output], initializer=initializers) + model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)]) + return model + +class TestAdaptorONNXRT(unittest.TestCase): + + qlinear_backend = QuantizationMode.QLinearOps + qdq_backend = 'qdqops' + integer_backend = QuantizationMode.IntegerOps + static_q_config = {"weight":{'dtype': 3, + 'algorithm': 'minmax', + 'scheme':'sym', + 'granularity': 'per_tensor'}, + 'activation':{'dtype': 2, + 'algorithm': 'minmax', + 'scheme':'asym', + 'granularity':'per_tensor', + 'quant_mode': 'static'} + } + dynamic_q_config = {"weight":{'dtype': 3, + 'algorithm': 'minmax', + 'scheme':'sym', + 'granularity': 'per_tensor'}, + 'activation':{'dtype': 2, + 'algorithm': 'minmax', + 'scheme':'asym', + 'granularity':'per_tensor', + 'quant_mode': 'dynamic'}} + config = ONNXQlinear2QDQConfig() + + @classmethod + def setUpClass(cls): + os.makedirs('./onnxrt_test') + + @classmethod + def tearDownClass(cls): + shutil.rmtree("./onnxrt_test", ignore_errors=True) + os.remove("test.onnx") + + def qlinear_test(self, model, q_config, quantize_params, quantizable_op_types): + quantizer = Quantizer(copy.deepcopy(model), + q_config, + self.qlinear_backend, + True, + quantize_params, + quantizable_op_types) + model = quantizer.quantize_model() + return Model(model) + + def dynamic_test(self, model, q_config, quantize_params, quantizable_op_types): + quantizer = Quantizer(copy.deepcopy(model), + q_config, + self.integer_backend, + False, + quantize_params, + quantizable_op_types) + quantizer.quantize_model() + return Model(model) + + def test_resize(self): + input_tensor = helper.make_tensor_value_info('input', TensorProto.FLOAT, [1, 2, 26, 42]) + + conv_weight_arr = np.random.randint(-1, 2, [3, 2, 3, 3]).astype(np.float32) + conv_weight_initializer = onnx.numpy_helper.from_array(conv_weight_arr, name='conv1_weight') + conv_node = onnx.helper.make_node('Conv', ['input', 'conv1_weight'], ['conv_output'], name='conv_node') + + initializers = [conv_weight_initializer] + + output_tensor = helper.make_tensor_value_info('output', TensorProto.FLOAT, [1, 3, 48, 80]) + resize_inputs = ['conv_output'] # resize_roi_name, resize_scales_name, resize_sizes_name] + resize_attrs = {'coordinate_transformation_mode': 'asymmetric', 'mode': 'nearest', 'nearest_mode': 'floor'} + resize_node = helper.make_node('Resize', resize_inputs, ['output'], name='resize_node', **resize_attrs) + resize_roi = [0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0] + resize_roi_name = 'resize_roi' + resize_roi_initializer = helper.make_tensor(resize_roi_name, TensorProto.FLOAT, [len(resize_roi)], resize_roi) + initializers.extend([resize_roi_initializer]) + resize_node.input.extend([resize_roi_name]) + + resize_scales = [1.0, 1.0, 2.0, 2.0] + resize_scales_name = 'resize_scales' + resize_scales_initializer = helper.make_tensor(resize_scales_name, TensorProto.FLOAT, [ + len(resize_scales)], resize_scales) + initializers.extend([resize_scales_initializer]) + resize_node.input.extend([resize_scales_name]) + + graph = helper.make_graph([conv_node, resize_node], 'TestOpQuantizerResize_test_model', + [input_tensor], [output_tensor], initializer=initializers) + model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)]) + model.ir_version = 7 # use stable onnx ir version + + q_config = {'conv_node': self.static_q_config, + 'resize_node': self.static_q_config} + quantize_params = {'input': [np.float32(10.), np.uint8(0)], + 'conv1_weight': [np.float32(10.), np.uint8(0)], + 'conv_output': [np.float32(10.), np.uint8(0)], + 'output': [np.float32(10.), np.uint8(0)], + } + q_model = self.qlinear_test(model, q_config, quantize_params, ['Resize', 'Conv']) + q_model.export('./test.onnx', self.config) + + model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 10)]) + model.ir_version = 7 # use stable onnx ir version + q_model = self.qlinear_test(model, q_config, quantize_params, ['Resize', 'Conv']) + q_model.export('./test.onnx', self.config) + + def test_argmax(self): + input_name = "input" + output_name = "output" + input_shape = [1, 256, 128, 128] + output_shape = [1, 32, 128] + initializers = [] + + # make Conv node + conv_weight_name = "conv_weight" + conv_weight_arr = np.random.randint(-1, 2, [32, 256, 1, 1]).astype(np.float32) + conv_weight_initializer = onnx.numpy_helper.from_array(conv_weight_arr, name=conv_weight_name) + conv_output_name = "conv_output" + conv_inputs = [input_name, conv_weight_name] + conv_outputs = [conv_output_name] + conv_name = "conv_node" + conv_node = onnx.helper.make_node( + "Conv", + conv_inputs, + conv_outputs, + dilations=[1, 1], + kernel_shape=[1, 1], + pads=[0, 0, 0, 0], + strides=[1, 1], + name=conv_name, + ) + + # make ArgMax node + argmax_inputs = [conv_output_name] + argmax_outputs = [output_name] + argmax_name = "argmax_node" + argmax_node = onnx.helper.make_node( + "ArgMax", + argmax_inputs, + argmax_outputs, + axis=3, + keepdims=0, + name=argmax_name, + ) + + initializers = [conv_weight_initializer] + + # make graph + input_tensor = helper.make_tensor_value_info(input_name, TensorProto.FLOAT, input_shape) + output_tensor = helper.make_tensor_value_info(output_name, TensorProto.INT64, output_shape) + graph_name = "ArgMax_Quant_Test" + graph = helper.make_graph( + [conv_node, argmax_node], + graph_name, + [input_tensor], + [output_tensor], + initializer=initializers, + ) + model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)]) + model.ir_version = 7 # use stable onnx ir version + q_config = {'conv_node': self.static_q_config, + 'argmax_node': self.static_q_config} + quantize_params = {'input': [np.float32(10.), np.uint8(0)], + 'conv_weight': [np.float32(10.), np.uint8(0)], + 'conv_output': [np.float32(10.), np.uint8(0)], + 'output': [np.float32(10.), np.uint8(0)], + } + q_model = self.qlinear_test(model, q_config, quantize_params, ['Conv', 'ArgMax']) + q_model.export('./test.onnx', self.config) + + def test_gemm(self): + input_name = "input" + output_name = "output" + initializers = [] + weight_shape = [100, 10] + weight_name = "linear1.weight" + bias_shape = [100] + bias_name = "linear1.bias" + node_name = "gemm" + + weight_data = np.random.normal(0, 0.1, weight_shape).astype(np.float32) + initializers.append(onnx.numpy_helper.from_array(weight_data, name=weight_name)) + + bias_data = np.random.normal(0, 0.1, bias_shape).astype(np.float32) + initializers.append(onnx.numpy_helper.from_array(bias_data, name=bias_name)) + + gemm1_node = onnx.helper.make_node( + "Gemm", + [input_name, weight_name, bias_name], + [output_name], + alpha=1.0, + beta=1.0, + transB=1, + name=node_name + ) + + gemm1_output_name = "gemm1_output" + input_tensor = helper.make_tensor_value_info(input_name, TensorProto.FLOAT, [-1, 10]) + output_tensor = helper.make_tensor_value_info(output_name, TensorProto.FLOAT, [-1, 100]) + graph_name = "gemm_test" + graph = helper.make_graph( + [gemm1_node], + graph_name, + [input_tensor], + [output_tensor], + initializer=initializers, + ) + model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)]) + model.ir_version = 7 # use stable onnx ir version + q_config = {'gemm': self.static_q_config} + quantize_params = {'input': [np.float32(10.), np.uint8(0)], + 'linear1.weight': [np.float32(10.), np.uint8(0)], + 'linear1.bias': [np.float32(10.), np.uint8(0)], + 'output': [np.float32(10.), np.uint8(0)], + } + q_model = self.qlinear_test(model, q_config, quantize_params, ['Gemm']) + q_model.export('./test.onnx', self.config) + + bias_tensor = helper.make_tensor_value_info(bias_name, TensorProto.FLOAT, [100]) + gemm2_node = onnx.helper.make_node( + "Gemm", + [input_name, weight_name, bias_name], + [output_name], + alpha=1.0, + beta=1.0, + transB=1, + name=node_name + ) + initializers = [] + initializers.append(onnx.numpy_helper.from_array(weight_data, name=weight_name)) + graph_name = "gemm_test" + graph = helper.make_graph( + [gemm2_node], + graph_name, + [input_tensor, bias_tensor], + [output_tensor], + initializer=initializers, + ) + model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)]) + model.ir_version = 7 + q_model = self.qlinear_test(model, q_config, quantize_params, ['Gemm']) + q_model.export('./test.onnx', self.config) + + def test_embed(self): + input_ids_shape = [1, 4] + input_ids_tensor = helper.make_tensor_value_info('input_ids', TensorProto.INT32, input_ids_shape) + + segment_ids_shape = [1, 4] + segment_ids_tensor = helper.make_tensor_value_info('segment_ids', TensorProto.INT32, segment_ids_shape) + + # EmbedLayerNormalization Node Constants and Weights: + word_embed_shape = [32, 4] + word_embed_weights = np.random.random_sample(word_embed_shape).astype(dtype='float32') + word_embed_initializer = onnx.numpy_helper.from_array(word_embed_weights, name='word_embed') + + pos_embed_shape = [16, 4] + pos_embed_weights = np.random.random_sample(pos_embed_shape).astype(dtype='float32') + pos_embed_initializer = onnx.numpy_helper.from_array(pos_embed_weights, name='pos_embed') + + seg_embed_shape = [2, 4] + seg_embed_weights = np.random.random_sample(seg_embed_shape).astype(dtype='float32') + seg_embed_initializer = onnx.numpy_helper.from_array(seg_embed_weights, name='seg_embed') + + gamma_shape = [4] + gamma = np.random.random_sample(gamma_shape).astype(dtype='float32') + gamma_initializer = onnx.numpy_helper.from_array(gamma, name='gamma') + + beta_shape = [4] + beta = np.random.random_sample(beta_shape).astype(dtype='float32') + beta_initializer = onnx.numpy_helper.from_array(beta, name='beta') + + # EmbedLayerNormalization Outputs: + layernorm_out_shape = [1, 4, 4] + layernorm_out_tensor = helper.make_tensor_value_info('layernorm_out', TensorProto.FLOAT, layernorm_out_shape) + + mask_index_out_shape = [1] + mask_index_out_tensor = helper.make_tensor_value_info('mask_index_out', TensorProto.INT32, mask_index_out_shape) + + # EmbedLayerNormalization Node: + embed_layer_norm_inputs = [ + 'input_ids', 'segment_ids', 'word_embed', 'pos_embed', 'seg_embed', 'gamma', 'beta' + ] + embed_layer_norm_outputs = ['layernorm_out', 'mask_index_out'] + embed_layer_norm_node = helper.make_node('EmbedLayerNormalization', + embed_layer_norm_inputs, + embed_layer_norm_outputs, + domain='com.microsoft', + name='Embed') + + # Construct the Graph and Model: + nodes = [embed_layer_norm_node] + graph_name = 'embed_layernorm_graph' + inputs = [input_ids_tensor, segment_ids_tensor] + outputs = [layernorm_out_tensor, mask_index_out_tensor] + initializers = [ + word_embed_initializer, pos_embed_initializer, seg_embed_initializer, gamma_initializer, beta_initializer + ] + + graph = helper.make_graph(nodes, graph_name, inputs, outputs, initializer=initializers) + model = helper.make_model(graph, + opset_imports=[helper.make_opsetid("com.microsoft", 14), helper.make_opsetid("ai.onnx", 14)]) + model.ir_version = 7 # use stable onnx ir version + + q_config = {'Embed': self.static_q_config} + quantize_params = {'word_embed': [np.uint8(10.), np.float32(0)], + 'pos_embed': [np.uint8(10.), np.float32(0)], + 'seg_embed': [np.uint8(10.), np.float32(0)], + 'gamma': [np.uint8(10.), np.float32(0)], + 'beta': [np.uint8(10.), np.float32(0)], + 'layernorm_out': [np.uint8(10.), np.float32(0)], + 'mask_index_out': [np.uint8(10.), np.float32(0)], + 'input_ids': [np.uint8(10.), np.float32(0)], + } + q_model = self.qlinear_test(model, q_config, quantize_params, ['EmbedLayerNormalization']) + q_model.export('./test.onnx', self.config) + + def test_concat_reshape_pooling(self): + model = build_model() + options.onnxrt.qdq_setting.DedicatedQDQPair = True + + q_config = {'Reshape':self.static_q_config, 'conv1':self.static_q_config, 'conv2':self.static_q_config, \ + 'Concat':self.static_q_config, 'AveragePool':self.static_q_config, 'add':self.static_q_config} + quantize_params = {'input': [np.uint8(10.), np.float32(0)], + 'conv1_weight': [np.uint8(10.), np.float32(0)], + 'conv1_output': [np.uint8(10.), np.float32(0)], + 'conv2_weight': [np.uint8(10.), np.float32(0)], + 'conv2_output': [np.uint8(10.), np.float32(0)], + 'concat_output': [np.uint8(10.), np.float32(0)], + 'avg_output': [np.uint8(10.), np.float32(0)], + 'add_out': [np.uint8(10.), np.float32(0)], + 'add_init': [np.uint8(10.), np.float32(0)], + 'shape': [np.uint8(10.), np.float32(0)], + 'reshape_output': [np.uint8(10.), np.float32(0)]} + quantizable_op_types = ['Reshape', 'Conv', 'Concat', 'AveragePool', 'Add'] + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + q_model.export('./test.onnx', self.config) + options.onnxrt.qdq_setting.DedicatedQDQPair = False + + q_config = {'Reshape':self.static_q_config, 'conv1':'fp32', 'conv2':self.static_q_config, \ + 'Concat':self.static_q_config, 'AveragePool':self.static_q_config} + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + q_model.export('./test.onnx', self.config) + + q_config = {'Reshape':self.static_q_config, 'conv1':'fp32', 'conv2':'fp32', \ + 'Concat':self.static_q_config, 'AveragePool':self.static_q_config} + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + q_model.export('./test.onnx', self.config) + + q_config = {'Reshape':self.static_q_config, 'conv1':self.static_q_config, 'conv2':self.static_q_config, \ + 'Concat':self.static_q_config, 'AveragePool':'fp32'} + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + q_model.export('./test.onnx', self.config) + + quantize_params = {'input': [np.uint8(10.), np.float32(0)], + 'conv1_weight': [np.uint8(10.), np.float32(0)], + 'conv1_output': [np.uint8(10.), np.float32(0)], + 'conv2_weight': [np.uint8(10.), np.float32(0)], + 'conv2_output': [np.uint8(10.), np.float32(0)], + 'concat_output': [np.uint8(10.), np.float32(0)], + 'avg_output': [np.uint8(10.), np.float32(0)], + 'shape': [np.uint8(10.), np.float32(0)], + 'add_out': [np.uint8(10.), np.float32(0)], + 'add_init': [np.uint8(10.), np.float32(0)], + 'reshape_output': [np.uint8(10.), np.float32(0)]} + q_config = {'Reshape':self.static_q_config, 'conv1':self.static_q_config, 'conv2':self.static_q_config, \ + 'Concat':self.static_q_config, 'AveragePool':self.static_q_config} + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + q_model.export('./test.onnx', self.config) + + def test_conv(self): + for op in ['Conv', 'FusedConv']: + A = helper.make_tensor_value_info('A', TensorProto.FLOAT, [1, 5, 5, 1]) + B = helper.make_tensor_value_info('B', TensorProto.FLOAT, [1, 3, 3, 1]) + C = helper.make_tensor_value_info('C', TensorProto.FLOAT, [1, 5, 5, 1]) + D = helper.make_tensor_value_info('D', TensorProto.FLOAT, [1, 1, 5, 1]) + conv_node = onnx.helper.make_node(op, ['A', 'B', 'C'], ['D'], + name=op, + kernel_shape=[3, 3], + pads=[1, 1, 1, 1]) + graph = helper.make_graph([conv_node], 'test_graph_1', [A, B, C], [D]) + model = helper.make_model(graph) + q_config = {op: self.static_q_config}, + quantize_params = {"A": [np.uint8(10.), np.float32(0)], + "B": [np.uint8(10.), np.float32(0)], + "C": [np.uint8(10.), np.float32(0)], + "D": [np.uint8(10.), np.float32(0)]} + quantizable_op_types = [op] + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + q_model.export('./test.onnx', self.config) + + def test_matmul(self): + A = helper.make_tensor_value_info('A', TensorProto.FLOAT, [1, 1, 5, 5]) + B = helper.make_tensor_value_info('B', TensorProto.FLOAT, [1, 1, 5, 1]) + C = helper.make_tensor_value_info('C', TensorProto.FLOAT, [1, 1, 5, 1]) + matmul_node = onnx.helper.make_node('MatMul', ['A', 'B'], ['C'], name='Matmul') + graph = helper.make_graph([matmul_node], 'test_graph_1', [A, B], [C]) + model = helper.make_model(graph) + q_config = {"Matmul": self.static_q_config} + quantize_params = {"A": [np.uint8(10.), np.float32(0)], + "B": [np.uint8(10.), np.float32(0)], + "C": [np.uint8(10.), np.float32(0)]} + quantizable_op_types = ["Matmul"] + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + q_model.export('./test.onnx', self.config) + q_config = {"Matmul": self.dynamic_q_config} + q_model = self.dynamic_test(model, q_config, None, quantizable_op_types) + q_model.export('./test.onnx', self.config) + quantize_params = {"A": [np.float32(10.)], + "B": [np.float32(10.)], + "C": [np.float32(10.)]} + + q_config = {"Matmul": {"weight":{'dtype': 3, + 'algorithm': 'minmax', + 'scheme':'sym', + 'granularity': 'per_tensor'}, + 'activation':{'dtype': 2, + 'algorithm': 'minmax', + 'scheme':'asym', + 'granularity':'per_tensor', + 'quant_mode': 'dynamic'}}} + quantize_params = {} + q_model = self.dynamic_test(model, q_config, quantize_params, quantizable_op_types) + q_model.export('./test.onnx', self.config) + + def test_attention(self): + A = helper.make_tensor_value_info('A', TensorProto.FLOAT, [1, 1, 5, 5]) + B = helper.make_tensor_value_info('B', TensorProto.FLOAT, [1, 1, 5, 5]) + C = helper.make_tensor_value_info('C', TensorProto.FLOAT, [1, 1, 5, 5]) + D = helper.make_tensor_value_info('D', TensorProto.FLOAT, [1, 1, 5, 5]) + node = onnx.helper.make_node('Attention', ['A', 'B', 'C'], ['D'], name='Attention') + graph = helper.make_graph([node], 'test_graph_1', [A, B, C], [D]) + model = helper.make_model(graph) + q_config = {"Attention": self.static_q_config} + quantize_params = {"A": [np.uint8(0), np.float32(0.5)], + "B": [np.uint8(0), np.float32(0.5)], + "C": [np.uint8(0), np.float32(0.5)], + "D": [np.uint8(0), np.float32(0.5)]} + quantizable_op_types = ["Attention"] + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + q_model.export('./test.onnx', self.config) + q_config = {"Attention": self.dynamic_q_config} + q_model = self.dynamic_test(model, q_config, quantize_params, quantizable_op_types) + q_model.export('./test.onnx', self.config) + + E = helper.make_tensor_value_info('E', TensorProto.INT32, [1, 1, 5, 5]) + F = helper.make_tensor_value_info('F', TensorProto.FLOAT, [1, 1, 5, 5]) + node = onnx.helper.make_node('Attention', ['A', 'B', 'C', 'F', 'E'], ['D'], name='Attention') + graph = helper.make_graph([node], 'test_graph_1', [A, B, C, F, E], [D]) + model = helper.make_model(graph) + q_config = {"Attention": self.static_q_config} + quantize_params = {"A": [np.uint8(0), np.float32(0.5)], + "B": [np.uint8(0), np.float32(0.5)], + "C": [np.uint8(0), np.float32(0.5)], + "D": [np.uint8(0), np.float32(0.5)]} + quantizable_op_types = ["Attention"] + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + q_model.export('./test.onnx', self.config) + q_config = {"Attention": self.dynamic_q_config} + q_model = self.dynamic_test(model, q_config, quantize_params, quantizable_op_types) + q_model.export('./test.onnx', self.config) + + def test_gather(self): + a_value = np.random.randn(100, 4).astype(np.float32) + A_init = helper.make_tensor('A', TensorProto.FLOAT, [100, 4], + a_value.reshape(400).tolist()) + b_value = np.random.randint(2, size=(1, 10)).astype(np.int32) + B_init = helper.make_tensor('B', TensorProto.INT32, [1, 10], + b_value.reshape(10).tolist()) + A = helper.make_tensor_value_info('A', TensorProto.FLOAT, [100, 4]) + B = helper.make_tensor_value_info('B', TensorProto.INT32, [1, 10]) + C = helper.make_tensor_value_info('C', TensorProto.FLOAT, [1, 10, 4]) + node = onnx.helper.make_node('Gather', ['A', 'B'], ['C'], name='Gather') + graph = helper.make_graph([node], 'test_graph_1', [A, B], [C], [A_init, B_init]) + model = helper.make_model(graph) + q_config = {'Gather': {"weight":{'dtype': 3, + 'algorithm': 'minmax', + 'scheme':'sym', + 'granularity': 'per_tensor'}, + 'activation':{'dtype': 2, + 'algorithm': 'minmax', + 'scheme':'asym', + 'granularity':'per_tensor', + 'quant_mode': 'static'} + }} + quantize_params = {"A": [np.uint8(10.), np.float32(0)], + "C": [np.uint8(10.), np.float32(0)]} + quantizable_op_types = ["Gather"] + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + q_model.export('./test.onnx', self.config) + q_config = {'Gather': {"weight":{'dtype': 3, + 'algorithm': 'minmax', + 'scheme':'sym', + 'granularity': 'per_tensor'}, + 'activation':{'dtype': 2, + 'algorithm': 'minmax', + 'scheme':'asym', + 'granularity':'per_tensor', + 'quant_mode': 'dynamic'} + }} + q_model = self.dynamic_test(model, q_config, quantize_params, quantizable_op_types) + q_model.export('./test.onnx', self.config) + graph = helper.make_graph([node], 'test_graph_1', [A, B], [C]) + model = helper.make_model(graph) + q_config = {'Gather': {"weight":{'dtype': 3, + 'algorithm': 'minmax', + 'scheme':'sym', + 'granularity': 'per_tensor'}, + 'activation':{'dtype': 2, + 'algorithm': 'minmax', + 'scheme':'asym', + 'granularity':'per_tensor', + 'quant_mode': 'dynamic'} + }} + quantize_params = {} + q_model = self.dynamic_test(model, q_config, quantize_params, quantizable_op_types) + q_model.export('./test.onnx', self.config) + + def test_split(self): + a_value = np.random.randn(100, 4).astype(np.float32) + A_init = helper.make_tensor('A', TensorProto.FLOAT, [100, 4], + a_value.reshape(400).tolist()) + A = helper.make_tensor_value_info('A', TensorProto.FLOAT, [100, 4]) + B = helper.make_tensor_value_info('conv1_output', TensorProto.FLOAT, [50, 4]) + C = helper.make_tensor_value_info('conv2_output', TensorProto.FLOAT, [50, 4]) + + node = onnx.helper.make_node('Split', ['A'], ['B', 'C'], name='Split') + graph = helper.make_graph([node], 'test_graph_1', [A], [B, C], [A_init]) + model = helper.make_model(graph) + q_config = {'Split': {"weight":{'dtype': 3, + 'algorithm': 'minmax', + 'scheme':'sym', + 'granularity': 'per_tensor'}, + 'activation':{'dtype': 2, + 'algorithm': 'minmax', + 'scheme':'asym', + 'granularity':'per_tensor', + 'quant_mode': 'static'} + }, + } + quantize_params = {"A": [np.uint8(10.), np.float32(0)], + "B": [np.uint8(10.), np.float32(0)], + "C": [np.uint8(10.), np.float32(0)], + } + quantizable_op_types = ["Split"] + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + q_model.export('./test.onnx', self.config) + + def test_pad(self): + b_value = np.array([0, 1, 1, 0, 1, 1]).astype(np.int64) + B_init = helper.make_tensor('B', TensorProto.INT64, [6], + b_value.reshape(6).tolist()) + B = helper.make_tensor_value_info('B', TensorProto.INT64, [6]) + C = helper.make_tensor_value_info('C', TensorProto.FLOAT, [1, 7, 7]) + + d_value = np.random.randn(1).astype(np.float32) + D_init = helper.make_tensor('D', TensorProto.FLOAT, [1], + d_value.reshape(1).tolist()) + D = helper.make_tensor_value_info('D', TensorProto.FLOAT, [1]) + + e_value = np.random.randn(1, 5, 5).astype(np.float32) + E_init = helper.make_tensor('E', TensorProto.FLOAT, [1, 1, 5, 5], + e_value.reshape(25).tolist()) + E = helper.make_tensor_value_info('E', TensorProto.FLOAT, [1, 1, 5, 5]) + f_value = np.random.randn(1, 3, 3).astype(np.float32) + F_init = helper.make_tensor('F', TensorProto.FLOAT, [1, 1, 3, 3], + f_value.reshape(9).tolist()) + F = helper.make_tensor_value_info('F', TensorProto.FLOAT, [1, 1, 3, 3]) + for mode in ["constant", "edge", "reflect", "constant_value", "constant_value_wo_init"]: + conv_node = onnx.helper.make_node('Conv', ['E', 'F'], ['A'], + name='Conv', + kernel=[3, 3], + padding=[1, 1, 1, 1]) + if mode == "constant_value": + node = onnx.helper.make_node('Pad', ['A', 'B', 'D'], ['C'], name='Pad', mode="constant") + graph = helper.make_graph([conv_node, node], 'test_graph_1', [E, F, B, D], [C], [E_init, F_init, B_init, D_init]) + elif mode == "constant_value_wo_init": + node = onnx.helper.make_node('Pad', ['A', 'B', 'D'], ['C'], name='Pad', mode="constant") + graph = helper.make_graph([conv_node, node], 'test_graph_1', [E, F, B, D], [C], [E_init, F_init, B_init]) + else: + node = onnx.helper.make_node('Pad', ['A', 'B'], ['C'], name='Pad', mode=mode) + graph = helper.make_graph([conv_node, node], 'test_graph_1', [E, F, B], [C], [E_init, F_init, B_init]) + model = helper.make_model(graph) + pad_config = {"weight":{'dtype': 3, + 'algorithm': 'minmax', + 'scheme':'sym', + 'granularity': 'per_tensor'}, + 'activation':{'dtype': 2, + 'algorithm': 'minmax', + 'scheme':'asym', + 'granularity':'per_tensor', + 'quant_mode': 'static'}} + conv_config = {"weight":{'dtype': 3, + 'algorithm': 'minmax', + 'scheme':'sym', + 'granularity': 'per_channel'}, + 'activation':{'dtype': 2, + 'algorithm': 'minmax', + 'scheme':'asym', + 'granularity':'per_tensor', + 'quant_mode': 'static'}} + q_config = {'Conv': conv_config, + 'Pad': pad_config} + quantize_params = {"A": [np.uint8(10.), np.float32(1)], + "C": [np.uint8(10.), np.float32(1)], + "D": [np.uint8(10.), np.float32(1)], + "E": [np.uint8(10.), np.float32(1)], + "F": [np.uint8(10.), np.float32(1)]} + quantizable_op_types = ["Conv", "Pad"] + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + q_model.export('./test.onnx', self.config) + options.onnxrt.qdq_setting.AddQDQPairToWeight = True + options.onnxrt.qdq_setting.AddQDQPairToWeight = False + + node = onnx.helper.make_node('Pad', ['E', 'B', 'D'], ['C'], name='Pad', mode="constant") + graph = helper.make_graph([node], 'test_graph_1', [E, B, D], [C], [E_init, B_init, D_init]) + model = helper.make_model(graph) + q_config = {'Pad': {'activation':{'dtype': 2, + 'algorithm': 'minmax', + 'scheme':'asym', + 'granularity':'per_tensor', + 'quant_mode': 'static'} + }} + quantize_params = {"C": [np.uint8(10.), np.float32(0)], + "E": [np.uint8(10.), np.float32(0)]} + quantizable_op_types = ["Pad"] + q_model = self.qlinear_test(model, pad_config, quantize_params, quantizable_op_types) + q_model.export('./test.onnx', self.config) + + def test_binary(self): + for op in ['Mul', 'Add']: + A = helper.make_tensor_value_info('A', TensorProto.FLOAT, [1, 10]) + B = helper.make_tensor_value_info('B', TensorProto.FLOAT, [1]) + C = helper.make_tensor_value_info('C', TensorProto.FLOAT, [1, 10]) + node = onnx.helper.make_node(op, ['A', 'B'], ['C'], name=op) + graph = helper.make_graph([node], 'test_graph_1', [A, B], [C]) + model = helper.make_model(graph) + q_config = {op: self.static_q_config} + quantize_params = {"A": [np.uint8(10.), np.float32(0)], + "B": [np.uint8(10.), np.float32(0)], + "C": [np.uint8(10.), np.float32(0)]} + quantizable_op_types = [op] + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + q_model.export('./test.onnx', self.config) + q_model = self.qlinear_test(model, q_config, {}, quantizable_op_types) + q_model.export('./test.onnx', self.config) + + def test_activation(self): + for op in ["Relu", "LeakyRelu", "Sigmoid"]: + B = helper.make_tensor_value_info('B', TensorProto.FLOAT, [1, 10]) + A = helper.make_tensor_value_info('A', TensorProto.FLOAT, [1, 10]) + node = onnx.helper.make_node(op, ['A'], ['B'], name=op) + graph = helper.make_graph([node], 'test_graph_1', [A], [B]) + model = helper.make_model(graph) + q_config = {op: self.static_q_config} + quantize_params = {"A": [np.uint8(10.), np.float32(0)], + "B": [np.uint8(10.), np.float32(0)]} + quantizable_op_types = [op] + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + q_model.export('./test.onnx', self.config) + + a_value = np.random.randn(1, 10).astype(np.float32) + A_init = helper.make_tensor('A', TensorProto.FLOAT, [1, 10], + a_value.reshape(10).tolist()) + graph = helper.make_graph([node], 'test_graph_1', [A], [B], [A_init]) + model = helper.make_model(graph) + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + q_model.export('./test.onnx', self.config) + q_model = self.qlinear_test(model, q_config, {}, quantizable_op_types) + q_model.export('./test.onnx', self.config) + + def test_pooling(self): + op = "MaxPool" + B = helper.make_tensor_value_info('B', TensorProto.FLOAT, [1, 5, 5, 1]) + A = helper.make_tensor_value_info('A', TensorProto.FLOAT, [1, 5, 5, 1]) + node = onnx.helper.make_node(op, ['A'], ['B'], + name=op, + kernel_shape=[3, 3], + pads=[1, 1, 1, 1]) + graph = helper.make_graph([node], 'test_graph_1', [A], [B]) + q_config = {op: self.static_q_config} + quantize_params = {"A": [np.uint8(10.), np.float32(0)], + "B": [np.uint8(10.), np.float32(0)]} + quantizable_op_types = [op] + for opset_version in [12, 13]: + opset = onnx.OperatorSetIdProto() + opset.version = opset_version + model = helper.make_model(graph, opset_imports=[opset]) + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + q_model.export('./test.onnx', self.config) + + A = helper.make_tensor_value_info('A', TensorProto.FLOAT, [1, 1, 5, 5]) + B = helper.make_tensor_value_info('B', TensorProto.FLOAT, [1, 1, 3, 3]) + D = helper.make_tensor_value_info('D', TensorProto.FLOAT, [1, 1, 5, 5]) + conv_node = onnx.helper.make_node('Conv', ['A', 'B'], ['C'], + name='Conv', + kernel_shape=[3, 3], + pads=[1, 1, 1, 1]) + pool_node = onnx.helper.make_node(op, ['C'], ['D'], name=op) + graph = helper.make_graph([conv_node, pool_node], 'test_graph_1', [A, B], [D]) + model = helper.make_model(graph) + + q_config = {"Conv": self.static_q_config, op: self.static_q_config} + quantize_params = {"A": [np.uint8(10.), np.float32(0)], + "B": [np.uint8(10.), np.float32(0)], + "C": [np.uint8(10.), np.float32(0)], + "D": [np.uint8(10.), np.float32(0)]} + quantizable_op_types = ["Conv", op] + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + q_model.export('./test.onnx', self.config) + + op = "GlobalAveragePool" + B = helper.make_tensor_value_info('B', TensorProto.FLOAT, [1, 5, 1, 1]) + A = helper.make_tensor_value_info('A', TensorProto.FLOAT, [1, 5, 5, 1]) + node = onnx.helper.make_node(op, ['A'], ['B'], + name=op, + kernel_shape=[3, 3], + pads=[1, 1, 1, 1]) + graph = helper.make_graph([node], 'test_graph_1', [A], [B]) + q_config = {op: self.static_q_config} + quantize_params = {"A": [np.uint8(10.), np.float32(0)], + "B": [np.uint8(10.), np.float32(0)]} + quantizable_op_types = [op] + for opset_version in [12, 13]: + opset = onnx.OperatorSetIdProto() + opset.version = opset_version + model = helper.make_model(graph, opset_imports=[opset]) + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + q_model.export('./test.onnx', self.config) + + A = helper.make_tensor_value_info('A', TensorProto.FLOAT, [1, 1, 5, 5]) + B = helper.make_tensor_value_info('B', TensorProto.FLOAT, [1, 1, 3, 3]) + D = helper.make_tensor_value_info('D', TensorProto.FLOAT, [1, 1, 1, 1]) + conv_node = onnx.helper.make_node('Conv', ['A', 'B'], ['C'], + name='Conv', + kernel_shape=[3, 3], + pads=[1, 1, 1, 1]) + pool_node = onnx.helper.make_node(op, ['C'], ['D'], name=op) + graph = helper.make_graph([conv_node, pool_node], 'test_graph_1', [A, B], [D]) + model = helper.make_model(graph) + + q_config = {"Conv": self.static_q_config, op: self.static_q_config} + quantize_params = {"A": [np.uint8(10.), np.float32(0)], + "B": [np.uint8(10.), np.float32(0)], + "C": [np.uint8(10.), np.float32(0)], + "D": [np.uint8(10.), np.float32(0)]} + quantizable_op_types = ["Conv", op] + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + q_model.export('./test.onnx', self.config) + + + def test_exclude_node(self): + A = helper.make_tensor_value_info('A', TensorProto.FLOAT, [1, 5, 5, 1]) + B = helper.make_tensor_value_info('B', TensorProto.FLOAT, [3, 3, 1, 1]) + D = helper.make_tensor_value_info('D', TensorProto.FLOAT, [1, 1, 3, 3]) + conv_node = onnx.helper.make_node('Conv', ['A', 'B'], ['C'], + name='Conv', + kernel_shape=[3, 3], + pads=[1, 1, 1, 1]) + pool_node = onnx.helper.make_node("MaxPool", ['C'], ['D'], name="MaxPool") + graph = helper.make_graph([conv_node, pool_node], 'test_graph_1', [A, B], [D]) + model = helper.make_model(graph) + + q_config = {"Conv": self.static_q_config, "MaxPool": "fp32"} + quantize_params = {"A": [np.uint8(10.), np.float32(0)], + "B": [np.uint8(10.), np.float32(0)], + "C": [np.uint8(10.), np.float32(0)], + "D": [np.uint8(10.), np.float32(0)]} + quantizable_op_types = ["Conv", "MaxPool"] + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + q_model.export('./test.onnx', self.config) + +if __name__ == "__main__": + unittest.main() From 302f067872bb5d81771082c7507eab2cbe3ea397 Mon Sep 17 00:00:00 2001 From: mengniwa Date: Thu, 8 Dec 2022 21:26:10 +0800 Subject: [PATCH 04/12] fix bug and add file Signed-off-by: mengniwa --- .../adaptor/ox_utils/operators/split.py | 4 +- neural_compressor/config.py | 3 +- .../experimental/export/qlinear2qdq.py | 85 +++++++++++++++++++ neural_compressor/model/onnx_model.py | 1 - 4 files changed, 88 insertions(+), 5 deletions(-) create mode 100644 neural_compressor/experimental/export/qlinear2qdq.py diff --git a/neural_compressor/adaptor/ox_utils/operators/split.py b/neural_compressor/adaptor/ox_utils/operators/split.py index 1699716abb2..3e9f62a57ac 100644 --- a/neural_compressor/adaptor/ox_utils/operators/split.py +++ b/neural_compressor/adaptor/ox_utils/operators/split.py @@ -82,8 +82,8 @@ def cast(self): # pragma: no cover return self.quantizer.dtype_cast(self.node, self.dtype) -@qop_registry(op_types="QSplit") -class QDirectOperator(QOperator): +@qop_registry(op_types="Split") +class QSplitOperator(QOperator): def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): super().__init__(onnx_node, children, initializers, channel_axis, exclude_output_quantization) diff --git a/neural_compressor/config.py b/neural_compressor/config.py index 2d1fd81ab18..8a608a235d9 100644 --- a/neural_compressor/config.py +++ b/neural_compressor/config.py @@ -814,12 +814,11 @@ def dynamic_axes(self): def dynamic_axes(self, dynamic_axes): self._dynamic_axes = dynamic_axes - class ONNXQlinear2QDQConfig: def __init__( self, channel_axis={}, - exclude_output_quantization={} + exclude_output_quantization=[] ): self._channel_axis = channel_axis self._exclude_output_quantization = exclude_output_quantization diff --git a/neural_compressor/experimental/export/qlinear2qdq.py b/neural_compressor/experimental/export/qlinear2qdq.py new file mode 100644 index 00000000000..43bd6f5cd02 --- /dev/null +++ b/neural_compressor/experimental/export/qlinear2qdq.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Helper functions to export onnx model from QLinearops to QDQ.""" + +from neural_compressor.utils import logger +from neural_compressor.utils.utility import LazyImport + +torch = LazyImport('torch') +onnx = LazyImport('onnx') +ort = LazyImport('onnxruntime') +ortq = LazyImport('onnxruntime.quantization') + +def check_model(model): + """Check optype for input model. + + Args: + model (ModelProto): onnx model. + """ + has_integerop = False + has_qlinearop = False + for node in model.nodes(): + if node.op_type.endswith('Integer'): + has_integerop = True + elif node.op_type.startswith('QLinear'): + has_qlinearop = True + elif node.op_type in ['QAttention', 'QGemm', 'QEmbedLayerNormalization']: + has_qlinearop = True + if has_integerop: + logger.info("This model has Integer ops, these ops will be skipped.") + if has_qlinearop: + return True + +def onnx_qlinear_to_qdq( + model, + output_name_to_node, + channel_axis={}, + exclude_output_quantization=[] +): + """Export FP32 PyTorch model into FP32 ONNX model. + + Args: + model (ModelProto): int8 onnx model. + output_name_to_node (dict): the mapping of tensor name and its destination nodes. + channel_axis (dict, optional): quantization axis of for per-channel quantized optype, + the key is optype (str), the value is axis (int). + exclude_output_quantization (list, optional): optypes to exclude output quantization. + """ + from neural_compressor.adaptor.ox_utils.operators.ops import QOPERATORS + add_nodes = [] + remove_nodes = [] + inits = [] + for node in model.graph.node: + if node.op_type in QOPERATORS: + if node.output[0] not in output_name_to_node: + continue + children = [] + for out in node.output: + children.append(output_name_to_node[node.output[0]]) + converter = QOPERATORS[node.op_type]( + node, + children, + model.graph.initializer, + channel_axis, + exclude_output_quantization) + done, add_node, init = converter.convert() + if done: + add_nodes.extend(add_node) + inits.extend(init) + remove_nodes.append(node) + return add_nodes, remove_nodes, inits diff --git a/neural_compressor/model/onnx_model.py b/neural_compressor/model/onnx_model.py index b28fbd8eb35..87e717e6abe 100644 --- a/neural_compressor/model/onnx_model.py +++ b/neural_compressor/model/onnx_model.py @@ -431,7 +431,6 @@ def get_nodes_chain(self, start_node, stop_node, result_chain=[]): def export(self, save_path, conf): from neural_compressor.experimental.export import onnx_qlinear_to_qdq add_nodes, remove_nodes, inits = onnx_qlinear_to_qdq(self._model, - self.initializer(), self._output_name_to_node, conf.channel_axis, conf.exclude_output_quantization) From fb1eba3cb06d93233942e93ef13ad3a6435d406a Mon Sep 17 00:00:00 2001 From: mengniwa Date: Fri, 9 Dec 2022 20:52:22 +0800 Subject: [PATCH 05/12] fix ut Signed-off-by: mengniwa --- .../adaptor/ox_utils/operators/activation.py | 22 +- .../adaptor/ox_utils/operators/argmax.py | 5 +- .../adaptor/ox_utils/operators/attention.py | 46 +--- .../adaptor/ox_utils/operators/binary_op.py | 22 +- .../adaptor/ox_utils/operators/concat.py | 24 +- .../adaptor/ox_utils/operators/conv.py | 54 ++-- .../adaptor/ox_utils/operators/direct_q8.py | 4 +- .../ox_utils/operators/embed_layernorm.py | 5 +- .../adaptor/ox_utils/operators/gather.py | 4 +- .../adaptor/ox_utils/operators/gavgpool.py | 22 +- .../adaptor/ox_utils/operators/gemm.py | 42 ++- .../adaptor/ox_utils/operators/matmul.py | 48 ++-- .../adaptor/ox_utils/operators/maxpool.py | 4 +- .../adaptor/ox_utils/operators/ops.py | 40 +-- .../adaptor/ox_utils/operators/pad.py | 4 +- .../adaptor/ox_utils/operators/pooling.py | 22 +- .../adaptor/ox_utils/operators/resize.py | 4 +- .../adaptor/ox_utils/operators/split.py | 37 ++- neural_compressor/config.py | 10 - .../experimental/export/qlinear2qdq.py | 15 +- neural_compressor/model/onnx_model.py | 5 +- test/export/test_onnx_qlieanr_to_qdq.py | 253 ++++-------------- 22 files changed, 219 insertions(+), 473 deletions(-) diff --git a/neural_compressor/adaptor/ox_utils/operators/activation.py b/neural_compressor/adaptor/ox_utils/operators/activation.py index 67c354d2164..e9481c8a891 100644 --- a/neural_compressor/adaptor/ox_utils/operators/activation.py +++ b/neural_compressor/adaptor/ox_utils/operators/activation.py @@ -91,8 +91,8 @@ def quantize(self): @qop_registry(op_types="QLinearLeakyRelu, QLinearSigmoid") class QActivationOperator(QOperator): - def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): - super().__init__(onnx_node, children, initializers, channel_axis, exclude_output_quantization) + def __init__(self, onnx_node, children, initializers, channel_axis): + super().__init__(onnx_node, children, initializers, channel_axis) def convert(self): node = self.node @@ -107,16 +107,14 @@ def convert(self): inputs = [node.name + '_in_dequant'] add_nodes.append(in_dq) # output q - if not self.disable_qdq_for_node_output: - out_q = onnx.helper.make_node( - 'QuantizeLinear', - [node.name + '_out', node.input[3], node.input[4]], - node.output, - node.name + '_out_quant') - outputs = [node.name + '_out'] - add_nodes.append(out_q) - else: - outputs = node.output + out_q = onnx.helper.make_node( + 'QuantizeLinear', + [node.name + '_out', node.input[3], node.input[4]], + node.output, + node.name + '_out_quant') + outputs = [node.name + '_out'] + add_nodes.append(out_q) + kwargs = {} for attribute in node.attribute: # pragma: no cover kwargs.update(attribute_to_kwarg(attribute)) diff --git a/neural_compressor/adaptor/ox_utils/operators/argmax.py b/neural_compressor/adaptor/ox_utils/operators/argmax.py index 69f310ee4d8..d11128fec94 100644 --- a/neural_compressor/adaptor/ox_utils/operators/argmax.py +++ b/neural_compressor/adaptor/ox_utils/operators/argmax.py @@ -35,10 +35,9 @@ def convert(self, convert_format): origin_name = node.input[0].split('_argmax_node')[0] if origin_name in self.quantizer.quantized_value_map: - node.input[0] = self.quantizer.quantized_value_map[origin_name].q_name node.name = node.name + '_quant' @qop_registry(op_types="ArgMax") class QArgMaxOperator(QOperator): - def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): - super().__init__(onnx_node, children, initializers, channel_axis, exclude_output_quantization) \ No newline at end of file + def __init__(self, onnx_node, children, initializers, channel_axis): + super().__init__(onnx_node, children, initializers, channel_axis) diff --git a/neural_compressor/adaptor/ox_utils/operators/attention.py b/neural_compressor/adaptor/ox_utils/operators/attention.py index ddaa970c6f2..d94250183fe 100644 --- a/neural_compressor/adaptor/ox_utils/operators/attention.py +++ b/neural_compressor/adaptor/ox_utils/operators/attention.py @@ -77,14 +77,16 @@ def convert(self, convert_format): @qop_registry(op_types="QAttention") class QAttentionOperator(QOperator): - def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): - super().__init__(onnx_node, children, initializers, channel_axis, exclude_output_quantization) + def __init__(self, onnx_node, children, initializers, channel_axis): + super().__init__(onnx_node, children, initializers, channel_axis) def convert(self): node = self.node add_nodes = [] inputs = [] inits = [] + if find_by_name(node.input[3], self.initializers) is None: + return False, add_nodes, inits # input dq in_dq1 = onnx.helper.make_node( 'DequantizeLinear', @@ -92,45 +94,19 @@ def convert(self): [node.name + '_in_dequant1'], node.name + '_in_dequant1') - weight_scale = onnx.numpy_helper.to_array( - find_by_name(node.input[4], self.initializers)) - if len(weight_scale.shape) == 1: - if 'MatMul' not in self.axis: - from neural_compressor.utils import logger - logger.warning( - "Don't offer the axis of per-channel quantizd Attention, use default axis=1") - axis = 1 - else: - axis = self.axis['Attention'] - in_dq2 = onnx.helper.make_node( - 'DequantizeLinear', - [node.input[1], node.input[4], node.input[7]], - [node.name + '_in_dequant2'], - node.name + '_in_dequant2', - axis=axis) - else: - in_dq2 = onnx.helper.make_node( - 'DequantizeLinear', - [node.input[1], node.input[4], node.input[7]], - [node.name + '_in_dequant2'], - node.name + '_in_dequant2') + in_dq2 = onnx.helper.make_node( + 'DequantizeLinear', + [node.input[1], node.input[4], node.input[7]], + [node.name + '_in_dequant2'], + node.name + '_in_dequant2') inputs = [node.name + '_in_dequant1', node.name + '_in_dequant2', node.input[2], node.input[5]] add_nodes.extend([in_dq1, in_dq2]) - # output q - if not self.disable_qdq_for_node_output: - out_q = onnx.helper.make_node( - 'QuantizeLinear', - [node.name + '_out', node.input[6], node.input[7]], - node.output, - node.name + '_out_quant') - outputs = [node.name + '_out'] - add_nodes.append(out_q) - else: - outputs = node.output + + outputs = node.output kwargs = {} for attribute in node.attribute: # pragma: no cover kwargs.update(attribute_to_kwarg(attribute)) diff --git a/neural_compressor/adaptor/ox_utils/operators/binary_op.py b/neural_compressor/adaptor/ox_utils/operators/binary_op.py index 6235b121a41..9419cc6b201 100644 --- a/neural_compressor/adaptor/ox_utils/operators/binary_op.py +++ b/neural_compressor/adaptor/ox_utils/operators/binary_op.py @@ -81,8 +81,8 @@ def convert(self, convert_format): @qop_registry(op_types="QLinearAdd, QLinearMul") class QBinaryOperator(QOperator): - def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): - super().__init__(onnx_node, children, initializers, channel_axis, exclude_output_quantization) + def __init__(self, onnx_node, children, initializers, channel_axis): + super().__init__(onnx_node, children, initializers, channel_axis) def convert(self): node = self.node @@ -104,16 +104,14 @@ def convert(self): add_nodes.extend([in_dq1, in_dq2]) # output q - if not self.disable_qdq_for_node_output: - out_q = onnx.helper.make_node( - 'QuantizeLinear', - [node.name + '_out', node.input[6], node.input[7]], - node.output, - node.name + '_out_quant') - outputs = [node.name + '_out'] - add_nodes.append(out_q) - else: - outputs = node.output + out_q = onnx.helper.make_node( + 'QuantizeLinear', + [node.name + '_out', node.input[6], node.input[7]], + node.output, + node.name + '_out_quant') + outputs = [node.name + '_out'] + add_nodes.append(out_q) + kwargs = {} for attribute in node.attribute: # pragma: no cover kwargs.update(attribute_to_kwarg(attribute)) diff --git a/neural_compressor/adaptor/ox_utils/operators/concat.py b/neural_compressor/adaptor/ox_utils/operators/concat.py index ceab401650c..fd68f95538c 100644 --- a/neural_compressor/adaptor/ox_utils/operators/concat.py +++ b/neural_compressor/adaptor/ox_utils/operators/concat.py @@ -99,8 +99,8 @@ def cast(self): # pragma: no cover @qop_registry(op_types="QLinearConcat") class QConcatOperator(QOperator): - def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): - super().__init__(onnx_node, children, initializers, channel_axis, exclude_output_quantization) + def __init__(self, onnx_node, children, initializers, channel_axis): + super().__init__(onnx_node, children, initializers, channel_axis) def convert(self): node = self.node @@ -108,7 +108,7 @@ def convert(self): inputs = [] inits = [] # input dq - for i in range((len(node.inputs) - 2) / 3 - 1): + for i in range(int((len(node.input) - 2) / 3 - 1)): in_dq = onnx.helper.make_node( 'DequantizeLinear', node.input[2 + i*3 : 2 + (i+1)*3], @@ -118,16 +118,14 @@ def convert(self): add_nodes.append(in_dq) # output q - if not self.disable_qdq_for_node_output: - out_q = onnx.helper.make_node( - 'QuantizeLinear', - [node.name + '_out', node.input[0], node.input[1]], - node.output, - node.name + '_out_quant') - outputs = [node.name + '_out'] - add_nodes.append(out_q) - else: - outputs = node.output + out_q = onnx.helper.make_node( + 'QuantizeLinear', + [node.name + '_out', node.input[0], node.input[1]], + node.output, + node.name + '_out_quant') + outputs = [node.name + '_out'] + add_nodes.append(out_q) + kwargs = {} for attribute in node.attribute: # pragma: no cover kwargs.update(attribute_to_kwarg(attribute)) diff --git a/neural_compressor/adaptor/ox_utils/operators/conv.py b/neural_compressor/adaptor/ox_utils/operators/conv.py index e028b5caa17..a8de96deb88 100644 --- a/neural_compressor/adaptor/ox_utils/operators/conv.py +++ b/neural_compressor/adaptor/ox_utils/operators/conv.py @@ -156,6 +156,7 @@ def convert(self, convert_format): if attribute.name == 'activation_params': # pragma: no cover continue kwargs.update(attribute_to_kwarg(attribute)) + qlinear_conv_node = onnx.helper.make_node("QLinearConv", qlinear_conv_inputs, [qlinear_conv_output], node.name, **kwargs) @@ -166,8 +167,8 @@ def convert(self, convert_format): @qop_registry(op_types="QLinearConv") class QConvOperator(QOperator): - def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): - super().__init__(onnx_node, children, initializers, channel_axis, exclude_output_quantization) + def __init__(self, onnx_node, children, initializers, channel_axis): + super().__init__(onnx_node, children, initializers, channel_axis) def convert(self): node = self.node @@ -180,35 +181,20 @@ def convert(self): [node.name + '_in_dequant1'], node.name + '_in_dequant1') - weight_scale = onnx.numpy_helper.to_array( - find_by_name(node.input[4], self.initializers)) - if weight_scale is not None and len(weight_scale.shape) == 1: - if 'Conv' not in self.axis: - from neural_compressor.utils import logger - logger.warning("Don't offer the axis of per-channel quantizd Conv, use default axis=0") - axis = 0 - else: - axis = self.axis['Conv'] - in_dq2 = onnx.helper.make_node( - 'DequantizeLinear', - node.input[3:6], - [node.name + '_in_dequant2'], - node.name + '_in_dequant2', - axis=axis) - elif weight_scale is not None: - in_dq2 = onnx.helper.make_node( - 'DequantizeLinear', - node.input[3:6], - [node.name + '_in_dequant2'], - node.name + '_in_dequant2') - + in_dq2 = onnx.helper.make_node( + 'DequantizeLinear', + node.input[3:6], + [node.name + '_in_dequant2'], + node.name + '_in_dequant2') + add_nodes.extend([in_dq1, in_dq2]) inputs = [node.name + '_in_dequant1', node.name + '_in_dequant2'] if len(node.input) == 9: import numpy as np input_scale = onnx.numpy_helper.to_array( find_by_name(node.input[1], self.initializers)) - + weight_scale = onnx.numpy_helper.to_array( + find_by_name(node.input[4], self.initializers)) bias_scale = input_scale * weight_scale # update scale initializer @@ -230,16 +216,14 @@ def convert(self): inputs.append(in_dq3.name) add_nodes.append(in_dq3) # output q - if not self.disable_qdq_for_node_output: - out_q = onnx.helper.make_node( - 'QuantizeLinear', - [node.name + '_out', node.input[6], node.input[7]], - node.output, - node.name + '_out_quant') - outputs = [node.name + '_out'] - add_nodes.append(out_q) - else: - outputs = node.output + out_q = onnx.helper.make_node( + 'QuantizeLinear', + [node.name + '_out', node.input[6], node.input[7]], + node.output, + node.name + '_out_quant') + outputs = [node.name + '_out'] + add_nodes.append(out_q) + kwargs = {} for attribute in node.attribute: # pragma: no cover kwargs.update(attribute_to_kwarg(attribute)) diff --git a/neural_compressor/adaptor/ox_utils/operators/direct_q8.py b/neural_compressor/adaptor/ox_utils/operators/direct_q8.py index 410fcf9eb8c..22f8c44928b 100644 --- a/neural_compressor/adaptor/ox_utils/operators/direct_q8.py +++ b/neural_compressor/adaptor/ox_utils/operators/direct_q8.py @@ -87,5 +87,5 @@ def cast(self): @qop_registry(op_types="Reshape, Transpose, Squeeze, Unsqueeze") class QDirectOperator(QOperator): - def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): - super().__init__(onnx_node, children, initializers, channel_axis, exclude_output_quantization) \ No newline at end of file + def __init__(self, onnx_node, children, initializers, channel_axis): + super().__init__(onnx_node, children, initializers, channel_axis) \ No newline at end of file diff --git a/neural_compressor/adaptor/ox_utils/operators/embed_layernorm.py b/neural_compressor/adaptor/ox_utils/operators/embed_layernorm.py index b48204518c4..4045fac9d29 100644 --- a/neural_compressor/adaptor/ox_utils/operators/embed_layernorm.py +++ b/neural_compressor/adaptor/ox_utils/operators/embed_layernorm.py @@ -70,11 +70,12 @@ def convert(self, convert_format): node.name, **kwargs) self.quantizer.new_nodes.append(qembed_layer_norm_node) self.quantizer.remove_nodes.extend(parents) + self.quantizer.remove_nodes.append(node) @qop_registry(op_types="QEmbedLayerNormalization") class QEmbedLayerNormalizationOperator(QOperator): - def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): - super().__init__(onnx_node, children, initializers, channel_axis, exclude_output_quantization) + def __init__(self, onnx_node, children, initializers, channel_axis): + super().__init__(onnx_node, children, initializers, channel_axis) def convert(self): node = self.node diff --git a/neural_compressor/adaptor/ox_utils/operators/gather.py b/neural_compressor/adaptor/ox_utils/operators/gather.py index 8d315f07cce..b6adf2ba03e 100644 --- a/neural_compressor/adaptor/ox_utils/operators/gather.py +++ b/neural_compressor/adaptor/ox_utils/operators/gather.py @@ -93,5 +93,5 @@ def convert(self, convert_format): @qop_registry(op_types="Gather") class QGatherOperator(QOperator): - def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): - super().__init__(onnx_node, children, initializers, channel_axis, exclude_output_quantization) + def __init__(self, onnx_node, children, initializers, channel_axis): + super().__init__(onnx_node, children, initializers, channel_axis) diff --git a/neural_compressor/adaptor/ox_utils/operators/gavgpool.py b/neural_compressor/adaptor/ox_utils/operators/gavgpool.py index 96d462a2a35..f5aa36158e7 100644 --- a/neural_compressor/adaptor/ox_utils/operators/gavgpool.py +++ b/neural_compressor/adaptor/ox_utils/operators/gavgpool.py @@ -62,8 +62,8 @@ def convert(self, convert_format): @qop_registry(op_types="QLinearGlobalAveragePool") class QGlobalAveragePoolOperator(QOperator): - def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): - super().__init__(onnx_node, children, initializers, channel_axis, exclude_output_quantization) + def __init__(self, onnx_node, children, initializers, channel_axis): + super().__init__(onnx_node, children, initializers, channel_axis) def convert(self): node = self.node @@ -78,16 +78,14 @@ def convert(self): inputs = [node.name + '_in_dequant'] add_nodes.append(in_dq) # output q - if not self.disable_qdq_for_node_output: - out_q = onnx.helper.make_node( - 'QuantizeLinear', - [node.name + '_out', node.input[3], node.input[4]], - node.output, - node.name + '_out_quant') - outputs = [node.name + '_out'] - add_nodes.append(out_q) - else: - outputs = node.output + out_q = onnx.helper.make_node( + 'QuantizeLinear', + [node.name + '_out', node.input[3], node.input[4]], + node.output, + node.name + '_out_quant') + outputs = [node.name + '_out'] + add_nodes.append(out_q) + kwargs = {} activation_node = onnx.helper.make_node( 'GlobalAveragePool', inputs, diff --git a/neural_compressor/adaptor/ox_utils/operators/gemm.py b/neural_compressor/adaptor/ox_utils/operators/gemm.py index 8688e478e5a..9a46e5078c1 100644 --- a/neural_compressor/adaptor/ox_utils/operators/gemm.py +++ b/neural_compressor/adaptor/ox_utils/operators/gemm.py @@ -95,8 +95,8 @@ def convert(self, convert_format): @qop_registry(op_types="QGemm") class QGemmOperator(QOperator): - def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): - super().__init__(onnx_node, children, initializers, channel_axis, exclude_output_quantization) + def __init__(self, onnx_node, children, initializers, channel_axis): + super().__init__(onnx_node, children, initializers, channel_axis) def convert(self): import numpy as np @@ -117,20 +117,12 @@ def convert(self): [node.name + '_in_dequant1'], node.name + '_in_dequant1') - if len(weight_scale.shape) == 1: - axis = 0 if is_B_transposed(node) else 1 - in_dq2 = onnx.helper.make_node( - 'DequantizeLinear', - node.input[3:6], - [node.name + '_in_dequant2'], - node.name + '_in_dequant2', - axis=axis) - else: - in_dq2 = onnx.helper.make_node( - 'DequantizeLinear', - node.input[3:6], - [node.name + '_in_dequant2'], - node.name + '_in_dequant2') + + in_dq2 = onnx.helper.make_node( + 'DequantizeLinear', + node.input[3:6], + [node.name + '_in_dequant2'], + node.name + '_in_dequant2') # update scale initializer bias_scale_data = np.asarray(bias_scale, dtype=np.float32).reshape(-1) @@ -152,16 +144,14 @@ def convert(self): add_nodes.extend([in_dq1, in_dq2, in_dq3]) # output q - if not self.disable_qdq_for_node_output: - out_q = onnx.helper.make_node( - 'QuantizeLinear', - [node.name + '_out', node.input[6], node.input[7]], - node.output, - node.name + '_out_quant') - outputs = [node.name + '_out'] - add_nodes.append(out_q) - else: - outputs = node.output + out_q = onnx.helper.make_node( + 'QuantizeLinear', + [node.name + '_out', node.input[6], node.input[7]], + node.output, + node.name + '_out_quant') + outputs = [node.name + '_out'] + add_nodes.append(out_q) + kwargs = {} for attribute in node.attribute: # pragma: no cover kwargs.update(attribute_to_kwarg(attribute)) diff --git a/neural_compressor/adaptor/ox_utils/operators/matmul.py b/neural_compressor/adaptor/ox_utils/operators/matmul.py index 822438a4dc2..a531c835f08 100644 --- a/neural_compressor/adaptor/ox_utils/operators/matmul.py +++ b/neural_compressor/adaptor/ox_utils/operators/matmul.py @@ -126,8 +126,8 @@ def convert(self, convert_format): @qop_registry(op_types="QLinearMatMul") class QMatMulOperator(QOperator): - def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): - super().__init__(onnx_node, children, initializers, channel_axis, exclude_output_quantization) + def __init__(self, onnx_node, children, initializers, channel_axis): + super().__init__(onnx_node, children, initializers, channel_axis) def convert(self): node = self.node @@ -140,41 +140,23 @@ def convert(self): [node.name + '_in_dequant1'], node.name + '_in_dequant1') - weight_scale = onnx.numpy_helper.to_array( - find_by_name(node.input[4], self.initializers)) - if weight_scale is not None and len(weight_scale.shape) == 1: - if 'MatMul' not in self.axis: - from neural_compressor.utils import logger - logger.warning("Don't offer the axis of per-channel quantizd MatMul, use default axis=1") - axis = 1 - else: - axis = self.axis['MatMul'] - in_dq2 = onnx.helper.make_node( - 'DequantizeLinear', - node.input[3:6], - [node.name + '_in_dequant2'], - node.name + '_in_dequant2', - axis=axis) - elif weight_scale is not None: - in_dq2 = onnx.helper.make_node( - 'DequantizeLinear', - node.input[3:6], - [node.name + '_in_dequant2'], - node.name + '_in_dequant2') + in_dq2 = onnx.helper.make_node( + 'DequantizeLinear', + node.input[3:6], + [node.name + '_in_dequant2'], + node.name + '_in_dequant2') inputs = [node.name + '_in_dequant1', node.name + '_in_dequant2'] add_nodes.extend([in_dq1, in_dq2]) # output q - if not self.disable_qdq_for_node_output: - out_q = onnx.helper.make_node( - 'QuantizeLinear', - [node.name + '_out', node.input[6], node.input[7]], - node.output, - node.name + '_out_quant') - outputs = [node.name + '_out'] - add_nodes.append(out_q) - else: - outputs = node.output + out_q = onnx.helper.make_node( + 'QuantizeLinear', + [node.name + '_out', node.input[6], node.input[7]], + node.output, + node.name + '_out_quant') + outputs = [node.name + '_out'] + add_nodes.append(out_q) + kwargs = {} for attribute in node.attribute: # pragma: no cover kwargs.update(attribute_to_kwarg(attribute)) diff --git a/neural_compressor/adaptor/ox_utils/operators/maxpool.py b/neural_compressor/adaptor/ox_utils/operators/maxpool.py index 2a521095fda..524d5e1e687 100644 --- a/neural_compressor/adaptor/ox_utils/operators/maxpool.py +++ b/neural_compressor/adaptor/ox_utils/operators/maxpool.py @@ -71,5 +71,5 @@ def convert(self, convert_format): @qop_registry(op_types="MaxPool") class QMaxPoolOperator(QOperator): - def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): - super().__init__(onnx_node, children, initializers, channel_axis, exclude_output_quantization) \ No newline at end of file + def __init__(self, onnx_node, children, initializers, channel_axis): + super().__init__(onnx_node, children, initializers, channel_axis) \ No newline at end of file diff --git a/neural_compressor/adaptor/ox_utils/operators/ops.py b/neural_compressor/adaptor/ox_utils/operators/ops.py index 8f410ab3143..c4876f4b09c 100644 --- a/neural_compressor/adaptor/ox_utils/operators/ops.py +++ b/neural_compressor/adaptor/ox_utils/operators/ops.py @@ -47,10 +47,13 @@ def qop_registry(op_types): def decorator_op(cls): assert cls.__name__.endswith( 'Operator'), "The name of subclass of QOperator should end with \'Operator\' substring." - if cls.__name__[:-len('Operator')] in OPERATORS: # pragma: no cover + if cls.__name__[:-len('Operator')] in QOPERATORS: # pragma: no cover raise ValueError('Cannot have two operators with the same name.') for single_op_type in [op_type.strip() for op_type in op_types.split(',')]: - if single_op_type.startswith('QLinear') or single_op_type in ['QGemm']: + if single_op_type.startswith('QLinear') or \ + single_op_type in ['QGemm', 'QAttention', 'QEmbedLayerNormalization', 'ArgMax', + 'Reshape', 'Transpose', 'Squeeze', 'Unsqueeze', 'Gather', + 'MaxPool', 'Pad', 'Resize', 'Split']: QOPERATORS[single_op_type] = cls return cls return decorator_op @@ -106,23 +109,26 @@ def cast(self): # pragma: no cover self.quantizer.dtype_cast(self.node, self.dtype) class QOperator(object): - def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): + def __init__(self, onnx_node, children, initializers, channel_axis): self.node = onnx_node self.children = children self.initializers = initializers - self.disable_qdq_for_node_output = True if onnx_node.op_type in \ - exclude_output_quantization else False self.axis = channel_axis self.per_channel = False + self.qop_list = ['QGemm', 'QAttention', 'QEmbedLayerNormalization', + 'QLinearLeakyRelu', 'QLinearSigmoid', 'QLinearAdd','QLinearMul', + 'QLinearConcat', 'QLinearConv', 'QLinearGlobalAveragePool', + 'QLinearMatMul', 'QLinearAveragePool'] def convert(self): node = self.node add_nodes = [] inputs = [] inits = [] - # output is not int8 tensor - if all([i.op_type != 'DequantizeLinear' for i in self.children]): - return False, add_nodes + if all([child.op_type not in self.qop_list or \ + child.op_type != 'DequantizeLinear' for child in self.children]): + return False, add_nodes, inits + # input dq for child in self.children: if child.op_type == 'DequantizeLinear': @@ -134,17 +140,15 @@ def convert(self): inputs.append(node.name + '_in_dequant') add_nodes.append(in_dq) break + # output q - if not self.disable_qdq_for_node_output: - out_q = onnx.helper.make_node( - 'QuantizeLinear', - [node.name + '_out', in_dq.input[1], in_dq.input[2]], - node.output, - node.name + '_out_quant') - outputs = [node.name + '_out'] - add_nodes.append(out_q) - else: - outputs = node.output + out_q = onnx.helper.make_node( + 'QuantizeLinear', + [node.name + '_out', in_dq.input[1], in_dq.input[2]], + node.output, + node.name + '_out_quant') + outputs = [node.name + '_out'] + add_nodes.append(out_q) kwargs = {} for attribute in node.attribute: # pragma: no cover diff --git a/neural_compressor/adaptor/ox_utils/operators/pad.py b/neural_compressor/adaptor/ox_utils/operators/pad.py index a2202b5d7fd..df3fe90a474 100644 --- a/neural_compressor/adaptor/ox_utils/operators/pad.py +++ b/neural_compressor/adaptor/ox_utils/operators/pad.py @@ -97,5 +97,5 @@ def convert(self, convert_format): @qop_registry(op_types="Pad") class QPadOperator(QOperator): - def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): - super().__init__(onnx_node, children, initializers, channel_axis, exclude_output_quantization) \ No newline at end of file + def __init__(self, onnx_node, children, initializers, channel_axis): + super().__init__(onnx_node, children, initializers, channel_axis) \ No newline at end of file diff --git a/neural_compressor/adaptor/ox_utils/operators/pooling.py b/neural_compressor/adaptor/ox_utils/operators/pooling.py index 544f7c4da2a..d6c6fe05f15 100644 --- a/neural_compressor/adaptor/ox_utils/operators/pooling.py +++ b/neural_compressor/adaptor/ox_utils/operators/pooling.py @@ -83,8 +83,8 @@ def convert(self, convert_format): @qop_registry(op_types="QLinearAveragePool") class QPoolOperator(QOperator): - def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): - super().__init__(onnx_node, children, initializers, channel_axis, exclude_output_quantization) + def __init__(self, onnx_node, children, initializers, channel_axis): + super().__init__(onnx_node, children, initializers, channel_axis) def convert(self): node = self.node @@ -99,16 +99,14 @@ def convert(self): inputs = [node.name + '_in_dequant'] add_nodes.append(in_dq) # output q - if not self.disable_qdq_for_node_output: - out_q = onnx.helper.make_node( - 'QuantizeLinear', - [node.name + '_out', node.input[3], node.input[4]], - node.output, - node.name + '_out_quant') - outputs = [node.name + '_out'] - add_nodes.append(out_q) - else: - outputs = node.output + out_q = onnx.helper.make_node( + 'QuantizeLinear', + [node.name + '_out', node.input[3], node.input[4]], + node.output, + node.name + '_out_quant') + outputs = [node.name + '_out'] + add_nodes.append(out_q) + kwargs = {} for attribute in node.attribute: # pragma: no cover kwargs.update(attribute_to_kwarg(attribute)) diff --git a/neural_compressor/adaptor/ox_utils/operators/resize.py b/neural_compressor/adaptor/ox_utils/operators/resize.py index bff3549ff8f..b7846888c0c 100644 --- a/neural_compressor/adaptor/ox_utils/operators/resize.py +++ b/neural_compressor/adaptor/ox_utils/operators/resize.py @@ -72,5 +72,5 @@ def convert(self, convert_format): @qop_registry(op_types="Resize") class QResizeOperator(QOperator): - def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): - super().__init__(onnx_node, children, initializers, channel_axis, exclude_output_quantization) \ No newline at end of file + def __init__(self, onnx_node, children, initializers, channel_axis): + super().__init__(onnx_node, children, initializers, channel_axis) \ No newline at end of file diff --git a/neural_compressor/adaptor/ox_utils/operators/split.py b/neural_compressor/adaptor/ox_utils/operators/split.py index 3e9f62a57ac..ec7df1e1434 100644 --- a/neural_compressor/adaptor/ox_utils/operators/split.py +++ b/neural_compressor/adaptor/ox_utils/operators/split.py @@ -84,17 +84,29 @@ def cast(self): # pragma: no cover @qop_registry(op_types="Split") class QSplitOperator(QOperator): - def __init__(self, onnx_node, children, initializers, channel_axis, exclude_output_quantization): - super().__init__(onnx_node, children, initializers, channel_axis, exclude_output_quantization) + def __init__(self, onnx_node, children, initializers, channel_axis): + super().__init__(onnx_node, children, initializers, channel_axis) def convert(self): node = self.node add_nodes = [] inputs = [] inits = [] - - if all([i.op_type != 'DequantizeLinear' for i in self.children]): - return False, add_nodes + + if all([child.op_type not in self.qop_list or \ + child.op_type != 'DequantizeLinear' for child in self.children]): + return False, add_nodes, inits + + outputs = [] + for i, out in enumerate(node.output): + out_q = onnx.helper.make_node( + 'QuantizeLinear', + [node.name + '_out_' + str(i), in_dq.input[1], in_dq.input[2]], + [node.output[i]], + node.name + '_out_quant_' + str(i)) + outputs.append([node.name + '_out_quant_' + str(i)]) + add_nodes.append(out_q) + # input dq for child in self.children: if child.op_type == 'DequantizeLinear': @@ -107,21 +119,6 @@ def convert(self): add_nodes.append(in_dq) break - # output q - outputs = [] - output_optype_mapping = dict((child.input[0], child.op_type) for child in self.children) - if not self.disable_qdq_for_node_output: - for i, out in enumerate(node.output): - out_q = onnx.helper.make_node( - 'QuantizeLinear', - [node.name + '_out_' + str(i), in_dq.input[1], in_dq.input[2]], - [node.output[i]], - node.name + '_out_quant_' + str(i)) - outputs.append([node.name + '_out_quant_' + str(i)]) - add_nodes.append(out_q) - else: - outputs = node.output - outputs = node.output kwargs = {} for attribute in node.attribute: # pragma: no cover diff --git a/neural_compressor/config.py b/neural_compressor/config.py index 8a608a235d9..c0a136a3bf5 100644 --- a/neural_compressor/config.py +++ b/neural_compressor/config.py @@ -818,10 +818,8 @@ class ONNXQlinear2QDQConfig: def __init__( self, channel_axis={}, - exclude_output_quantization=[] ): self._channel_axis = channel_axis - self._exclude_output_quantization = exclude_output_quantization @property def channel_axis(self): @@ -831,14 +829,6 @@ def channel_axis(self): def channel_axis(self, channel_axis): self._dtype = channel_axis - @property - def exclude_output_quantization(self): - return self._exclude_output_quantization - - @exclude_output_quantization.setter - def exclude_output_quantization(self, exclude_output_quantization): - self._exclude_output_quantization = exclude_output_quantization - class Torch2ONNXConfig(ExportConfig): def __init__( self, diff --git a/neural_compressor/experimental/export/qlinear2qdq.py b/neural_compressor/experimental/export/qlinear2qdq.py index 43bd6f5cd02..a1f48374895 100644 --- a/neural_compressor/experimental/export/qlinear2qdq.py +++ b/neural_compressor/experimental/export/qlinear2qdq.py @@ -47,18 +47,16 @@ def check_model(model): def onnx_qlinear_to_qdq( model, - output_name_to_node, - channel_axis={}, - exclude_output_quantization=[] + input_name_to_nodes, + channel_axis={} ): """Export FP32 PyTorch model into FP32 ONNX model. Args: model (ModelProto): int8 onnx model. - output_name_to_node (dict): the mapping of tensor name and its destination nodes. + input_name_to_nodes (dict): the mapping of tensor name and its destination nodes. channel_axis (dict, optional): quantization axis of for per-channel quantized optype, the key is optype (str), the value is axis (int). - exclude_output_quantization (list, optional): optypes to exclude output quantization. """ from neural_compressor.adaptor.ox_utils.operators.ops import QOPERATORS add_nodes = [] @@ -66,17 +64,16 @@ def onnx_qlinear_to_qdq( inits = [] for node in model.graph.node: if node.op_type in QOPERATORS: - if node.output[0] not in output_name_to_node: + if node.output[0] not in input_name_to_nodes: continue children = [] for out in node.output: - children.append(output_name_to_node[node.output[0]]) + children.extend(input_name_to_nodes[node.output[0]]) converter = QOPERATORS[node.op_type]( node, children, model.graph.initializer, - channel_axis, - exclude_output_quantization) + channel_axis) done, add_node, init = converter.convert() if done: add_nodes.extend(add_node) diff --git a/neural_compressor/model/onnx_model.py b/neural_compressor/model/onnx_model.py index 87e717e6abe..63dd97512aa 100644 --- a/neural_compressor/model/onnx_model.py +++ b/neural_compressor/model/onnx_model.py @@ -431,9 +431,8 @@ def get_nodes_chain(self, start_node, stop_node, result_chain=[]): def export(self, save_path, conf): from neural_compressor.experimental.export import onnx_qlinear_to_qdq add_nodes, remove_nodes, inits = onnx_qlinear_to_qdq(self._model, - self._output_name_to_node, - conf.channel_axis, - conf.exclude_output_quantization) + self._input_name_to_nodes, + conf.channel_axis) self.add_nodes(add_nodes) self.remove_nodes(remove_nodes) self.remove_unused_constant() diff --git a/test/export/test_onnx_qlieanr_to_qdq.py b/test/export/test_onnx_qlieanr_to_qdq.py index 4164b8d17fc..63018f12b4a 100644 --- a/test/export/test_onnx_qlieanr_to_qdq.py +++ b/test/export/test_onnx_qlieanr_to_qdq.py @@ -37,7 +37,7 @@ def build_model(): initializers = [conv1_weight_initializer, conv2_weight_initializer] initializers.append(onnx.numpy_helper.from_array(np.array([88, 11], dtype=np.int64), name='shape')) - initializers.append(onnx.numpy_helper.from_array(np.zeros((1, 3, 15, 15)), name='add_init')) + initializers.append(onnx.numpy_helper.from_array(np.zeros((1, 3, 15, 15), dtype=np.float32), name='add_init')) graph = helper.make_graph([conv1_node, conv2_node, concat_node, avgpool_node, reshape_node, add_node], 'test', [input], [output], initializer=initializers) model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)]) @@ -97,53 +97,7 @@ def dynamic_test(self, model, q_config, quantize_params, quantizable_op_types): quantizable_op_types) quantizer.quantize_model() return Model(model) - - def test_resize(self): - input_tensor = helper.make_tensor_value_info('input', TensorProto.FLOAT, [1, 2, 26, 42]) - - conv_weight_arr = np.random.randint(-1, 2, [3, 2, 3, 3]).astype(np.float32) - conv_weight_initializer = onnx.numpy_helper.from_array(conv_weight_arr, name='conv1_weight') - conv_node = onnx.helper.make_node('Conv', ['input', 'conv1_weight'], ['conv_output'], name='conv_node') - - initializers = [conv_weight_initializer] - - output_tensor = helper.make_tensor_value_info('output', TensorProto.FLOAT, [1, 3, 48, 80]) - resize_inputs = ['conv_output'] # resize_roi_name, resize_scales_name, resize_sizes_name] - resize_attrs = {'coordinate_transformation_mode': 'asymmetric', 'mode': 'nearest', 'nearest_mode': 'floor'} - resize_node = helper.make_node('Resize', resize_inputs, ['output'], name='resize_node', **resize_attrs) - resize_roi = [0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0] - resize_roi_name = 'resize_roi' - resize_roi_initializer = helper.make_tensor(resize_roi_name, TensorProto.FLOAT, [len(resize_roi)], resize_roi) - initializers.extend([resize_roi_initializer]) - resize_node.input.extend([resize_roi_name]) - - resize_scales = [1.0, 1.0, 2.0, 2.0] - resize_scales_name = 'resize_scales' - resize_scales_initializer = helper.make_tensor(resize_scales_name, TensorProto.FLOAT, [ - len(resize_scales)], resize_scales) - initializers.extend([resize_scales_initializer]) - resize_node.input.extend([resize_scales_name]) - - graph = helper.make_graph([conv_node, resize_node], 'TestOpQuantizerResize_test_model', - [input_tensor], [output_tensor], initializer=initializers) - model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)]) - model.ir_version = 7 # use stable onnx ir version - - q_config = {'conv_node': self.static_q_config, - 'resize_node': self.static_q_config} - quantize_params = {'input': [np.float32(10.), np.uint8(0)], - 'conv1_weight': [np.float32(10.), np.uint8(0)], - 'conv_output': [np.float32(10.), np.uint8(0)], - 'output': [np.float32(10.), np.uint8(0)], - } - q_model = self.qlinear_test(model, q_config, quantize_params, ['Resize', 'Conv']) - q_model.export('./test.onnx', self.config) - - model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 10)]) - model.ir_version = 7 # use stable onnx ir version - q_model = self.qlinear_test(model, q_config, quantize_params, ['Resize', 'Conv']) - q_model.export('./test.onnx', self.config) - + def test_argmax(self): input_name = "input" output_name = "output" @@ -200,10 +154,10 @@ def test_argmax(self): model.ir_version = 7 # use stable onnx ir version q_config = {'conv_node': self.static_q_config, 'argmax_node': self.static_q_config} - quantize_params = {'input': [np.float32(10.), np.uint8(0)], - 'conv_weight': [np.float32(10.), np.uint8(0)], - 'conv_output': [np.float32(10.), np.uint8(0)], - 'output': [np.float32(10.), np.uint8(0)], + quantize_params = {'input': [np.uint8(0), np.float32(10.)], + 'conv_weight': [np.uint8(0), np.float32(10.)], + 'conv_output': [np.uint8(0), np.float32(10.)], + 'output': [np.uint8(0), np.float32(10.)], } q_model = self.qlinear_test(model, q_config, quantize_params, ['Conv', 'ArgMax']) q_model.export('./test.onnx', self.config) @@ -248,10 +202,10 @@ def test_gemm(self): model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)]) model.ir_version = 7 # use stable onnx ir version q_config = {'gemm': self.static_q_config} - quantize_params = {'input': [np.float32(10.), np.uint8(0)], - 'linear1.weight': [np.float32(10.), np.uint8(0)], - 'linear1.bias': [np.float32(10.), np.uint8(0)], - 'output': [np.float32(10.), np.uint8(0)], + quantize_params = {'input': [np.uint8(0), np.float32(10.)], + 'linear1.weight': [np.uint8(0), np.float32(10.)], + 'linear1.bias': [np.uint8(0), np.float32(10.)], + 'output': [np.uint8(0), np.float32(10.)], } q_model = self.qlinear_test(model, q_config, quantize_params, ['Gemm']) q_model.export('./test.onnx', self.config) @@ -288,6 +242,9 @@ def test_embed(self): segment_ids_shape = [1, 4] segment_ids_tensor = helper.make_tensor_value_info('segment_ids', TensorProto.INT32, segment_ids_shape) + mask_shape = [1, 4] + mask_tensor = helper.make_tensor_value_info('mask', TensorProto.INT32, input_ids_shape) + # EmbedLayerNormalization Node Constants and Weights: word_embed_shape = [32, 4] word_embed_weights = np.random.random_sample(word_embed_shape).astype(dtype='float32') @@ -318,7 +275,7 @@ def test_embed(self): # EmbedLayerNormalization Node: embed_layer_norm_inputs = [ - 'input_ids', 'segment_ids', 'word_embed', 'pos_embed', 'seg_embed', 'gamma', 'beta' + 'input_ids', 'segment_ids', 'word_embed', 'pos_embed', 'seg_embed', 'gamma', 'beta', 'mask' ] embed_layer_norm_outputs = ['layernorm_out', 'mask_index_out'] embed_layer_norm_node = helper.make_node('EmbedLayerNormalization', @@ -330,7 +287,7 @@ def test_embed(self): # Construct the Graph and Model: nodes = [embed_layer_norm_node] graph_name = 'embed_layernorm_graph' - inputs = [input_ids_tensor, segment_ids_tensor] + inputs = [input_ids_tensor, segment_ids_tensor, mask_tensor] outputs = [layernorm_out_tensor, mask_index_out_tensor] initializers = [ word_embed_initializer, pos_embed_initializer, seg_embed_initializer, gamma_initializer, beta_initializer @@ -338,7 +295,7 @@ def test_embed(self): graph = helper.make_graph(nodes, graph_name, inputs, outputs, initializer=initializers) model = helper.make_model(graph, - opset_imports=[helper.make_opsetid("com.microsoft", 14), helper.make_opsetid("ai.onnx", 14)]) + opset_imports=[helper.make_opsetid("com.microsoft", 1), helper.make_opsetid("ai.onnx", 12)]) model.ir_version = 7 # use stable onnx ir version q_config = {'Embed': self.static_q_config} @@ -408,7 +365,7 @@ def test_concat_reshape_pooling(self): q_model.export('./test.onnx', self.config) def test_conv(self): - for op in ['Conv', 'FusedConv']: + for op in ['Conv']: A = helper.make_tensor_value_info('A', TensorProto.FLOAT, [1, 5, 5, 1]) B = helper.make_tensor_value_info('B', TensorProto.FLOAT, [1, 3, 3, 1]) C = helper.make_tensor_value_info('C', TensorProto.FLOAT, [1, 5, 5, 1]) @@ -463,30 +420,15 @@ def test_matmul(self): q_model.export('./test.onnx', self.config) def test_attention(self): - A = helper.make_tensor_value_info('A', TensorProto.FLOAT, [1, 1, 5, 5]) - B = helper.make_tensor_value_info('B', TensorProto.FLOAT, [1, 1, 5, 5]) - C = helper.make_tensor_value_info('C', TensorProto.FLOAT, [1, 1, 5, 5]) - D = helper.make_tensor_value_info('D', TensorProto.FLOAT, [1, 1, 5, 5]) - node = onnx.helper.make_node('Attention', ['A', 'B', 'C'], ['D'], name='Attention') - graph = helper.make_graph([node], 'test_graph_1', [A, B, C], [D]) - model = helper.make_model(graph) - q_config = {"Attention": self.static_q_config} - quantize_params = {"A": [np.uint8(0), np.float32(0.5)], - "B": [np.uint8(0), np.float32(0.5)], - "C": [np.uint8(0), np.float32(0.5)], - "D": [np.uint8(0), np.float32(0.5)]} - quantizable_op_types = ["Attention"] - q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) - q_model.export('./test.onnx', self.config) - q_config = {"Attention": self.dynamic_q_config} - q_model = self.dynamic_test(model, q_config, quantize_params, quantizable_op_types) - q_model.export('./test.onnx', self.config) - - E = helper.make_tensor_value_info('E', TensorProto.INT32, [1, 1, 5, 5]) - F = helper.make_tensor_value_info('F', TensorProto.FLOAT, [1, 1, 5, 5]) - node = onnx.helper.make_node('Attention', ['A', 'B', 'C', 'F', 'E'], ['D'], name='Attention') - graph = helper.make_graph([node], 'test_graph_1', [A, B, C, F, E], [D]) - model = helper.make_model(graph) + A = helper.make_tensor_value_info('A', TensorProto.FLOAT, [1, 128, 768]) + B = helper.make_tensor_value_info('B', TensorProto.FLOAT, [768, 2304]) + C = helper.make_tensor_value_info('C', TensorProto.FLOAT, [2304]) + D = helper.make_tensor_value_info('D', TensorProto.FLOAT, [1, 128, 768]) + mask = helper.make_tensor_value_info('mask', TensorProto.INT32, [1, 128]) + + node = onnx.helper.make_node('Attention', ['A', 'B', 'C', 'mask'], ['D'], name='Attention', num_heads=1) + graph = helper.make_graph([node], 'test_graph_1', [A, B, C, mask], [D]) + model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)]) q_config = {"Attention": self.static_q_config} quantize_params = {"A": [np.uint8(0), np.float32(0.5)], "B": [np.uint8(0), np.float32(0.5)], @@ -496,8 +438,6 @@ def test_attention(self): q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) q_model.export('./test.onnx', self.config) q_config = {"Attention": self.dynamic_q_config} - q_model = self.dynamic_test(model, q_config, quantize_params, quantizable_op_types) - q_model.export('./test.onnx', self.config) def test_gather(self): a_value = np.random.randn(100, 4).astype(np.float32) @@ -512,9 +452,9 @@ def test_gather(self): node = onnx.helper.make_node('Gather', ['A', 'B'], ['C'], name='Gather') graph = helper.make_graph([node], 'test_graph_1', [A, B], [C], [A_init, B_init]) model = helper.make_model(graph) - q_config = {'Gather': {"weight":{'dtype': 3, + q_config = {'Gather': {"weight":{'dtype': 2, 'algorithm': 'minmax', - 'scheme':'sym', + 'scheme':'asym', 'granularity': 'per_tensor'}, 'activation':{'dtype': 2, 'algorithm': 'minmax', @@ -555,117 +495,6 @@ def test_gather(self): q_model = self.dynamic_test(model, q_config, quantize_params, quantizable_op_types) q_model.export('./test.onnx', self.config) - def test_split(self): - a_value = np.random.randn(100, 4).astype(np.float32) - A_init = helper.make_tensor('A', TensorProto.FLOAT, [100, 4], - a_value.reshape(400).tolist()) - A = helper.make_tensor_value_info('A', TensorProto.FLOAT, [100, 4]) - B = helper.make_tensor_value_info('conv1_output', TensorProto.FLOAT, [50, 4]) - C = helper.make_tensor_value_info('conv2_output', TensorProto.FLOAT, [50, 4]) - - node = onnx.helper.make_node('Split', ['A'], ['B', 'C'], name='Split') - graph = helper.make_graph([node], 'test_graph_1', [A], [B, C], [A_init]) - model = helper.make_model(graph) - q_config = {'Split': {"weight":{'dtype': 3, - 'algorithm': 'minmax', - 'scheme':'sym', - 'granularity': 'per_tensor'}, - 'activation':{'dtype': 2, - 'algorithm': 'minmax', - 'scheme':'asym', - 'granularity':'per_tensor', - 'quant_mode': 'static'} - }, - } - quantize_params = {"A": [np.uint8(10.), np.float32(0)], - "B": [np.uint8(10.), np.float32(0)], - "C": [np.uint8(10.), np.float32(0)], - } - quantizable_op_types = ["Split"] - q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) - q_model.export('./test.onnx', self.config) - - def test_pad(self): - b_value = np.array([0, 1, 1, 0, 1, 1]).astype(np.int64) - B_init = helper.make_tensor('B', TensorProto.INT64, [6], - b_value.reshape(6).tolist()) - B = helper.make_tensor_value_info('B', TensorProto.INT64, [6]) - C = helper.make_tensor_value_info('C', TensorProto.FLOAT, [1, 7, 7]) - - d_value = np.random.randn(1).astype(np.float32) - D_init = helper.make_tensor('D', TensorProto.FLOAT, [1], - d_value.reshape(1).tolist()) - D = helper.make_tensor_value_info('D', TensorProto.FLOAT, [1]) - - e_value = np.random.randn(1, 5, 5).astype(np.float32) - E_init = helper.make_tensor('E', TensorProto.FLOAT, [1, 1, 5, 5], - e_value.reshape(25).tolist()) - E = helper.make_tensor_value_info('E', TensorProto.FLOAT, [1, 1, 5, 5]) - f_value = np.random.randn(1, 3, 3).astype(np.float32) - F_init = helper.make_tensor('F', TensorProto.FLOAT, [1, 1, 3, 3], - f_value.reshape(9).tolist()) - F = helper.make_tensor_value_info('F', TensorProto.FLOAT, [1, 1, 3, 3]) - for mode in ["constant", "edge", "reflect", "constant_value", "constant_value_wo_init"]: - conv_node = onnx.helper.make_node('Conv', ['E', 'F'], ['A'], - name='Conv', - kernel=[3, 3], - padding=[1, 1, 1, 1]) - if mode == "constant_value": - node = onnx.helper.make_node('Pad', ['A', 'B', 'D'], ['C'], name='Pad', mode="constant") - graph = helper.make_graph([conv_node, node], 'test_graph_1', [E, F, B, D], [C], [E_init, F_init, B_init, D_init]) - elif mode == "constant_value_wo_init": - node = onnx.helper.make_node('Pad', ['A', 'B', 'D'], ['C'], name='Pad', mode="constant") - graph = helper.make_graph([conv_node, node], 'test_graph_1', [E, F, B, D], [C], [E_init, F_init, B_init]) - else: - node = onnx.helper.make_node('Pad', ['A', 'B'], ['C'], name='Pad', mode=mode) - graph = helper.make_graph([conv_node, node], 'test_graph_1', [E, F, B], [C], [E_init, F_init, B_init]) - model = helper.make_model(graph) - pad_config = {"weight":{'dtype': 3, - 'algorithm': 'minmax', - 'scheme':'sym', - 'granularity': 'per_tensor'}, - 'activation':{'dtype': 2, - 'algorithm': 'minmax', - 'scheme':'asym', - 'granularity':'per_tensor', - 'quant_mode': 'static'}} - conv_config = {"weight":{'dtype': 3, - 'algorithm': 'minmax', - 'scheme':'sym', - 'granularity': 'per_channel'}, - 'activation':{'dtype': 2, - 'algorithm': 'minmax', - 'scheme':'asym', - 'granularity':'per_tensor', - 'quant_mode': 'static'}} - q_config = {'Conv': conv_config, - 'Pad': pad_config} - quantize_params = {"A": [np.uint8(10.), np.float32(1)], - "C": [np.uint8(10.), np.float32(1)], - "D": [np.uint8(10.), np.float32(1)], - "E": [np.uint8(10.), np.float32(1)], - "F": [np.uint8(10.), np.float32(1)]} - quantizable_op_types = ["Conv", "Pad"] - q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) - q_model.export('./test.onnx', self.config) - options.onnxrt.qdq_setting.AddQDQPairToWeight = True - options.onnxrt.qdq_setting.AddQDQPairToWeight = False - - node = onnx.helper.make_node('Pad', ['E', 'B', 'D'], ['C'], name='Pad', mode="constant") - graph = helper.make_graph([node], 'test_graph_1', [E, B, D], [C], [E_init, B_init, D_init]) - model = helper.make_model(graph) - q_config = {'Pad': {'activation':{'dtype': 2, - 'algorithm': 'minmax', - 'scheme':'asym', - 'granularity':'per_tensor', - 'quant_mode': 'static'} - }} - quantize_params = {"C": [np.uint8(10.), np.float32(0)], - "E": [np.uint8(10.), np.float32(0)]} - quantizable_op_types = ["Pad"] - q_model = self.qlinear_test(model, pad_config, quantize_params, quantizable_op_types) - q_model.export('./test.onnx', self.config) - def test_binary(self): for op in ['Mul', 'Add']: A = helper.make_tensor_value_info('A', TensorProto.FLOAT, [1, 10]) @@ -685,13 +514,24 @@ def test_binary(self): q_model.export('./test.onnx', self.config) def test_activation(self): + config = {"weight":{'dtype': 2, + 'algorithm': 'minmax', + 'scheme':'asym', + 'granularity': 'per_tensor'}, + 'activation':{'dtype': 2, + 'algorithm': 'minmax', + 'scheme':'asym', + 'granularity':'per_tensor', + 'quant_mode': 'static'} + } + for op in ["Relu", "LeakyRelu", "Sigmoid"]: B = helper.make_tensor_value_info('B', TensorProto.FLOAT, [1, 10]) A = helper.make_tensor_value_info('A', TensorProto.FLOAT, [1, 10]) node = onnx.helper.make_node(op, ['A'], ['B'], name=op) graph = helper.make_graph([node], 'test_graph_1', [A], [B]) model = helper.make_model(graph) - q_config = {op: self.static_q_config} + q_config = {op: config} quantize_params = {"A": [np.uint8(10.), np.float32(0)], "B": [np.uint8(10.), np.float32(0)]} quantizable_op_types = [op] @@ -705,8 +545,6 @@ def test_activation(self): model = helper.make_model(graph) q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) q_model.export('./test.onnx', self.config) - q_model = self.qlinear_test(model, q_config, {}, quantizable_op_types) - q_model.export('./test.onnx', self.config) def test_pooling(self): op = "MaxPool" @@ -735,7 +573,7 @@ def test_pooling(self): name='Conv', kernel_shape=[3, 3], pads=[1, 1, 1, 1]) - pool_node = onnx.helper.make_node(op, ['C'], ['D'], name=op) + pool_node = onnx.helper.make_node(op, ['C'], ['D'], name=op, kernel_shape=[1, 1]) graph = helper.make_graph([conv_node, pool_node], 'test_graph_1', [A, B], [D]) model = helper.make_model(graph) @@ -752,9 +590,7 @@ def test_pooling(self): B = helper.make_tensor_value_info('B', TensorProto.FLOAT, [1, 5, 1, 1]) A = helper.make_tensor_value_info('A', TensorProto.FLOAT, [1, 5, 5, 1]) node = onnx.helper.make_node(op, ['A'], ['B'], - name=op, - kernel_shape=[3, 3], - pads=[1, 1, 1, 1]) + name=op) graph = helper.make_graph([node], 'test_graph_1', [A], [B]) q_config = {op: self.static_q_config} quantize_params = {"A": [np.uint8(10.), np.float32(0)], @@ -791,12 +627,12 @@ def test_pooling(self): def test_exclude_node(self): A = helper.make_tensor_value_info('A', TensorProto.FLOAT, [1, 5, 5, 1]) B = helper.make_tensor_value_info('B', TensorProto.FLOAT, [3, 3, 1, 1]) - D = helper.make_tensor_value_info('D', TensorProto.FLOAT, [1, 1, 3, 3]) + D = helper.make_tensor_value_info('D', TensorProto.FLOAT, [1, 3, 5, 1]) conv_node = onnx.helper.make_node('Conv', ['A', 'B'], ['C'], name='Conv', kernel_shape=[3, 3], pads=[1, 1, 1, 1]) - pool_node = onnx.helper.make_node("MaxPool", ['C'], ['D'], name="MaxPool") + pool_node = onnx.helper.make_node("MaxPool", ['C'], ['D'], name="MaxPool", kernel_shape=[1, 1]) graph = helper.make_graph([conv_node, pool_node], 'test_graph_1', [A, B], [D]) model = helper.make_model(graph) @@ -806,6 +642,7 @@ def test_exclude_node(self): "C": [np.uint8(10.), np.float32(0)], "D": [np.uint8(10.), np.float32(0)]} quantizable_op_types = ["Conv", "MaxPool"] + self.config.exclude_output_quantization = ['Conv'] q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) q_model.export('./test.onnx', self.config) From 118ad1ec964bb49a8a1bab6b33c31f0dd7e8475b Mon Sep 17 00:00:00 2001 From: mengniwa Date: Fri, 9 Dec 2022 21:27:01 +0800 Subject: [PATCH 06/12] fix bug Signed-off-by: mengniwa --- .../adaptor/ox_utils/operators/split.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/neural_compressor/adaptor/ox_utils/operators/split.py b/neural_compressor/adaptor/ox_utils/operators/split.py index ec7df1e1434..48d1f918fd3 100644 --- a/neural_compressor/adaptor/ox_utils/operators/split.py +++ b/neural_compressor/adaptor/ox_utils/operators/split.py @@ -96,16 +96,6 @@ def convert(self): if all([child.op_type not in self.qop_list or \ child.op_type != 'DequantizeLinear' for child in self.children]): return False, add_nodes, inits - - outputs = [] - for i, out in enumerate(node.output): - out_q = onnx.helper.make_node( - 'QuantizeLinear', - [node.name + '_out_' + str(i), in_dq.input[1], in_dq.input[2]], - [node.output[i]], - node.name + '_out_quant_' + str(i)) - outputs.append([node.name + '_out_quant_' + str(i)]) - add_nodes.append(out_q) # input dq for child in self.children: @@ -119,6 +109,16 @@ def convert(self): add_nodes.append(in_dq) break + outputs = [] + for i, out in enumerate(node.output): + out_q = onnx.helper.make_node( + 'QuantizeLinear', + [node.name + '_out_' + str(i), in_dq.input[1], in_dq.input[2]], + [node.output[i]], + node.name + '_out_quant_' + str(i)) + outputs.append([node.name + '_out_quant_' + str(i)]) + add_nodes.append(out_q) + outputs = node.output kwargs = {} for attribute in node.attribute: # pragma: no cover From f82c9be39131e169a1b2be22bdc4a44423f5f828 Mon Sep 17 00:00:00 2001 From: mengniwa Date: Fri, 9 Dec 2022 21:56:01 +0800 Subject: [PATCH 07/12] remove unused code Signed-off-by: mengniwa --- .../adaptor/ox_utils/operators/activation.py | 4 ++-- .../adaptor/ox_utils/operators/argmax.py | 5 +++-- .../adaptor/ox_utils/operators/attention.py | 4 ++-- .../adaptor/ox_utils/operators/binary_op.py | 4 ++-- .../adaptor/ox_utils/operators/concat.py | 4 ++-- .../adaptor/ox_utils/operators/conv.py | 4 ++-- .../adaptor/ox_utils/operators/direct_q8.py | 4 ++-- .../adaptor/ox_utils/operators/embed_layernorm.py | 4 ++-- .../adaptor/ox_utils/operators/gather.py | 4 ++-- .../adaptor/ox_utils/operators/gavgpool.py | 4 ++-- .../adaptor/ox_utils/operators/gemm.py | 4 ++-- .../adaptor/ox_utils/operators/matmul.py | 4 ++-- .../adaptor/ox_utils/operators/maxpool.py | 4 ++-- .../adaptor/ox_utils/operators/ops.py | 4 +--- .../adaptor/ox_utils/operators/pad.py | 4 ++-- .../adaptor/ox_utils/operators/pooling.py | 4 ++-- .../adaptor/ox_utils/operators/resize.py | 4 ++-- .../adaptor/ox_utils/operators/split.py | 4 ++-- neural_compressor/config.py | 15 ++------------- .../experimental/export/qlinear2qdq.py | 4 +--- neural_compressor/model/onnx_model.py | 3 +-- 21 files changed, 40 insertions(+), 55 deletions(-) diff --git a/neural_compressor/adaptor/ox_utils/operators/activation.py b/neural_compressor/adaptor/ox_utils/operators/activation.py index e9481c8a891..cf677e61881 100644 --- a/neural_compressor/adaptor/ox_utils/operators/activation.py +++ b/neural_compressor/adaptor/ox_utils/operators/activation.py @@ -91,8 +91,8 @@ def quantize(self): @qop_registry(op_types="QLinearLeakyRelu, QLinearSigmoid") class QActivationOperator(QOperator): - def __init__(self, onnx_node, children, initializers, channel_axis): - super().__init__(onnx_node, children, initializers, channel_axis) + def __init__(self, onnx_node, children, initializers): + super().__init__(onnx_node, children, initializers) def convert(self): node = self.node diff --git a/neural_compressor/adaptor/ox_utils/operators/argmax.py b/neural_compressor/adaptor/ox_utils/operators/argmax.py index d11128fec94..17bfaadbffa 100644 --- a/neural_compressor/adaptor/ox_utils/operators/argmax.py +++ b/neural_compressor/adaptor/ox_utils/operators/argmax.py @@ -35,9 +35,10 @@ def convert(self, convert_format): origin_name = node.input[0].split('_argmax_node')[0] if origin_name in self.quantizer.quantized_value_map: + #node.input[0] = self.quantizer.quantized_value_map[origin_name].q_name node.name = node.name + '_quant' @qop_registry(op_types="ArgMax") class QArgMaxOperator(QOperator): - def __init__(self, onnx_node, children, initializers, channel_axis): - super().__init__(onnx_node, children, initializers, channel_axis) + def __init__(self, onnx_node, children, initializers): + super().__init__(onnx_node, children, initializers) \ No newline at end of file diff --git a/neural_compressor/adaptor/ox_utils/operators/attention.py b/neural_compressor/adaptor/ox_utils/operators/attention.py index d94250183fe..26030e9284a 100644 --- a/neural_compressor/adaptor/ox_utils/operators/attention.py +++ b/neural_compressor/adaptor/ox_utils/operators/attention.py @@ -77,8 +77,8 @@ def convert(self, convert_format): @qop_registry(op_types="QAttention") class QAttentionOperator(QOperator): - def __init__(self, onnx_node, children, initializers, channel_axis): - super().__init__(onnx_node, children, initializers, channel_axis) + def __init__(self, onnx_node, children, initializers): + super().__init__(onnx_node, children, initializers) def convert(self): node = self.node diff --git a/neural_compressor/adaptor/ox_utils/operators/binary_op.py b/neural_compressor/adaptor/ox_utils/operators/binary_op.py index 9419cc6b201..72c92da3dcf 100644 --- a/neural_compressor/adaptor/ox_utils/operators/binary_op.py +++ b/neural_compressor/adaptor/ox_utils/operators/binary_op.py @@ -81,8 +81,8 @@ def convert(self, convert_format): @qop_registry(op_types="QLinearAdd, QLinearMul") class QBinaryOperator(QOperator): - def __init__(self, onnx_node, children, initializers, channel_axis): - super().__init__(onnx_node, children, initializers, channel_axis) + def __init__(self, onnx_node, children, initializers): + super().__init__(onnx_node, children, initializers) def convert(self): node = self.node diff --git a/neural_compressor/adaptor/ox_utils/operators/concat.py b/neural_compressor/adaptor/ox_utils/operators/concat.py index fd68f95538c..eb85155421c 100644 --- a/neural_compressor/adaptor/ox_utils/operators/concat.py +++ b/neural_compressor/adaptor/ox_utils/operators/concat.py @@ -99,8 +99,8 @@ def cast(self): # pragma: no cover @qop_registry(op_types="QLinearConcat") class QConcatOperator(QOperator): - def __init__(self, onnx_node, children, initializers, channel_axis): - super().__init__(onnx_node, children, initializers, channel_axis) + def __init__(self, onnx_node, children, initializers): + super().__init__(onnx_node, children, initializers) def convert(self): node = self.node diff --git a/neural_compressor/adaptor/ox_utils/operators/conv.py b/neural_compressor/adaptor/ox_utils/operators/conv.py index a8de96deb88..7f95d548b2a 100644 --- a/neural_compressor/adaptor/ox_utils/operators/conv.py +++ b/neural_compressor/adaptor/ox_utils/operators/conv.py @@ -167,8 +167,8 @@ def convert(self, convert_format): @qop_registry(op_types="QLinearConv") class QConvOperator(QOperator): - def __init__(self, onnx_node, children, initializers, channel_axis): - super().__init__(onnx_node, children, initializers, channel_axis) + def __init__(self, onnx_node, children, initializers): + super().__init__(onnx_node, children, initializers) def convert(self): node = self.node diff --git a/neural_compressor/adaptor/ox_utils/operators/direct_q8.py b/neural_compressor/adaptor/ox_utils/operators/direct_q8.py index 22f8c44928b..a227687a985 100644 --- a/neural_compressor/adaptor/ox_utils/operators/direct_q8.py +++ b/neural_compressor/adaptor/ox_utils/operators/direct_q8.py @@ -87,5 +87,5 @@ def cast(self): @qop_registry(op_types="Reshape, Transpose, Squeeze, Unsqueeze") class QDirectOperator(QOperator): - def __init__(self, onnx_node, children, initializers, channel_axis): - super().__init__(onnx_node, children, initializers, channel_axis) \ No newline at end of file + def __init__(self, onnx_node, children, initializers): + super().__init__(onnx_node, children, initializers) \ No newline at end of file diff --git a/neural_compressor/adaptor/ox_utils/operators/embed_layernorm.py b/neural_compressor/adaptor/ox_utils/operators/embed_layernorm.py index 4045fac9d29..91310f9e15d 100644 --- a/neural_compressor/adaptor/ox_utils/operators/embed_layernorm.py +++ b/neural_compressor/adaptor/ox_utils/operators/embed_layernorm.py @@ -74,8 +74,8 @@ def convert(self, convert_format): @qop_registry(op_types="QEmbedLayerNormalization") class QEmbedLayerNormalizationOperator(QOperator): - def __init__(self, onnx_node, children, initializers, channel_axis): - super().__init__(onnx_node, children, initializers, channel_axis) + def __init__(self, onnx_node, children, initializers): + super().__init__(onnx_node, children, initializers) def convert(self): node = self.node diff --git a/neural_compressor/adaptor/ox_utils/operators/gather.py b/neural_compressor/adaptor/ox_utils/operators/gather.py index b6adf2ba03e..7c3c6285b45 100644 --- a/neural_compressor/adaptor/ox_utils/operators/gather.py +++ b/neural_compressor/adaptor/ox_utils/operators/gather.py @@ -93,5 +93,5 @@ def convert(self, convert_format): @qop_registry(op_types="Gather") class QGatherOperator(QOperator): - def __init__(self, onnx_node, children, initializers, channel_axis): - super().__init__(onnx_node, children, initializers, channel_axis) + def __init__(self, onnx_node, children, initializers): + super().__init__(onnx_node, children, initializers) diff --git a/neural_compressor/adaptor/ox_utils/operators/gavgpool.py b/neural_compressor/adaptor/ox_utils/operators/gavgpool.py index f5aa36158e7..eec48e6af19 100644 --- a/neural_compressor/adaptor/ox_utils/operators/gavgpool.py +++ b/neural_compressor/adaptor/ox_utils/operators/gavgpool.py @@ -62,8 +62,8 @@ def convert(self, convert_format): @qop_registry(op_types="QLinearGlobalAveragePool") class QGlobalAveragePoolOperator(QOperator): - def __init__(self, onnx_node, children, initializers, channel_axis): - super().__init__(onnx_node, children, initializers, channel_axis) + def __init__(self, onnx_node, children, initializers): + super().__init__(onnx_node, children, initializers) def convert(self): node = self.node diff --git a/neural_compressor/adaptor/ox_utils/operators/gemm.py b/neural_compressor/adaptor/ox_utils/operators/gemm.py index 9a46e5078c1..49f8eeaa6c7 100644 --- a/neural_compressor/adaptor/ox_utils/operators/gemm.py +++ b/neural_compressor/adaptor/ox_utils/operators/gemm.py @@ -95,8 +95,8 @@ def convert(self, convert_format): @qop_registry(op_types="QGemm") class QGemmOperator(QOperator): - def __init__(self, onnx_node, children, initializers, channel_axis): - super().__init__(onnx_node, children, initializers, channel_axis) + def __init__(self, onnx_node, children, initializers): + super().__init__(onnx_node, children, initializers) def convert(self): import numpy as np diff --git a/neural_compressor/adaptor/ox_utils/operators/matmul.py b/neural_compressor/adaptor/ox_utils/operators/matmul.py index a531c835f08..6ed795f869f 100644 --- a/neural_compressor/adaptor/ox_utils/operators/matmul.py +++ b/neural_compressor/adaptor/ox_utils/operators/matmul.py @@ -126,8 +126,8 @@ def convert(self, convert_format): @qop_registry(op_types="QLinearMatMul") class QMatMulOperator(QOperator): - def __init__(self, onnx_node, children, initializers, channel_axis): - super().__init__(onnx_node, children, initializers, channel_axis) + def __init__(self, onnx_node, children, initializers): + super().__init__(onnx_node, children, initializers) def convert(self): node = self.node diff --git a/neural_compressor/adaptor/ox_utils/operators/maxpool.py b/neural_compressor/adaptor/ox_utils/operators/maxpool.py index 524d5e1e687..3180a6a49f1 100644 --- a/neural_compressor/adaptor/ox_utils/operators/maxpool.py +++ b/neural_compressor/adaptor/ox_utils/operators/maxpool.py @@ -71,5 +71,5 @@ def convert(self, convert_format): @qop_registry(op_types="MaxPool") class QMaxPoolOperator(QOperator): - def __init__(self, onnx_node, children, initializers, channel_axis): - super().__init__(onnx_node, children, initializers, channel_axis) \ No newline at end of file + def __init__(self, onnx_node, children, initializers): + super().__init__(onnx_node, children, initializers) \ No newline at end of file diff --git a/neural_compressor/adaptor/ox_utils/operators/ops.py b/neural_compressor/adaptor/ox_utils/operators/ops.py index c4876f4b09c..ad6237b2d41 100644 --- a/neural_compressor/adaptor/ox_utils/operators/ops.py +++ b/neural_compressor/adaptor/ox_utils/operators/ops.py @@ -109,12 +109,10 @@ def cast(self): # pragma: no cover self.quantizer.dtype_cast(self.node, self.dtype) class QOperator(object): - def __init__(self, onnx_node, children, initializers, channel_axis): + def __init__(self, onnx_node, children, initializers): self.node = onnx_node self.children = children self.initializers = initializers - self.axis = channel_axis - self.per_channel = False self.qop_list = ['QGemm', 'QAttention', 'QEmbedLayerNormalization', 'QLinearLeakyRelu', 'QLinearSigmoid', 'QLinearAdd','QLinearMul', 'QLinearConcat', 'QLinearConv', 'QLinearGlobalAveragePool', diff --git a/neural_compressor/adaptor/ox_utils/operators/pad.py b/neural_compressor/adaptor/ox_utils/operators/pad.py index df3fe90a474..00bb38a3bbd 100644 --- a/neural_compressor/adaptor/ox_utils/operators/pad.py +++ b/neural_compressor/adaptor/ox_utils/operators/pad.py @@ -97,5 +97,5 @@ def convert(self, convert_format): @qop_registry(op_types="Pad") class QPadOperator(QOperator): - def __init__(self, onnx_node, children, initializers, channel_axis): - super().__init__(onnx_node, children, initializers, channel_axis) \ No newline at end of file + def __init__(self, onnx_node, children, initializers): + super().__init__(onnx_node, children, initializers) \ No newline at end of file diff --git a/neural_compressor/adaptor/ox_utils/operators/pooling.py b/neural_compressor/adaptor/ox_utils/operators/pooling.py index d6c6fe05f15..a794dec7018 100644 --- a/neural_compressor/adaptor/ox_utils/operators/pooling.py +++ b/neural_compressor/adaptor/ox_utils/operators/pooling.py @@ -83,8 +83,8 @@ def convert(self, convert_format): @qop_registry(op_types="QLinearAveragePool") class QPoolOperator(QOperator): - def __init__(self, onnx_node, children, initializers, channel_axis): - super().__init__(onnx_node, children, initializers, channel_axis) + def __init__(self, onnx_node, children, initializers): + super().__init__(onnx_node, children, initializers) def convert(self): node = self.node diff --git a/neural_compressor/adaptor/ox_utils/operators/resize.py b/neural_compressor/adaptor/ox_utils/operators/resize.py index b7846888c0c..7d266c7a5a5 100644 --- a/neural_compressor/adaptor/ox_utils/operators/resize.py +++ b/neural_compressor/adaptor/ox_utils/operators/resize.py @@ -72,5 +72,5 @@ def convert(self, convert_format): @qop_registry(op_types="Resize") class QResizeOperator(QOperator): - def __init__(self, onnx_node, children, initializers, channel_axis): - super().__init__(onnx_node, children, initializers, channel_axis) \ No newline at end of file + def __init__(self, onnx_node, children, initializers): + super().__init__(onnx_node, children, initializers) \ No newline at end of file diff --git a/neural_compressor/adaptor/ox_utils/operators/split.py b/neural_compressor/adaptor/ox_utils/operators/split.py index 48d1f918fd3..d022fd3d4c1 100644 --- a/neural_compressor/adaptor/ox_utils/operators/split.py +++ b/neural_compressor/adaptor/ox_utils/operators/split.py @@ -84,8 +84,8 @@ def cast(self): # pragma: no cover @qop_registry(op_types="Split") class QSplitOperator(QOperator): - def __init__(self, onnx_node, children, initializers, channel_axis): - super().__init__(onnx_node, children, initializers, channel_axis) + def __init__(self, onnx_node, children, initializers): + super().__init__(onnx_node, children, initializers) def convert(self): node = self.node diff --git a/neural_compressor/config.py b/neural_compressor/config.py index c0a136a3bf5..77a86e1486d 100644 --- a/neural_compressor/config.py +++ b/neural_compressor/config.py @@ -815,19 +815,8 @@ def dynamic_axes(self, dynamic_axes): self._dynamic_axes = dynamic_axes class ONNXQlinear2QDQConfig: - def __init__( - self, - channel_axis={}, - ): - self._channel_axis = channel_axis - - @property - def channel_axis(self): - return self._channel_axis - - @channel_axis.setter - def channel_axis(self, channel_axis): - self._dtype = channel_axis + def __init__(self): + pass class Torch2ONNXConfig(ExportConfig): def __init__( diff --git a/neural_compressor/experimental/export/qlinear2qdq.py b/neural_compressor/experimental/export/qlinear2qdq.py index a1f48374895..e8e5227285f 100644 --- a/neural_compressor/experimental/export/qlinear2qdq.py +++ b/neural_compressor/experimental/export/qlinear2qdq.py @@ -48,7 +48,6 @@ def check_model(model): def onnx_qlinear_to_qdq( model, input_name_to_nodes, - channel_axis={} ): """Export FP32 PyTorch model into FP32 ONNX model. @@ -72,8 +71,7 @@ def onnx_qlinear_to_qdq( converter = QOPERATORS[node.op_type]( node, children, - model.graph.initializer, - channel_axis) + model.graph.initializer) done, add_node, init = converter.convert() if done: add_nodes.extend(add_node) diff --git a/neural_compressor/model/onnx_model.py b/neural_compressor/model/onnx_model.py index 63dd97512aa..7bfdd9a1c38 100644 --- a/neural_compressor/model/onnx_model.py +++ b/neural_compressor/model/onnx_model.py @@ -431,8 +431,7 @@ def get_nodes_chain(self, start_node, stop_node, result_chain=[]): def export(self, save_path, conf): from neural_compressor.experimental.export import onnx_qlinear_to_qdq add_nodes, remove_nodes, inits = onnx_qlinear_to_qdq(self._model, - self._input_name_to_nodes, - conf.channel_axis) + self._input_name_to_nodes) self.add_nodes(add_nodes) self.remove_nodes(remove_nodes) self.remove_unused_constant() From 1bd855e5e71ebe355744edf932f814b6310d2dd7 Mon Sep 17 00:00:00 2001 From: mengniwa Date: Sat, 10 Dec 2022 10:23:36 +0800 Subject: [PATCH 08/12] update comments Signed-off-by: mengniwa --- neural_compressor/adaptor/ox_utils/operators/argmax.py | 3 +-- neural_compressor/experimental/export/qlinear2qdq.py | 8 +------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/neural_compressor/adaptor/ox_utils/operators/argmax.py b/neural_compressor/adaptor/ox_utils/operators/argmax.py index 17bfaadbffa..92a05498c54 100644 --- a/neural_compressor/adaptor/ox_utils/operators/argmax.py +++ b/neural_compressor/adaptor/ox_utils/operators/argmax.py @@ -35,10 +35,9 @@ def convert(self, convert_format): origin_name = node.input[0].split('_argmax_node')[0] if origin_name in self.quantizer.quantized_value_map: - #node.input[0] = self.quantizer.quantized_value_map[origin_name].q_name node.name = node.name + '_quant' @qop_registry(op_types="ArgMax") class QArgMaxOperator(QOperator): def __init__(self, onnx_node, children, initializers): - super().__init__(onnx_node, children, initializers) \ No newline at end of file + super().__init__(onnx_node, children, initializers) diff --git a/neural_compressor/experimental/export/qlinear2qdq.py b/neural_compressor/experimental/export/qlinear2qdq.py index e8e5227285f..644715f50dc 100644 --- a/neural_compressor/experimental/export/qlinear2qdq.py +++ b/neural_compressor/experimental/export/qlinear2qdq.py @@ -18,12 +18,6 @@ """Helper functions to export onnx model from QLinearops to QDQ.""" from neural_compressor.utils import logger -from neural_compressor.utils.utility import LazyImport - -torch = LazyImport('torch') -onnx = LazyImport('onnx') -ort = LazyImport('onnxruntime') -ortq = LazyImport('onnxruntime.quantization') def check_model(model): """Check optype for input model. @@ -49,7 +43,7 @@ def onnx_qlinear_to_qdq( model, input_name_to_nodes, ): - """Export FP32 PyTorch model into FP32 ONNX model. + """Export ONNX QLinearops model into QDQ model. Args: model (ModelProto): int8 onnx model. From bd1e8d905ae0b0c29709db1e4197b49bbc53e85d Mon Sep 17 00:00:00 2001 From: mengniwa Date: Sat, 10 Dec 2022 10:43:18 +0800 Subject: [PATCH 09/12] fix model check Signed-off-by: mengniwa --- .../experimental/export/qlinear2qdq.py | 38 ++++++++++--------- neural_compressor/model/onnx_model.py | 3 +- 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/neural_compressor/experimental/export/qlinear2qdq.py b/neural_compressor/experimental/export/qlinear2qdq.py index 644715f50dc..0eebadaaa6d 100644 --- a/neural_compressor/experimental/export/qlinear2qdq.py +++ b/neural_compressor/experimental/export/qlinear2qdq.py @@ -27,7 +27,7 @@ def check_model(model): """ has_integerop = False has_qlinearop = False - for node in model.nodes(): + for node in model.graph.node: if node.op_type.endswith('Integer'): has_integerop = True elif node.op_type.startswith('QLinear'): @@ -38,6 +38,9 @@ def check_model(model): logger.info("This model has Integer ops, these ops will be skipped.") if has_qlinearop: return True + else: + logger.info("This model has no QLinear ops, save the original model.") + return False def onnx_qlinear_to_qdq( model, @@ -55,20 +58,21 @@ def onnx_qlinear_to_qdq( add_nodes = [] remove_nodes = [] inits = [] - for node in model.graph.node: - if node.op_type in QOPERATORS: - if node.output[0] not in input_name_to_nodes: - continue - children = [] - for out in node.output: - children.extend(input_name_to_nodes[node.output[0]]) - converter = QOPERATORS[node.op_type]( - node, - children, - model.graph.initializer) - done, add_node, init = converter.convert() - if done: - add_nodes.extend(add_node) - inits.extend(init) - remove_nodes.append(node) + if check_model(model): + for node in model.graph.node: + if node.op_type in QOPERATORS: + if node.output[0] not in input_name_to_nodes: + continue + children = [] + for out in node.output: + children.extend(input_name_to_nodes[node.output[0]]) + converter = QOPERATORS[node.op_type]( + node, + children, + model.graph.initializer) + done, add_node, init = converter.convert() + if done: + add_nodes.extend(add_node) + inits.extend(init) + remove_nodes.append(node) return add_nodes, remove_nodes, inits diff --git a/neural_compressor/model/onnx_model.py b/neural_compressor/model/onnx_model.py index 7bfdd9a1c38..74f4a004f58 100644 --- a/neural_compressor/model/onnx_model.py +++ b/neural_compressor/model/onnx_model.py @@ -434,7 +434,8 @@ def export(self, save_path, conf): self._input_name_to_nodes) self.add_nodes(add_nodes) self.remove_nodes(remove_nodes) - self.remove_unused_constant() self.add_initializers(inits) + self.update() + self.remove_unused_constant() self.topological_sort() self.save(save_path) From c7562a616e95019357e33b60299c8c1d644f7e5d Mon Sep 17 00:00:00 2001 From: mengniwa Date: Mon, 12 Dec 2022 12:43:59 +0800 Subject: [PATCH 10/12] fix code Signed-off-by: mengniwa --- .../adaptor/ox_utils/operators/__init__.py | 4 ++-- .../adaptor/ox_utils/operators/direct_q8.py | 1 - .../experimental/export/qlinear2qdq.py | 14 ++++++++++--- neural_compressor/model/onnx_model.py | 20 ++++++++++--------- 4 files changed, 24 insertions(+), 15 deletions(-) diff --git a/neural_compressor/adaptor/ox_utils/operators/__init__.py b/neural_compressor/adaptor/ox_utils/operators/__init__.py index da48d428ac4..7b17ff45b5b 100644 --- a/neural_compressor/adaptor/ox_utils/operators/__init__.py +++ b/neural_compressor/adaptor/ox_utils/operators/__init__.py @@ -18,7 +18,7 @@ from os.path import dirname, basename, isfile, join import glob -from .ops import OPERATORS +from .ops import OPERATORS, QOPERATORS modules = glob.glob(join(dirname(__file__), "*.py")) @@ -26,4 +26,4 @@ if isfile(f) and not f.startswith('__') and not f.endswith('__init__.py'): __import__(basename(f)[:-3], globals(), locals(), level=1) -__all__ = ["OPERATORS"] \ No newline at end of file +__all__ = ["OPERATORS", "QOPERATORS"] diff --git a/neural_compressor/adaptor/ox_utils/operators/direct_q8.py b/neural_compressor/adaptor/ox_utils/operators/direct_q8.py index a227687a985..08a6e5a326b 100644 --- a/neural_compressor/adaptor/ox_utils/operators/direct_q8.py +++ b/neural_compressor/adaptor/ox_utils/operators/direct_q8.py @@ -17,7 +17,6 @@ # from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator, qop_registry, QOperator -from neural_compressor.adaptor.ox_utils.util import attribute_to_kwarg, ms_domain @op_registry(op_types="Reshape, Transpose, Squeeze, Unsqueeze") class Direct8BitOperator(Operator): diff --git a/neural_compressor/experimental/export/qlinear2qdq.py b/neural_compressor/experimental/export/qlinear2qdq.py index 0eebadaaa6d..688c8ee344f 100644 --- a/neural_compressor/experimental/export/qlinear2qdq.py +++ b/neural_compressor/experimental/export/qlinear2qdq.py @@ -18,6 +18,10 @@ """Helper functions to export onnx model from QLinearops to QDQ.""" from neural_compressor.utils import logger +from neural_compressor.adaptor.ox_utils.util import find_by_name +from neural_compressor.utils.utility import LazyImport + +numpy_helper = LazyImport('onnx.numpy_helper') def check_model(model): """Check optype for input model. @@ -25,6 +29,7 @@ def check_model(model): Args: model (ModelProto): onnx model. """ + has_integerop = False has_qlinearop = False for node in model.graph.node: @@ -34,6 +39,11 @@ def check_model(model): has_qlinearop = True elif node.op_type in ['QAttention', 'QGemm', 'QEmbedLayerNormalization']: has_qlinearop = True + elif node.op_type in ['Gather']: + input_data = find_by_name(node.input[0], model.graph.initializer) + if input_data is not None and \ + numpy_helper.to_array(input_data).dtype in ['int8', 'uint8']: + has_qlinearop = True if has_integerop: logger.info("This model has Integer ops, these ops will be skipped.") if has_qlinearop: @@ -51,10 +61,8 @@ def onnx_qlinear_to_qdq( Args: model (ModelProto): int8 onnx model. input_name_to_nodes (dict): the mapping of tensor name and its destination nodes. - channel_axis (dict, optional): quantization axis of for per-channel quantized optype, - the key is optype (str), the value is axis (int). """ - from neural_compressor.adaptor.ox_utils.operators.ops import QOPERATORS + from neural_compressor.adaptor.ox_utils.operators import QOPERATORS add_nodes = [] remove_nodes = [] inits = [] diff --git a/neural_compressor/model/onnx_model.py b/neural_compressor/model/onnx_model.py index 74f4a004f58..7b95aa9628a 100644 --- a/neural_compressor/model/onnx_model.py +++ b/neural_compressor/model/onnx_model.py @@ -430,12 +430,14 @@ def get_nodes_chain(self, start_node, stop_node, result_chain=[]): def export(self, save_path, conf): from neural_compressor.experimental.export import onnx_qlinear_to_qdq - add_nodes, remove_nodes, inits = onnx_qlinear_to_qdq(self._model, - self._input_name_to_nodes) - self.add_nodes(add_nodes) - self.remove_nodes(remove_nodes) - self.add_initializers(inits) - self.update() - self.remove_unused_constant() - self.topological_sort() - self.save(save_path) + from neural_compressor.config import ONNXQlinear2QDQConfig + if isinstance(conf, ONNXQlinear2QDQConfig): + add_nodes, remove_nodes, inits = onnx_qlinear_to_qdq(self._model, + self._input_name_to_nodes) + self.add_nodes(add_nodes) + self.remove_nodes(remove_nodes) + self.add_initializers(inits) + self.update() + self.remove_unused_constant() + self.topological_sort() + self.save(save_path) From fefe1244fc2da78d89a0a184c9a1a5ca9433eef8 Mon Sep 17 00:00:00 2001 From: mengniwa Date: Mon, 12 Dec 2022 13:29:55 +0800 Subject: [PATCH 11/12] fix bug Signed-off-by: mengniwa --- neural_compressor/adaptor/ox_utils/operators/argmax.py | 2 -- neural_compressor/adaptor/ox_utils/operators/matmul.py | 2 +- neural_compressor/experimental/export/qlinear2qdq.py | 1 - 3 files changed, 1 insertion(+), 4 deletions(-) diff --git a/neural_compressor/adaptor/ox_utils/operators/argmax.py b/neural_compressor/adaptor/ox_utils/operators/argmax.py index 92a05498c54..65daf5b5523 100644 --- a/neural_compressor/adaptor/ox_utils/operators/argmax.py +++ b/neural_compressor/adaptor/ox_utils/operators/argmax.py @@ -15,9 +15,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import onnx from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator, QOperator, qop_registry -from neural_compressor.adaptor.ox_utils.util import attribute_to_kwarg @op_registry(op_types="ArgMax") class ArgMaxOperator(Operator): diff --git a/neural_compressor/adaptor/ox_utils/operators/matmul.py b/neural_compressor/adaptor/ox_utils/operators/matmul.py index 6ed795f869f..fbf6558bb02 100644 --- a/neural_compressor/adaptor/ox_utils/operators/matmul.py +++ b/neural_compressor/adaptor/ox_utils/operators/matmul.py @@ -18,7 +18,7 @@ import onnx from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator, QOperator, qop_registry -from neural_compressor.adaptor.ox_utils.util import find_by_name, ms_domain, attribute_to_kwarg +from neural_compressor.adaptor.ox_utils.util import find_by_name, attribute_to_kwarg from onnx import onnx_pb as onnx_proto @op_registry(op_types="MatMul") diff --git a/neural_compressor/experimental/export/qlinear2qdq.py b/neural_compressor/experimental/export/qlinear2qdq.py index 688c8ee344f..10c0b74d7ef 100644 --- a/neural_compressor/experimental/export/qlinear2qdq.py +++ b/neural_compressor/experimental/export/qlinear2qdq.py @@ -29,7 +29,6 @@ def check_model(model): Args: model (ModelProto): onnx model. """ - has_integerop = False has_qlinearop = False for node in model.graph.node: From 8c15a9cc3a20c1490499494ff8cd9e477c5fa37b Mon Sep 17 00:00:00 2001 From: mengniwa Date: Mon, 12 Dec 2022 16:16:45 +0800 Subject: [PATCH 12/12] add warning Signed-off-by: mengniwa --- neural_compressor/model/onnx_model.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/neural_compressor/model/onnx_model.py b/neural_compressor/model/onnx_model.py index 7b95aa9628a..a090412b171 100644 --- a/neural_compressor/model/onnx_model.py +++ b/neural_compressor/model/onnx_model.py @@ -441,3 +441,7 @@ def export(self, save_path, conf): self.remove_unused_constant() self.topological_sort() self.save(save_path) + else: + logger.warning("Unsupported config for export, " + "only ONNXQlinear2QDQConfig is supported!") + exit(0)