enhance node mapping

xin3he · xin3he · commit 958d3b040c13 · 2022-12-02T16:20:04.000+08:00
Signed-off-by: Xin He &lt;xin3.he@intel.com&gt;
diff --git a/neural_compressor/experimental/export/torch2onnx.py b/neural_compressor/experimental/export/torch2onnx.py
@@ -157,70 +157,85 @@ def set_data_type(
     return activation_type, weight_type
 
 
-def get_quantizable_onnx_ops(
-    q_config,
+def get_node_mapping(
+    fp32_model,
     fp32_onnx_path,
 ):
-    """Get quantizable onnx ops.
+    """Get PyTorch module and ONNX node mapping.
 
     Args:
-        q_config (dict): quantization configuration from PyTorch.
+        fp32_model (torch.nn.Module): quantization configuration from PyTorch.
         fp32_onnx_path (str): path to fp32 onnx model.
 
     Returns:
-        quantize_nodes: all onnx node that should be quantized.
         module_node_mapping: op mapping from PyTorch to ONNX.
         linear_matmul_list: contains matmul that comes from linear.
     """
+    def check_data(op_type, data, module_dict):
+        for name, value in module_dict.items():
+            if value.shape == data.shape:
+                if (value == data).all():
+                    return name
+                # Convolution weight data mismatch.
+                elif op_type == 'Conv' and np.allclose(value, data):
+                    return name
+        return None
+
+    module_dict = {}
+    for name, module in fp32_model.named_modules():
+        if 'Conv' in str(module.__class__.__name__) or \
+          'Embedding' in str(module.__class__.__name__) or \
+          'Linear' in str(module.__class__.__name__):
+            if hasattr(module, 'weight'):
+                value = module.weight.detach().cpu().numpy()
+                module_dict[name] = value
+
+    module_node_mapping = {}
+    linear_matmul_list = []
     fp32_onnx_model = onnx.load(fp32_onnx_path)
-    # Clarify ONNX nodes that we can mapping from PyTorch
-    if 'dynamic' in q_config['approach']:
-        op_types_to_quantize=['MatMul', 'Gather', "LSTM"]
-    else:
-        op_types_to_quantize=['MatMul', 'Gather', 'Conv']
+    initializer_data = {tensor.name: tensor for tensor in fp32_onnx_model.graph.initializer}
+    from onnx import numpy_helper
+    for node in fp32_onnx_model.graph.node:
+        if node.op_type in op_types_to_quantize:
+            if node.op_type == 'MatMul' and node.input[1] in initializer_data:
+                data = numpy_helper.to_array(initializer_data[node.input[1]]).T
+            elif node.op_type == 'Gather' and node.input[0] in initializer_data:
+                data = numpy_helper.to_array(initializer_data[node.input[0]])
+            elif node.op_type in ['Conv', 'Gemm']:
+                data = numpy_helper.to_array(initializer_data[node.input[1]])
+            else:
+                continue
+            pt_name = check_data(node.op_type, data, module_dict)
+            if pt_name:
+                module_node_mapping[pt_name] = node.name
+                if node.op_type == 'MatMul':
+                    linear_matmul_list.append(node.name)
+    return module_node_mapping, linear_matmul_list
 
-    from neural_compressor.adaptor.onnxrt import ONNXRTAdaptor
-    # pylint: disable=E1120
-    fp32_onnx_model = ONNXRTAdaptor._replace_gemm_with_matmul(fp32_onnx_model).model
-    onnx.save(fp32_onnx_model, fp32_onnx_path)
 
-    # Get weight name from onnx initializer
-    weight_name_list = []
-    for tensor in fp32_onnx_model.graph.initializer:
-        weight_name_list.append(tensor.name)
+def get_quantizable_onnx_ops(
+    int8_model,
+    module_node_mapping
+):
+    """Get quantizable onnx ops.
+
+    Args:
+        int8_model (torch.nn.Module): PyTorch int8 model.
+        module_node_mapping (dict): op mapping from PyTorch to ONNX.
 
-    # Match weight name with onnx node name
+    Returns:
+        quantize_nodes: all onnx node that should be quantized.
+    """
     quantize_nodes = []
-    tmp_node_mapping = {}
-    module_node_mapping = {}
-    for node in fp32_onnx_model.graph.node:
-        if node.op_type not in op_types_to_quantize:
-            for inp in node.input:
-                if inp in weight_name_list and 'weight' in inp:
-                    tmp_node_mapping.update({node.output[0] : inp.split('.weight')[0]})
-                elif inp in tmp_node_mapping:
-                    tmp_node_mapping.update({node.output[0] : tmp_node_mapping[inp]})
-        else:
-            for inp in node.input:
-                if inp in weight_name_list and 'weight' in inp:
-                    module_node_mapping.update({inp.split('.weight')[0] : node.name})
-                elif inp in tmp_node_mapping:
-                    module_node_mapping.update({tmp_node_mapping[inp]: node.name})
-
-    quantize_nodes = list(module_node_mapping.values())
-    # Fetch all matmul in ONNX that comes from PyTorch Linear
-    # Match pytorch module name with onnx node name for fallbacked fp32 module
-    linear_matmul_list = []
-    for k, v in q_config['op'].items():   # pragma: no cover
-        if 'Linear' in k[1]:
-            k_0 = k[0].split('.module')[0] if k[0] not in module_node_mapping else k[0]
-            linear_matmul_list.append(module_node_mapping[k_0])
-        if not 'int8' in v['weight']['dtype']:
-            k_0 = k[0].split('.module')[0] if k[0] not in module_node_mapping else k[0]
-            if k[0] in module_node_mapping:
-                fallback_op = module_node_mapping[k_0]
-                quantize_nodes.remove(fallback_op)
-    return quantize_nodes, module_node_mapping, linear_matmul_list
+    for name, module in int8_model.named_modules():
+        if 'Conv' in str(module.__class__.__name__) or \
+          'Embedding' in str(module.__class__.__name__) or \
+          'Linear' in str(module.__class__.__name__):
+            if hasattr(module, 'weight') and callable(module.weight):
+                if module.weight().dtype == torch.qint8:
+                    node = module_node_mapping[name.split('.module')[0]]
+                    quantize_nodes.append(node)
+    return quantize_nodes
 
 
 def get_scale_info(
@@ -257,7 +272,7 @@ def build_scale_mapping(
     module_node_mapping,
     int8_scale_info,
 ):
-    """_summary_
+    """Build scale mapping.
 
     Args:
         fp32_onnx_path (str): path to fp32 onnx model.
@@ -405,13 +420,17 @@ def torch_to_int8_onnx(
         quant_format (str, optional): quantization format of ONNX model. Defaults to 'QDQ'.
         dtype (str, optional): data types of activation and weight of ONNX model. Defaults to 'U8S8'.
     """
+    global op_types_to_quantize
+    if q_config['approach'] == 'post_training_dynamic_quant':
+        op_types_to_quantize=['MatMul', 'Gemm', 'Gather']
+    else:
+        op_types_to_quantize=['MatMul', 'Gemm', 'Gather', 'Conv']
+
     if quant_format == 'QDQ' and opset_version < 13:   # pragma: no cover 
         opset_version = 13
         logger.warning("QDQ format requires opset_version >= 13, " + 
                         "we reset opset_version={} here".format(opset_version))
 
-    activation_type, weight_type = set_data_type(dtype)
-
     # pylint: disable=E1101
     fp32_onnx_path = save_path + '.tmp' if save_path else 'int8-model.onnx.tmp'
     torch_to_fp32_onnx(
@@ -422,13 +441,12 @@ def torch_to_int8_onnx(
         input_names=input_names,
         output_names=output_names,
         dynamic_axes=dynamic_axes,
-        do_constant_folding=False,
         verbose=False,
     )
 
-    quantize_nodes, module_node_mapping, linear_matmul_list = get_quantizable_onnx_ops(
-        q_config, fp32_onnx_path
-    )
+    activation_type, weight_type = set_data_type(dtype)
+    module_node_mapping, linear_matmul_list = get_node_mapping(fp32_model, fp32_onnx_path)
+    quantize_nodes = get_quantizable_onnx_ops(int8_model, module_node_mapping)
 
     if q_config['approach'] == 'quant_aware_training':
         update_weight_bias(int8_model, fp32_onnx_path)
@@ -439,6 +457,7 @@ def torch_to_int8_onnx(
     quant_format = ortq.QuantFormat.QOperator if quant_format != 'QDQ' else ortq.QuantFormat.QDQ
 
     if q_config['approach'] == 'post_training_dynamic_quant':
+        logger.info("Quantization format is not avalible when executing dynamic quantization.")
         ortq.quantize_dynamic(
             fp32_onnx_path,
             save_path,
diff --git a/test/export/test_torch2onnx.py b/test/export/test_torch2onnx.py
@@ -94,10 +94,12 @@ def setUpClass(self):
     @classmethod
     def tearDownClass(self):
         shutil.rmtree('nc_workspace', ignore_errors=True)
-        os.remove('fp32-cv-model.onnx')
-        os.remove('int8-cv-model.onnx')
-        os.remove('fp32-nlp-model.onnx')
-        os.remove('int8-nlp-model.onnx')
+        # os.remove('fp32-cv-model.onnx')
+        # os.remove('int8-cv-qdq-model.onnx')
+        # os.remove('int8-cv-qlinear-model.onnx')
+        # os.remove('fp32-nlp-model.onnx')
+        # os.remove('int8-nlp-qdq-model.onnx')
+        # os.remove('int8-nlp-qlinear-model.onnx')
 
     def test_fp32_CV_models(self):
         model = self.cv_model
@@ -151,8 +153,8 @@ def test_int8_CV_models(self):
                 dynamic_axes={"input": {0: "batch_size"},
                               "output": {0: "batch_size"}},
             )
-            q_model.export('int8-cv-model.onnx', int8_onnx_config)
-            check_CV_onnx('int8-cv-model.onnx', self.cv_dataloader)
+            q_model.export('int8-cv-qdq-model.onnx', int8_onnx_config)
+            check_CV_onnx('int8-cv-qdq-model.onnx', self.cv_dataloader)
 
             int8_onnx_config = Torch2ONNXConfig(
                 dtype="int8",
@@ -164,8 +166,8 @@ def test_int8_CV_models(self):
                 dynamic_axes={"input": {0: "batch_size"},
                               "output": {0: "batch_size"}},
             )
-            q_model.export('int8-cv-model.onnx', int8_onnx_config)
-            check_CV_onnx('int8-cv-model.onnx', self.cv_dataloader)
+            q_model.export('int8-cv-qlinear-model.onnx', int8_onnx_config)
+            check_CV_onnx('int8-cv-qlinear-model.onnx', self.cv_dataloader)
 
     def test_fp32_NLP_models(self):
         symbolic_names = {0: 'batch_size', 1: 'max_seq_len'}
@@ -227,8 +229,8 @@ def test_int8_NLP_models(self):
                 output_names=['labels'],
                 dynamic_axes=dynamic_axes,
             )
-            q_model.export('int8-nlp-model.onnx', int8_onnx_config)
-            check_NLP_onnx('int8-nlp-model.onnx', self.nlp_input)
+            q_model.export('int8-nlp-qdq-model.onnx', int8_onnx_config)
+            check_NLP_onnx('int8-nlp-qdq-model.onnx', self.nlp_input)
 
             int8_onnx_config = Torch2ONNXConfig(
                 dtype="int8",
@@ -239,8 +241,8 @@ def test_int8_NLP_models(self):
                 output_names=['labels'],
                 dynamic_axes=dynamic_axes,
             )
-            q_model.export('int8-nlp-model.onnx', int8_onnx_config)
-            check_NLP_onnx('int8-nlp-model.onnx', self.nlp_input)
+            q_model.export('int8-nlp-qlinear-model.onnx', int8_onnx_config)
+            check_NLP_onnx('int8-nlp-qlinear-model.onnx', self.nlp_input)
 
 if __name__ == "__main__":
     unittest.main()