enable scale mapping for fused module

xin3he · xin3he · commit 255eb0cd9d12 · 2022-12-02T23:47:01.000+08:00
Signed-off-by: Xin He &lt;xin3.he@intel.com&gt;
diff --git a/neural_compressor/adaptor/pytorch.py b/neural_compressor/adaptor/pytorch.py
@@ -2701,6 +2701,7 @@ def quantize(self, tune_cfg, model, dataloader, q_func=None):
                 repr(e)))
             q_model = model
         q_model._model.eval()
+        hook_list = torch_utils.util._set_input_scale_hook(q_model._model, op_cfgs)
         if q_model.kwargs is not None:
             self.prepare_custom_config_dict = q_model.kwargs.get('prepare_custom_config_dict',
                                                                  None)
@@ -2738,7 +2739,6 @@ def quantize(self, tune_cfg, model, dataloader, q_func=None):
             # q_func can be created by neural_compressor internal or passed by user. It's critical to
             # distinguish how q_func is passed since neural_compressor built-in functions accept
             # neural_compressor model and user defined func should accept framework model.
-            hook_list = torch_utils.util._set_input_scale_hook(q_model._model, op_cfgs)
             q_model._model = q_func(
                 q_model if getattr(q_func, 'builtin', None) else q_model._model)
             assert q_model._model is not None, "Please return a trained model in train function!"
@@ -2767,7 +2767,6 @@ def quantize(self, tune_cfg, model, dataloader, q_func=None):
                                                     prefix='',
                                                     example_inputs=example_inputs)
             if self.approach in ['post_training_static_quant', 'post_training_auto_quant']:
-                hook_list = torch_utils.util._set_input_scale_hook(q_model._model, op_cfgs)
                 iterations = tune_cfg.get('calib_iteration', 1)
                 if q_func is not None:
                     q_func(q_model._model)
@@ -2778,7 +2777,7 @@ def quantize(self, tune_cfg, model, dataloader, q_func=None):
                                            calib_sampling_size=tune_cfg.get('calib_sampling_size', 1))
 
         if self.approach != 'post_training_dynamic_quant':
-            input_scale_info = torch_utils.util._get_input_scale(q_model._model, hook_list)
+            scale_info = torch_utils.util._get_input_scale(q_model._model, hook_list)
 
         if self.sub_module_list is None:
             if self.version > Version("1.12.1"):  # pragma: no cover
@@ -2802,7 +2801,7 @@ def quantize(self, tune_cfg, model, dataloader, q_func=None):
         q_model.q_config = copy.deepcopy(self.tune_cfg)
         if self.approach != 'post_training_dynamic_quant':
             self._get_scale_zeropoint(q_model._model, q_model.q_config)
-            q_model.q_config['input_scale_info'] = input_scale_info
+            q_model.q_config['scale_info'] = scale_info
 
         self._dump_model_op_stats(q_model._model, q_model.q_config, self.approach)
         torch_utils.util.get_embedding_contiguous(q_model._model)
@@ -2940,8 +2939,8 @@ def _pre_hook_for_qat(self, dataloader=None):
         hook_list = torch_utils.util._set_input_scale_hook(self.model._model, quantized_ops)
 
     def _post_hook_for_qat(self):
-        input_scale_info = torch_utils.util._get_input_scale(self.model._model, hook_list)
-        self.model.q_config['input_scale_info'] = input_scale_info
+        scale_info = torch_utils.util._get_input_scale(self.model._model, hook_list)
+        self.model.q_config['scale_info'] = scale_info
         from torch.quantization.quantize_fx import convert_fx
         if self.sub_module_list is None:
             if self.version > Version("1.12.1"):  # pragma: no cover
diff --git a/neural_compressor/adaptor/torch_utils/util.py b/neural_compressor/adaptor/torch_utils/util.py
@@ -44,6 +44,23 @@ def contiguous_hook(module, input):
             child.register_forward_pre_hook(contiguous_hook)
 
 
+def is_fused_module(module):
+    """This is a helper function for `_propagate_qconfig_helper` to detecte
+        if this module is fused.
+
+    Args:
+        module (object): input module
+
+    Returns:
+        (bool): is fused or not
+    """
+    op_type = str(type(module))
+    if 'fused' in op_type:
+        return True
+    else:
+        return False
+
+
 def _set_input_scale_hook(model, op_cfgs):
     """Insert hooks to observer input scale and zeropoint.
 
@@ -55,19 +72,24 @@ def _set_input_scale_hook(model, op_cfgs):
         hook_list (list): input observer hooks
     """
     def input_scale_hook(module, input):
-        module.input_observer = module.input_config.activation()
+        module.input_observer = module.qconfig.activation()
         module.input_observer(input[0])
         return input
 
+    def output_scale_hook(module, input, output):
+        module.output_observer = module.qconfig.activation()
+        module.output_observer(output)
+        return output
+
     hook_list = []
     for name, module in model.named_modules():
         if 'Conv' in str(module.__class__.__name__) or \
           'Linear' in str(module.__class__.__name__):
-            if name not in op_cfgs or op_cfgs[name] is None:
+            if is_fused_module(module):
                 continue
-            module.input_config = op_cfgs[name]
-            handle = module.register_forward_pre_hook(input_scale_hook)
-            hook_list.append(handle)
+            handle_in = module.register_forward_pre_hook(input_scale_hook)
+            handle_out = module.register_forward_hook(output_scale_hook)
+            hook_list.extend([handle_in, handle_out])
     return hook_list
 
 
@@ -81,19 +103,20 @@ def _get_input_scale(model, hook_list):
     Returns:
         input_scale_info (dict): input scale and zero_point of each modules
     """
-    input_scale_info = {}
+    scale_info = {}
     for name, module in model.named_modules():
-        if hasattr(module, "input_observer"):
-            scale, zero_point = module.input_observer.calculate_qparams()
-            input_scale_info[name] = {
-                'scale': float(scale),
-                'zero_point': int(zero_point)
+        if hasattr(module, "input_observer") and hasattr(module, "output_observer"):
+            scale_in, zero_point_in = module.input_observer.calculate_qparams()
+            scale_out, zero_point_out = module.output_observer.calculate_qparams()
+            scale_info[name] = {
+                'input_scale': float(scale_in),
+                'input_zeropoint': int(zero_point_in),
+                'output_scale': float(scale_out),
+                'output_zeropoint': int(zero_point_out)
             }
-        if hasattr(module, "input_config"):
-            del module.input_config
     for h in hook_list:
         h.remove()
-    return input_scale_info
+    return scale_info
 
 
 def collate_torch_preds(results):
diff --git a/neural_compressor/experimental/export/torch2onnx.py b/neural_compressor/experimental/export/torch2onnx.py
@@ -195,35 +195,6 @@ def get_quantizable_onnx_ops(
     return quantize_nodes
 
 
-def get_scale_info(
-    int8_model,
-    q_config,
-):
-    """Fetch scale information from q_config.
-
-    Args:
-        int8_model (torch.nn.Module): PyTorch int8 model.
-        q_config (dict): quantization configuration.
-
-    Returns:
-        int8_scale_info: int8 scale infomation.
-    """
-    # get output scale and zp from module
-    int8_scale_info = {}
-    for name, scale_info in q_config['input_scale_info'].items():
-        int8_scale_info[name] = {
-            'input_scale': scale_info['scale'],
-            'input_zeropoint': scale_info['zero_point'],
-        }
-    for name, module in int8_model.named_modules():
-        if name in int8_scale_info:
-            int8_scale_info[name].update({
-                'output_scale': module.scale,
-                'output_zeropoint': module.zero_point,
-            })
-    return int8_scale_info
-
-
 def build_scale_mapping(
     fp32_onnx_path,
     module_node_mapping,
@@ -242,14 +213,21 @@ def build_scale_mapping(
     node_module_mapping = {}
     for module_name, node_name in module_node_mapping.items():
         node_module_mapping[node_name] = module_name
-    # match scale and zeropoint from PyTorch to ONNX node
+    # Match scale and zeropoint from PyTorch to ONNX node
     scale_zp_dict = {}
     fp32_onnx_model = onnx.load(fp32_onnx_path)
     for node in fp32_onnx_model.graph.node:
         if node.name in node_module_mapping:
             module_name = node_module_mapping[node.name]
-            if module_name not in int8_scale_info:
+
+            # For fine-grained fx and fuse pattern
+            if module_name + '.module' in int8_scale_info:
                 module_name = module_name + '.module'
+            elif module_name + '.0' in int8_scale_info:
+                module_name = module_name + '.0'
+            elif module_name + '.module.0' in int8_scale_info:
+                module_name = module_name + '.module.0'
+
             if module_name in int8_scale_info:
                 recoder = int8_scale_info[module_name]
                 input_scale_args = node.input[0] + '_scale'
@@ -447,7 +425,7 @@ def qdq_model_use_output_scale_zp(
 def qop_model_default(
     int8_onnx_model
 ):
-    # nn.quantized.Lienar module will be converted to the following format:
+    # nn.quantized.Linear module will be converted to the following format:
     #     QuantizeLinear
     #           |
     #  MatMulIntegerToFloat
@@ -696,7 +674,7 @@ def torch_to_int8_onnx(
     if q_config['approach'] == 'quant_aware_training':
         update_weight_bias(int8_model, fp32_onnx_path)
     if q_config['approach'] != 'post_training_dynamic_quant':
-        int8_scale_info = get_scale_info(int8_model, q_config)
+        int8_scale_info = q_config['scale_info']
         scale_mapping = build_scale_mapping(fp32_onnx_path, module_node_mapping, int8_scale_info)
 
     quant_format = ortq.QuantFormat.QOperator if quant_format != 'QDQ' else ortq.QuantFormat.QDQ
diff --git a/neural_compressor/experimental/export/utils.py b/neural_compressor/experimental/export/utils.py
@@ -62,7 +62,7 @@ def __init__(self, fp32_onnx_path):
             for dim in node.shape:
                 shape.append(dim if isinstance(dim, int) else 1)
             dtype = ONNX2Numpy_dtype(node.type)
-            input[node.name] = np.ones(shape).astype(dtype)
+            input[node.name] = np.zeros(shape).astype(dtype)
         self.data = [input]
         self.data = iter(self.data)