From 14f42a89e8f87468ddf294fa50c54f4a283d8ace Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 8 Nov 2022 22:31:53 +0800
Subject: [PATCH 001/128] Feat(ST): add a interface for hawq(stage1)

---
 neural_compressor/adaptor/pytorch.py | 13 +++++++++++++
 neural_compressor/strategy/basic.py  |  7 ++++++-
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/neural_compressor/adaptor/pytorch.py b/neural_compressor/adaptor/pytorch.py
index bd2f250a216..1201bb21fa4 100644
--- a/neural_compressor/adaptor/pytorch.py
+++ b/neural_compressor/adaptor/pytorch.py
@@ -1072,7 +1072,20 @@ def is_fused_module(self, module):
             return True
         else:
             return False
+        
+    def calculate_op_sensitivity(self, model, dataloader, method_args):
+        """Compute the op sensitivity by the specific method.
+
+        Args:
+            model(INC model): The fp32 model. 
+            dataloader: The calibration dataloader.
+            method_args(Dict): The parameters for specifying the method.  
 
+        Returns:
+            ops_sensitivity(Dict[tuple, float]): The key is (op_name, op_type), 
+              the value is the sensitivity under the specified method
+        """
+        pass
 
 unify_op_type_mapping = {
     "ConvReLU2d": "Conv2d",
diff --git a/neural_compressor/strategy/basic.py b/neural_compressor/strategy/basic.py
index c35398dd4bb..3cc4e38bde2 100644
--- a/neural_compressor/strategy/basic.py
+++ b/neural_compressor/strategy/basic.py
@@ -143,7 +143,12 @@ def next_tune_cfg(self):
                 if fallback_items_lst:
                     logger.info(f"Start to fallback op to {target_dtype} one by one.")
                     self._fallback_started()
-                fallback_items_name_lst = [item.name for item in fallback_items_lst][::-1] # from bottom to up
+                #fallback_items_name_lst = [item.name for item in fallback_items_lst][::-1] # from bottom to up
+                ops_sensitivity = self.adaptor.calculate_op_sensitivity(self._fp32_model, 
+                                                                        self.calib_dataloader, 
+                                                                        method_args = {'name': 'hessian_trace'})
+                fallback_items_name_lst = sorted(ops_sensitivity, key = lambda items: items[1], reverse=True)
+                
                 op_dtypes = OrderedDict(zip(fallback_items_name_lst, [target_dtype] * len(fallback_items_name_lst)))
                 initial_op_tuning_cfg = deepcopy(best_op_tuning_cfg_stage1)
                 fallback_sampler = FallbackTuningSampler(tuning_space, tuning_order_lst=[],

From e0ff7328bbd3628f15c429bb257bd5c648b79486 Mon Sep 17 00:00:00 2001
From: root <root@milan-1.sh.intel.com>
Date: Thu, 10 Nov 2022 14:22:12 +0800
Subject: [PATCH 002/128] hawq_metric.py

---
 neural_compressor/strategy/hawq_metric.py | 291 ++++++++++++++++++++++
 1 file changed, 291 insertions(+)
 create mode 100644 neural_compressor/strategy/hawq_metric.py

diff --git a/neural_compressor/strategy/hawq_metric.py b/neural_compressor/strategy/hawq_metric.py
new file mode 100644
index 00000000000..acbcd98d740
--- /dev/null
+++ b/neural_compressor/strategy/hawq_metric.py
@@ -0,0 +1,291 @@
+"""
+ Copyright (c) 2022 Intel Corporation
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+      http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+import logging
+import torch
+import numpy as np
+from torch.autograd import Variable
+import yaml
+import torchvision.transforms as transforms
+import torchvision
+import random
+import copy
+from torch.quantization import get_default_qat_qconfig, quantize_jit,get_default_qconfig
+from torch.quantization.quantize_fx import prepare_fx, convert_fx,fuse_fx
+from torch.quantization import default_dynamic_qconfig, float_qparams_weight_only_qconfig
+import torch.quantization._numeric_suite as ns
+
+
+def fixed_seed(seed):
+    """Fixed rand seed to make sure results are same in different times on different devices.Eg CPU/GPU
+       Args:
+          seed:                              an integer number
+       return:                               None 
+    """
+    np.random.seed(seed)   #random
+    random.seed(seed)
+    torch.manual_seed(seed) #cpu
+    torch.cuda.manual_seed_all(seed)  #parallel cpu
+    torch.backends.cudnn.deterministic = True  #make sure results are same on cpu/gpu
+    torch.backends.cudnn.benchmark = True   #accelerator
+def calculate_params_gradients(model):
+     """
+     get the gradients and parameters from given model
+     Args:
+          model:                             FP32 model specificed
+     return:
+          params:                            paratmeters of model
+          grads:                             gradients of model
+     """
+     params=[]
+     grads=[]
+     for indx,(name, parm) in zip(enumerate(model.parameters()), model.named_parameters()): 
+          logging.info('->tensor_index:', indx[0],'-->name:', name, '-->grad_requirs:',parm.requires_grad, '-->current tensor len:',parm.shape)
+          if not parm.requires_grad:
+               continue
+          params.append(parm)
+          grads.append(0. if parm.grad is None else parm.grad+0.)
+     return params, grads
+def calculate_inner_product(list_x,list_y):
+     """Compute the inner product of two lists of variables list_x,list_y
+     Args:
+          list_x:                            input list variables
+          list_y:                            input list variables
+     return:
+          sum of inner product
+     """
+     return sum([torch.sum(x * y) for (x, y) in zip(list_x, list_y)])
+
+def calculate_vector_product(gradsH, params, v):
+     """compute the hessian vector product by torch.autograd.grad.
+     Agrs:
+          gradsH:                             gradient at current point
+          params:                             corresponding variables
+          v:                                  vector
+     return:
+          hv:                                 hessian vector product
+     """
+     hv=torch.autograd.grad(
+          gradsH,
+          params,
+          grad_outputs=v,
+          only_inputs=True,
+          retain_graph=True)
+     return hv
+def ptq_calibrate(model, data_loader,num_cal):
+     """Calibrate model in post train quantization model 
+        Args:
+            model:                            a pre_quantization model to calibrate
+            data_laoder:                      datasets
+            num_cal:                          maximization number of calibrated samples, such as images
+        return:
+            model:                            a calibrated model
+     """
+     #Generate some samples to calibrate from data_loader
+     calibrate_samples=[]
+     i=0
+     for inputs, targets in data_loader:
+          calibrate_samples.append(inputs)
+          i=i+1
+          if i>=num_cal:
+               break
+     # model.cpu()
+     model.eval()
+     #calibration
+     with torch.no_grad():
+          for sample in calibrate_samples:
+               model(sample)
+     return model
+def calculate_perturbation(model_qnt,model_fp32)->dict:
+     """calculate weights quantized perturbation using L2 normal
+        Args:
+            model_qnt:                       quantized model
+            model_fp32:                      float model
+        return:
+            pertur_lst:                      dict,which contains layer_name and value
+            
+     """
+     
+     wq_cmp_dict=ns.compare_weights(model_fp32.state_dict(), model_qnt.state_dict())
+     pertur_lst=[]
+     for key in wq_cmp_dict:
+          pertur_pair={"layer_name":'',"value":0}
+          op_float_tensor=wq_cmp_dict[key]['float']
+          op_qnt_tensor=wq_cmp_dict[key]['quantized'].dequantize()
+          diff_l2=(torch.norm(op_float_tensor-op_qnt_tensor,p=2)**2) #Formula: L2=||Q(w)-w||p^2
+          pertur_pair['layer_name']=key
+          pertur_pair['value']=diff_l2
+          pertur_lst.append(pertur_pair)
+     return pertur_lst
+class Hessian():
+     """This class used to compute each layer hessian trace from given FP32 model
+     """
+     def __init__(self,model,criterion, data=None, dataloader=None,device='cpu') -> None:
+          """Initial parameters 
+          Args:
+               model:                         FP32 model specificed
+               criterion:                     loss function
+               data:                          a single batch of data, including inputs and its corresponding labels
+               dataloader:                    the data loader including bunch of batches of data
+               device:                        currently only supports cpu device
+          """
+          #make sure we either pass a single batch or a dataloader
+          assert (data!=None and dataloader==None ) or (data==None and dataloader!=None)
+          #make mode is evaluation model
+          self.model=model.eval()
+          self.criterion=criterion
+          self.device=device
+
+          if data!=None:
+               self.data=data
+               self.full_dataset=False
+          if not self.full_dataset:
+               self.inputs, self.targets=self.data
+               outputs=self.model(self.inputs)
+               loss=self.criterion(outputs,self.targets)
+               loss.backward(create_graph=True)
+          params, gradSH=calculate_params_gradients(self.model)
+
+          self.params=params
+          self.gradSH=gradSH
+     def calculate_trace(self,max_Iter=100, tolerance=1e-3):
+          """Compute the hessian trace based on Hutchinson algorithm
+          Args:
+               max_Inter:                    number of  maximization iteration 
+               tolerance:                    minimum relative tolerance for stopping the algorithm.
+          return: 
+               avg_traces_lst:               return hessian trace per layer for given model
+          """
+          avg_traces_lst=[]
+          for (i_grad, i_param,(module_name, _)) in zip(self.gradSH, self.params, self.model.named_parameters()):
+               v=[torch.randint_like(i_param,high=2, device=self.device)]
+               for v_i in v:
+                    v_i[v_i==0]=-1
+               i_v=v
+               trace_vhv=[]
+               trace=0.
+               trace_pair={"layer_name":" ", "trace":0}
+               self.model.zero_grad()
+               for i in range(max_Iter):
+                    hv=calculate_vector_product(i_grad,i_param,i_v) # hessian vector
+                    trace_vhv_cur=calculate_inner_product(hv,v).cpu().item()#current point 
+                    trace_vhv.append(trace_vhv_cur)
+                    difference=(np.mean(trace_vhv)-trace)/(abs(trace)+1e-6)
+                    if abs(difference)<tolerance:
+                         avg_trace_vhv=np.mean(trace_vhv)
+                         trace_pair["layer_name"]=module_name
+                         trace_pair["trace"]=avg_trace_vhv
+                         avg_traces_lst.append(trace_pair)
+                         break
+                    else:
+                         trace=np.mean(trace_vhv)
+          return avg_traces_lst
+                         
+
+class Hawq_top():
+     """This class is a interface of hessian
+     """
+     def __init__(self,model,yaml_trace=None,yaml_cpu=None,dataloader=None) -> None:
+          self.dataloader=dataloader
+          if yaml_trace and yaml_cpu is not None:
+               with open(yaml_trace) as file:
+                    params_config=yaml.load(file)
+               if params_config['loss']=='CrossEntropyLoss':
+                    self.criterion=torch.nn.CrossEntropyLoss()
+               self.random_seed=params_config['random_seed']
+               self.max_Iteration=params_config['max_Iteration']
+               self.enable_op_fuse=params_config['enable_op_fuse']
+               self.tolerance=float(params_config['tolerance'])
+               self.max_cal_sample=float(params_config['max_cal_smaple'])
+               self.quantize_mode=params_config['quantize_mode']
+               with open(yaml_cpu,'r') as file:
+                    yaml_config=yaml.load(file)
+               str_dtype=(yaml_config[0]['precisions']['names'])
+               self.list_dtype = str_dtype.split(",") 
+          else:
+               self.criterion=torch.nn.CrossEntropyLoss()
+               self.random_seed=100
+               self.max_Iteration=100
+               self.enable_op_fuse=True
+               self.tolerance=1e-6
+               self.max_cal_sample=100
+               self.quantize_mode='ptq'
+               self.list_dtype=['int8','fp32']
+          logging.info("Current parameters config for Hutchinson’s algorithm as below:")
+          logging.info("criterion:",self.criterion,"| random_seed:",self.random_seed,"| max_Iteration:", self.max_Iteration, \
+          "| tolerance:", self.tolerance,"|  en_op_fuse", self.enable_op_fuse,"| max_cal_sample:", self.max_cal_sample)
+          fixed_seed(self.random_seed)
+          self.model=model
+          self.model.eval()
+          model_tmp=copy.deepcopy(model)
+          model_tmp.eval()
+          self.model_fused= fuse_fx(model_tmp)
+          self.model_fused.eval()
+              
+     def get_init_config(self)->dict: 
+          """
+          """
+          #Load a sample from dataloader to compute graident    
+          for inputs, targets in self.dataloader:
+               break
+          #Hessian average trace computation
+          with torch.enable_grad():
+               if self.enable_op_fuse:
+                    hawq_cmp=Hessian(self.model_fused,criterion=self.criterion,data=(inputs,targets))
+               else:
+                    hawq_cmp=Hessian(self.model,criterion=self.criterion,data=(inputs,targets))
+          avg_traces_lst=hawq_cmp.calculate_trace(max_Iter=self.max_Iteration,tolerance=self.tolerance)
+         
+          #fiter none weight layer and save weight layer to match perturbation computation
+          avg_traces_lst_weight=[]
+          for avg_trace_i in avg_traces_lst:
+               if 'weight' in avg_trace_i['layer_name']:
+                    avg_traces_lst_weight.append(avg_trace_i)
+          # avg_traces_lst_sorted=sorted(avg_traces_lst,key=lambda x:x["trace"], reverse=True)
+          if self.quantize_mode=='ptq':
+               #PTQ quantization
+               qconfig = get_default_qconfig("fbgemm")
+               qconfig_dict={"":qconfig} #enable all layers/tensor to quantize
+               #calibrate
+               model_prepared=prepare_fx(self.model, qconfig_dict)
+               model_prepared=ptq_calibrate(model_prepared,data_loader=self.dataloader,num_cal=self.max_cal_sample)
+               model_prepared.cpu()
+               model_all_qnt=convert_fx(model_prepared)
+               #calculate perturbation
+               pertu_list=calculate_perturbation(model_fp32=self.model,model_qnt=model_all_qnt)
+               #calculate omiga
+               for omiga_i in pertu_list:
+                    for avg_trace_i in avg_traces_lst:
+                         if avg_trace_i['layer_name']==omiga_i['layer_name']:
+                              avg_trace_i['trace']=avg_trace_i['trace']*omiga_i['value']
+               # for avg_trace_i, omiga_i in zip(avg_traces_lst_weight,pertu_list):
+               #      omig_pair={"layer_name":" ", "value":0}
+               #      omig_val=avg_trace_i['trace']*omiga_i['value']
+               #      omig_pair['layer_name']=avg_trace_i['layer_name']
+               #      omig_pair['value']=omig_val
+               #      omig_list.append(omig_pair)
+               # omig_list_sorted=sorted(omig_list,key=lambda x:x['value'],reverse=True)
+               omig_list_sorted=sorted(avg_traces_lst,key=lambda x:x['trace'],reverse=True)
+          tune_init_config_pairs=[]
+          #
+          for i in omig_list_sorted:
+               tune_init_config_pair={"op_name":'',"op_type":'','trace':0}
+               if i['layer_name']==omig_list_sorted[0]['layer_name']: 
+                    tune_init_config_pair['op_name']=i['layer_name']
+                    tune_init_config_pair['op_type']=self.list_dtype[-1] #setup as float op
+                    tune_init_config_pair['trace']=float(i['trace'])
+               else:
+                    tune_init_config_pair['op_name']=i['layer_name']
+                    tune_init_config_pair['op_type']=self.list_dtype[0]
+                    tune_init_config_pair['trace']=float(i['trace'])
+               tune_init_config_pairs.append(tune_init_config_pair)
+          return tune_init_config_pairs

From e81744e621635c8013fac83c03898a47ab121e69 Mon Sep 17 00:00:00 2001
From: root <root@milan-1.sh.intel.com>
Date: Thu, 10 Nov 2022 14:24:13 +0800
Subject: [PATCH 003/128] pytorch.py

---
 neural_compressor/adaptor/pytorch.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/neural_compressor/adaptor/pytorch.py b/neural_compressor/adaptor/pytorch.py
index 1201bb21fa4..668c77246a6 100644
--- a/neural_compressor/adaptor/pytorch.py
+++ b/neural_compressor/adaptor/pytorch.py
@@ -30,6 +30,7 @@
 from ..utils import logger
 from .query import QueryBackendCapability
 from ..experimental.data.dataloaders.base_dataloader import BaseDataLoader
+from neural_compressor.strategy.hawq_metric import Hawq_top
 try:  # pragma: no cover
     import intel_extension_for_pytorch as ipex
     IPEX = True
@@ -1085,6 +1086,10 @@ def calculate_op_sensitivity(self, model, dataloader, method_args):
             ops_sensitivity(Dict[tuple, float]): The key is (op_name, op_type), 
               the value is the sensitivity under the specified method
         """
+        if method_args['name']=='hessian_trace':
+            Hawq_top(model=model,yaml_cpu=None,yaml_trace=None,dataloader=dataloader)
+            hessian_cmp=Hawq_top.get_init_config()
+            return hessian_cmp
         pass
 
 unify_op_type_mapping = {

From 466ffb8bcb5a22f6cf79b5e9259de7f13bb21a5c Mon Sep 17 00:00:00 2001
From: root <root@milan-1.sh.intel.com>
Date: Thu, 10 Nov 2022 15:31:11 +0800
Subject: [PATCH 004/128] disable line 33

---
 neural_compressor/adaptor/pytorch.py    |  2 +-
 test/strategy/test_hessian_trace_inc.py | 63 +++++++++++++++++++++++++
 2 files changed, 64 insertions(+), 1 deletion(-)
 create mode 100644 test/strategy/test_hessian_trace_inc.py

diff --git a/neural_compressor/adaptor/pytorch.py b/neural_compressor/adaptor/pytorch.py
index 668c77246a6..097b9359f93 100644
--- a/neural_compressor/adaptor/pytorch.py
+++ b/neural_compressor/adaptor/pytorch.py
@@ -30,7 +30,7 @@
 from ..utils import logger
 from .query import QueryBackendCapability
 from ..experimental.data.dataloaders.base_dataloader import BaseDataLoader
-from neural_compressor.strategy.hawq_metric import Hawq_top
+# from neural_compressor.strategy.hawq_metric import Hawq_top
 try:  # pragma: no cover
     import intel_extension_for_pytorch as ipex
     IPEX = True
diff --git a/test/strategy/test_hessian_trace_inc.py b/test/strategy/test_hessian_trace_inc.py
new file mode 100644
index 00000000000..f05b47ca3aa
--- /dev/null
+++ b/test/strategy/test_hessian_trace_inc.py
@@ -0,0 +1,63 @@
+import torch
+import unittest
+import os
+import sys
+import copy
+import torchvision
+import torchvision.transforms as transforms
+from torch.utils.data import DataLoader
+from neural_compressor.data import DATASETS
+from neural_compressor.experimental.data.dataloaders.pytorch_dataloader import PyTorchDataLoader
+from neural_compressor.adaptor.pytorch import TemplateAdaptor
+from neural_compressor.strategy.hawq_metric import Hawq_top
+
+def build_hessian_trace():
+    hessian_trace_config_yaml='''
+    loss:
+        CrossEntropyLoss
+    random_seed:
+        1
+    max_Iteration:
+        100
+    tolerance:
+        1e-3
+    enable_op_fuse:
+        True
+    max_cal_smaple:
+        100
+    quantize_mode:
+        ptq
+    '''
+    with open('./hessian_trace_config_yaml','w+',encoding="utf-8") as f:
+        f.write(hessian_trace_config_yaml)
+class Test_hessian_trace(unittest.TestCase):
+    #boot up test
+    @classmethod
+    def setUpClass(cls) -> None:
+        build_hessian_trace()
+        cls.model=torchvision.models.resnet18()
+    #shotdown test
+    @classmethod
+    def tearDownClass(cls) -> None:
+        os.remove('./hessian_trace_config_yaml')
+    #one test case
+    def test_run_hessian_trace(cls):
+        """
+        hessian_trace_top
+        Inputs:
+            model:                      FP32 model
+            dataloader:                 imagenet
+        """ 
+        model=cls.model
+        datasets = DATASETS('pytorch')
+        dummy_dataset = datasets['dummy'](shape=(200, 3, 224, 224), low=0., high=1., label=True)
+        dummy_dataloader = PyTorchDataLoader(dummy_dataset)
+        # yaml_cpu='/home/bfang1/Projects/HAWQ_INC/frameworks.ai.lpot.intel-lpot/neural_compressor/adaptor/pytorch_cpu.yaml'
+        # hessian_cmp=hawq_metric.Hawq_top(model,'./hessian_trace_config_yaml',yaml_cpu,dummy_dataloader)
+        hessian_cmp=Hawq_top(model,yaml_cpu=None,yaml_trace=None,dataloader=dummy_dataloader)
+        tuning_init_config=hessian_cmp.get_init_config()
+        #print tuning init_config
+        for i in tuning_init_config:
+            print(i)
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file

From 3fb9a236a7a56fe625e5ff3389c901601b4fafb8 Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Tue, 15 Nov 2022 11:44:48 +0800
Subject: [PATCH 005/128] add wenhuach test env

---
 neural_compressor/strategy/hawq.py  | 311 ++++++++++++++++++++++++++++
 test/strategy/test_hawq_wenhuach.py |  74 +++++++
 2 files changed, 385 insertions(+)
 create mode 100644 neural_compressor/strategy/hawq.py
 create mode 100644 test/strategy/test_hawq_wenhuach.py

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
new file mode 100644
index 00000000000..2dd0287fa2e
--- /dev/null
+++ b/neural_compressor/strategy/hawq.py
@@ -0,0 +1,311 @@
+"""
+ Copyright (c) 2022 Intel Corporation
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+      http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+from ..utils import logger
+import torch
+import numpy as np
+from torch.autograd import Variable
+import yaml
+import torchvision.transforms as transforms
+import torchvision
+import random
+import copy
+from torch.quantization import get_default_qat_qconfig, quantize_jit, get_default_qconfig
+from torch.quantization.quantize_fx import prepare_fx, convert_fx, fuse_fx
+from torch.quantization import default_dynamic_qconfig, float_qparams_weight_only_qconfig
+import torch.quantization._numeric_suite as ns
+
+
+def fix_seed(seed):
+    """Fixed rand seed to make sure results are same in different times on different devices.Eg CPU/GPU
+       Args:
+          seed:                              an integer number
+       return:                               None
+    """
+    np.random.seed(seed)  # random
+    random.seed(seed)
+    torch.manual_seed(seed)  # cpu
+    torch.cuda.manual_seed_all(seed)  # parallel cpu
+    torch.backends.cudnn.deterministic = True  # make sure results are same on cpu/gpu
+    torch.backends.cudnn.benchmark = True  # accelerator
+
+
+def calculate_params_gradients(model):
+    """
+    get the gradients and parameters from given model
+    Args:
+         model:                             FP32 model specificed
+    return:
+         params:                            paratmeters of model
+         grads:                             gradients of model
+    """
+    params = []
+    grads = []
+    for indx, (name, parm) in zip(enumerate(model.parameters()), model.named_parameters()):
+        logger.info(
+            f'index:{indx[0]}-->name:{name}:{parm.shape}')
+
+        if not parm.requires_grad:
+            continue
+        params.append(parm)
+        grads.append(0. if parm.grad is None else parm.grad + 0.)
+    return params, grads
+
+
+def calculate_inner_product(list_x, list_y):
+    """Compute the inner product of two lists of variables list_x,list_y
+    Args:
+         list_x:                            input list variables
+         list_y:                            input list variables
+    return:
+         sum of inner product
+    """
+    return sum([torch.sum(x * y) for (x, y) in zip(list_x, list_y)])
+
+
+def calculate_vector_product(gradsH, params, v):
+    """compute the hessian vector product by torch.autograd.grad.
+    Agrs:
+         gradsH:                             gradient at current point
+         params:                             corresponding variables
+         v:                                  vector
+    return:
+         hv:                                 hessian vector product
+    """
+    hv = torch.autograd.grad(
+        gradsH,
+        params,
+        grad_outputs=v,
+        only_inputs=True,
+        retain_graph=True)
+    return hv
+
+
+def ptq_calibrate(model, data_loader, num_cal):
+    """Calibrate model in post train quantization model
+       Args:
+           model:                            a pre_quantization model to calibrate
+           data_laoder:                      datasets
+           num_cal:                          maximization number of calibrated samples, such as images
+       return:
+           model:                            a calibrated model
+    """
+    # Generate some samples to calibrate from data_loader
+    calibrate_samples = []
+    i = 0
+    for inputs, targets in data_loader:
+        calibrate_samples.append(inputs)
+        i = i + 1
+        if i >= num_cal:
+            break
+    # model.cpu()
+    model.eval()
+    # calibration
+    with torch.no_grad():
+        for sample in calibrate_samples:
+            model(sample)
+    return model
+
+
+def calculate_perturbation(model_qnt, model_fp32) -> dict:
+    """calculate weights quantized perturbation using L2 normal
+       Args:
+           model_qnt:                       quantized model
+           model_fp32:                      float model
+       return:
+           pertur_lst:                      dict,which contains layer_name and value
+
+    """
+
+    wq_cmp_dict = ns.compare_weights(model_fp32.state_dict(), model_qnt.state_dict())
+    pertur_lst = []
+    for key in wq_cmp_dict:
+        pertur_pair = {"layer_name": '', "value": 0}
+        op_float_tensor = wq_cmp_dict[key]['float']
+        op_qnt_tensor = wq_cmp_dict[key]['quantized'].dequantize()
+        diff_l2 = (torch.norm(op_float_tensor - op_qnt_tensor, p=2) ** 2)  # Formula: L2=||Q(w)-w||p^2
+        pertur_pair['layer_name'] = key
+        pertur_pair['value'] = diff_l2
+        pertur_lst.append(pertur_pair)
+    return pertur_lst
+
+
+class Hessian():
+    """This class used to compute each layer hessian trace from given FP32 model
+    """
+
+    def __init__(self, model, criterion, data=None, dataloader=None, device='cpu') -> None:
+        """Initial parameters
+        Args:
+             model:                         FP32 model specificed
+             criterion:                     loss function
+             data:                          a single batch of data, including inputs and its corresponding labels
+             dataloader:                    the data loader including bunch of batches of data
+             device:                        currently only supports cpu device
+        """
+        # make sure we either pass a single batch or a dataloader
+        assert (data != None and dataloader == None) or (data == None and dataloader != None)
+        # make mode is evaluation model
+        self.model = model.eval()
+        self.criterion = criterion
+        self.device = device
+
+        if data != None:
+            self.data = data
+            self.full_dataset = False
+        if not self.full_dataset:
+            self.inputs, self.targets = self.data
+            outputs = self.model(self.inputs)
+            loss = self.criterion(outputs, self.targets)
+            loss.backward(create_graph=True)
+        params, gradSH = calculate_params_gradients(self.model)
+
+        self.params = params
+        self.gradSH = gradSH
+
+    def calculate_trace(self, max_Iter=100, tolerance=1e-3):
+        """Compute the hessian trace based on Hutchinson algorithm
+        Args:
+             max_Inter:                    number of  maximization iteration
+             tolerance:                    minimum relative tolerance for stopping the algorithm.
+        return:
+             avg_traces_lst:               return hessian trace per layer for given model
+        """
+        avg_traces_lst = []
+        for (i_grad, i_param, (module_name, _)) in zip(self.gradSH, self.params, self.model.named_parameters()):
+            v = [torch.randint_like(i_param, high=2, device=self.device)]
+            for v_i in v:
+                v_i[v_i == 0] = -1
+            i_v = v
+            trace_vhv = []
+            trace = 0.
+            trace_pair = {"layer_name": " ", "trace": 0}
+            self.model.zero_grad()
+            for i in range(max_Iter):
+                hv = calculate_vector_product(i_grad, i_param, i_v)  # hessian vector
+                trace_vhv_cur = calculate_inner_product(hv, v).cpu().item()  # current point
+                trace_vhv.append(trace_vhv_cur)
+                difference = (np.mean(trace_vhv) - trace) / (abs(trace) + 1e-6)
+                if abs(difference) < tolerance:
+                    avg_trace_vhv = np.mean(trace_vhv)
+                    trace_pair["layer_name"] = module_name
+                    trace_pair["trace"] = avg_trace_vhv
+                    avg_traces_lst.append(trace_pair)
+                    break
+                else:
+                    trace = np.mean(trace_vhv)
+        return avg_traces_lst
+
+
+class Hawq_top():
+    """This class is a interface of hessian
+    """
+
+    def __init__(self, model, yaml_trace=None, yaml_cpu=None, dataloader=None) -> None:
+        self.dataloader = dataloader
+        if yaml_trace and yaml_cpu is not None:
+            with open(yaml_trace) as file:
+                params_config = yaml.load(file)
+            if params_config['loss'] == 'CrossEntropyLoss':
+                self.criterion = torch.nn.CrossEntropyLoss()
+            self.random_seed = params_config['random_seed']
+            self.max_Iteration = params_config['max_Iteration']
+            self.enable_op_fuse = params_config['enable_op_fuse']
+            self.tolerance = float(params_config['tolerance'])
+            self.max_cal_sample = float(params_config['max_cal_smaple'])
+            self.quantize_mode = params_config['quantize_mode']
+            with open(yaml_cpu, 'r') as file:
+                yaml_config = yaml.load(file)
+            str_dtype = (yaml_config[0]['precisions']['names'])
+            self.list_dtype = str_dtype.split(",")
+        else:
+            self.criterion = torch.nn.CrossEntropyLoss()
+            self.random_seed = 100
+            self.max_Iteration = 100
+            self.enable_op_fuse = True
+            self.tolerance = 1e-6
+            self.max_cal_sample = 100
+            self.quantize_mode = 'ptq'
+            self.list_dtype = ['int8', 'fp32']
+        # logger.info("Current parameters config for Hutchinson’s algorithm as below:")
+        logger.info(
+            f"criterion:{self.criterion}| random_seed:{self.random_seed}| max_Iteration:self.max_Iteration| tolerance:{self.tolerance}")
+        # logger.info("criterion:", self.criterion, "| random_seed:", self.random_seed, "| max_Iteration:",
+        #              self.max_Iteration, \
+        #              "| tolerance:", self.tolerance, "|  en_op_fuse", self.enable_op_fuse, "| max_cal_sample:",
+        #              self.max_cal_sample)
+        fix_seed(self.random_seed)
+        self.model = model
+        self.model.eval()
+        model_tmp = copy.deepcopy(model)
+        model_tmp.eval()
+        self.model_fused = fuse_fx(model_tmp)
+        self.model_fused.eval()
+
+    def get_init_config(self) -> dict:
+        """
+        """
+        # Load a sample from dataloader to compute graident
+        for inputs, targets in self.dataloader:
+            break
+        # Hessian average trace computation
+        with torch.enable_grad():
+            if self.enable_op_fuse:
+                hawq_cmp = Hessian(self.model_fused, criterion=self.criterion, data=(inputs, targets))
+            else:
+                hawq_cmp = Hessian(self.model, criterion=self.criterion, data=(inputs, targets))
+        avg_traces_lst = hawq_cmp.calculate_trace(max_Iter=self.max_Iteration, tolerance=self.tolerance)
+
+        # fiter none weight layer and save weight layer to match perturbation computation
+        avg_traces_lst_weight = []
+        for avg_trace_i in avg_traces_lst:
+            if 'weight' in avg_trace_i['layer_name']:
+                avg_traces_lst_weight.append(avg_trace_i)
+        # avg_traces_lst_sorted=sorted(avg_traces_lst,key=lambda x:x["trace"], reverse=True)
+        if self.quantize_mode == 'ptq':
+            # PTQ quantization
+            qconfig = get_default_qconfig("fbgemm")
+            qconfig_dict = {"": qconfig}  # enable all layers/tensor to quantize
+            # calibrate
+            model_prepared = prepare_fx(self.model, qconfig_dict)
+            model_prepared = ptq_calibrate(model_prepared, data_loader=self.dataloader, num_cal=self.max_cal_sample)
+            model_prepared.cpu()
+            model_all_qnt = convert_fx(model_prepared)
+            # calculate perturbation
+            pertu_list = calculate_perturbation(model_fp32=self.model, model_qnt=model_all_qnt)
+            # calculate omiga
+            for omiga_i in pertu_list:
+                for avg_trace_i in avg_traces_lst:
+                    if avg_trace_i['layer_name'] == omiga_i['layer_name']:
+                        avg_trace_i['trace'] = avg_trace_i['trace'] * omiga_i['value']
+            # for avg_trace_i, omiga_i in zip(avg_traces_lst_weight,pertu_list):
+            #      omig_pair={"layer_name":" ", "value":0}
+            #      omig_val=avg_trace_i['trace']*omiga_i['value']
+            #      omig_pair['layer_name']=avg_trace_i['layer_name']
+            #      omig_pair['value']=omig_val
+            #      omig_list.append(omig_pair)
+            # omig_list_sorted=sorted(omig_list,key=lambda x:x['value'],reverse=True)
+            omig_list_sorted = sorted(avg_traces_lst, key=lambda x: x['trace'], reverse=True)
+        tune_init_config_pairs = []
+        #
+        for i in omig_list_sorted:
+            tune_init_config_pair = {"op_name": '', "op_type": '', 'trace': 0}
+            if i['layer_name'] == omig_list_sorted[0]['layer_name']:
+                tune_init_config_pair['op_name'] = i['layer_name']
+                tune_init_config_pair['op_type'] = self.list_dtype[-1]  # setup as float op
+                tune_init_config_pair['trace'] = float(i['trace'])
+            else:
+                tune_init_config_pair['op_name'] = i['layer_name']
+                tune_init_config_pair['op_type'] = self.list_dtype[0]
+                tune_init_config_pair['trace'] = float(i['trace'])
+            tune_init_config_pairs.append(tune_init_config_pair)
+        return tune_init_config_pairs
diff --git a/test/strategy/test_hawq_wenhuach.py b/test/strategy/test_hawq_wenhuach.py
new file mode 100644
index 00000000000..5affedc70ca
--- /dev/null
+++ b/test/strategy/test_hawq_wenhuach.py
@@ -0,0 +1,74 @@
+import torch
+import unittest
+import os
+import sys
+import copy
+import torchvision
+import torchvision.transforms as transforms
+from torch.utils.data import DataLoader
+from neural_compressor.data import DATASETS
+from neural_compressor.experimental.data.dataloaders.pytorch_dataloader import PyTorchDataLoader
+from neural_compressor.adaptor.pytorch import TemplateAdaptor
+
+from neural_compressor.strategy.hawq import Hawq_top, fix_seed
+
+fix_seed(1)
+
+
+def build_hessian_trace():
+    hessian_trace_config_yaml = '''
+    loss:
+        CrossEntropyLoss
+    random_seed:
+        1
+    max_Iteration:
+        100
+    tolerance:
+        1e-3
+    enable_op_fuse:
+        True
+    max_cal_smaple:
+        100
+    quantize_mode:
+        ptq
+    '''
+    with open('./hessian_trace_config_yaml', 'w+', encoding="utf-8") as f:
+        f.write(hessian_trace_config_yaml)
+
+
+class Test_hessian_trace(unittest.TestCase):
+    # boot up test
+    @classmethod
+    def setUpClass(cls) -> None:
+        build_hessian_trace()
+        cls.model = torchvision.models.resnet18()
+
+    # shotdown test
+    @classmethod
+    def tearDownClass(cls) -> None:
+        os.remove('./hessian_trace_config_yaml')
+
+    # one test case
+    def test_run_hessian_trace(cls):
+        """
+        hessian_trace_top
+        Inputs:
+            model:                      FP32 model
+            dataloader:                 imagenet
+        """
+
+        model = cls.model
+        datasets = DATASETS('pytorch')
+        dummy_dataset = datasets['dummy'](shape=(200, 3, 224, 224), low=0., high=1., label=True)
+        dummy_dataloader = PyTorchDataLoader(dummy_dataset)
+        # yaml_cpu='/home/bfang1/Projects/HAWQ_INC/frameworks.ai.lpot.intel-lpot/neural_compressor/adaptor/pytorch_cpu.yaml'
+        # hessian_cmp=hawq_metric.Hawq_top(model,'./hessian_trace_config_yaml',yaml_cpu,dummy_dataloader)
+        hessian_cmp = Hawq_top(model, yaml_cpu=None, yaml_trace=None, dataloader=dummy_dataloader)
+        tuning_init_config = hessian_cmp.get_init_config()
+        # print tuning init_config
+        for i in tuning_init_config:
+            print(i)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 59bd29b40094f47ac3f7ac1feefe947a50fb0a3b Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Tue, 15 Nov 2022 14:00:06 +0800
Subject: [PATCH 006/128] try to test mes strategy, have bug now

---
 test/strategy/test_hawq_wenhuach.py | 162 +++++++++++++++++++---------
 1 file changed, 113 insertions(+), 49 deletions(-)

diff --git a/test/strategy/test_hawq_wenhuach.py b/test/strategy/test_hawq_wenhuach.py
index 5affedc70ca..4443cd8d486 100644
--- a/test/strategy/test_hawq_wenhuach.py
+++ b/test/strategy/test_hawq_wenhuach.py
@@ -9,66 +9,130 @@
 from neural_compressor.data import DATASETS
 from neural_compressor.experimental.data.dataloaders.pytorch_dataloader import PyTorchDataLoader
 from neural_compressor.adaptor.pytorch import TemplateAdaptor
-
+from neural_compressor.adaptor import FRAMEWORKS
+import shutil
 from neural_compressor.strategy.hawq import Hawq_top, fix_seed
 
 fix_seed(1)
 
+def build_ptq_yaml():
+    fake_yaml = '''
+        model:
+          name: imagenet
+          framework: pytorch
+        quantization: 
+          calibration:
+        evaluation:
+          accuracy:
+            metric:
+              topk: 1
+        tuning:
+          strategy:
+            name: mse
+          accuracy_criterion:
+            relative: -0.1
+          random_seed: 9527
+          exit_policy:
+            max_trials: 1
+          workspace:
+            path: saved
+        '''
+    with open('ptq_yaml.yaml', 'w', encoding="utf-8") as f:
+        f.write(fake_yaml)
 
-def build_hessian_trace():
-    hessian_trace_config_yaml = '''
-    loss:
-        CrossEntropyLoss
-    random_seed:
-        1
-    max_Iteration:
-        100
-    tolerance:
-        1e-3
-    enable_op_fuse:
-        True
-    max_cal_smaple:
-        100
-    quantize_mode:
-        ptq
-    '''
-    with open('./hessian_trace_config_yaml', 'w+', encoding="utf-8") as f:
-        f.write(hessian_trace_config_yaml)
-
+class TestPytorchAdaptor(unittest.TestCase):
+    framework_specific_info = {"device": "cpu",
+                               "approach": "post_training_static_quant",
+                               "random_seed": 1234,
+                               "q_dataloader": None,
+                               "workspace_path": None}
+    framework = "pytorch"
+    adaptor = FRAMEWORKS[framework](framework_specific_info)
+    model = torchvision.models.resnet18()
 
-class Test_hessian_trace(unittest.TestCase):
-    # boot up test
-    @classmethod
-    def setUpClass(cls) -> None:
-        build_hessian_trace()
-        cls.model = torchvision.models.resnet18()
+    # model = torch.quantization.QuantWrapper(model)
 
-    # shotdown test
     @classmethod
-    def tearDownClass(cls) -> None:
-        os.remove('./hessian_trace_config_yaml')
+    def setUpClass(self):
+        build_ptq_yaml()
 
-    # one test case
-    def test_run_hessian_trace(cls):
-        """
-        hessian_trace_top
-        Inputs:
-            model:                      FP32 model
-            dataloader:                 imagenet
-        """
 
-        model = cls.model
-        datasets = DATASETS('pytorch')
-        dummy_dataset = datasets['dummy'](shape=(200, 3, 224, 224), low=0., high=1., label=True)
-        dummy_dataloader = PyTorchDataLoader(dummy_dataset)
-        # yaml_cpu='/home/bfang1/Projects/HAWQ_INC/frameworks.ai.lpot.intel-lpot/neural_compressor/adaptor/pytorch_cpu.yaml'
-        # hessian_cmp=hawq_metric.Hawq_top(model,'./hessian_trace_config_yaml',yaml_cpu,dummy_dataloader)
-        hessian_cmp = Hawq_top(model, yaml_cpu=None, yaml_trace=None, dataloader=dummy_dataloader)
-        tuning_init_config = hessian_cmp.get_init_config()
-        # print tuning init_config
-        for i in tuning_init_config:
-            print(i)
+    @classmethod
+    def tearDownClass(self):
+        os.remove('ptq_yaml.yaml')
+        shutil.rmtree('./saved', ignore_errors=True)
+        shutil.rmtree('runs', ignore_errors=True)
 
+    def test_run_hawq_one_trial(self):
+        from neural_compressor.experimental import Quantization, common
+        model = copy.deepcopy(self.model)
+        for fake_yaml in ['ptq_yaml.yaml']:
+            if fake_yaml == 'ptq_yaml.yaml':
+                model.eval()
+            quantizer = Quantization(fake_yaml)
+            dataset = quantizer.dataset('dummy', (1, 3, 224, 224), label=True)
+            quantizer.calib_dataloader = common.DataLoader(dataset)
+            quantizer.eval_dataloader = common.DataLoader(dataset)
+            quantizer.model = model
+            quantizer()
 
 if __name__ == "__main__":
     unittest.main()
+
+# def build_hessian_trace():
+#     hessian_trace_config_yaml = '''
+#     loss:
+#         CrossEntropyLoss
+#     random_seed:
+#         1
+#     max_Iteration:
+#         100
+#     tolerance:
+#         1e-3
+#     enable_op_fuse:
+#         True
+#     max_cal_smaple:
+#         100
+#     quantize_mode:
+#         ptq
+#     '''
+#     with open('./hessian_trace_config_yaml', 'w+', encoding="utf-8") as f:
+#         f.write(hessian_trace_config_yaml)
+#
+#
+# class Test_hessian_trace(unittest.TestCase):
+#     # boot up test
+#     @classmethod
+#     def setUpClass(cls) -> None:
+#         build_hessian_trace()
+#         cls.model = torchvision.models.resnet18()
+#
+#     # shotdown test
+#     @classmethod
+#     def tearDownClass(cls) -> None:
+#         os.remove('./hessian_trace_config_yaml')
+#
+#     # one test case
+#     def test_run_hessian_trace(cls):
+#         """
+#         hessian_trace_top
+#         Inputs:
+#             model:                      FP32 model
+#             dataloader:                 imagenet
+#         """
+#
+#         model = cls.model
+#         datasets = DATASETS('pytorch')
+#         dummy_dataset = datasets['dummy'](shape=(200, 3, 224, 224), low=0., high=1., label=True)
+#         dummy_dataloader = PyTorchDataLoader(dummy_dataset)
+#         # yaml_cpu='/home/bfang1/Projects/HAWQ_INC/frameworks.ai.lpot.intel-lpot/neural_compressor/adaptor/pytorch_cpu.yaml'
+#         # hessian_cmp=hawq_metric.Hawq_top(model,'./hessian_trace_config_yaml',yaml_cpu,dummy_dataloader)
+#         hessian_cmp = Hawq_top(model, yaml_cpu=None, yaml_trace=None, dataloader=dummy_dataloader)
+#         tuning_init_config = hessian_cmp.get_init_config()
+#         # print tuning init_config
+#         for i in tuning_init_config:
+#             print(i)
+
+
+# if __name__ == "__main__":
+#     unittest.main()

From accec3ccfa207d8ac8a08dce39ea758161d61f68 Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Tue, 15 Nov 2022 14:06:46 +0800
Subject: [PATCH 007/128] change name

---
 .../strategy/{hawq.py => hawq_wenhuach.py}    | 24 ++++++++++---------
 1 file changed, 13 insertions(+), 11 deletions(-)
 rename neural_compressor/strategy/{hawq.py => hawq_wenhuach.py} (95%)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq_wenhuach.py
similarity index 95%
rename from neural_compressor/strategy/hawq.py
rename to neural_compressor/strategy/hawq_wenhuach.py
index 2dd0287fa2e..6c74401c5fc 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq_wenhuach.py
@@ -246,23 +246,25 @@ def __init__(self, model, yaml_trace=None, yaml_cpu=None, dataloader=None) -> No
         fix_seed(self.random_seed)
         self.model = model
         self.model.eval()
-        model_tmp = copy.deepcopy(model)
-        model_tmp.eval()
-        self.model_fused = fuse_fx(model_tmp)
-        self.model_fused.eval()
+        if self.enable_op_fuse:
+            self.model = fuse_fx(self.model)
+
+        # model_tmp = copy.deepcopy(model)
+        # model_tmp.eval()
+        # self.model_fused = fuse_fx(model_tmp)
+        # self.model_fused.eval()
 
     def get_init_config(self) -> dict:
         """
         """
         # Load a sample from dataloader to compute graident
-        for inputs, targets in self.dataloader:
-            break
-        # Hessian average trace computation
+        inputs, targets = next(iter(self.dataloader))
+
         with torch.enable_grad():
-            if self.enable_op_fuse:
-                hawq_cmp = Hessian(self.model_fused, criterion=self.criterion, data=(inputs, targets))
-            else:
-                hawq_cmp = Hessian(self.model, criterion=self.criterion, data=(inputs, targets))
+            # if self.enable_op_fuse:
+            #     hawq_cmp = Hessian(self.model_fused, criterion=self.criterion, data=(inputs, targets))
+            # else:
+            hawq_cmp = Hessian(self.model, criterion=self.criterion, data=(inputs, targets))
         avg_traces_lst = hawq_cmp.calculate_trace(max_Iter=self.max_Iteration, tolerance=self.tolerance)
 
         # fiter none weight layer and save weight layer to match perturbation computation

From 769cbc201fc37b1450c306b3074726990f06b875 Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Tue, 15 Nov 2022 14:10:03 +0800
Subject: [PATCH 008/128] comment test

---
 test/strategy/test_hawq_wenhuach.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/test/strategy/test_hawq_wenhuach.py b/test/strategy/test_hawq_wenhuach.py
index 4443cd8d486..a470f679cf8 100644
--- a/test/strategy/test_hawq_wenhuach.py
+++ b/test/strategy/test_hawq_wenhuach.py
@@ -11,7 +11,7 @@
 from neural_compressor.adaptor.pytorch import TemplateAdaptor
 from neural_compressor.adaptor import FRAMEWORKS
 import shutil
-from neural_compressor.strategy.hawq import Hawq_top, fix_seed
+from neural_compressor.strategy.hawq_wenhuach import Hawq_top, fix_seed
 
 fix_seed(1)
 
@@ -77,7 +77,8 @@ def test_run_hawq_one_trial(self):
             quantizer()
 
 if __name__ == "__main__":
-    unittest.main()
+    pass
+    # unittest.main()
 
 # def build_hessian_trace():
 #     hessian_trace_config_yaml = '''

From a9fecbbe58a8c1eaac7a1c9caf7aef66f9763310 Mon Sep 17 00:00:00 2001
From: BiaoFangAIA <108742533+BiaoFangAIA@users.noreply.github.com>
Date: Tue, 15 Nov 2022 14:19:56 +0800
Subject: [PATCH 009/128] add activation quantized loss eval

---
 neural_compressor/strategy/hawq_metric.py | 146 ++++++++++++++--------
 1 file changed, 97 insertions(+), 49 deletions(-)

diff --git a/neural_compressor/strategy/hawq_metric.py b/neural_compressor/strategy/hawq_metric.py
index acbcd98d740..63db277ab14 100644
--- a/neural_compressor/strategy/hawq_metric.py
+++ b/neural_compressor/strategy/hawq_metric.py
@@ -37,7 +37,7 @@ def fixed_seed(seed):
     torch.cuda.manual_seed_all(seed)  #parallel cpu
     torch.backends.cudnn.deterministic = True  #make sure results are same on cpu/gpu
     torch.backends.cudnn.benchmark = True   #accelerator
-def calculate_params_gradients(model):
+def cal_params_grad(model):
      """
      get the gradients and parameters from given model
      Args:
@@ -55,17 +55,7 @@ def calculate_params_gradients(model):
           params.append(parm)
           grads.append(0. if parm.grad is None else parm.grad+0.)
      return params, grads
-def calculate_inner_product(list_x,list_y):
-     """Compute the inner product of two lists of variables list_x,list_y
-     Args:
-          list_x:                            input list variables
-          list_y:                            input list variables
-     return:
-          sum of inner product
-     """
-     return sum([torch.sum(x * y) for (x, y) in zip(list_x, list_y)])
-
-def calculate_vector_product(gradsH, params, v):
+def cal_vector_product(gradsH, params, v):
      """compute the hessian vector product by torch.autograd.grad.
      Agrs:
           gradsH:                             gradient at current point
@@ -105,7 +95,7 @@ def ptq_calibrate(model, data_loader,num_cal):
           for sample in calibrate_samples:
                model(sample)
      return model
-def calculate_perturbation(model_qnt,model_fp32)->dict:
+def cal_weights_pertubation(model_qnt,model_fp32)->dict:
      """calculate weights quantized perturbation using L2 normal
         Args:
             model_qnt:                       quantized model
@@ -126,6 +116,44 @@ def calculate_perturbation(model_qnt,model_fp32)->dict:
           pertur_pair['value']=diff_l2
           pertur_lst.append(pertur_pair)
      return pertur_lst
+def cal_act_pertubation(model_fp32,model_qnt,data_loader,num_cal=100)->dict:
+     """calculate weights quantized perturbation using L2 normal
+        Args:
+            model_qunt:                     quantized model
+            model_fp32:                     float model
+            data_loader:                    path to datasets
+        return:
+            pretur_lst:                     dict
+
+     """
+     ns.prepare_model_outputs(model_fp32, model_qnt)
+     model_fp32.cpu()
+     model_fp32.eval()
+     model_qnt.cpu()
+     model_qnt.eval()
+     obv_samples=[]
+     i=0
+     for inputs, targets in data_loader:
+          obv_samples.append(inputs)
+          i=i+1
+          if i>=num_cal:
+               break
+     with torch.no_grad():
+          for image in obv_samples:
+               model_fp32(image)
+               model_qnt(image)
+     act_qnt_pairs=[]
+     act_compare_dict = ns.get_matching_activations(model_fp32, q_module=model_qnt)
+     for key in act_compare_dict:
+          op_float_tensor=(act_compare_dict[key]['float'][0])
+          op_qnt_tensor=act_compare_dict[key]['quantized'][0].dequantize()
+          diff_l2=(torch.norm(op_float_tensor-op_qnt_tensor,p=2)**2)
+          pertur_pair={"layer_name":'',"value":0}
+          pertur_pair['layer_name']=key
+          pertur_pair['value']=diff_l2
+          act_qnt_pairs.append(pertur_pair)
+     return act_qnt_pairs
+     
 class Hessian():
      """This class used to compute each layer hessian trace from given FP32 model
      """
@@ -153,7 +181,7 @@ def __init__(self,model,criterion, data=None, dataloader=None,device='cpu') -> N
                outputs=self.model(self.inputs)
                loss=self.criterion(outputs,self.targets)
                loss.backward(create_graph=True)
-          params, gradSH=calculate_params_gradients(self.model)
+          params, gradSH=cal_params_grad(self.model)
 
           self.params=params
           self.gradSH=gradSH
@@ -176,8 +204,8 @@ def calculate_trace(self,max_Iter=100, tolerance=1e-3):
                trace_pair={"layer_name":" ", "trace":0}
                self.model.zero_grad()
                for i in range(max_Iter):
-                    hv=calculate_vector_product(i_grad,i_param,i_v) # hessian vector
-                    trace_vhv_cur=calculate_inner_product(hv,v).cpu().item()#current point 
+                    hv=cal_vector_product(i_grad,i_param,i_v) # hessian vector
+                    trace_vhv_cur=sum([torch.sum(x * y) for (x, y) in zip(hv, v)])
                     trace_vhv.append(trace_vhv_cur)
                     difference=(np.mean(trace_vhv)-trace)/(abs(trace)+1e-6)
                     if abs(difference)<tolerance:
@@ -217,7 +245,7 @@ def __init__(self,model,yaml_trace=None,yaml_cpu=None,dataloader=None) -> None:
                self.max_Iteration=100
                self.enable_op_fuse=True
                self.tolerance=1e-6
-               self.max_cal_sample=100
+               self.max_cal_sample=1
                self.quantize_mode='ptq'
                self.list_dtype=['int8','fp32']
           logging.info("Current parameters config for Hutchinson’s algorithm as below:")
@@ -230,6 +258,7 @@ def __init__(self,model,yaml_trace=None,yaml_cpu=None,dataloader=None) -> None:
           model_tmp.eval()
           self.model_fused= fuse_fx(model_tmp)
           self.model_fused.eval()
+          self.hawq_level='L3'   #L1:top engievalue L2:avg_trace L3:avg_trace+pertubation
               
      def get_init_config(self)->dict: 
           """
@@ -238,6 +267,7 @@ def get_init_config(self)->dict:
           for inputs, targets in self.dataloader:
                break
           #Hessian average trace computation
+          fixed_seed(self.random_seed)
           with torch.enable_grad():
                if self.enable_op_fuse:
                     hawq_cmp=Hessian(self.model_fused,criterion=self.criterion,data=(inputs,targets))
@@ -246,40 +276,58 @@ def get_init_config(self)->dict:
           avg_traces_lst=hawq_cmp.calculate_trace(max_Iter=self.max_Iteration,tolerance=self.tolerance)
          
           #fiter none weight layer and save weight layer to match perturbation computation
-          avg_traces_lst_weight=[]
-          for avg_trace_i in avg_traces_lst:
-               if 'weight' in avg_trace_i['layer_name']:
-                    avg_traces_lst_weight.append(avg_trace_i)
-          # avg_traces_lst_sorted=sorted(avg_traces_lst,key=lambda x:x["trace"], reverse=True)
-          if self.quantize_mode=='ptq':
-               #PTQ quantization
-               qconfig = get_default_qconfig("fbgemm")
-               qconfig_dict={"":qconfig} #enable all layers/tensor to quantize
-               #calibrate
-               model_prepared=prepare_fx(self.model, qconfig_dict)
-               model_prepared=ptq_calibrate(model_prepared,data_loader=self.dataloader,num_cal=self.max_cal_sample)
-               model_prepared.cpu()
-               model_all_qnt=convert_fx(model_prepared)
-               #calculate perturbation
-               pertu_list=calculate_perturbation(model_fp32=self.model,model_qnt=model_all_qnt)
-               #calculate omiga
-               for omiga_i in pertu_list:
-                    for avg_trace_i in avg_traces_lst:
-                         if avg_trace_i['layer_name']==omiga_i['layer_name']:
-                              avg_trace_i['trace']=avg_trace_i['trace']*omiga_i['value']
-               # for avg_trace_i, omiga_i in zip(avg_traces_lst_weight,pertu_list):
-               #      omig_pair={"layer_name":" ", "value":0}
-               #      omig_val=avg_trace_i['trace']*omiga_i['value']
-               #      omig_pair['layer_name']=avg_trace_i['layer_name']
-               #      omig_pair['value']=omig_val
-               #      omig_list.append(omig_pair)
-               # omig_list_sorted=sorted(omig_list,key=lambda x:x['value'],reverse=True)
-               omig_list_sorted=sorted(avg_traces_lst,key=lambda x:x['trace'],reverse=True)
+          if self.hawq_level=='L2':
+               avg_traces_lst_sorted=sorted(avg_traces_lst,key=lambda x:x["trace"], reverse=True)
+               logging.info("avg_traces desending sorted is:")
+               for i in avg_traces_lst_sorted:
+                    logging.info(i)
+               list_sorted=avg_traces_lst_sorted 
+          if self.hawq_level=='L3':
+               if self.quantize_mode=='ptq':
+                    #PTQ quantization
+                    qconfig = get_default_qconfig("fbgemm")
+                    qconfig_dict={"":qconfig} #enable all layers/tensor to quantize
+                    #calibrate
+                    model_prepared=prepare_fx(self.model, qconfig_dict)
+                    model_prepared=ptq_calibrate(model_prepared,data_loader=self.dataloader,num_cal=self.max_cal_sample)
+                    model_prepared.cpu()
+                    model_all_qnt=convert_fx(model_prepared)
+                    #calculate weights quantized perturbation
+                    weights_pertu_lst=cal_weights_pertubation(model_fp32=self.model,model_qnt=model_all_qnt)
+                    #merge weights quantized perturbation
+                    #generally, fused ops=quantized weights+quantized activation 
+                    avg_trace_i=0
+                    omigs=[]
+                    for wct_i in weights_pertu_lst:
+                        omig_pair={"layer_name":" ", "trace":0}
+                        tmp_value=avg_traces_lst[avg_trace_i]['trace']*wct_i['value']
+                        omig_pair['layer_name']=avg_traces_lst[avg_trace_i]['layer_name']
+                        omig_pair['trace']=tmp_value
+                        avg_trace_i=avg_trace_i+2
+                        omigs.append(omig_pair)
+                    act_pertu_lst=cal_act_pertubation(model_fp32=self.model, model_qnt=model_all_qnt,data_loader=self.dataloader,num_cal=self.max_cal_sample)
+                    avg_trace_i=1
+                    for act_i in act_pertu_lst:
+                         omig_pair={"layer_name":" ", "trace":0}
+                         tmp_value=avg_traces_lst[avg_trace_i]['trace']+act_i['value']
+                         omig_pair['layer_name']=avg_traces_lst[avg_trace_i]['layer_name']
+                         omig_pair['trace']=tmp_value
+                         avg_trace_i=avg_trace_i+2
+                         omigs.append(omig_pair)
+                    
+                    # for avg_trace_i, omiga_i in zip(avg_traces_lst_weight,pertu_list):
+                    #      omig_pair={"layer_name":" ", "value":0}
+                    #      omig_val=avg_trace_i['trace']*omiga_i['value']
+                    #      omig_pair['layer_name']=avg_trace_i['layer_name']
+                    #      omig_pair['value']=omig_val
+                    #      omig_list.append(omig_pair)
+                    # omig_list_sorted=sorted(omig_list,key=lambda x:x['value'],reverse=True)
+                    omig_list_sorted=sorted(omigs,key=lambda x:x['trace'],reverse=True)
+                    list_sorted=omig_list_sorted
           tune_init_config_pairs=[]
-          #
-          for i in omig_list_sorted:
+          for i in list_sorted:
                tune_init_config_pair={"op_name":'',"op_type":'','trace':0}
-               if i['layer_name']==omig_list_sorted[0]['layer_name']: 
+               if i['layer_name']==list_sorted[0]['layer_name']: 
                     tune_init_config_pair['op_name']=i['layer_name']
                     tune_init_config_pair['op_type']=self.list_dtype[-1] #setup as float op
                     tune_init_config_pair['trace']=float(i['trace'])

From 8f9e355cba65346e954cf8fa34d2aad34bdc4893 Mon Sep 17 00:00:00 2001
From: BiaoFangAIA <108742533+BiaoFangAIA@users.noreply.github.com>
Date: Tue, 15 Nov 2022 14:21:17 +0800
Subject: [PATCH 010/128] fixed seed for dummy datasets

---
 test/strategy/test_hessian_trace_inc.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/test/strategy/test_hessian_trace_inc.py b/test/strategy/test_hessian_trace_inc.py
index f05b47ca3aa..5285bc619c7 100644
--- a/test/strategy/test_hessian_trace_inc.py
+++ b/test/strategy/test_hessian_trace_inc.py
@@ -10,7 +10,21 @@
 from neural_compressor.experimental.data.dataloaders.pytorch_dataloader import PyTorchDataLoader
 from neural_compressor.adaptor.pytorch import TemplateAdaptor
 from neural_compressor.strategy.hawq_metric import Hawq_top
-
+import random
+import numpy as np
+def fixed_seed(seed):
+    """Fixed rand seed to make sure results are same in different times on different devices.Eg CPU/GPU
+       Args:
+          seed:                              an integer number
+       return:                               None 
+    """
+    np.random.seed(seed)   #random
+    random.seed(seed)
+    torch.manual_seed(seed) #cpu
+    torch.cuda.manual_seed_all(seed)  #parallel cpu
+    torch.backends.cudnn.deterministic = True  #make sure results are same on cpu/gpu
+    torch.backends.cudnn.benchmark = True   #accelerator
+fixed_seed(100)
 def build_hessian_trace():
     hessian_trace_config_yaml='''
     loss:

From 11c7592e3090dc724483eb62fdc824301c7d3340 Mon Sep 17 00:00:00 2001
From: BiaoFangAIA <108742533+BiaoFangAIA@users.noreply.github.com>
Date: Tue, 15 Nov 2022 14:24:18 +0800
Subject: [PATCH 011/128] for independence hawq tuning strategic

---
 neural_compressor/strategy/hawq.py | 202 +++++++++++++++++++++++++++++
 1 file changed, 202 insertions(+)
 create mode 100644 neural_compressor/strategy/hawq.py

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
new file mode 100644
index 00000000000..17231ceec9d
--- /dev/null
+++ b/neural_compressor/strategy/hawq.py
@@ -0,0 +1,202 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import numpy as np
+from collections import OrderedDict
+from .strategy import strategy_registry, TuneStrategy
+from ..utils import logger
+
+from .st_utils.tuning_sampler import OpTypeWiseTuningSampler, FallbackTuningSampler, ModelWiseTuningSampler
+from .st_utils.tuning_structs import OpTuningConfig
+from .st_utils.tuning_space import TUNING_ITEMS_LST
+
+@strategy_registry
+class HawqTuneStrategy(TuneStrategy):
+    """The basic tuning strategy which tunes the low precision model with below order.
+
+    1. modelwise tuning for all quantizable ops.
+    2. fallback tuning from bottom to top to decide the priority of which op has biggest impact
+       on accuracy.
+    3. incremental fallback tuning by fallbacking multiple ops with the order got from #2.
+
+    Args:
+        model (object):                        The FP32 model specified for low precision tuning.
+        conf (Class):                          The Conf class instance initialized from user yaml
+                                               config file.
+        q_dataloader (generator):              Data loader for calibration, mandatory for
+                                               post-training quantization.
+                                               It is iterable and should yield a tuple (input,
+                                               label) for calibration dataset containing label,
+                                               or yield (input, _) for label-free calibration
+                                               dataset. The input could be a object, list, tuple or
+                                               dict, depending on user implementation, as well as
+                                               it can be taken as model input.
+        q_func (function, optional):           Reserved for future use.
+        eval_dataloader (generator, optional): Data loader for evaluation. It is iterable
+                                               and should yield a tuple of (input, label).
+                                               The input could be a object, list, tuple or dict,
+                                               depending on user implementation, as well as it can
+                                               be taken as model input. The label should be able
+                                               to take as input of supported metrics. If this
+                                               parameter is not None, user needs to specify
+                                               pre-defined evaluation metrics through configuration
+                                               file and should set "eval_func" parameter as None.
+                                               Tuner will combine model, eval_dataloader and
+                                               pre-defined metrics to run evaluation process.
+        eval_func (function, optional):        The evaluation function provided by user.
+                                               This function takes model as parameter, and
+                                               evaluation dataset and metrics should be
+                                               encapsulated in this function implementation and
+                                               outputs a higher-is-better accuracy scalar value.
+
+                                               The pseudo code should be something like:
+
+                                               def eval_func(model):
+                                                    input, label = dataloader()
+                                                    output = model(input)
+                                                    accuracy = metric(output, label)
+                                                    return accuracy
+        dicts (dict, optional):                The dict containing resume information.
+                                               Defaults to None.
+
+    """
+
+    def __init__(self, model, conf, q_dataloader, q_func=None,
+                 eval_dataloader=None, eval_func=None, dicts=None, q_hooks=None):
+        super(
+            HawqTuneStrategy,
+            self).__init__(
+            model,
+            conf,
+            q_dataloader,
+            q_func,
+            eval_dataloader,
+            eval_func,
+            dicts,
+            q_hooks)
+
+    def next_tune_cfg(self):
+        """The generator of yielding next tuning config to traverse by concrete strategies
+           according to last tuning result.
+
+        Yields:
+            tune_config (dict): It's a dict containing the tuning configuration to run.
+        """
+        from copy import deepcopy
+        tuning_space = self.tuning_space
+        calib_sampling_size_lst = tuning_space.root_item.get_option_by_name('calib_sampling_size').options
+        for calib_sampling_size in calib_sampling_size_lst:
+            # Initialize the tuning config for each op according to the quantization approach 
+            op_item_dtype_dict, quant_mode_wise_items, initial_op_tuning_cfg = self.initial_tuning_cfg()
+            # Optype-wise tuning tuning items: the algorithm/scheme/granularity of activation(weight)
+            early_stop_tuning = False
+            stage1_cnt = 0
+            quant_ops = quant_mode_wise_items['static'] if 'static' in quant_mode_wise_items else []
+            quant_ops += quant_mode_wise_items['dynamic'] if 'dynamic' in quant_mode_wise_items else []
+            stage1_max = 1e9  # TODO set a more appropriate value
+            op_wise_tuning_sampler = OpTypeWiseTuningSampler(tuning_space, [], [], 
+                                                             op_item_dtype_dict, initial_op_tuning_cfg)
+            for op_tuning_cfg in op_wise_tuning_sampler:
+                stage1_cnt += 1
+                if early_stop_tuning and stage1_cnt > stage1_max:
+                    logger.info("Early stopping the stage 1.")
+                    break
+                op_tuning_cfg['calib_sampling_size'] = calib_sampling_size
+                yield op_tuning_cfg
+            # Fallback the ops supported both static and dynamic from static to dynamic
+            # Tuning items: None
+            if self.cfg.quantization.approach == 'post_training_auto_quant':
+                static_dynamic_items = [item for item in tuning_space.query_items_by_quant_mode('static') if
+                                        item in tuning_space.query_items_by_quant_mode('dynamic')]
+                if static_dynamic_items:
+                    logger.info("Fallback all ops that support both dynamic and static to dynamic.")
+                else:
+                    logger.info("Non ops that support both dynamic")
+
+                new_op_tuning_cfg = deepcopy(self.cur_best_tuning_cfg)
+                for item in static_dynamic_items:
+                    new_op_tuning_cfg[item.name] = self.initial_dynamic_cfg_based_on_static_cfg(
+                                                   new_op_tuning_cfg[item.name])
+                new_op_tuning_cfg['calib_sampling_size'] = calib_sampling_size
+                yield new_op_tuning_cfg
+            best_op_tuning_cfg_stage1 = deepcopy(self.cur_best_tuning_cfg)
+
+            # Fallback
+            for target_dtype in ['bf16', 'fp32']:
+                target_type_lst = set(tuning_space.query_items_by_quant_mode(target_dtype))
+                fallback_items_lst = [item for item in quant_ops if item in target_type_lst]
+                if fallback_items_lst:
+                    logger.info(f"Start to fallback op to {target_dtype} one by one.")
+                    self._fallback_started()
+                #fallback_items_name_lst = [item.name for item in fallback_items_lst][::-1] # from bottom to up
+                ops_sensitivity = self.adaptor.calculate_op_sensitivity(self._fp32_model, 
+                                                                        self.calib_dataloader, 
+                                                                        method_args = {'name': 'hessian_trace'})
+                fallback_items_name_lst = sorted(ops_sensitivity, key = lambda items: items[1], reverse=True)
+                
+                op_dtypes = OrderedDict(zip(fallback_items_name_lst, [target_dtype] * len(fallback_items_name_lst)))
+                initial_op_tuning_cfg = deepcopy(best_op_tuning_cfg_stage1)
+                fallback_sampler = FallbackTuningSampler(tuning_space, tuning_order_lst=[],
+                                                        initial_op_tuning_cfg=initial_op_tuning_cfg,
+                                                        op_dtypes=op_dtypes, accumulate=False)
+                op_fallback_acc_impact = OrderedDict()
+                for op_index, op_tuning_cfg in enumerate(fallback_sampler):
+                    op_tuning_cfg['calib_sampling_size'] = calib_sampling_size
+                    yield op_tuning_cfg
+                    acc, _ = self.last_tune_result
+                    op_fallback_acc_impact[fallback_items_name_lst[op_index]] = acc
+
+
+                # do accumulated fallback according to the order in the previous stage
+                if len(op_fallback_acc_impact) > 0:
+                    ordered_ops = sorted(op_fallback_acc_impact.keys(), 
+                                         key=lambda key: op_fallback_acc_impact[key],
+                                         reverse=self.higher_is_better)
+                    op_dtypes = OrderedDict(zip(ordered_ops, [target_dtype] * len(fallback_items_name_lst)))
+                    logger.info(f"Start to accumulate fallback to {target_dtype}.")
+                    initial_op_tuning_cfg = deepcopy(best_op_tuning_cfg_stage1)
+                    fallback_sampler = FallbackTuningSampler(tuning_space, tuning_order_lst=[],
+                                                            initial_op_tuning_cfg=initial_op_tuning_cfg,
+                                                            op_dtypes=op_dtypes, accumulate=True)
+                    for op_tuning_cfg in fallback_sampler:
+                        op_tuning_cfg['calib_sampling_size'] = calib_sampling_size
+                        yield op_tuning_cfg
+                        
+    def initial_dynamic_cfg_based_on_static_cfg(self, op_static_cfg:OpTuningConfig):
+        op_state = op_static_cfg.get_state()
+        op_name = op_static_cfg.op_name
+        op_type = op_static_cfg.op_type
+        op_quant_mode = 'dynamic'
+        tuning_space = self.tuning_space
+        dynamic_state = {}
+        for att in ['weight', 'activation']:
+            if att not in op_state:
+                continue
+            for item_name, item_val in op_state[att].items():
+                att_item = (att, item_name)
+                if att_item not in TUNING_ITEMS_LST:
+                    continue
+                if tuning_space.query_item_option((op_name, op_type), op_quant_mode, att_item, item_val):
+                    dynamic_state[att_item] = item_val
+                else:
+                    quant_mode_item = tuning_space.query_quant_mode_item((op_name, op_type), op_quant_mode)
+                    tuning_item = quant_mode_item.get_option_by_name(att_item)
+                    dynamic_state[att_item] = tuning_item.options[0] if tuning_item else None
+        return OpTuningConfig(op_name, op_type, op_quant_mode, tuning_space, kwargs=dynamic_state)
+        
+        
\ No newline at end of file

From bf44c0e0be4e0f1d27d3fa8a1bc6ca1cc3ac6230 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 15 Nov 2022 14:38:19 +0800
Subject: [PATCH 012/128] add a fallback ut

---
 neural_compressor/strategy/basic.py           | 10 +--
 .../strategy/{ => st_utils}/hawq_metric.py    |  0
 .../strategy/{ => st_utils}/hawq_wenhuach.py  |  0
 test/strategy/test_basic_fallback.py          | 73 +++++++++++++++++++
 4 files changed, 78 insertions(+), 5 deletions(-)
 rename neural_compressor/strategy/{ => st_utils}/hawq_metric.py (100%)
 rename neural_compressor/strategy/{ => st_utils}/hawq_wenhuach.py (100%)
 create mode 100644 test/strategy/test_basic_fallback.py

diff --git a/neural_compressor/strategy/basic.py b/neural_compressor/strategy/basic.py
index 3cc4e38bde2..184a15996f7 100644
--- a/neural_compressor/strategy/basic.py
+++ b/neural_compressor/strategy/basic.py
@@ -143,11 +143,11 @@ def next_tune_cfg(self):
                 if fallback_items_lst:
                     logger.info(f"Start to fallback op to {target_dtype} one by one.")
                     self._fallback_started()
-                #fallback_items_name_lst = [item.name for item in fallback_items_lst][::-1] # from bottom to up
-                ops_sensitivity = self.adaptor.calculate_op_sensitivity(self._fp32_model, 
-                                                                        self.calib_dataloader, 
-                                                                        method_args = {'name': 'hessian_trace'})
-                fallback_items_name_lst = sorted(ops_sensitivity, key = lambda items: items[1], reverse=True)
+                fallback_items_name_lst = [item.name for item in fallback_items_lst][::-1] # from bottom to up
+                # ops_sensitivity = self.adaptor.calculate_op_sensitivity(self._fp32_model, 
+                #                                                         self.calib_dataloader, 
+                #                                                         method_args = {'name': 'hessian_trace'})
+                #fallback_items_name_lst = sorted(ops_sensitivity, key = lambda items: items[1], reverse=True)
                 
                 op_dtypes = OrderedDict(zip(fallback_items_name_lst, [target_dtype] * len(fallback_items_name_lst)))
                 initial_op_tuning_cfg = deepcopy(best_op_tuning_cfg_stage1)
diff --git a/neural_compressor/strategy/hawq_metric.py b/neural_compressor/strategy/st_utils/hawq_metric.py
similarity index 100%
rename from neural_compressor/strategy/hawq_metric.py
rename to neural_compressor/strategy/st_utils/hawq_metric.py
diff --git a/neural_compressor/strategy/hawq_wenhuach.py b/neural_compressor/strategy/st_utils/hawq_wenhuach.py
similarity index 100%
rename from neural_compressor/strategy/hawq_wenhuach.py
rename to neural_compressor/strategy/st_utils/hawq_wenhuach.py
diff --git a/test/strategy/test_basic_fallback.py b/test/strategy/test_basic_fallback.py
new file mode 100644
index 00000000000..352c81850c4
--- /dev/null
+++ b/test/strategy/test_basic_fallback.py
@@ -0,0 +1,73 @@
+import torch
+import unittest
+import os
+import sys
+import copy
+import torchvision
+import torchvision.transforms as transforms
+from torch.utils.data import DataLoader
+from neural_compressor.data import DATASETS
+from neural_compressor.experimental.data.dataloaders.pytorch_dataloader import PyTorchDataLoader
+from neural_compressor.adaptor.pytorch import TemplateAdaptor
+from neural_compressor.adaptor import FRAMEWORKS
+import shutil
+
+
+def build_ptq_yaml():
+    fake_yaml = '''
+    model:
+        name: resnet18
+        framework: pytorch_fx
+    tuning:
+        strategy:
+            name: basic
+        accuracy_criterion:
+            absolute:  -1
+        exit_policy:
+            timeout: 0
+    '''
+    with open('ptq_yaml.yaml', 'w', encoding="utf-8") as f:
+        f.write(fake_yaml)
+
+class TestPytorchAdaptor(unittest.TestCase):
+    framework_specific_info = {"device": "cpu",
+                               "approach": "post_training_static_quant",
+                               "random_seed": 1234,
+                               "q_dataloader": None,
+                               "workspace_path": None}
+    framework = "pytorch"
+    adaptor = FRAMEWORKS[framework](framework_specific_info)
+    model = torchvision.models.resnet18()
+
+    # model = torch.quantization.QuantWrapper(model)
+
+    @classmethod
+    def setUpClass(self):
+        self.i = 0
+        build_ptq_yaml()
+
+
+    @classmethod
+    def tearDownClass(self):
+        os.remove('ptq_yaml.yaml')
+        shutil.rmtree('./saved', ignore_errors=True)
+        shutil.rmtree('runs', ignore_errors=True)
+
+    def test_basic_fallback(self):
+        def eval_func(model):
+          self.i -= 1
+          return self.i
+          
+        from neural_compressor.experimental import Quantization, common
+        model = copy.deepcopy(self.model)
+        quantizer = Quantization('ptq_yaml.yaml')
+        quantizer.eval_func = eval_func
+        dataset = quantizer.dataset('dummy', (1, 3, 224, 224), label=True)
+        quantizer.calib_dataloader = common.DataLoader(dataset)
+        quantizer.eval_dataloader = common.DataLoader(dataset)
+        quantizer.model = model
+        q_model = quantizer()
+        self.assertTrue(q_model is None)
+        
+if __name__ == "__main__":
+    unittest.main()

From eff50653a4679beabde6054706fd8e3c8f1360bb Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Wed, 16 Nov 2022 14:03:42 +0800
Subject: [PATCH 013/128] update test file

---
 test/strategy/test_hawq_wenhuach.py | 41 ++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 15 deletions(-)

diff --git a/test/strategy/test_hawq_wenhuach.py b/test/strategy/test_hawq_wenhuach.py
index a470f679cf8..ad7939d5d84 100644
--- a/test/strategy/test_hawq_wenhuach.py
+++ b/test/strategy/test_hawq_wenhuach.py
@@ -11,7 +11,7 @@
 from neural_compressor.adaptor.pytorch import TemplateAdaptor
 from neural_compressor.adaptor import FRAMEWORKS
 import shutil
-from neural_compressor.strategy.hawq_wenhuach import Hawq_top, fix_seed
+from neural_compressor.strategy.st_utils.hawq_wenhuach import Hawq_top, fix_seed
 
 fix_seed(1)
 
@@ -19,7 +19,7 @@ def build_ptq_yaml():
     fake_yaml = '''
         model:
           name: imagenet
-          framework: pytorch
+          framework: pytorch_fx
         quantization: 
           calibration:
         evaluation:
@@ -28,12 +28,12 @@ def build_ptq_yaml():
               topk: 1
         tuning:
           strategy:
-            name: mse
+            name: hawq
           accuracy_criterion:
             relative: -0.1
           random_seed: 9527
           exit_policy:
-            max_trials: 1
+            max_trials: 3
           workspace:
             path: saved
         '''
@@ -50,10 +50,17 @@ class TestPytorchAdaptor(unittest.TestCase):
     adaptor = FRAMEWORKS[framework](framework_specific_info)
     model = torchvision.models.resnet18()
 
+    # from collections import OrderedDict
+    # model = torch.nn.Sequential(OrderedDict([
+    #     ('conv1', torch.nn.Conv2d(3, 2, 1, 1)),
+    #     ('conv2', torch.nn.Conv2d(2, 1, 1, 1)),
+    #     ('flat', torch.nn.Flatten()),
+    # ]))
     # model = torch.quantization.QuantWrapper(model)
 
     @classmethod
     def setUpClass(self):
+        self.i = 0
         build_ptq_yaml()
 
 
@@ -63,22 +70,26 @@ def tearDownClass(self):
         shutil.rmtree('./saved', ignore_errors=True)
         shutil.rmtree('runs', ignore_errors=True)
 
+
+
     def test_run_hawq_one_trial(self):
+        def eval_func(model):
+            self.i -= 1
+            return self.i
         from neural_compressor.experimental import Quantization, common
         model = copy.deepcopy(self.model)
-        for fake_yaml in ['ptq_yaml.yaml']:
-            if fake_yaml == 'ptq_yaml.yaml':
-                model.eval()
-            quantizer = Quantization(fake_yaml)
-            dataset = quantizer.dataset('dummy', (1, 3, 224, 224), label=True)
-            quantizer.calib_dataloader = common.DataLoader(dataset)
-            quantizer.eval_dataloader = common.DataLoader(dataset)
-            quantizer.model = model
-            quantizer()
+
+        quantizer = Quantization('ptq_yaml.yaml')
+        quantizer.eval_func = eval_func
+        dataset = quantizer.dataset('dummy', (1, 3, 224, 224), label=True)
+        quantizer.calib_dataloader = common.DataLoader(dataset)
+        quantizer.eval_dataloader = common.DataLoader(dataset)
+        quantizer.model = model
+        quantizer()
 
 if __name__ == "__main__":
-    pass
-    # unittest.main()
+
+    unittest.main()
 
 # def build_hessian_trace():
 #     hessian_trace_config_yaml = '''

From ed6a1fcd5eb2cf94da2f1f3483de4747166bd0be Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Thu, 17 Nov 2022 15:04:32 +0800
Subject: [PATCH 014/128] tiny update

---
 neural_compressor/strategy/hawq.py  | 329 +++++++++++++++++++++-------
 neural_compressor/strategy/mse.py   |   3 +-
 test/strategy/test_hawq_wenhuach.py |  10 +-
 3 files changed, 263 insertions(+), 79 deletions(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 17231ceec9d..3db5cf0aed5 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -18,6 +18,9 @@
 import copy
 import numpy as np
 from collections import OrderedDict
+
+import torch.nn
+
 from .strategy import strategy_registry, TuneStrategy
 from ..utils import logger
 
@@ -25,6 +28,154 @@
 from .st_utils.tuning_structs import OpTuningConfig
 from .st_utils.tuning_space import TUNING_ITEMS_LST
 
+
+class HessianTrace:
+    def __init__(self, model, conf, adaptor, op_cfgs_list, dataloader):
+        self.model = model
+        self.conf = conf  ##config
+        self.op_cfgs_list = op_cfgs_list  ##op to get
+        self.dataloader = dataloader
+        self.adaptor = adaptor
+        self.max_iter = 500
+        self.tolerance = 1e-5
+        self.eps = 1e-6
+        self.index = 0
+
+    # def apply_init(self):
+    #     trace_per_op = self._cal_trace()
+    #     if not trace_per_op:
+    #         raise RuntimeError('Failed to calculate hessian traces!')
+    #
+    #     perturbations = self._calc_quantization_noise()
+    #     configuration_metric = self._calc_hawq_metric_per_configuration(
+    #         perturbations, trace_per_op)
+    #     config_index = self.choose_configuration(configuration_metric)
+    #     chosen_config = self.op_cfgs_list[config_index]
+    #     return chosen_config, trace_per_op
+
+    def get_device(self, model: torch.nn.Module):
+        for n, p in model.named_parameters():
+            return p.data.device
+
+    def get_gradient(self, model, data, criterion, op_list, device="cpu", retrain_graph=False):
+        model.zero_grad()
+        input = data[0]
+        target = data[1]
+        output = model(input)
+        loss = criterion(output, target)
+        loss.backward(retain_graph=retrain_graph)
+        gradients = {}
+        for n, p in model.named_parameters():
+            if n in op_list:
+                continue
+            gradients[n] = 0
+            if p.grad != None:
+                gradients[n] = p.grad
+        return gradients
+
+    def get_avg_trace(self, num_batches=2):
+        """
+                Estimates average hessian trace for each parameter
+                """
+        assert num_batches > 0
+        ##num_data_iter = self.op_cfgs_list[0]['calib_iteration']
+        ##num_all_data = num_data_iter * self.dataloader.batch_size
+        op_list = [item.name for item in self.op_cfgs_list]
+        criterion = torch.nn.CrossEntropyLoss()  ##TODO setting this in config
+        device = self.get_device(self.model)
+
+        for step, batch in enumerate(self.dataloader):
+            gradient_dict = self.get_gradient(self.model, batch,criterion, op_list, device=device, retrain_graph=True)
+            tmp = 1
+            if step == num_batches - 1:
+                break
+
+
+        weight_vhp = []
+        w_avg_total_trace = 0.
+        w_avg_traces_per_iter = []
+        mean_avg_traces_per_param = None
+        act_vhp = []
+        a_avg_total_trace = 0.
+        a_avg_traces_per_iter = []
+        mean_avg_traces_per_act = None
+
+        for i in range(max_iter):
+            weight_vhp_list, w_v, \
+            act_vhp_list, a_v, op_act_grad = self.adaptor.get_2order_grad(self.model,
+                                                                          criterion,
+                                                                          self.dataloader,
+                                                                          num_data_iter,
+                                                                          qop_list)
+            if not weight_vhp:
+                weight_vhp = [np.random.randn(*p.shape) for p in w_v]
+            for vhp_curr in weight_vhp_list:
+                weight_vhp = [a + b * float(self.dataloader.batch_size) + 0. \
+                              for a, b in zip(weight_vhp, vhp_curr)]
+            weight_vhp = [a / float(num_all_data) for a in weight_vhp]
+            avg_traces_per_param = [np.sum(a * b) / a.size for (a, b) in zip(weight_vhp, w_v)]
+            w_avg_traces_per_iter.append(avg_traces_per_param)
+            mean_avg_traces_per_param = np.mean(w_avg_traces_per_iter, axis=0)
+            w_mean_avg_total_trace = np.sum(mean_avg_traces_per_param)
+
+            w_diff_avg = abs(w_mean_avg_total_trace - w_avg_total_trace) / \
+                         (w_avg_total_trace + diff_eps)
+            w_avg_total_trace = w_mean_avg_total_trace
+            logger.info(
+                '{}# weights difference_avg={} avg_trace={}'.format(
+                    i, w_diff_avg, w_avg_total_trace))
+
+            if not act_vhp:
+                act_vhp = [np.random.randn(*p.shape) for p in a_v]
+            for vhp_curr in act_vhp_list:
+                act_vhp = [a + b * float(self.dataloader.batch_size) + 0. \
+                           for a, b in zip(act_vhp, vhp_curr)]
+            act_vhp = [a / float(num_all_data) for a in act_vhp]
+            avg_traces_per_act = [np.sum(a * b) / a.size for (a, b) in zip(act_vhp, a_v)]
+            a_avg_traces_per_iter.append(avg_traces_per_act)
+            mean_avg_traces_per_act = np.mean(a_avg_traces_per_iter, axis=0)
+            a_mean_avg_total_trace = np.sum(mean_avg_traces_per_act)
+
+            a_diff_avg = abs(a_mean_avg_total_trace - a_avg_total_trace) / \
+                         (a_avg_total_trace + diff_eps)
+            a_avg_total_trace = a_mean_avg_total_trace
+            logger.info(
+                '{}# activation difference_avg={} avg_trace={}'.format(
+                    i, a_diff_avg, a_avg_total_trace))
+
+            if w_diff_avg < tolerance and a_diff_avg < tolerance:
+                return mean_avg_traces_per_param, mean_avg_traces_per_act, op_act_grad
+
+        return mean_avg_traces_per_param, mean_avg_traces_per_act, op_act_grad
+
+    def _cal_trace(self):
+        """
+        Calculate the trace for both weight and activation per layer
+        """
+        pass
+        # trace_estimator = HessianTraceEstimator(self.model,
+        #                                         self.conf,
+        #                                         self.adaptor,
+        #                                         self.op_cfgs_list,
+        #                                         self.dataloader)
+        # w_avg_trace, a_avg_trace, op_act_grad = trace_estimator.get_avg_trace()
+        #
+        # # mapping trace to op per op_weight_mapping
+        # weights_name = self.adaptor.get_all_weight_names(self.model)
+        # op_weight_mapping = self.get_op_weight_mapping()
+        # trace_per_op = OrderedDict()
+        # w_op_trace_info = np.zeros(len(op_weight_mapping))
+        # for i, (op_name, w_name) in enumerate(op_weight_mapping.items()):
+        #     index = weights_name.index(w_name)
+        #     w_op_trace_info[i] = w_avg_trace[index]
+        #     act_trace = 0.0
+        #     if op_name in op_act_grad:
+        #         a_index = op_act_grad.index(op_name)
+        #         act_trace = a_avg_trace[a_index]
+        #     trace_per_op[op_name] = (w_avg_trace[index], act_trace)
+        # return trace_per_op
+
+
 @strategy_registry
 class HawqTuneStrategy(TuneStrategy):
     """The basic tuning strategy which tunes the low precision model with below order.
@@ -91,6 +242,37 @@ def __init__(self, model, conf, q_dataloader, q_func=None,
             q_hooks)
 
     def next_tune_cfg(self):
+        from copy import deepcopy
+        tuning_space = self.tuning_space
+        calib_size = tuning_space.root_item.get_option_by_name('calib_sampling_size').options[0]  ##TODO suppoprt list
+
+        # Initialize the tuning config for each op according to the quantization approach
+        op_item_dtype_dict, quant_mode_wise_items, initial_op_tuning_cfg = self.initial_tuning_cfg()
+
+        quant_ops = quant_mode_wise_items['static'] if 'static' in quant_mode_wise_items else []
+        quant_ops += quant_mode_wise_items['dynamic'] if 'dynamic' in quant_mode_wise_items else []
+
+        target_dtype = "fp32"  ##TODO support bf16
+        target_type_lst = set(tuning_space.query_items_by_quant_mode(target_dtype))
+        fp_op_list = [item for item in quant_ops if item in target_type_lst]
+        orig_eval = True
+        if self._fp32_model.training:
+            orig_eval = False
+        self._fp32_model.train()
+        ht = HessianTrace(self._fp32_model, self.cfg, self.adaptor, fp_op_list, self.calib_dataloader)
+        ht.get_avg_trace()
+        # if orig_eval:
+        #     self._fp32_model.eval()
+        # ht.get_avg_trace()
+        # tmp = 1
+        # fallback_items_name_lst = [item.name for item in fallback_items_lst][::-1] # from bottom to up
+        # ops_sensitivity = self.adaptor.get_hessian_trace(self._fp32_model,
+        #                                                         self.calib_dataloader,
+        #                                                         self.
+        #                                                         method_args={'name': 'hessian_trace'})
+        # tmp = 1
+
+    def next_tune_cfg_bk(self):
         """The generator of yielding next tuning config to traverse by concrete strategies
            according to last tuning result.
 
@@ -100,84 +282,85 @@ def next_tune_cfg(self):
         from copy import deepcopy
         tuning_space = self.tuning_space
         calib_sampling_size_lst = tuning_space.root_item.get_option_by_name('calib_sampling_size').options
-        for calib_sampling_size in calib_sampling_size_lst:
-            # Initialize the tuning config for each op according to the quantization approach 
-            op_item_dtype_dict, quant_mode_wise_items, initial_op_tuning_cfg = self.initial_tuning_cfg()
-            # Optype-wise tuning tuning items: the algorithm/scheme/granularity of activation(weight)
-            early_stop_tuning = False
-            stage1_cnt = 0
-            quant_ops = quant_mode_wise_items['static'] if 'static' in quant_mode_wise_items else []
-            quant_ops += quant_mode_wise_items['dynamic'] if 'dynamic' in quant_mode_wise_items else []
-            stage1_max = 1e9  # TODO set a more appropriate value
-            op_wise_tuning_sampler = OpTypeWiseTuningSampler(tuning_space, [], [], 
-                                                             op_item_dtype_dict, initial_op_tuning_cfg)
-            for op_tuning_cfg in op_wise_tuning_sampler:
-                stage1_cnt += 1
-                if early_stop_tuning and stage1_cnt > stage1_max:
-                    logger.info("Early stopping the stage 1.")
-                    break
+
+        calib_sampling_size = calib_sampling_size_lst[0]
+        # Initialize the tuning config for each op according to the quantization approach
+        op_item_dtype_dict, quant_mode_wise_items, initial_op_tuning_cfg = self.initial_tuning_cfg()
+        # Optype-wise tuning tuning items: the algorithm/scheme/granularity of activation(weight)
+        early_stop_tuning = False
+        stage1_cnt = 0
+        quant_ops = quant_mode_wise_items['static'] if 'static' in quant_mode_wise_items else []
+        quant_ops += quant_mode_wise_items['dynamic'] if 'dynamic' in quant_mode_wise_items else []
+        stage1_max = 1e9  # TODO set a more appropriate value
+        op_wise_tuning_sampler = OpTypeWiseTuningSampler(tuning_space, [], [],
+                                                         op_item_dtype_dict, initial_op_tuning_cfg)
+        # for op_tuning_cfg in op_wise_tuning_sampler:
+        #     stage1_cnt += 1
+        #     if early_stop_tuning and stage1_cnt > stage1_max:
+        #         logger.info("Early stopping the stage 1.")
+        #         break
+        #     op_tuning_cfg['calib_sampling_size'] = calib_sampling_size
+        #     yield op_tuning_cfg
+        # Fallback the ops supported both static and dynamic from static to dynamic
+        # Tuning items: None
+        # if self.cfg.quantization.approach == 'post_training_auto_quant':
+        #     static_dynamic_items = [item for item in tuning_space.query_items_by_quant_mode('static') if
+        #                             item in tuning_space.query_items_by_quant_mode('dynamic')]
+        #     if static_dynamic_items:
+        #         logger.info("Fallback all ops that support both dynamic and static to dynamic.")
+        #     else:
+        #         logger.info("Non ops that support both dynamic")
+        #
+        #     new_op_tuning_cfg = deepcopy(self.cur_best_tuning_cfg)
+        #     for item in static_dynamic_items:
+        #         new_op_tuning_cfg[item.name] = self.initial_dynamic_cfg_based_on_static_cfg(
+        #                                        new_op_tuning_cfg[item.name])
+        #     new_op_tuning_cfg['calib_sampling_size'] = calib_sampling_size
+        #     yield new_op_tuning_cfg
+        best_op_tuning_cfg_stage1 = deepcopy(self.cur_best_tuning_cfg)
+
+        # Fallback
+        for target_dtype in ['bf16', 'fp32']:
+            target_type_lst = set(tuning_space.query_items_by_quant_mode(target_dtype))
+            fallback_items_lst = [item for item in quant_ops if item in target_type_lst]
+            if fallback_items_lst:
+                logger.info(f"Start to fallback op to {target_dtype} one by one.")
+                self._fallback_started()
+            # fallback_items_name_lst = [item.name for item in fallback_items_lst][::-1] # from bottom to up
+            ops_sensitivity = self.adaptor.calculate_op_sensitivity(self._fp32_model,
+                                                                    self.calib_dataloader,
+                                                                    method_args={'name': 'hessian_trace'})
+
+            fallback_items_name_lst = sorted(ops_sensitivity, key=lambda items: items[1], reverse=True)
+
+            op_dtypes = OrderedDict(zip(fallback_items_name_lst, [target_dtype] * len(fallback_items_name_lst)))
+            initial_op_tuning_cfg = deepcopy(best_op_tuning_cfg_stage1)
+            fallback_sampler = FallbackTuningSampler(tuning_space, tuning_order_lst=[],
+                                                     initial_op_tuning_cfg=initial_op_tuning_cfg,
+                                                     op_dtypes=op_dtypes, accumulate=False)
+            op_fallback_acc_impact = OrderedDict()
+            for op_index, op_tuning_cfg in enumerate(fallback_sampler):
                 op_tuning_cfg['calib_sampling_size'] = calib_sampling_size
                 yield op_tuning_cfg
-            # Fallback the ops supported both static and dynamic from static to dynamic
-            # Tuning items: None
-            if self.cfg.quantization.approach == 'post_training_auto_quant':
-                static_dynamic_items = [item for item in tuning_space.query_items_by_quant_mode('static') if
-                                        item in tuning_space.query_items_by_quant_mode('dynamic')]
-                if static_dynamic_items:
-                    logger.info("Fallback all ops that support both dynamic and static to dynamic.")
-                else:
-                    logger.info("Non ops that support both dynamic")
-
-                new_op_tuning_cfg = deepcopy(self.cur_best_tuning_cfg)
-                for item in static_dynamic_items:
-                    new_op_tuning_cfg[item.name] = self.initial_dynamic_cfg_based_on_static_cfg(
-                                                   new_op_tuning_cfg[item.name])
-                new_op_tuning_cfg['calib_sampling_size'] = calib_sampling_size
-                yield new_op_tuning_cfg
-            best_op_tuning_cfg_stage1 = deepcopy(self.cur_best_tuning_cfg)
-
-            # Fallback
-            for target_dtype in ['bf16', 'fp32']:
-                target_type_lst = set(tuning_space.query_items_by_quant_mode(target_dtype))
-                fallback_items_lst = [item for item in quant_ops if item in target_type_lst]
-                if fallback_items_lst:
-                    logger.info(f"Start to fallback op to {target_dtype} one by one.")
-                    self._fallback_started()
-                #fallback_items_name_lst = [item.name for item in fallback_items_lst][::-1] # from bottom to up
-                ops_sensitivity = self.adaptor.calculate_op_sensitivity(self._fp32_model, 
-                                                                        self.calib_dataloader, 
-                                                                        method_args = {'name': 'hessian_trace'})
-                fallback_items_name_lst = sorted(ops_sensitivity, key = lambda items: items[1], reverse=True)
-                
-                op_dtypes = OrderedDict(zip(fallback_items_name_lst, [target_dtype] * len(fallback_items_name_lst)))
+                acc, _ = self.last_tune_result
+                op_fallback_acc_impact[fallback_items_name_lst[op_index]] = acc
+
+            # do accumulated fallback according to the order in the previous stage
+            if len(op_fallback_acc_impact) > 0:
+                ordered_ops = sorted(op_fallback_acc_impact.keys(),
+                                     key=lambda key: op_fallback_acc_impact[key],
+                                     reverse=self.higher_is_better)
+                op_dtypes = OrderedDict(zip(ordered_ops, [target_dtype] * len(fallback_items_name_lst)))
+                logger.info(f"Start to accumulate fallback to {target_dtype}.")
                 initial_op_tuning_cfg = deepcopy(best_op_tuning_cfg_stage1)
                 fallback_sampler = FallbackTuningSampler(tuning_space, tuning_order_lst=[],
-                                                        initial_op_tuning_cfg=initial_op_tuning_cfg,
-                                                        op_dtypes=op_dtypes, accumulate=False)
-                op_fallback_acc_impact = OrderedDict()
-                for op_index, op_tuning_cfg in enumerate(fallback_sampler):
+                                                         initial_op_tuning_cfg=initial_op_tuning_cfg,
+                                                         op_dtypes=op_dtypes, accumulate=True)
+                for op_tuning_cfg in fallback_sampler:
                     op_tuning_cfg['calib_sampling_size'] = calib_sampling_size
                     yield op_tuning_cfg
-                    acc, _ = self.last_tune_result
-                    op_fallback_acc_impact[fallback_items_name_lst[op_index]] = acc
-
-
-                # do accumulated fallback according to the order in the previous stage
-                if len(op_fallback_acc_impact) > 0:
-                    ordered_ops = sorted(op_fallback_acc_impact.keys(), 
-                                         key=lambda key: op_fallback_acc_impact[key],
-                                         reverse=self.higher_is_better)
-                    op_dtypes = OrderedDict(zip(ordered_ops, [target_dtype] * len(fallback_items_name_lst)))
-                    logger.info(f"Start to accumulate fallback to {target_dtype}.")
-                    initial_op_tuning_cfg = deepcopy(best_op_tuning_cfg_stage1)
-                    fallback_sampler = FallbackTuningSampler(tuning_space, tuning_order_lst=[],
-                                                            initial_op_tuning_cfg=initial_op_tuning_cfg,
-                                                            op_dtypes=op_dtypes, accumulate=True)
-                    for op_tuning_cfg in fallback_sampler:
-                        op_tuning_cfg['calib_sampling_size'] = calib_sampling_size
-                        yield op_tuning_cfg
-                        
-    def initial_dynamic_cfg_based_on_static_cfg(self, op_static_cfg:OpTuningConfig):
+
+    def initial_dynamic_cfg_based_on_static_cfg(self, op_static_cfg: OpTuningConfig):
         op_state = op_static_cfg.get_state()
         op_name = op_static_cfg.op_name
         op_type = op_static_cfg.op_type
@@ -198,5 +381,3 @@ def initial_dynamic_cfg_based_on_static_cfg(self, op_static_cfg:OpTuningConfig):
                     tuning_item = quant_mode_item.get_option_by_name(att_item)
                     dynamic_state[att_item] = tuning_item.options[0] if tuning_item else None
         return OpTuningConfig(op_name, op_type, op_quant_mode, tuning_space, kwargs=dynamic_state)
-        
-        
\ No newline at end of file
diff --git a/neural_compressor/strategy/mse.py b/neural_compressor/strategy/mse.py
index 614984359ba..8dafa35759d 100644
--- a/neural_compressor/strategy/mse.py
+++ b/neural_compressor/strategy/mse.py
@@ -194,10 +194,11 @@ def initial_op_quant_mode(items_lst, target_quant_mode, op_item_dtype_dict):
                 initial_op_quant_mode(quant_mode_items, quant_mode, op_item_dtype_dict)
 
             # step3. optype-wise tuning tuning items: the algorithm/scheme/granularity of activation(weight)
-            early_stop_tuning = False
+            early_stop_tuning = True
             stage1_cnt = 0
             int8_ops = quant_mode_wise_items['dynamic'] + quant_mode_wise_items['static']
             stage1_max = min(5, len(int8_ops))  # TODO set a more appropriate value
+            stage1_max=-1
             op_wise_tuning_sampler = OpTypeWiseTuningSampler(tuning_space, [], [], 
                                                              op_item_dtype_dict, initial_op_tuning_cfg)
             for op_tuning_cfg in op_wise_tuning_sampler:
diff --git a/test/strategy/test_hawq_wenhuach.py b/test/strategy/test_hawq_wenhuach.py
index ad7939d5d84..a6ee28b9d4a 100644
--- a/test/strategy/test_hawq_wenhuach.py
+++ b/test/strategy/test_hawq_wenhuach.py
@@ -12,7 +12,7 @@
 from neural_compressor.adaptor import FRAMEWORKS
 import shutil
 from neural_compressor.strategy.st_utils.hawq_wenhuach import Hawq_top, fix_seed
-
+from torch.quantization.quantize_fx import fuse_fx
 fix_seed(1)
 
 def build_ptq_yaml():
@@ -41,7 +41,7 @@ def build_ptq_yaml():
         f.write(fake_yaml)
 
 class TestPytorchAdaptor(unittest.TestCase):
-    framework_specific_info = {"device": "cpu",
+    framework_specific_info = {"device": "gpu",
                                "approach": "post_training_static_quant",
                                "random_seed": 1234,
                                "q_dataloader": None,
@@ -50,6 +50,7 @@ class TestPytorchAdaptor(unittest.TestCase):
     adaptor = FRAMEWORKS[framework](framework_specific_info)
     model = torchvision.models.resnet18()
 
+
     # from collections import OrderedDict
     # model = torch.nn.Sequential(OrderedDict([
     #     ('conv1', torch.nn.Conv2d(3, 2, 1, 1)),
@@ -78,10 +79,11 @@ def eval_func(model):
             return self.i
         from neural_compressor.experimental import Quantization, common
         model = copy.deepcopy(self.model)
-
+        model.eval()
+        model = fuse_fx(model)
         quantizer = Quantization('ptq_yaml.yaml')
         quantizer.eval_func = eval_func
-        dataset = quantizer.dataset('dummy', (1, 3, 224, 224), label=True)
+        dataset = quantizer.dataset('dummy', (32, 3, 224, 224), label=True)
         quantizer.calib_dataloader = common.DataLoader(dataset)
         quantizer.eval_dataloader = common.DataLoader(dataset)
         quantizer.model = model

From 883c3a4c4293ff9ac692bd2c9e2bb68ce35d50d4 Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Thu, 17 Nov 2022 20:01:48 +0800
Subject: [PATCH 015/128] weight hessian trace, not finished

---
 neural_compressor/strategy/hawq.py | 242 +++++++++++++++++++----------
 1 file changed, 161 insertions(+), 81 deletions(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 3db5cf0aed5..4d3b9489b8f 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -27,19 +27,22 @@
 from .st_utils.tuning_sampler import OpTypeWiseTuningSampler, FallbackTuningSampler, ModelWiseTuningSampler
 from .st_utils.tuning_structs import OpTuningConfig
 from .st_utils.tuning_space import TUNING_ITEMS_LST
-
+from torch.quantization.quantize_fx import fuse_fx
+import  torchvision
 
 class HessianTrace:
-    def __init__(self, model, conf, adaptor, op_cfgs_list, dataloader):
+    def __init__(self, model, conf, adaptor, weight_list, dataloader):
         self.model = model
         self.conf = conf  ##config
-        self.op_cfgs_list = op_cfgs_list  ##op to get
+        self.weight_list = weight_list  ##op to get
         self.dataloader = dataloader
         self.adaptor = adaptor
         self.max_iter = 500
         self.tolerance = 1e-5
         self.eps = 1e-6
         self.index = 0
+        self.device = self.get_device(self.model)
+        self.criterion = torch.nn.CrossEntropyLoss().to(self.device)  ##TODO need to set in config
 
     # def apply_init(self):
     #     trace_per_op = self._cal_trace()
@@ -57,22 +60,51 @@ def get_device(self, model: torch.nn.Module):
         for n, p in model.named_parameters():
             return p.data.device
 
-    def get_gradient(self, model, data, criterion, op_list, device="cpu", retrain_graph=False):
+    def get_gradients(self, model, data, criterion, create_graph=False):
         model.zero_grad()
-        input = data[0]
-        target = data[1]
+        input = data[0].to(self.device)
+        target = data[1].to(self.device)
         output = model(input)
         loss = criterion(output, target)
-        loss.backward(retain_graph=retrain_graph)
-        gradients = {}
+        loss.backward(create_graph=create_graph)
+        gradients = []
         for n, p in model.named_parameters():
-            if n in op_list:
-                continue
-            gradients[n] = 0
             if p.grad != None:
-                gradients[n] = p.grad
+                gradient = p.grad
+                gradients.append(gradient+0.0) ## add 0 to create a copy
+        model.zero_grad()
         return gradients
 
+    def get_params(self, model):
+        parameters = [p for p in model.parameters() if p.requires_grad]
+        return parameters
+
+    def sample_rademacher(self, params):
+        samples = []
+        for param in params:
+            r = torch.randint_like(param, high=2, device=self.device)
+            r.masked_fill_(r == 0, -1)
+            samples.append(r)
+        return samples
+
+    def hutchinson_one_step(self, params, num_batches):
+        v = self.sample_rademacher(params)
+        H_v = [0] * len(v)
+        cnt = 0
+        for step, batch in enumerate(self.dataloader):
+            batch_size = batch[0].shape[0]
+            cnt += batch_size
+            gradients = self.get_gradients(self.model, batch, self.criterion, create_graph=True)
+            H_v_one = torch.autograd.grad(gradients, params, v, only_inputs=True, retain_graph=False)
+            H_v = [pre + cur * float(batch_size) + 0.0 for cur, pre in zip(H_v_one, H_v)]
+            if step == num_batches - 1:
+                break
+        if cnt > 0:
+            H_v = [item / cnt for item in H_v]
+        v_t_H_v = [torch.sum(h_v * v_t) / h_v.size().numel() for (h_v, v_t) in zip(H_v, v)]
+        return v_t_H_v
+
+
     def get_avg_trace(self, num_batches=2):
         """
                 Estimates average hessian trace for each parameter
@@ -80,73 +112,75 @@ def get_avg_trace(self, num_batches=2):
         assert num_batches > 0
         ##num_data_iter = self.op_cfgs_list[0]['calib_iteration']
         ##num_all_data = num_data_iter * self.dataloader.batch_size
-        op_list = [item.name for item in self.op_cfgs_list]
-        criterion = torch.nn.CrossEntropyLoss()  ##TODO setting this in config
-        device = self.get_device(self.model)
+        op_list = self.weight_list
 
-        for step, batch in enumerate(self.dataloader):
-            gradient_dict = self.get_gradient(self.model, batch,criterion, op_list, device=device, retrain_graph=True)
-            tmp = 1
-            if step == num_batches - 1:
-                break
+        ##TODO setting this in config
+
+
+        params = [p for p in self.model.parameters() if p.requires_grad]
 
+        for i in range(self.max_iter):
+            trace_estimated = self.hutchinson_one_step(params, num_batches)
 
-        weight_vhp = []
-        w_avg_total_trace = 0.
-        w_avg_traces_per_iter = []
-        mean_avg_traces_per_param = None
-        act_vhp = []
-        a_avg_total_trace = 0.
-        a_avg_traces_per_iter = []
-        mean_avg_traces_per_act = None
-
-        for i in range(max_iter):
-            weight_vhp_list, w_v, \
-            act_vhp_list, a_v, op_act_grad = self.adaptor.get_2order_grad(self.model,
-                                                                          criterion,
-                                                                          self.dataloader,
-                                                                          num_data_iter,
-                                                                          qop_list)
-            if not weight_vhp:
-                weight_vhp = [np.random.randn(*p.shape) for p in w_v]
-            for vhp_curr in weight_vhp_list:
-                weight_vhp = [a + b * float(self.dataloader.batch_size) + 0. \
-                              for a, b in zip(weight_vhp, vhp_curr)]
-            weight_vhp = [a / float(num_all_data) for a in weight_vhp]
-            avg_traces_per_param = [np.sum(a * b) / a.size for (a, b) in zip(weight_vhp, w_v)]
-            w_avg_traces_per_iter.append(avg_traces_per_param)
-            mean_avg_traces_per_param = np.mean(w_avg_traces_per_iter, axis=0)
-            w_mean_avg_total_trace = np.sum(mean_avg_traces_per_param)
-
-            w_diff_avg = abs(w_mean_avg_total_trace - w_avg_total_trace) / \
-                         (w_avg_total_trace + diff_eps)
-            w_avg_total_trace = w_mean_avg_total_trace
-            logger.info(
-                '{}# weights difference_avg={} avg_trace={}'.format(
-                    i, w_diff_avg, w_avg_total_trace))
-
-            if not act_vhp:
-                act_vhp = [np.random.randn(*p.shape) for p in a_v]
-            for vhp_curr in act_vhp_list:
-                act_vhp = [a + b * float(self.dataloader.batch_size) + 0. \
-                           for a, b in zip(act_vhp, vhp_curr)]
-            act_vhp = [a / float(num_all_data) for a in act_vhp]
-            avg_traces_per_act = [np.sum(a * b) / a.size for (a, b) in zip(act_vhp, a_v)]
-            a_avg_traces_per_iter.append(avg_traces_per_act)
-            mean_avg_traces_per_act = np.mean(a_avg_traces_per_iter, axis=0)
-            a_mean_avg_total_trace = np.sum(mean_avg_traces_per_act)
-
-            a_diff_avg = abs(a_mean_avg_total_trace - a_avg_total_trace) / \
-                         (a_avg_total_trace + diff_eps)
-            a_avg_total_trace = a_mean_avg_total_trace
-            logger.info(
-                '{}# activation difference_avg={} avg_trace={}'.format(
-                    i, a_diff_avg, a_avg_total_trace))
-
-            if w_diff_avg < tolerance and a_diff_avg < tolerance:
-                return mean_avg_traces_per_param, mean_avg_traces_per_act, op_act_grad
-
-        return mean_avg_traces_per_param, mean_avg_traces_per_act, op_act_grad
+
+        tmp = 1
+        #
+        # weight_vhp = []
+        # w_avg_total_trace = 0.
+        # w_avg_traces_per_iter = []
+        # mean_avg_traces_per_param = None
+        # act_vhp = []
+        # a_avg_total_trace = 0.
+        # a_avg_traces_per_iter = []
+        # mean_avg_traces_per_act = None
+        #
+        # for i in range(self.max_iter):
+        #     weight_vhp_list, w_v, \
+        #     act_vhp_list, a_v, op_act_grad = self.adaptor.get_2order_grad(self.model,
+        #                                                                   criterion,
+        #                                                                   self.dataloader,
+        #                                                                   num_data_iter,
+        #                                                                   qop_list)
+        #     if not weight_vhp:
+        #         weight_vhp = [np.random.randn(*p.shape) for p in w_v]
+        #     for vhp_curr in weight_vhp_list:
+        #         weight_vhp = [a + b * float(self.dataloader.batch_size) + 0. \
+        #                       for a, b in zip(weight_vhp, vhp_curr)]
+        #     weight_vhp = [a / float(num_all_data) for a in weight_vhp]
+        #     avg_traces_per_param = [np.sum(a * b) / a.size for (a, b) in zip(weight_vhp, w_v)]
+        #     w_avg_traces_per_iter.append(avg_traces_per_param)
+        #     mean_avg_traces_per_param = np.mean(w_avg_traces_per_iter, axis=0)
+        #     w_mean_avg_total_trace = np.sum(mean_avg_traces_per_param)
+        #
+        #     w_diff_avg = abs(w_mean_avg_total_trace - w_avg_total_trace) / \
+        #                  (w_avg_total_trace + diff_eps)
+        #     w_avg_total_trace = w_mean_avg_total_trace
+        #     logger.info(
+        #         '{}# weights difference_avg={} avg_trace={}'.format(
+        #             i, w_diff_avg, w_avg_total_trace))
+        #
+        #     if not act_vhp:
+        #         act_vhp = [np.random.randn(*p.shape) for p in a_v]
+        #     for vhp_curr in act_vhp_list:
+        #         act_vhp = [a + b * float(self.dataloader.batch_size) + 0. \
+        #                    for a, b in zip(act_vhp, vhp_curr)]
+        #     act_vhp = [a / float(num_all_data) for a in act_vhp]
+        #     avg_traces_per_act = [np.sum(a * b) / a.size for (a, b) in zip(act_vhp, a_v)]
+        #     a_avg_traces_per_iter.append(avg_traces_per_act)
+        #     mean_avg_traces_per_act = np.mean(a_avg_traces_per_iter, axis=0)
+        #     a_mean_avg_total_trace = np.sum(mean_avg_traces_per_act)
+        #
+        #     a_diff_avg = abs(a_mean_avg_total_trace - a_avg_total_trace) / \
+        #                  (a_avg_total_trace + diff_eps)
+        #     a_avg_total_trace = a_mean_avg_total_trace
+        #     logger.info(
+        #         '{}# activation difference_avg={} avg_trace={}'.format(
+        #             i, a_diff_avg, a_avg_total_trace))
+        #
+        #     if w_diff_avg < tolerance and a_diff_avg < tolerance:
+        #         return mean_avg_traces_per_param, mean_avg_traces_per_act, op_act_grad
+        #
+        # return mean_avg_traces_per_param, mean_avg_traces_per_act, op_act_grad
 
     def _cal_trace(self):
         """
@@ -241,6 +275,46 @@ def __init__(self, model, conf, q_dataloader, q_func=None,
             dicts,
             q_hooks)
 
+    def is_fused_module(self, module):
+        """This is a helper function for `_propagate_qconfig_helper` to detecte
+           if this module is fused.
+        Args:
+            module (object): input module
+        Returns:
+            (bool): is fused or not
+        """
+        op_type = str(type(module))
+        if 'fused' in op_type:
+            return True
+        else:
+            return False
+
+    def get_fused_mapping(self):
+        # tmp = self.model
+        # if isinstance(self._fp32_model, torch.nn.Module):
+        #     fx_model  = self._fp32_model
+        #
+        # model = copy.deepcopy(self._fp32_model) ##orig model
+        # model.eval()
+        # fx_model = fuse_fx(model)
+        model = self._fp32_model
+        weights_info = dict(model.named_parameters())
+        weight_to_op = {}
+
+        module_dict = dict(model.named_modules())
+        for op_name, child in model.named_modules():
+            if self.is_fused_module(child):
+                for name, _ in child.named_children():
+                    if op_name + "." + name + ".weight" in weights_info:  ##TODO check if this is right
+                        weight_to_op[op_name + "." + name + ".weight"] = op_name
+                    # module_prefix = op_name + '.' + name
+                    # if module_prefix in module_dict:
+                    #     module_dict.pop(module_prefix)  # remove sub-modules of fused modules
+            else:
+                if op_name + ".weight" in weights_info:
+                    weight_to_op[op_name + ".weight"] = op_name
+        return weight_to_op
+
     def next_tune_cfg(self):
         from copy import deepcopy
         tuning_space = self.tuning_space
@@ -254,16 +328,21 @@ def next_tune_cfg(self):
 
         target_dtype = "fp32"  ##TODO support bf16
         target_type_lst = set(tuning_space.query_items_by_quant_mode(target_dtype))
-        fp_op_list = [item for item in quant_ops if item in target_type_lst]
+        fp_op_list = [item.name for item in quant_ops if item in target_type_lst]
+        # for n, p in self._fp32_model.named_modules():
+        #     print(n)
+        # for n, p in self._fp32_model.named_parameters():
+        #     print(n)
+        weight_to_op = self.get_fused_mapping()
         orig_eval = True
         if self._fp32_model.training:
             orig_eval = False
         self._fp32_model.train()
-        ht = HessianTrace(self._fp32_model, self.cfg, self.adaptor, fp_op_list, self.calib_dataloader)
+        ht = HessianTrace(self._fp32_model, self.cfg, self.adaptor, weight_to_op.keys(), self.calib_dataloader)
         ht.get_avg_trace()
-        # if orig_eval:
-        #     self._fp32_model.eval()
-        # ht.get_avg_trace()
+        if orig_eval:
+            self._fp32_model.eval()
+
         # tmp = 1
         # fallback_items_name_lst = [item.name for item in fallback_items_lst][::-1] # from bottom to up
         # ops_sensitivity = self.adaptor.get_hessian_trace(self._fp32_model,
@@ -338,6 +417,7 @@ def next_tune_cfg_bk(self):
             fallback_sampler = FallbackTuningSampler(tuning_space, tuning_order_lst=[],
                                                      initial_op_tuning_cfg=initial_op_tuning_cfg,
                                                      op_dtypes=op_dtypes, accumulate=False)
+
             op_fallback_acc_impact = OrderedDict()
             for op_index, op_tuning_cfg in enumerate(fallback_sampler):
                 op_tuning_cfg['calib_sampling_size'] = calib_sampling_size

From a50cc143a84d1b35ab0a0baf16d05c819e7b6164 Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Fri, 18 Nov 2022 11:16:25 +0800
Subject: [PATCH 016/128] bascially finished weight trace

---
 neural_compressor/strategy/hawq.py | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 4d3b9489b8f..39fd93fd3ff 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -96,19 +96,20 @@ def hutchinson_one_step(self, params, num_batches):
             cnt += batch_size
             gradients = self.get_gradients(self.model, batch, self.criterion, create_graph=True)
             H_v_one = torch.autograd.grad(gradients, params, v, only_inputs=True, retain_graph=False)
-            H_v = [pre + cur * float(batch_size) + 0.0 for cur, pre in zip(H_v_one, H_v)]
+            H_v = [pre + cur * float(batch_size)  for cur, pre in zip(H_v_one, H_v)]
             if step == num_batches - 1:
                 break
         if cnt > 0:
             H_v = [item / cnt for item in H_v]
-        v_t_H_v = [torch.sum(h_v * v_t) / h_v.size().numel() for (h_v, v_t) in zip(H_v, v)]
+        v_t_H_v = torch.stack([torch.mean(h_v * v_t) for (h_v, v_t) in zip(H_v, v)])##maybe sum is better
         return v_t_H_v
 
 
-    def get_avg_trace(self, num_batches=2):
+
+    def get_avg_traces(self, num_batches=2):
+        """
+        Estimates average hessian trace for each parameter
         """
-                Estimates average hessian trace for each parameter
-                """
         assert num_batches > 0
         ##num_data_iter = self.op_cfgs_list[0]['calib_iteration']
         ##num_all_data = num_data_iter * self.dataloader.batch_size
@@ -119,8 +120,21 @@ def get_avg_trace(self, num_batches=2):
 
         params = [p for p in self.model.parameters() if p.requires_grad]
 
+        layer_traces_per_iter = []
+        prev_avg_model_trace = 0
         for i in range(self.max_iter):
-            trace_estimated = self.hutchinson_one_step(params, num_batches)
+            layer_traces = self.hutchinson_one_step(params, num_batches)
+            layer_traces_per_iter.append(layer_traces)
+            layer_traces_estimate = torch.mean(torch.stack(layer_traces_per_iter), dim=0)
+            model_trace = torch.sum(layer_traces_estimate)
+            diff_ratio = abs(model_trace-prev_avg_model_trace)/(prev_avg_model_trace+self.eps)
+            if diff_ratio < self.tolerance and i > 10:##TODO magic number
+                break
+            prev_avg_model_trace = model_trace
+
+        layer_traces = layer_traces_estimate
+        return layer_traces
+
 
 
         tmp = 1
@@ -339,7 +353,7 @@ def next_tune_cfg(self):
             orig_eval = False
         self._fp32_model.train()
         ht = HessianTrace(self._fp32_model, self.cfg, self.adaptor, weight_to_op.keys(), self.calib_dataloader)
-        ht.get_avg_trace()
+        ht.get_avg_traces()
         if orig_eval:
             self._fp32_model.eval()
 

From 2528605a655dad9bd9cde5f07fcd53d5e3e5ea50 Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Fri, 18 Nov 2022 15:45:52 +0800
Subject: [PATCH 017/128] enable activation gradient hook,  activation trace is
 not finished

---
 neural_compressor/strategy/hawq.py | 266 +++++++++++------------------
 1 file changed, 101 insertions(+), 165 deletions(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 39fd93fd3ff..34d94901167 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -28,42 +28,73 @@
 from .st_utils.tuning_structs import OpTuningConfig
 from .st_utils.tuning_space import TUNING_ITEMS_LST
 from torch.quantization.quantize_fx import fuse_fx
-import  torchvision
+import torchvision
+
 
 class HessianTrace:
-    def __init__(self, model, conf, adaptor, weight_list, dataloader):
-        self.model = model
-        self.conf = conf  ##config
-        self.weight_list = weight_list  ##op to get
+    """
+    please refer to
+    Yao, Zhewei, et al. "Pyhessian: Neural networks through the lens of the hessian." 2020 IEEE international conference on big data (Big data). IEEE, 2020.
+    Dong, Zhen, et al. "Hawq-v2: Hessian aware trace-weighted quantization of neural networks." Advances in neural information processing systems 33 (2020): 18518-18529.
+    https://github.com/openvinotoolkit/nncf/blob/develop/nncf/torch/quantization/hessian_trace.py
+    """
+
+    def __init__(self, model, dataloader, criterion=None):
+        self.model = model  ##TODO need to check fused or not
         self.dataloader = dataloader
-        self.adaptor = adaptor
         self.max_iter = 500
         self.tolerance = 1e-5
         self.eps = 1e-6
         self.index = 0
         self.device = self.get_device(self.model)
-        self.criterion = torch.nn.CrossEntropyLoss().to(self.device)  ##TODO need to set in config
-
-    # def apply_init(self):
-    #     trace_per_op = self._cal_trace()
-    #     if not trace_per_op:
-    #         raise RuntimeError('Failed to calculate hessian traces!')
-    #
-    #     perturbations = self._calc_quantization_noise()
-    #     configuration_metric = self._calc_hawq_metric_per_configuration(
-    #         perturbations, trace_per_op)
-    #     config_index = self.choose_configuration(configuration_metric)
-    #     chosen_config = self.op_cfgs_list[config_index]
-    #     return chosen_config, trace_per_op
+        self.criterion = criterion
+        if self.criterion == None:
+            self.criterion = torch.nn.CrossEntropyLoss().to(self.device)  ##TODO need to set in config
+        self.criterion = self.criterion.to(self.device)
+        self.weight_to_op, self.op_list = self.get_fused_mapping()
+
+    def is_fused_module(self, module):
+        """This is a helper function for `_propagate_qconfig_helper` to detecte
+           if this module is fused.
+        Args:
+            module (object): input module
+        Returns:
+            (bool): is fused or not
+        """
+        op_type = str(type(module))
+        if 'fused' in op_type:
+            return True
+        else:
+            return False
+
+    def get_fused_mapping(self):
+        model = self.model
+        weights_info = dict(model.named_parameters())
+        weight_to_op = {}
+        for op_name, child in model.named_modules():
+            if self.is_fused_module(child):
+                for name, _ in child.named_children():
+                    if op_name + "." + name + ".weight" in weights_info:  ##TODO check if this is right
+                        weight_to_op[op_name + "." + name + ".weight"] = op_name
+                        break
+            else:
+                if op_name + ".weight" in weights_info:
+                    weight_to_op[op_name + ".weight"] = op_name
+        op_list = []
+        for key in weight_to_op.keys():
+            op_list.append(weight_to_op[key])
+        return weight_to_op, op_list
 
     def get_device(self, model: torch.nn.Module):
         for n, p in model.named_parameters():
             return p.data.device
 
-    def get_gradients(self, model, data, criterion, create_graph=False):
+    def get_gradients(self, model, data, criterion, create_graph=False, enable_act=False):
         model.zero_grad()
         input = data[0].to(self.device)
         target = data[1].to(self.device)
+        if enable_act:
+            input.requires_grad = True
         output = model(input)
         loss = criterion(output, target)
         loss.backward(create_graph=create_graph)
@@ -71,7 +102,7 @@ def get_gradients(self, model, data, criterion, create_graph=False):
         for n, p in model.named_parameters():
             if p.grad != None:
                 gradient = p.grad
-                gradients.append(gradient+0.0) ## add 0 to create a copy
+                gradients.append(gradient + 0.0)  ## add 0 to create a copy
         model.zero_grad()
         return gradients
 
@@ -87,143 +118,88 @@ def sample_rademacher(self, params):
             samples.append(r)
         return samples
 
-    def hutchinson_one_step(self, params, num_batches):
+    def hutchinson_one_step(self, params, enable_act, num_batches):
         v = self.sample_rademacher(params)
         H_v = [0] * len(v)
         cnt = 0
-        for step, batch in enumerate(self.dataloader):
-            batch_size = batch[0].shape[0]
+        for step, data in enumerate(self.dataloader):
+            batch_size = data[0].shape[0]
             cnt += batch_size
-            gradients = self.get_gradients(self.model, batch, self.criterion, create_graph=True)
+            gradients = self.get_gradients(self.model, data, self.criterion, create_graph=True,enable_act=enable_act)
             H_v_one = torch.autograd.grad(gradients, params, v, only_inputs=True, retain_graph=False)
-            H_v = [pre + cur * float(batch_size)  for cur, pre in zip(H_v_one, H_v)]
+            H_v = [pre + cur * float(batch_size) for cur, pre in zip(H_v_one, H_v)]
             if step == num_batches - 1:
                 break
         if cnt > 0:
             H_v = [item / cnt for item in H_v]
-        v_t_H_v = torch.stack([torch.mean(h_v * v_t) for (h_v, v_t) in zip(H_v, v)])##maybe sum is better
+        v_t_H_v = torch.stack([torch.mean(h_v * v_t) for (h_v, v_t) in zip(H_v, v)])  ##maybe sum is better
         return v_t_H_v
 
 
+    def backward_hook(self, name):
+        def grad_hook(model, grad_input, grad_output):
+            self.layer_acts_grads[name] = [grad_input, grad_output]
+        return grad_hook
+
+    def forward_hook(self, name):
+        def enable_input_grad_hook(model, inputs, outputs):
+            try:
+                input = inputs[0]##TODO check whether this is right
+            except:
+                input = inputs
 
-    def get_avg_traces(self, num_batches=2):
+            if input.is_leaf == False:
+                if input.requires_grad is False:
+                    input.requires_grad = True
+                    self.layer_acts[name] = input
+
+        return enable_input_grad_hook
+
+    def register_hook(self):
+        for name, module in self.model.named_modules():
+            if name in self.op_list:
+                forward_handle = module.register_forward_hook(self.forward_hook(name))
+                backward_handle = module.register_backward_hook(self.backward_hook(name))
+                self.hook_handlers.append(forward_handle)
+                self.hook_handlers.append(backward_handle)
+
+    def unregister_hook(self):
+        for handel in self.hook_handlers:
+            handel.remove()
+
+    def get_avg_traces(self, enable_act=True, num_batches=2):
         """
         Estimates average hessian trace for each parameter
         """
         assert num_batches > 0
+        if enable_act:
+            self.hook_handlers = []
+            self.layer_acts = {}
+            self.layer_acts_grads = {}
+            self.register_hook()
         ##num_data_iter = self.op_cfgs_list[0]['calib_iteration']
         ##num_all_data = num_data_iter * self.dataloader.batch_size
-        op_list = self.weight_list
-
+        ##op_list = self.op_list
         ##TODO setting this in config
-
-
         params = [p for p in self.model.parameters() if p.requires_grad]
 
         layer_traces_per_iter = []
         prev_avg_model_trace = 0
         for i in range(self.max_iter):
-            layer_traces = self.hutchinson_one_step(params, num_batches)
+            layer_traces = self.hutchinson_one_step(params, enable_act, num_batches )
             layer_traces_per_iter.append(layer_traces)
             layer_traces_estimate = torch.mean(torch.stack(layer_traces_per_iter), dim=0)
             model_trace = torch.sum(layer_traces_estimate)
-            diff_ratio = abs(model_trace-prev_avg_model_trace)/(prev_avg_model_trace+self.eps)
-            if diff_ratio < self.tolerance and i > 10:##TODO magic number
+            diff_ratio = abs(model_trace - prev_avg_model_trace) / (prev_avg_model_trace + self.eps)
+            if diff_ratio < self.tolerance and i > 10:  ##TODO magic number
                 break
             prev_avg_model_trace = model_trace
 
         layer_traces = layer_traces_estimate
+        self.unregister_hook()
         return layer_traces
 
 
-
-        tmp = 1
-        #
-        # weight_vhp = []
-        # w_avg_total_trace = 0.
-        # w_avg_traces_per_iter = []
-        # mean_avg_traces_per_param = None
-        # act_vhp = []
-        # a_avg_total_trace = 0.
-        # a_avg_traces_per_iter = []
-        # mean_avg_traces_per_act = None
-        #
-        # for i in range(self.max_iter):
-        #     weight_vhp_list, w_v, \
-        #     act_vhp_list, a_v, op_act_grad = self.adaptor.get_2order_grad(self.model,
-        #                                                                   criterion,
-        #                                                                   self.dataloader,
-        #                                                                   num_data_iter,
-        #                                                                   qop_list)
-        #     if not weight_vhp:
-        #         weight_vhp = [np.random.randn(*p.shape) for p in w_v]
-        #     for vhp_curr in weight_vhp_list:
-        #         weight_vhp = [a + b * float(self.dataloader.batch_size) + 0. \
-        #                       for a, b in zip(weight_vhp, vhp_curr)]
-        #     weight_vhp = [a / float(num_all_data) for a in weight_vhp]
-        #     avg_traces_per_param = [np.sum(a * b) / a.size for (a, b) in zip(weight_vhp, w_v)]
-        #     w_avg_traces_per_iter.append(avg_traces_per_param)
-        #     mean_avg_traces_per_param = np.mean(w_avg_traces_per_iter, axis=0)
-        #     w_mean_avg_total_trace = np.sum(mean_avg_traces_per_param)
-        #
-        #     w_diff_avg = abs(w_mean_avg_total_trace - w_avg_total_trace) / \
-        #                  (w_avg_total_trace + diff_eps)
-        #     w_avg_total_trace = w_mean_avg_total_trace
-        #     logger.info(
-        #         '{}# weights difference_avg={} avg_trace={}'.format(
-        #             i, w_diff_avg, w_avg_total_trace))
-        #
-        #     if not act_vhp:
-        #         act_vhp = [np.random.randn(*p.shape) for p in a_v]
-        #     for vhp_curr in act_vhp_list:
-        #         act_vhp = [a + b * float(self.dataloader.batch_size) + 0. \
-        #                    for a, b in zip(act_vhp, vhp_curr)]
-        #     act_vhp = [a / float(num_all_data) for a in act_vhp]
-        #     avg_traces_per_act = [np.sum(a * b) / a.size for (a, b) in zip(act_vhp, a_v)]
-        #     a_avg_traces_per_iter.append(avg_traces_per_act)
-        #     mean_avg_traces_per_act = np.mean(a_avg_traces_per_iter, axis=0)
-        #     a_mean_avg_total_trace = np.sum(mean_avg_traces_per_act)
-        #
-        #     a_diff_avg = abs(a_mean_avg_total_trace - a_avg_total_trace) / \
-        #                  (a_avg_total_trace + diff_eps)
-        #     a_avg_total_trace = a_mean_avg_total_trace
-        #     logger.info(
-        #         '{}# activation difference_avg={} avg_trace={}'.format(
-        #             i, a_diff_avg, a_avg_total_trace))
-        #
-        #     if w_diff_avg < tolerance and a_diff_avg < tolerance:
-        #         return mean_avg_traces_per_param, mean_avg_traces_per_act, op_act_grad
-        #
-        # return mean_avg_traces_per_param, mean_avg_traces_per_act, op_act_grad
-
-    def _cal_trace(self):
-        """
-        Calculate the trace for both weight and activation per layer
-        """
-        pass
-        # trace_estimator = HessianTraceEstimator(self.model,
-        #                                         self.conf,
-        #                                         self.adaptor,
-        #                                         self.op_cfgs_list,
-        #                                         self.dataloader)
-        # w_avg_trace, a_avg_trace, op_act_grad = trace_estimator.get_avg_trace()
-        #
-        # # mapping trace to op per op_weight_mapping
-        # weights_name = self.adaptor.get_all_weight_names(self.model)
-        # op_weight_mapping = self.get_op_weight_mapping()
-        # trace_per_op = OrderedDict()
-        # w_op_trace_info = np.zeros(len(op_weight_mapping))
-        # for i, (op_name, w_name) in enumerate(op_weight_mapping.items()):
-        #     index = weights_name.index(w_name)
-        #     w_op_trace_info[i] = w_avg_trace[index]
-        #     act_trace = 0.0
-        #     if op_name in op_act_grad:
-        #         a_index = op_act_grad.index(op_name)
-        #         act_trace = a_avg_trace[a_index]
-        #     trace_per_op[op_name] = (w_avg_trace[index], act_trace)
-        # return trace_per_op
-
-
 @strategy_registry
 class HawqTuneStrategy(TuneStrategy):
     """The basic tuning strategy which tunes the low precision model with below order.
@@ -289,46 +265,6 @@ def __init__(self, model, conf, q_dataloader, q_func=None,
             dicts,
             q_hooks)
 
-    def is_fused_module(self, module):
-        """This is a helper function for `_propagate_qconfig_helper` to detecte
-           if this module is fused.
-        Args:
-            module (object): input module
-        Returns:
-            (bool): is fused or not
-        """
-        op_type = str(type(module))
-        if 'fused' in op_type:
-            return True
-        else:
-            return False
-
-    def get_fused_mapping(self):
-        # tmp = self.model
-        # if isinstance(self._fp32_model, torch.nn.Module):
-        #     fx_model  = self._fp32_model
-        #
-        # model = copy.deepcopy(self._fp32_model) ##orig model
-        # model.eval()
-        # fx_model = fuse_fx(model)
-        model = self._fp32_model
-        weights_info = dict(model.named_parameters())
-        weight_to_op = {}
-
-        module_dict = dict(model.named_modules())
-        for op_name, child in model.named_modules():
-            if self.is_fused_module(child):
-                for name, _ in child.named_children():
-                    if op_name + "." + name + ".weight" in weights_info:  ##TODO check if this is right
-                        weight_to_op[op_name + "." + name + ".weight"] = op_name
-                    # module_prefix = op_name + '.' + name
-                    # if module_prefix in module_dict:
-                    #     module_dict.pop(module_prefix)  # remove sub-modules of fused modules
-            else:
-                if op_name + ".weight" in weights_info:
-                    weight_to_op[op_name + ".weight"] = op_name
-        return weight_to_op
-
     def next_tune_cfg(self):
         from copy import deepcopy
         tuning_space = self.tuning_space
@@ -347,12 +283,12 @@ def next_tune_cfg(self):
         #     print(n)
         # for n, p in self._fp32_model.named_parameters():
         #     print(n)
-        weight_to_op = self.get_fused_mapping()
+
         orig_eval = True
         if self._fp32_model.training:
             orig_eval = False
         self._fp32_model.train()
-        ht = HessianTrace(self._fp32_model, self.cfg, self.adaptor, weight_to_op.keys(), self.calib_dataloader)
+        ht = HessianTrace(self._fp32_model, self.calib_dataloader)
         ht.get_avg_traces()
         if orig_eval:
             self._fp32_model.eval()

From abbc4ae53e66c32a8f2ff20ee0316b562c2dda92 Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Fri, 18 Nov 2022 15:47:25 +0800
Subject: [PATCH 018/128] reformat code

---
 neural_compressor/strategy/hawq.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 34d94901167..8ec728337b9 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -102,7 +102,7 @@ def get_gradients(self, model, data, criterion, create_graph=False, enable_act=F
         for n, p in model.named_parameters():
             if p.grad != None:
                 gradient = p.grad
-                gradients.append(gradient + 0.0)  ## add 0 to create a copy
+                gradients.append(gradient + 0.0) ## add 0 to create a copy
         model.zero_grad()
         return gradients
 
@@ -125,7 +125,7 @@ def hutchinson_one_step(self, params, enable_act, num_batches):
         for step, data in enumerate(self.dataloader):
             batch_size = data[0].shape[0]
             cnt += batch_size
-            gradients = self.get_gradients(self.model, data, self.criterion, create_graph=True,enable_act=enable_act)
+            gradients = self.get_gradients(self.model, data, self.criterion, create_graph=True, enable_act=enable_act)
             H_v_one = torch.autograd.grad(gradients, params, v, only_inputs=True, retain_graph=False)
             H_v = [pre + cur * float(batch_size) for cur, pre in zip(H_v_one, H_v)]
             if step == num_batches - 1:
@@ -135,16 +135,16 @@ def hutchinson_one_step(self, params, enable_act, num_batches):
         v_t_H_v = torch.stack([torch.mean(h_v * v_t) for (h_v, v_t) in zip(H_v, v)])  ##maybe sum is better
         return v_t_H_v
 
-
     def backward_hook(self, name):
         def grad_hook(model, grad_input, grad_output):
             self.layer_acts_grads[name] = [grad_input, grad_output]
+
         return grad_hook
 
     def forward_hook(self, name):
         def enable_input_grad_hook(model, inputs, outputs):
             try:
-                input = inputs[0]##TODO check whether this is right
+                input = inputs[0]  ##TODO check whether this is right
             except:
                 input = inputs
 
@@ -167,7 +167,7 @@ def unregister_hook(self):
         for handel in self.hook_handlers:
             handel.remove()
 
-    def get_avg_traces(self, enable_act=True, num_batches=2):
+    def get_avg_traces(self, enable_act=False, num_batches=2):
         """
         Estimates average hessian trace for each parameter
         """
@@ -186,7 +186,7 @@ def get_avg_traces(self, enable_act=True, num_batches=2):
         layer_traces_per_iter = []
         prev_avg_model_trace = 0
         for i in range(self.max_iter):
-            layer_traces = self.hutchinson_one_step(params, enable_act, num_batches )
+            layer_traces = self.hutchinson_one_step(params, enable_act, num_batches)
             layer_traces_per_iter.append(layer_traces)
             layer_traces_estimate = torch.mean(torch.stack(layer_traces_per_iter), dim=0)
             model_trace = torch.sum(layer_traces_estimate)

From 58128ec3db00e65f3a8ef0cb05b364038ecd2623 Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Fri, 18 Nov 2022 15:54:48 +0800
Subject: [PATCH 019/128] fix a bug

---
 neural_compressor/strategy/hawq.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 8ec728337b9..7d2331af345 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -196,7 +196,8 @@ def get_avg_traces(self, enable_act=False, num_batches=2):
             prev_avg_model_trace = model_trace
 
         layer_traces = layer_traces_estimate
-        self.unregister_hook()
+        if enable_act:
+            self.unregister_hook()
         return layer_traces
 
 

From 26538ee995c7b22275342955fcc3ea9cb4c88f18 Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Mon, 21 Nov 2022 11:06:43 +0800
Subject: [PATCH 020/128] when reset the required grad, something goes wrong

---
 neural_compressor/strategy/hawq.py | 67 ++++++++++++++++++++----------
 1 file changed, 46 insertions(+), 21 deletions(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 7d2331af345..228bb249e2a 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -92,6 +92,7 @@ def get_device(self, model: torch.nn.Module):
     def get_gradients(self, model, data, criterion, create_graph=False, enable_act=False):
         model.zero_grad()
         input = data[0].to(self.device)
+        ##self._input_shape = input.shape  ## for resetting input activation
         target = data[1].to(self.device)
         if enable_act:
             input.requires_grad = True
@@ -102,7 +103,7 @@ def get_gradients(self, model, data, criterion, create_graph=False, enable_act=F
         for n, p in model.named_parameters():
             if p.grad != None:
                 gradient = p.grad
-                gradients.append(gradient + 0.0) ## add 0 to create a copy
+                gradients.append(gradient + 0.0)  ## add 0 to create a copy
         model.zero_grad()
         return gradients
 
@@ -118,7 +119,7 @@ def sample_rademacher(self, params):
             samples.append(r)
         return samples
 
-    def hutchinson_one_step(self, params, enable_act, num_batches):
+    def get_hv_one_sample(self, params, enable_act, num_batches):
         v = self.sample_rademacher(params)
         H_v = [0] * len(v)
         cnt = 0
@@ -135,19 +136,17 @@ def hutchinson_one_step(self, params, enable_act, num_batches):
         v_t_H_v = torch.stack([torch.mean(h_v * v_t) for (h_v, v_t) in zip(H_v, v)])  ##maybe sum is better
         return v_t_H_v
 
-    def backward_hook(self, name):
-        def grad_hook(model, grad_input, grad_output):
+    def _get_input_grad_hook(self, name):
+        def input_grad_hook(model, grad_input, grad_output):
             self.layer_acts_grads[name] = [grad_input, grad_output]
+        return input_grad_hook
 
-        return grad_hook
-
-    def forward_hook(self, name):
+    def _get_enable_input_grad_hook(self, name):
         def enable_input_grad_hook(model, inputs, outputs):
             try:
                 input = inputs[0]  ##TODO check whether this is right
             except:
                 input = inputs
-
             if input.is_leaf == False:
                 if input.requires_grad is False:
                     input.requires_grad = True
@@ -155,28 +154,54 @@ def enable_input_grad_hook(model, inputs, outputs):
 
         return enable_input_grad_hook
 
-    def register_hook(self):
+    # def _get_disable_input_grad_hook(self, name):
+    #     def disable_input_grad_hook(model, inputs, outputs):
+    #         try:
+    #             input = inputs[0]  ##TODO check whether this is right
+    #         except:
+    #             input = inputs
+    #         if input.is_leaf == False:## you can only change requires_grad flags of leaf variables
+    #             if input.requires_grad is True:
+    #                 input.requires_grad = False
+    #
+    #
+    #     return disable_input_grad_hook
+
+
+    def _unregister_hook(self):
+        for handel in self.hook_handles:
+            handel.remove()
+
+    def register_input_grad_hooks(self):
         for name, module in self.model.named_modules():
             if name in self.op_list:
-                forward_handle = module.register_forward_hook(self.forward_hook(name))
-                backward_handle = module.register_backward_hook(self.backward_hook(name))
-                self.hook_handlers.append(forward_handle)
-                self.hook_handlers.append(backward_handle)
+                hook_handle = module.register_forward_hook(self._get_enable_input_grad_hook(name))
+                self.hook_handles.append(hook_handle)
+                hook_handle = module.register_forward_hook(self._get_input_grad_hook(name))
+                self.hook_handles.append(hook_handle)
+
+
+    def reset_input_gradient_and_hooks(self):
+        # tmp_input = torch.zeros(self._input_shape, device=self.device)
+        # for name, module in self.model.named_modules():
+        #     if name in self.op_list:
+        #         hook_handle = module.register_forward_hook(self._get_disable_input_grad_hook(name))
+        #         self.hook_handles.append(hook_handle)
+        # self.model(tmp_input)
+        self._unregister_hook()
+
 
-    def unregister_hook(self):
-        for handel in self.hook_handlers:
-            handel.remove()
 
-    def get_avg_traces(self, enable_act=False, num_batches=2):
+    def get_avg_traces(self, enable_act=True, num_batches=2):
         """
         Estimates average hessian trace for each parameter
         """
         assert num_batches > 0
         if enable_act:
-            self.hook_handlers = []
+            self.hook_handles = []
             self.layer_acts = {}
             self.layer_acts_grads = {}
-            self.register_hook()
+            self.register_input_grad_hooks()
         ##num_data_iter = self.op_cfgs_list[0]['calib_iteration']
         ##num_all_data = num_data_iter * self.dataloader.batch_size
         ##op_list = self.op_list
@@ -186,7 +211,7 @@ def get_avg_traces(self, enable_act=False, num_batches=2):
         layer_traces_per_iter = []
         prev_avg_model_trace = 0
         for i in range(self.max_iter):
-            layer_traces = self.hutchinson_one_step(params, enable_act, num_batches)
+            layer_traces = self.get_hv_one_sample(params, enable_act, num_batches)
             layer_traces_per_iter.append(layer_traces)
             layer_traces_estimate = torch.mean(torch.stack(layer_traces_per_iter), dim=0)
             model_trace = torch.sum(layer_traces_estimate)
@@ -197,7 +222,7 @@ def get_avg_traces(self, enable_act=False, num_batches=2):
 
         layer_traces = layer_traces_estimate
         if enable_act:
-            self.unregister_hook()
+            self.reset_input_gradient_and_hooks()
         return layer_traces
 
 

From 8710a690fd1dde7ebb769623a35e4ada5911417a Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Mon, 21 Nov 2022 15:13:01 +0800
Subject: [PATCH 021/128] add trick imagenet dataset fix one issue

---
 .../experimental/quantization.py              | 32 +++++++++++++++++++
 neural_compressor/strategy/hawq.py            |  3 +-
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/neural_compressor/experimental/quantization.py b/neural_compressor/experimental/quantization.py
index 3a4f822c905..4fa143fc5c8 100644
--- a/neural_compressor/experimental/quantization.py
+++ b/neural_compressor/experimental/quantization.py
@@ -143,6 +143,38 @@ def pre_process(self):
             with open(self.resume_file, 'rb') as f:
                 _resume = pickle.load(f).__dict__
 
+
+        import torchvision.datasets as datasets
+        import torchvision.transforms as transforms
+        data_path = "/mnt/data2/dataset/dataset/imagenet/img_raw"
+        traindir = os.path.join(data_path, 'train')
+        valdir = os.path.join(data_path, 'val')
+        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                         std=[0.229, 0.224, 0.225])
+
+        train_dataset = datasets.ImageFolder(
+            traindir,
+            transforms.Compose([
+                transforms.RandomResizedCrop(224),
+                transforms.RandomHorizontalFlip(),
+                transforms.ToTensor(),
+                normalize,
+            ]))
+
+        val_dataset = datasets.ImageFolder(
+            valdir,
+            transforms.Compose([
+                transforms.RandomResizedCrop(224),
+                transforms.RandomHorizontalFlip(),
+                transforms.ToTensor(),
+                normalize,
+            ]))
+
+        from torch.utils.data import DataLoader
+
+        self._calib_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
+        self._eval_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=False)
+
         self.strategy = STRATEGIES[strategy](
             self._model,
             self.conf,
diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 228bb249e2a..2beef8668b4 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -98,6 +98,7 @@ def get_gradients(self, model, data, criterion, create_graph=False, enable_act=F
             input.requires_grad = True
         output = model(input)
         loss = criterion(output, target)
+        # torch.autograd.backward(loss, create_graph=create_graph)
         loss.backward(create_graph=create_graph)
         gradients = []
         for n, p in model.named_parameters():
@@ -177,7 +178,7 @@ def register_input_grad_hooks(self):
             if name in self.op_list:
                 hook_handle = module.register_forward_hook(self._get_enable_input_grad_hook(name))
                 self.hook_handles.append(hook_handle)
-                hook_handle = module.register_forward_hook(self._get_input_grad_hook(name))
+                hook_handle = module.register_backward_hook(self._get_input_grad_hook(name))
                 self.hook_handles.append(hook_handle)
 
 

From cda302943bc532bdb3b80e6a3f9aeabeeab69acc Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Mon, 21 Nov 2022 15:13:22 +0800
Subject: [PATCH 022/128] fix fuese issue

---
 neural_compressor/adaptor/pytorch.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/neural_compressor/adaptor/pytorch.py b/neural_compressor/adaptor/pytorch.py
index 097b9359f93..8e89de9fb8b 100644
--- a/neural_compressor/adaptor/pytorch.py
+++ b/neural_compressor/adaptor/pytorch.py
@@ -3104,8 +3104,15 @@ def _get_quantizable_ops_recursively(self, model, prefix, quantizable_ops):
         Returns:
             None
         """
-
+        module_dict = dict(model.named_modules())
         for op_name, child in model.named_modules():
+            if self.is_fused_module(child):
+                for name, _ in child.named_children():
+                    module_prefix = op_name + '.' + name
+                    if module_prefix in module_dict:
+                        module_dict.pop(module_prefix)  # remove sub-modules of fused modules
+
+        for op_name, child in module_dict.items():
             if type(child) in self.white_list \
                and type(child) != torch.nn.Sequential \
                and type(child) != torch.quantization.stubs.DeQuantStub:

From df3c6e059f713ded00dda7ed1d0c2195c49820dd Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Mon, 21 Nov 2022 18:51:51 +0800
Subject: [PATCH 023/128] change to eval model, remove bias

---
 .../quantization/ptq/cpu/fx/conf.yaml         | 12 +++--
 .../experimental/quantization.py              |  2 +-
 .../strategy/auto_mixed_precision.py          |  1 +
 neural_compressor/strategy/hawq.py            | 46 ++++++++++++++-----
 .../strategy/st_utils/hawq_wenhuach.py        |  2 +-
 neural_compressor/strategy/strategy.py        |  3 +-
 test/strategy/test_hawq_wenhuach.py           |  2 +-
 7 files changed, 47 insertions(+), 21 deletions(-)

diff --git a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/conf.yaml b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/conf.yaml
index d1dab0d2f43..064656e872b 100644
--- a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/conf.yaml
+++ b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/conf.yaml
@@ -21,10 +21,10 @@ quantization:                                        # optional. tuning constrai
   calibration:
     sampling_size: 300                               # optional. default value is 100. used to set how many samples should be used in calibration.
     dataloader:
-      batch_size: 30
+      batch_size: 1
       dataset:
         ImageFolder:
-          root: /path/to/calibration/dataset         # NOTE: modify to calibration dataset location if needed
+          root: /home/bfang1/Projects/HAWQ_INC/datasets/raw      # NOTE: modify to calibration dataset location if needed
       transform:
         Resize:
           size: 256
@@ -40,10 +40,10 @@ evaluation:                                          # optional. required if use
     metric:
       topk: 1                                        # built-in metrics are topk, map, f1, allow user to register new metric.
     dataloader:
-      batch_size: 30
+      batch_size: 1
       dataset:
         ImageFolder:
-          root: /path/to/evaluation/dataset          # NOTE: modify to evaluation dataset location if needed
+          root: /home/bfang1/Projects/HAWQ_INC/datasets/raw      # NOTE: modify to evaluation dataset location if needed
       transform:
         Resize:
           size: 256
@@ -61,7 +61,7 @@ evaluation:                                          # optional. required if use
       batch_size: 1
       dataset:
         ImageFolder:
-          root: /path/to/evaluation/dataset          # NOTE: modify to evaluation dataset location if needed
+          root: /home/bfang1/Projects/HAWQ_INC/datasets/raw        # NOTE: modify to evaluation dataset location if needed
       transform:
         Resize:
           size: 256
@@ -73,6 +73,8 @@ evaluation:                                          # optional. required if use
           std: [0.229, 0.224, 0.225]
 
 tuning:
+  strategy:
+    name: hawq
   accuracy_criterion:
     relative:  0.01                                  # optional. default value is relative, other value is absolute. this example allows relative accuracy loss: 1%.
   exit_policy:
diff --git a/neural_compressor/experimental/quantization.py b/neural_compressor/experimental/quantization.py
index 4fa143fc5c8..7e8e8cfbbac 100644
--- a/neural_compressor/experimental/quantization.py
+++ b/neural_compressor/experimental/quantization.py
@@ -146,7 +146,7 @@ def pre_process(self):
 
         import torchvision.datasets as datasets
         import torchvision.transforms as transforms
-        data_path = "/mnt/data2/dataset/dataset/imagenet/img_raw"
+        data_path = "/home/bfang1/Projects/HAWQ_INC/datasets/raw"
         traindir = os.path.join(data_path, 'train')
         valdir = os.path.join(data_path, 'val')
         normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
diff --git a/neural_compressor/strategy/auto_mixed_precision.py b/neural_compressor/strategy/auto_mixed_precision.py
index 4b59cf2cced..7fbd759a87e 100644
--- a/neural_compressor/strategy/auto_mixed_precision.py
+++ b/neural_compressor/strategy/auto_mixed_precision.py
@@ -145,6 +145,7 @@ def traverse(self):
         if self.baseline is None and (self.eval_dataloader or self.eval_func):
             logger.info("Get FP32 model baseline.")
             self.baseline = self._evaluate(self.model)
+            self.baseline=[0.698,[700]]
             # record the FP32 baseline
             self._add_tuning_history()
 
diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 2beef8668b4..09f0b1ef175 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -102,15 +102,15 @@ def get_gradients(self, model, data, criterion, create_graph=False, enable_act=F
         loss.backward(create_graph=create_graph)
         gradients = []
         for n, p in model.named_parameters():
-            if p.grad != None:
+            if p.grad != None and n in self.weight_names:
                 gradient = p.grad
                 gradients.append(gradient + 0.0)  ## add 0 to create a copy
         model.zero_grad()
         return gradients
 
-    def get_params(self, model):
-        parameters = [p for p in model.parameters() if p.requires_grad]
-        return parameters
+    # def get_params(self, model):
+    #     parameters = [p for p in model.parameters() if p.requires_grad]
+    #     return parameters
 
     def sample_rademacher(self, params):
         samples = []
@@ -191,9 +191,13 @@ def reset_input_gradient_and_hooks(self):
         # self.model(tmp_input)
         self._unregister_hook()
 
+    def get_params(self):
+        weight_names = [n for n, p in self.model.named_parameters() if p.requires_grad and "bias" not in n]  ##remove bias
+        params = [p for n, p in self.model.named_parameters() if p.requires_grad and "bias" not in n]  ##remove bias
+        self.weight_names = weight_names
+        self.params = params
 
-
-    def get_avg_traces(self, enable_act=True, num_batches=2):
+    def get_avg_traces(self, enable_act=False, num_batches=2):
         """
         Estimates average hessian trace for each parameter
         """
@@ -207,18 +211,22 @@ def get_avg_traces(self, enable_act=True, num_batches=2):
         ##num_all_data = num_data_iter * self.dataloader.batch_size
         ##op_list = self.op_list
         ##TODO setting this in config
-        params = [p for p in self.model.parameters() if p.requires_grad]
+        self.get_params()
+        # names = [n for n, p in self.model.named_parameters() if p.requires_grad and "bias" not in n]##remove bias
+        # params = [p for n, p in self.model.named_parameters() if p.requires_grad and "bias" not in n]##remove bias
 
         layer_traces_per_iter = []
         prev_avg_model_trace = 0
         for i in range(self.max_iter):
-            layer_traces = self.get_hv_one_sample(params, enable_act, num_batches)
+            layer_traces = self.get_hv_one_sample(self.params, enable_act, num_batches)
             layer_traces_per_iter.append(layer_traces)
             layer_traces_estimate = torch.mean(torch.stack(layer_traces_per_iter), dim=0)
             model_trace = torch.sum(layer_traces_estimate)
             diff_ratio = abs(model_trace - prev_avg_model_trace) / (prev_avg_model_trace + self.eps)
             if diff_ratio < self.tolerance and i > 10:  ##TODO magic number
                 break
+            if i==50:##TODO for debug
+                break
             prev_avg_model_trace = model_trace
 
         layer_traces = layer_traces_estimate
@@ -314,11 +322,25 @@ def next_tune_cfg(self):
         orig_eval = True
         if self._fp32_model.training:
             orig_eval = False
-        self._fp32_model.train()
+        self._fp32_model.eval()
         ht = HessianTrace(self._fp32_model, self.calib_dataloader)
-        ht.get_avg_traces()
-        if orig_eval:
-            self._fp32_model.eval()
+        traces = ht.get_avg_traces()
+        if orig_eval==False:
+            self._fp32_model.train()
+
+        ordered_ops = sorted(op_fallback_acc_impact.keys(),
+                             key=lambda key: op_fallback_acc_impact[key],
+                             reverse=self.higher_is_better)
+        op_dtypes = OrderedDict(zip(ordered_ops, [target_dtype] * len(fallback_items_name_lst)))
+        logger.info(f"Start to accumulate fallback to {target_dtype}.")
+        initial_op_tuning_cfg = deepcopy(best_op_tuning_cfg_stage1)
+        fallback_sampler = FallbackTuningSampler(tuning_space, tuning_order_lst=[],
+                                                 initial_op_tuning_cfg=initial_op_tuning_cfg,
+                                                 op_dtypes=op_dtypes, accumulate=True)
+        for op_tuning_cfg in fallback_sampler:
+            op_tuning_cfg['calib_sampling_size'] = calib_sampling_size
+            yield op_tuning_cfg
+
 
         # tmp = 1
         # fallback_items_name_lst = [item.name for item in fallback_items_lst][::-1] # from bottom to up
diff --git a/neural_compressor/strategy/st_utils/hawq_wenhuach.py b/neural_compressor/strategy/st_utils/hawq_wenhuach.py
index 6c74401c5fc..c0ced2af3f4 100644
--- a/neural_compressor/strategy/st_utils/hawq_wenhuach.py
+++ b/neural_compressor/strategy/st_utils/hawq_wenhuach.py
@@ -10,7 +10,7 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 """
-from ..utils import logger
+from ...utils import logger
 import torch
 import numpy as np
 from torch.autograd import Variable
diff --git a/neural_compressor/strategy/strategy.py b/neural_compressor/strategy/strategy.py
index 8c959023bf0..c5db10a4d1b 100644
--- a/neural_compressor/strategy/strategy.py
+++ b/neural_compressor/strategy/strategy.py
@@ -219,7 +219,8 @@ def traverse(self):
         if self.baseline is None:
             logger.info("Get FP32 model baseline.")
             self._fp32_model = self.model
-            self.baseline = self._evaluate(self.model)
+            ##self.baseline = self._evaluate(self.model)
+            self.baseline = [0.698,[700]]
             # record the FP32 baseline
             self._add_tuning_history()
         self.show_baseline_info()
diff --git a/test/strategy/test_hawq_wenhuach.py b/test/strategy/test_hawq_wenhuach.py
index a6ee28b9d4a..236d8219e71 100644
--- a/test/strategy/test_hawq_wenhuach.py
+++ b/test/strategy/test_hawq_wenhuach.py
@@ -11,7 +11,7 @@
 from neural_compressor.adaptor.pytorch import TemplateAdaptor
 from neural_compressor.adaptor import FRAMEWORKS
 import shutil
-from neural_compressor.strategy.st_utils.hawq_wenhuach import Hawq_top, fix_seed
+from neural_compressor.strategy.st_utils.hawq_wenhuach import fix_seed
 from torch.quantization.quantize_fx import fuse_fx
 fix_seed(1)
 

From 084b4def57518b000d5b31f794c8d9eb40e0ef9e Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Mon, 21 Nov 2022 19:16:10 +0800
Subject: [PATCH 024/128] fixed weight to op bug

---
 neural_compressor/strategy/hawq.py  | 41 ++++++++++++++++++-----------
 test/strategy/test_hawq_wenhuach.py |  2 +-
 2 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 09f0b1ef175..015d9e678c1 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -75,11 +75,12 @@ def get_fused_mapping(self):
             if self.is_fused_module(child):
                 for name, _ in child.named_children():
                     if op_name + "." + name + ".weight" in weights_info:  ##TODO check if this is right
-                        weight_to_op[op_name + "." + name + ".weight"] = op_name
+                        weight_to_op[op_name + "." + name + ".weight"] = op_name[7:]
                         break
             else:
-                if op_name + ".weight" in weights_info:
-                    weight_to_op[op_name + ".weight"] = op_name
+                name = op_name + ".weight"
+                if name in weights_info and name not in weight_to_op.keys():
+                    weight_to_op[op_name + ".weight"] = op_name[7:]
         op_list = []
         for key in weight_to_op.keys():
             op_list.append(weight_to_op[key])
@@ -232,7 +233,15 @@ def get_avg_traces(self, enable_act=False, num_batches=2):
         layer_traces = layer_traces_estimate
         if enable_act:
             self.reset_input_gradient_and_hooks()
-        return layer_traces
+        weight_name_to_traces={}
+
+        for weigth_name,trace in zip(self.weight_names, layer_traces):
+            weight_name_to_traces[weigth_name] = trace
+        op_name_to_trace={}
+        for weigth_name in self.weight_names:
+            op_name = self.weight_to_op[weigth_name]
+            op_name_to_trace[op_name] = weight_name_to_traces[weigth_name]
+        return op_name_to_trace
 
 
 @strategy_registry
@@ -328,18 +337,18 @@ def next_tune_cfg(self):
         if orig_eval==False:
             self._fp32_model.train()
 
-        ordered_ops = sorted(op_fallback_acc_impact.keys(),
-                             key=lambda key: op_fallback_acc_impact[key],
-                             reverse=self.higher_is_better)
-        op_dtypes = OrderedDict(zip(ordered_ops, [target_dtype] * len(fallback_items_name_lst)))
-        logger.info(f"Start to accumulate fallback to {target_dtype}.")
-        initial_op_tuning_cfg = deepcopy(best_op_tuning_cfg_stage1)
-        fallback_sampler = FallbackTuningSampler(tuning_space, tuning_order_lst=[],
-                                                 initial_op_tuning_cfg=initial_op_tuning_cfg,
-                                                 op_dtypes=op_dtypes, accumulate=True)
-        for op_tuning_cfg in fallback_sampler:
-            op_tuning_cfg['calib_sampling_size'] = calib_sampling_size
-            yield op_tuning_cfg
+        # ordered_ops = sorted(op_fallback_acc_impact.keys(),
+        #                      key=lambda key: op_fallback_acc_impact[key],
+        #                      reverse=self.higher_is_better)
+        # op_dtypes = OrderedDict(zip(ordered_ops, [target_dtype] * len(fallback_items_name_lst)))
+        # logger.info(f"Start to accumulate fallback to {target_dtype}.")
+        # initial_op_tuning_cfg = deepcopy(best_op_tuning_cfg_stage1)
+        # fallback_sampler = FallbackTuningSampler(tuning_space, tuning_order_lst=[],
+        #                                          initial_op_tuning_cfg=initial_op_tuning_cfg,
+        #                                          op_dtypes=op_dtypes, accumulate=True)
+        # for op_tuning_cfg in fallback_sampler:
+        #     op_tuning_cfg['calib_sampling_size'] = calib_sampling_size
+        #     yield op_tuning_cfg
 
 
         # tmp = 1
diff --git a/test/strategy/test_hawq_wenhuach.py b/test/strategy/test_hawq_wenhuach.py
index 236d8219e71..a09c83c3452 100644
--- a/test/strategy/test_hawq_wenhuach.py
+++ b/test/strategy/test_hawq_wenhuach.py
@@ -13,7 +13,7 @@
 import shutil
 from neural_compressor.strategy.st_utils.hawq_wenhuach import fix_seed
 from torch.quantization.quantize_fx import fuse_fx
-fix_seed(1)
+# fix_seed(1)
 
 def build_ptq_yaml():
     fake_yaml = '''

From 4f0961d11eecabb44d624bdf14ffc062d246eb6b Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Mon, 21 Nov 2022 19:44:42 +0800
Subject: [PATCH 025/128] still have issues

---
 neural_compressor/strategy/hawq.py               | 16 +++++++++++++++-
 .../strategy/st_utils/tuning_sampler.py          |  2 +-
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 015d9e678c1..dbbaa98e931 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -333,10 +333,24 @@ def next_tune_cfg(self):
             orig_eval = False
         self._fp32_model.eval()
         ht = HessianTrace(self._fp32_model, self.calib_dataloader)
-        traces = ht.get_avg_traces()
+        op_to_traces = ht.get_avg_traces()
         if orig_eval==False:
             self._fp32_model.train()
 
+        ordered_ops = sorted(op_to_traces.keys(),
+                             key=lambda key: op_to_traces[key],
+                             reverse=self.higher_is_better)
+        op_dtypes = OrderedDict(zip(ordered_ops, [target_dtype] * len(ordered_ops)))
+        logger.info(f"Start to accumulate fallback to {target_dtype}.")
+
+        fallback_sampler = FallbackTuningSampler(tuning_space, tuning_order_lst=[],
+                                                 initial_op_tuning_cfg=None,
+                                                 op_dtypes=op_dtypes, accumulate=True)
+        for op_tuning_cfg in fallback_sampler:
+            op_tuning_cfg['calib_sampling_size'] = calib_size
+            yield op_tuning_cfg
+
+
         # ordered_ops = sorted(op_fallback_acc_impact.keys(),
         #                      key=lambda key: op_fallback_acc_impact[key],
         #                      reverse=self.higher_is_better)
diff --git a/neural_compressor/strategy/st_utils/tuning_sampler.py b/neural_compressor/strategy/st_utils/tuning_sampler.py
index fea140a9e4d..c583f1c2764 100644
--- a/neural_compressor/strategy/st_utils/tuning_sampler.py
+++ b/neural_compressor/strategy/st_utils/tuning_sampler.py
@@ -263,7 +263,7 @@ def __init__(self,
 
     def __iter__(self):
         new_tune_cfg = copy.deepcopy(self.initial_op_tuning_cfg)
-        skip_first = True
+        skip_first = False
         for op_name_type, target_dtype in self.op_dtypes.items():
             if not self.accumulate:
                 new_tune_cfg = copy.deepcopy(self.initial_op_tuning_cfg)

From 16bd68ecc3b73afdf6613c351ae2c149d2a51bcd Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 22 Nov 2022 11:01:22 +0800
Subject: [PATCH 026/128] WA for align the op name

---
 neural_compressor/strategy/hawq.py | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index dbbaa98e931..bc042f06b2c 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -316,7 +316,22 @@ def next_tune_cfg(self):
 
         # Initialize the tuning config for each op according to the quantization approach
         op_item_dtype_dict, quant_mode_wise_items, initial_op_tuning_cfg = self.initial_tuning_cfg()
-
+        # Optype-wise tuning tuning items: the algorithm/scheme/granularity of activation(weight)
+        early_stop_tuning = True
+        stage1_cnt = 0
+        quant_ops = quant_mode_wise_items['static'] if 'static' in quant_mode_wise_items else []
+        quant_ops += quant_mode_wise_items['dynamic'] if 'dynamic' in quant_mode_wise_items else []
+        stage1_max = 2  # TODO set a more appropriate value
+        op_wise_tuning_sampler = OpTypeWiseTuningSampler(tuning_space, [], [], 
+                                                            op_item_dtype_dict, initial_op_tuning_cfg)
+        for op_tuning_cfg in op_wise_tuning_sampler:
+            stage1_cnt += 1
+            if early_stop_tuning and stage1_cnt > stage1_max:
+                logger.info("Early stopping the stage 1.")
+                break
+            op_tuning_cfg['calib_sampling_size'] = calib_size
+            yield op_tuning_cfg
+        # Fallback the ops supported both static and dynamic from static to dynamic
         quant_ops = quant_mode_wise_items['static'] if 'static' in quant_mode_wise_items else []
         quant_ops += quant_mode_wise_items['dynamic'] if 'dynamic' in quant_mode_wise_items else []
 
@@ -340,11 +355,16 @@ def next_tune_cfg(self):
         ordered_ops = sorted(op_to_traces.keys(),
                              key=lambda key: op_to_traces[key],
                              reverse=self.higher_is_better)
-        op_dtypes = OrderedDict(zip(ordered_ops, [target_dtype] * len(ordered_ops)))
+        # WA for add op type
+        op_info_map = {}
+        for op_info in list(initial_op_tuning_cfg.keys()):
+            op_info_map[op_info[0]] = op_info # op_name: (op_name, op_type)
+        tmp_ordered_ops = [op_info_map[op_name] for op_name in ordered_ops]
+        op_dtypes = OrderedDict(zip(tmp_ordered_ops, [target_dtype] * len(ordered_ops)))
         logger.info(f"Start to accumulate fallback to {target_dtype}.")
 
         fallback_sampler = FallbackTuningSampler(tuning_space, tuning_order_lst=[],
-                                                 initial_op_tuning_cfg=None,
+                                                 initial_op_tuning_cfg=op_tuning_cfg,
                                                  op_dtypes=op_dtypes, accumulate=True)
         for op_tuning_cfg in fallback_sampler:
             op_tuning_cfg['calib_sampling_size'] = calib_size

From e0ae1cee6da110be432e6ed26359a980a4c6f531 Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Tue, 22 Nov 2022 15:36:21 +0800
Subject: [PATCH 027/128] change entry point to main function fx model before
 entering into quantization

---
 .../quantization/ptq/cpu/fx/conf.yaml         |  6 +-
 .../quantization/ptq/cpu/fx/main.py           |  2 +
 .../experimental/quantization.py              | 62 +++++++++----------
 neural_compressor/strategy/hawq.py            |  2 +-
 .../strategy/st_utils/tuning_sampler.py       |  2 +-
 test/strategy/test_hawq_wenhuach.py           |  8 +--
 6 files changed, 42 insertions(+), 40 deletions(-)

diff --git a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/conf.yaml b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/conf.yaml
index 064656e872b..4b50b559e6a 100644
--- a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/conf.yaml
+++ b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/conf.yaml
@@ -24,7 +24,7 @@ quantization:                                        # optional. tuning constrai
       batch_size: 1
       dataset:
         ImageFolder:
-          root: /home/bfang1/Projects/HAWQ_INC/datasets/raw      # NOTE: modify to calibration dataset location if needed
+          root: /home/bfang1/Projects/HAWQ_INC/datasets/raw/val     # NOTE: modify to calibration dataset location if needed
       transform:
         Resize:
           size: 256
@@ -43,7 +43,7 @@ evaluation:                                          # optional. required if use
       batch_size: 1
       dataset:
         ImageFolder:
-          root: /home/bfang1/Projects/HAWQ_INC/datasets/raw      # NOTE: modify to evaluation dataset location if needed
+          root: /home/bfang1/Projects/HAWQ_INC/datasets/raw/val      # NOTE: modify to evaluation dataset location if needed
       transform:
         Resize:
           size: 256
@@ -61,7 +61,7 @@ evaluation:                                          # optional. required if use
       batch_size: 1
       dataset:
         ImageFolder:
-          root: /home/bfang1/Projects/HAWQ_INC/datasets/raw        # NOTE: modify to evaluation dataset location if needed
+          root: /home/bfang1/Projects/HAWQ_INC/datasets/raw/val        # NOTE: modify to evaluation dataset location if needed
       transform:
         Resize:
           size: 256
diff --git a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/main.py b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/main.py
index 8646048ccf4..30008bfa3db 100644
--- a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/main.py
+++ b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/main.py
@@ -169,6 +169,8 @@ def main():
     if args.tune:
         from neural_compressor.experimental import Quantization, common
         model.eval()
+        from torch.quantization.quantize_fx import fuse_fx
+        model = fuse_fx(model)
         quantizer = Quantization("./conf.yaml")
         quantizer.model = common.Model(model)
         q_model = quantizer.fit()
diff --git a/neural_compressor/experimental/quantization.py b/neural_compressor/experimental/quantization.py
index 7e8e8cfbbac..c6e4a8c3646 100644
--- a/neural_compressor/experimental/quantization.py
+++ b/neural_compressor/experimental/quantization.py
@@ -143,37 +143,37 @@ def pre_process(self):
             with open(self.resume_file, 'rb') as f:
                 _resume = pickle.load(f).__dict__
 
-
-        import torchvision.datasets as datasets
-        import torchvision.transforms as transforms
-        data_path = "/home/bfang1/Projects/HAWQ_INC/datasets/raw"
-        traindir = os.path.join(data_path, 'train')
-        valdir = os.path.join(data_path, 'val')
-        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
-                                         std=[0.229, 0.224, 0.225])
-
-        train_dataset = datasets.ImageFolder(
-            traindir,
-            transforms.Compose([
-                transforms.RandomResizedCrop(224),
-                transforms.RandomHorizontalFlip(),
-                transforms.ToTensor(),
-                normalize,
-            ]))
-
-        val_dataset = datasets.ImageFolder(
-            valdir,
-            transforms.Compose([
-                transforms.RandomResizedCrop(224),
-                transforms.RandomHorizontalFlip(),
-                transforms.ToTensor(),
-                normalize,
-            ]))
-
-        from torch.utils.data import DataLoader
-
-        self._calib_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
-        self._eval_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=False)
+        #
+        # import torchvision.datasets as datasets
+        # import torchvision.transforms as transforms
+        # data_path = "/home/bfang1/Projects/HAWQ_INC/datasets/raw"
+        # traindir = os.path.join(data_path, 'train')
+        # valdir = os.path.join(data_path, 'val')
+        # normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+        #                                  std=[0.229, 0.224, 0.225])
+        #
+        # train_dataset = datasets.ImageFolder(
+        #     traindir,
+        #     transforms.Compose([
+        #         transforms.RandomResizedCrop(224),
+        #         transforms.RandomHorizontalFlip(),
+        #         transforms.ToTensor(),
+        #         normalize,
+        #     ]))
+        #
+        # val_dataset = datasets.ImageFolder(
+        #     valdir,
+        #     transforms.Compose([
+        #         transforms.RandomResizedCrop(224),
+        #         transforms.RandomHorizontalFlip(),
+        #         transforms.ToTensor(),
+        #         normalize,
+        #     ]))
+        #
+        # from torch.utils.data import DataLoader
+        #
+        # self._calib_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
+        # self._eval_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=False)
 
         self.strategy = STRATEGIES[strategy](
             self._model,
diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index bc042f06b2c..6db4757aa0c 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -321,7 +321,7 @@ def next_tune_cfg(self):
         stage1_cnt = 0
         quant_ops = quant_mode_wise_items['static'] if 'static' in quant_mode_wise_items else []
         quant_ops += quant_mode_wise_items['dynamic'] if 'dynamic' in quant_mode_wise_items else []
-        stage1_max = 2  # TODO set a more appropriate value
+        stage1_max = -1  # TODO set a more appropriate value
         op_wise_tuning_sampler = OpTypeWiseTuningSampler(tuning_space, [], [], 
                                                             op_item_dtype_dict, initial_op_tuning_cfg)
         for op_tuning_cfg in op_wise_tuning_sampler:
diff --git a/neural_compressor/strategy/st_utils/tuning_sampler.py b/neural_compressor/strategy/st_utils/tuning_sampler.py
index c583f1c2764..f311d7c16a4 100644
--- a/neural_compressor/strategy/st_utils/tuning_sampler.py
+++ b/neural_compressor/strategy/st_utils/tuning_sampler.py
@@ -272,7 +272,7 @@ def __iter__(self):
             if self.accumulate and skip_first:  # skip the first one
                 skip_first = False
                 continue
-            logger.debug(f"fallback {op_name_type} to {target_dtype}")
+            logger.info(f"fallback {op_name_type} to {target_dtype}")
             yield new_tune_cfg  # need to skip the first one
 
 
diff --git a/test/strategy/test_hawq_wenhuach.py b/test/strategy/test_hawq_wenhuach.py
index a09c83c3452..2adcd5a5812 100644
--- a/test/strategy/test_hawq_wenhuach.py
+++ b/test/strategy/test_hawq_wenhuach.py
@@ -74,15 +74,15 @@ def tearDownClass(self):
 
 
     def test_run_hawq_one_trial(self):
-        def eval_func(model):
-            self.i -= 1
-            return self.i
+        # def eval_func(model):
+        #     self.i -= 1
+        #     return self.i
         from neural_compressor.experimental import Quantization, common
         model = copy.deepcopy(self.model)
         model.eval()
         model = fuse_fx(model)
         quantizer = Quantization('ptq_yaml.yaml')
-        quantizer.eval_func = eval_func
+        ##quantizer.eval_func = eval_func
         dataset = quantizer.dataset('dummy', (32, 3, 224, 224), label=True)
         quantizer.calib_dataloader = common.DataLoader(dataset)
         quantizer.eval_dataloader = common.DataLoader(dataset)

From 3440ac5ee964fdf2fb1d876a096cf964ecefb4bf Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Wed, 23 Nov 2022 10:54:34 +0800
Subject: [PATCH 028/128] get activations and the corresponding gradients

---
 neural_compressor/strategy/hawq.py | 72 ++++++++++++++++--------------
 1 file changed, 39 insertions(+), 33 deletions(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 6db4757aa0c..604008ac2d4 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -67,6 +67,13 @@ def is_fused_module(self, module):
         else:
             return False
 
+    def mapping_module_to_op(self, name):
+        length = len("_model.")
+        if len(name) < length:
+            return name
+        else:
+            return name[length:]
+
     def get_fused_mapping(self):
         model = self.model
         weights_info = dict(model.named_parameters())
@@ -75,7 +82,8 @@ def get_fused_mapping(self):
             if self.is_fused_module(child):
                 for name, _ in child.named_children():
                     if op_name + "." + name + ".weight" in weights_info:  ##TODO check if this is right
-                        weight_to_op[op_name + "." + name + ".weight"] = op_name[7:]
+
+                        weight_to_op[op_name + "." + name + ".weight"] = self.mapping_module_to_op(op_name)
                         break
             else:
                 name = op_name + ".weight"
@@ -95,8 +103,8 @@ def get_gradients(self, model, data, criterion, create_graph=False, enable_act=F
         input = data[0].to(self.device)
         ##self._input_shape = input.shape  ## for resetting input activation
         target = data[1].to(self.device)
-        if enable_act:
-            input.requires_grad = True
+        # if enable_act:
+        #     input.requires_grad = True
         output = model(input)
         loss = criterion(output, target)
         # torch.autograd.backward(loss, create_graph=create_graph)
@@ -138,23 +146,24 @@ def get_hv_one_sample(self, params, enable_act, num_batches):
         v_t_H_v = torch.stack([torch.mean(h_v * v_t) for (h_v, v_t) in zip(H_v, v)])  ##maybe sum is better
         return v_t_H_v
 
-    def _get_input_grad_hook(self, name):
-        def input_grad_hook(model, grad_input, grad_output):
+    def _get_act_grad_hook(self, name):
+        def act_grad_hook(model, grad_input, grad_output):
             self.layer_acts_grads[name] = [grad_input, grad_output]
-        return input_grad_hook
 
-    def _get_enable_input_grad_hook(self, name):
-        def enable_input_grad_hook(model, inputs, outputs):
+        return act_grad_hook
+
+    def _get_enable_act_grad_hook(self, name):
+        def enable_act_grad_hook(model, inputs, outputs):
             try:
                 input = inputs[0]  ##TODO check whether this is right
             except:
                 input = inputs
-            if input.is_leaf == False:
-                if input.requires_grad is False:
-                    input.requires_grad = True
-                    self.layer_acts[name] = input
 
-        return enable_input_grad_hook
+            if input.requires_grad is False:
+                input.requires_grad = True
+            self.layer_acts[name] = input
+
+        return enable_act_grad_hook
 
     # def _get_disable_input_grad_hook(self, name):
     #     def disable_input_grad_hook(model, inputs, outputs):
@@ -169,21 +178,19 @@ def enable_input_grad_hook(model, inputs, outputs):
     #
     #     return disable_input_grad_hook
 
-
     def _unregister_hook(self):
         for handel in self.hook_handles:
             handel.remove()
 
-    def register_input_grad_hooks(self):
+    def register_act_grad_hooks(self):
         for name, module in self.model.named_modules():
-            if name in self.op_list:
-                hook_handle = module.register_forward_hook(self._get_enable_input_grad_hook(name))
+            if self.mapping_module_to_op(name) in self.op_list:
+                hook_handle = module.register_forward_hook(self._get_enable_act_grad_hook(name))
                 self.hook_handles.append(hook_handle)
-                hook_handle = module.register_backward_hook(self._get_input_grad_hook(name))
+                hook_handle = module.register_backward_hook(self._get_act_grad_hook(name))
                 self.hook_handles.append(hook_handle)
 
-
-    def reset_input_gradient_and_hooks(self):
+    def reset_act_gradient_and_hooks(self):
         # tmp_input = torch.zeros(self._input_shape, device=self.device)
         # for name, module in self.model.named_modules():
         #     if name in self.op_list:
@@ -193,12 +200,13 @@ def reset_input_gradient_and_hooks(self):
         self._unregister_hook()
 
     def get_params(self):
-        weight_names = [n for n, p in self.model.named_parameters() if p.requires_grad and "bias" not in n]  ##remove bias
+        weight_names = [n for n, p in self.model.named_parameters() if
+                        p.requires_grad and "bias" not in n]  ##remove bias
         params = [p for n, p in self.model.named_parameters() if p.requires_grad and "bias" not in n]  ##remove bias
         self.weight_names = weight_names
         self.params = params
 
-    def get_avg_traces(self, enable_act=False, num_batches=2):
+    def get_avg_traces(self, enable_act=True, num_batches=2):
         """
         Estimates average hessian trace for each parameter
         """
@@ -207,7 +215,7 @@ def get_avg_traces(self, enable_act=False, num_batches=2):
             self.hook_handles = []
             self.layer_acts = {}
             self.layer_acts_grads = {}
-            self.register_input_grad_hooks()
+            self.register_act_grad_hooks()
         ##num_data_iter = self.op_cfgs_list[0]['calib_iteration']
         ##num_all_data = num_data_iter * self.dataloader.batch_size
         ##op_list = self.op_list
@@ -226,18 +234,18 @@ def get_avg_traces(self, enable_act=False, num_batches=2):
             diff_ratio = abs(model_trace - prev_avg_model_trace) / (prev_avg_model_trace + self.eps)
             if diff_ratio < self.tolerance and i > 10:  ##TODO magic number
                 break
-            if i==50:##TODO for debug
+            if i == 50:  ##TODO for debug
                 break
             prev_avg_model_trace = model_trace
 
         layer_traces = layer_traces_estimate
         if enable_act:
             self.reset_input_gradient_and_hooks()
-        weight_name_to_traces={}
+        weight_name_to_traces = {}
 
-        for weigth_name,trace in zip(self.weight_names, layer_traces):
+        for weigth_name, trace in zip(self.weight_names, layer_traces):
             weight_name_to_traces[weigth_name] = trace
-        op_name_to_trace={}
+        op_name_to_trace = {}
         for weigth_name in self.weight_names:
             op_name = self.weight_to_op[weigth_name]
             op_name_to_trace[op_name] = weight_name_to_traces[weigth_name]
@@ -322,8 +330,8 @@ def next_tune_cfg(self):
         quant_ops = quant_mode_wise_items['static'] if 'static' in quant_mode_wise_items else []
         quant_ops += quant_mode_wise_items['dynamic'] if 'dynamic' in quant_mode_wise_items else []
         stage1_max = -1  # TODO set a more appropriate value
-        op_wise_tuning_sampler = OpTypeWiseTuningSampler(tuning_space, [], [], 
-                                                            op_item_dtype_dict, initial_op_tuning_cfg)
+        op_wise_tuning_sampler = OpTypeWiseTuningSampler(tuning_space, [], [],
+                                                         op_item_dtype_dict, initial_op_tuning_cfg)
         for op_tuning_cfg in op_wise_tuning_sampler:
             stage1_cnt += 1
             if early_stop_tuning and stage1_cnt > stage1_max:
@@ -349,7 +357,7 @@ def next_tune_cfg(self):
         self._fp32_model.eval()
         ht = HessianTrace(self._fp32_model, self.calib_dataloader)
         op_to_traces = ht.get_avg_traces()
-        if orig_eval==False:
+        if orig_eval == False:
             self._fp32_model.train()
 
         ordered_ops = sorted(op_to_traces.keys(),
@@ -358,7 +366,7 @@ def next_tune_cfg(self):
         # WA for add op type
         op_info_map = {}
         for op_info in list(initial_op_tuning_cfg.keys()):
-            op_info_map[op_info[0]] = op_info # op_name: (op_name, op_type)
+            op_info_map[op_info[0]] = op_info  # op_name: (op_name, op_type)
         tmp_ordered_ops = [op_info_map[op_name] for op_name in ordered_ops]
         op_dtypes = OrderedDict(zip(tmp_ordered_ops, [target_dtype] * len(ordered_ops)))
         logger.info(f"Start to accumulate fallback to {target_dtype}.")
@@ -370,7 +378,6 @@ def next_tune_cfg(self):
             op_tuning_cfg['calib_sampling_size'] = calib_size
             yield op_tuning_cfg
 
-
         # ordered_ops = sorted(op_fallback_acc_impact.keys(),
         #                      key=lambda key: op_fallback_acc_impact[key],
         #                      reverse=self.higher_is_better)
@@ -384,7 +391,6 @@ def next_tune_cfg(self):
         #     op_tuning_cfg['calib_sampling_size'] = calib_sampling_size
         #     yield op_tuning_cfg
 
-
         # tmp = 1
         # fallback_items_name_lst = [item.name for item in fallback_items_lst][::-1] # from bottom to up
         # ops_sensitivity = self.adaptor.get_hessian_trace(self._fp32_model,

From f895fb4b6a0219bf0cb31ddbfce802a0be2d828d Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Wed, 23 Nov 2022 11:54:27 +0800
Subject: [PATCH 029/128] change fusefx position

---
 .../quantization/ptq/cpu/fx/main.py            |  2 --
 neural_compressor/strategy/hawq.py             | 18 ++++++++++--------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/main.py b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/main.py
index 30008bfa3db..8646048ccf4 100644
--- a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/main.py
+++ b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/main.py
@@ -169,8 +169,6 @@ def main():
     if args.tune:
         from neural_compressor.experimental import Quantization, common
         model.eval()
-        from torch.quantization.quantize_fx import fuse_fx
-        model = fuse_fx(model)
         quantizer = Quantization("./conf.yaml")
         quantizer.model = common.Model(model)
         q_model = quantizer.fit()
diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 604008ac2d4..2cfac2b5815 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -40,7 +40,9 @@ class HessianTrace:
     """
 
     def __init__(self, model, dataloader, criterion=None):
-        self.model = model  ##TODO need to check fused or not
+        from torch.quantization.quantize_fx import fuse_fx
+        self.model = fuse_fx(model.model)
+
         self.dataloader = dataloader
         self.max_iter = 500
         self.tolerance = 1e-5
@@ -68,11 +70,11 @@ def is_fused_module(self, module):
             return False
 
     def mapping_module_to_op(self, name):
-        length = len("_model.")
-        if len(name) < length:
-            return name
-        else:
-            return name[length:]
+        # length = len("_model.")
+        # if len(name) < length:
+        #     return name
+        # else:
+        return name
 
     def get_fused_mapping(self):
         model = self.model
@@ -88,7 +90,7 @@ def get_fused_mapping(self):
             else:
                 name = op_name + ".weight"
                 if name in weights_info and name not in weight_to_op.keys():
-                    weight_to_op[op_name + ".weight"] = op_name[7:]
+                    weight_to_op[op_name + ".weight"] = op_name
         op_list = []
         for key in weight_to_op.keys():
             op_list.append(weight_to_op[key])
@@ -240,7 +242,7 @@ def get_avg_traces(self, enable_act=True, num_batches=2):
 
         layer_traces = layer_traces_estimate
         if enable_act:
-            self.reset_input_gradient_and_hooks()
+            self.reset_act_gradient_and_hooks()
         weight_name_to_traces = {}
 
         for weigth_name, trace in zip(self.weight_names, layer_traces):

From 4f7dd785e4b52882953183a7a44d1de9daa2b8d2 Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Wed, 23 Nov 2022 17:20:16 +0800
Subject: [PATCH 030/128] add weight quant loss, the current key is from quant
 model

---
 neural_compressor/strategy/hawq.py | 342 +++++++++++++++++------------
 1 file changed, 201 insertions(+), 141 deletions(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 2cfac2b5815..2f6a2e7e074 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -29,6 +29,7 @@
 from .st_utils.tuning_space import TUNING_ITEMS_LST
 from torch.quantization.quantize_fx import fuse_fx
 import torchvision
+from typing import Dict, List, Optional, Any, Union, Callable, Set
 
 
 class HessianTrace:
@@ -55,6 +56,22 @@ def __init__(self, model, dataloader, criterion=None):
         self.criterion = self.criterion.to(self.device)
         self.weight_to_op, self.op_list = self.get_fused_mapping()
 
+    def get_qnt_weight_loss(self, weights_name):
+
+        fp32_model = self.fp32model
+
+        qnt_model = self.q_model
+
+        # print(self.model.state_dict())
+        for n, p in self.model.named_parameters():
+            print(n)
+
+        print("*" * 20)
+
+        for n, p in self.q_model._model.named_parameters():
+            print(n)
+        pass
+
     def is_fused_module(self, module):
         """This is a helper function for `_propagate_qconfig_helper` to detecte
            if this module is fused.
@@ -100,7 +117,7 @@ def get_device(self, model: torch.nn.Module):
         for n, p in model.named_parameters():
             return p.data.device
 
-    def get_gradients(self, model, data, criterion, create_graph=False, enable_act=False):
+    def get_gradients(self, model, data, criterion, create_graph=False):
         model.zero_grad()
         input = data[0].to(self.device)
         ##self._input_shape = input.shape  ## for resetting input activation
@@ -131,14 +148,15 @@ def sample_rademacher(self, params):
             samples.append(r)
         return samples
 
-    def get_hv_one_sample(self, params, enable_act, num_batches):
+    def get_vtHv_weight(self, params, num_samples):
+        num_batches = (num_samples + self.dataloader.batchsize - 1) // self.dataloader
         v = self.sample_rademacher(params)
         H_v = [0] * len(v)
         cnt = 0
         for step, data in enumerate(self.dataloader):
             batch_size = data[0].shape[0]
             cnt += batch_size
-            gradients = self.get_gradients(self.model, data, self.criterion, create_graph=True, enable_act=enable_act)
+            gradients = self.get_gradients(self.model, data, self.criterion, create_graph=True)
             H_v_one = torch.autograd.grad(gradients, params, v, only_inputs=True, retain_graph=False)
             H_v = [pre + cur * float(batch_size) for cur, pre in zip(H_v_one, H_v)]
             if step == num_batches - 1:
@@ -148,6 +166,25 @@ def get_hv_one_sample(self, params, enable_act, num_batches):
         v_t_H_v = torch.stack([torch.mean(h_v * v_t) for (h_v, v_t) in zip(H_v, v)])  ##maybe sum is better
         return v_t_H_v
 
+    def get_vtHv_act(self, params, num_samples):
+        v = self.sample_rademacher(params)
+        H_v = [0] * len(v)
+        cnt = 0
+        for step, data in enumerate(self.dataloader):
+            if cnt >= num_samples:
+                break
+            for i in range(self.dataloader.batchsize):  ##force to batchsize to be 1
+                input = data[0][i:i + 1]
+                target = data[1][i:i + 1]
+
+                self.get_gradients(self.model, (input, target), self.criterion, create_graph=True)
+                layer_acts = [self.layer_acts[key] for key in self.layer_acts.keys()]
+                layer_act_gradients = [self.layer_acts_grads[key] for key in self.layer_acts.keys()]
+                hv_one = torch.autograd.grad(layer_act_gradients, layer_acts, v, only_inputs=True, retain_graph=False)
+                cnt += 1
+                if cnt >= num_samples:
+                    break
+
     def _get_act_grad_hook(self, name):
         def act_grad_hook(model, grad_input, grad_output):
             self.layer_acts_grads[name] = [grad_input, grad_output]
@@ -208,28 +245,12 @@ def get_params(self):
         self.weight_names = weight_names
         self.params = params
 
-    def get_avg_traces(self, enable_act=True, num_batches=2):
-        """
-        Estimates average hessian trace for each parameter
-        """
-        assert num_batches > 0
-        if enable_act:
-            self.hook_handles = []
-            self.layer_acts = {}
-            self.layer_acts_grads = {}
-            self.register_act_grad_hooks()
-        ##num_data_iter = self.op_cfgs_list[0]['calib_iteration']
-        ##num_all_data = num_data_iter * self.dataloader.batch_size
-        ##op_list = self.op_list
-        ##TODO setting this in config
-        self.get_params()
-        # names = [n for n, p in self.model.named_parameters() if p.requires_grad and "bias" not in n]##remove bias
-        # params = [p for n, p in self.model.named_parameters() if p.requires_grad and "bias" not in n]##remove bias
+    def get_weight_traces(self, num_samples):
 
         layer_traces_per_iter = []
         prev_avg_model_trace = 0
         for i in range(self.max_iter):
-            layer_traces = self.get_hv_one_sample(self.params, enable_act, num_batches)
+            layer_traces = self.get_vtHv_weight(self.params, num_samples)
             layer_traces_per_iter.append(layer_traces)
             layer_traces_estimate = torch.mean(torch.stack(layer_traces_per_iter), dim=0)
             model_trace = torch.sum(layer_traces_estimate)
@@ -239,19 +260,152 @@ def get_avg_traces(self, enable_act=True, num_batches=2):
             if i == 50:  ##TODO for debug
                 break
             prev_avg_model_trace = model_trace
-
-        layer_traces = layer_traces_estimate
-        if enable_act:
-            self.reset_act_gradient_and_hooks()
         weight_name_to_traces = {}
 
-        for weigth_name, trace in zip(self.weight_names, layer_traces):
-            weight_name_to_traces[weigth_name] = trace
+        for weight_name, trace in zip(self.weight_names, layer_traces):
+            weight_name_to_traces[weight_name] = trace
         op_name_to_trace = {}
-        for weigth_name in self.weight_names:
-            op_name = self.weight_to_op[weigth_name]
-            op_name_to_trace[op_name] = weight_name_to_traces[weigth_name]
+        for weight_name in self.weight_names:
+            op_name = self.weight_to_op[weight_name]
+            op_name_to_trace[op_name] = weight_name_to_traces[weight_name]
         return op_name_to_trace
+        return layer_traces_estimate
+
+    def get_act_traces(self, num_samples):
+        self.hook_handles = []
+        self.layer_acts = {}
+        self.layer_acts_grads = {}
+        self.register_act_grad_hooks()
+        for i in range(self.max_iter):
+            pass
+
+    def get_avg_traces(self, enable_act=True, num_samples=100):
+        """
+        Estimates average hessian trace for each parameter
+        """
+
+        assert num_samples > 0
+
+        ##num_data_iter = self.op_cfgs_list[0]['calib_iteration']
+        ##num_all_data = num_data_iter * self.dataloader.batch_size
+        ##op_list = self.op_list
+        ##TODO setting this in config
+        self.get_params()
+        # names = [n for n, p in self.model.named_parameters() if p.requires_grad and "bias" not in n]##remove bias
+        # params = [p for n, p in self.model.named_parameters() if p.requires_grad and "bias" not in n]##remove bias
+
+        ## handle activation
+        if enable_act:
+            self.get_act_traces(num_samples)
+            ##change batchsize to 1
+
+        #
+        # layer_traces = layer_traces_estimate
+        # if enable_act:
+        #     self.reset_act_gradient_and_hooks()
+
+
+##copy from torch.quantization._numeric_suite
+def _find_match(
+        str_list: Union[Dict[str, Any], List[str]], key_str: str,
+        postfix: str,
+) -> Optional[str]:
+    split_str = key_str.split(".")
+    if split_str[-1] == postfix:
+        match_string = "".join(key_str.split(".")[0:-1])
+        for s2 in str_list:
+            pattern1 = "".join(s2.split(".")[0:-1])
+            pattern2 = "".join(s2.split(".")[0:-2])
+            if match_string == pattern1:
+                return s2
+            if match_string == pattern2:
+                return s2
+
+        # For matching "fc.weight" and "fc._packed_params._packed_params"
+        if postfix == "_packed_params":
+            match_string = "".join(key_str.split(".")[0:-2])
+            if len(match_string) == 0:
+                return None
+            for s2 in str_list:
+                pattern1 = "".join(s2.split(".")[0:-1])
+                pattern2 = "".join(s2.split(".")[0:-2])
+                if match_string == pattern1:
+                    return s2
+                if match_string == pattern2:
+                    return s2
+        return None
+    else:
+        return None
+
+
+##copy form torch.quantization._numeric_suite
+def compare_weights(
+        float_dict: Dict[str, Any], quantized_dict: Dict[str, Any]
+) -> Dict[str, Dict[str, torch.Tensor]]:
+    r"""Compare the weights of the float module with its corresponding quantized
+    module. Return a dict with key corresponding to module names and each entry being
+    a dictionary with two keys 'float' and 'quantized', containing the float and
+    quantized weights. This dict can be used to compare and compute the quantization
+    error of the weights of float and quantized models.
+
+    Example usage::
+
+        wt_compare_dict = compare_weights(
+            float_model.state_dict(), qmodel.state_dict())
+        for key in wt_compare_dict:
+            print(
+                key,
+                compute_error(
+                    wt_compare_dict[key]['float'],
+                    wt_compare_dict[key]['quantized'].dequantize()
+                )
+            )
+
+    Args:
+        float_dict: state dict of the float model
+        quantized_dict: state dict of the quantized model
+
+    Return:
+        weight_dict: dict with key corresponding to module names and each entry being
+        a dictionary with two keys 'float' and 'quantized', containing the float and
+        quantized weights
+    """
+
+    weight_dict: Dict[str, Dict] = {}
+    for key in quantized_dict:
+        match_key = _find_match(float_dict, key, "weight")
+        if match_key is not None:
+            weight_dict[key] = {}
+            weight_dict[key]["float"] = float_dict[match_key]
+            weight_dict[key]["quantized"] = quantized_dict[key]
+            continue
+
+        # For matching "fc.weight" and "fc._packed_params._packed_params"
+        match_key = _find_match(float_dict, key, "_packed_params")
+        if match_key is not None:
+            weight_dict[key] = {}
+            weight_dict[key]["float"] = float_dict[match_key]
+            weight_dict[key]["quantized"] = quantized_dict[key][0]
+
+        # For LSTM
+        split_str = key.split(".")
+        if split_str[-1] == "param" and split_str[-3] == "_all_weight_values":
+            layer = split_str[-2]
+            module_name = ".".join(split_str[:-3])
+            float_weight_ih_key = module_name + ".weight_ih_l" + layer
+            float_weight_hh_key = module_name + ".weight_hh_l" + layer
+            if float_weight_ih_key in float_dict and float_weight_hh_key in float_dict:
+                weight_dict[key] = {}
+                weight_dict[key]["float"] = float_dict[float_weight_ih_key]
+                weight_dict[key]["quantized"] = (
+                    quantized_dict[key].__getstate__()[0][4][0].__getstate__()[0][0]
+                )
+                weight_dict[key]["float"] = float_dict[float_weight_hh_key]
+                weight_dict[key]["quantized"] = (
+                    quantized_dict[key].__getstate__()[0][4][1].__getstate__()[0][0]
+                )
+
+    return weight_dict
 
 
 @strategy_registry
@@ -331,7 +485,7 @@ def next_tune_cfg(self):
         stage1_cnt = 0
         quant_ops = quant_mode_wise_items['static'] if 'static' in quant_mode_wise_items else []
         quant_ops += quant_mode_wise_items['dynamic'] if 'dynamic' in quant_mode_wise_items else []
-        stage1_max = -1  # TODO set a more appropriate value
+        stage1_max = 1  # TODO set a more appropriate value
         op_wise_tuning_sampler = OpTypeWiseTuningSampler(tuning_space, [], [],
                                                          op_item_dtype_dict, initial_op_tuning_cfg)
         for op_tuning_cfg in op_wise_tuning_sampler:
@@ -341,6 +495,12 @@ def next_tune_cfg(self):
                 break
             op_tuning_cfg['calib_sampling_size'] = calib_size
             yield op_tuning_cfg
+
+        # import torch.quantization._numeric_suite as ns
+        # self.model.eval()
+        # fused_model = fuse_fx(self.model.model)
+        # res = compare_weights(fused_model.state_dict(), self.q_model.state_dict())
+
         # Fallback the ops supported both static and dynamic from static to dynamic
         quant_ops = quant_mode_wise_items['static'] if 'static' in quant_mode_wise_items else []
         quant_ops += quant_mode_wise_items['dynamic'] if 'dynamic' in quant_mode_wise_items else []
@@ -358,6 +518,16 @@ def next_tune_cfg(self):
             orig_eval = False
         self._fp32_model.eval()
         ht = HessianTrace(self._fp32_model, self.calib_dataloader)
+
+        q_model_state_dict = {
+        }
+        for key in self.q_model.state_dict().keys():
+            length = len("_model.")
+            new_key = key[length:]
+            q_model_state_dict[new_key] = self.q_model.state_dict()[key]
+
+        weight_quant_loss = compare_weights(ht.model.state_dict(), q_model_state_dict)
+
         op_to_traces = ht.get_avg_traces()
         if orig_eval == False:
             self._fp32_model.train()
@@ -380,116 +550,6 @@ def next_tune_cfg(self):
             op_tuning_cfg['calib_sampling_size'] = calib_size
             yield op_tuning_cfg
 
-        # ordered_ops = sorted(op_fallback_acc_impact.keys(),
-        #                      key=lambda key: op_fallback_acc_impact[key],
-        #                      reverse=self.higher_is_better)
-        # op_dtypes = OrderedDict(zip(ordered_ops, [target_dtype] * len(fallback_items_name_lst)))
-        # logger.info(f"Start to accumulate fallback to {target_dtype}.")
-        # initial_op_tuning_cfg = deepcopy(best_op_tuning_cfg_stage1)
-        # fallback_sampler = FallbackTuningSampler(tuning_space, tuning_order_lst=[],
-        #                                          initial_op_tuning_cfg=initial_op_tuning_cfg,
-        #                                          op_dtypes=op_dtypes, accumulate=True)
-        # for op_tuning_cfg in fallback_sampler:
-        #     op_tuning_cfg['calib_sampling_size'] = calib_sampling_size
-        #     yield op_tuning_cfg
-
-        # tmp = 1
-        # fallback_items_name_lst = [item.name for item in fallback_items_lst][::-1] # from bottom to up
-        # ops_sensitivity = self.adaptor.get_hessian_trace(self._fp32_model,
-        #                                                         self.calib_dataloader,
-        #                                                         self.
-        #                                                         method_args={'name': 'hessian_trace'})
-        # tmp = 1
-
-    def next_tune_cfg_bk(self):
-        """The generator of yielding next tuning config to traverse by concrete strategies
-           according to last tuning result.
-
-        Yields:
-            tune_config (dict): It's a dict containing the tuning configuration to run.
-        """
-        from copy import deepcopy
-        tuning_space = self.tuning_space
-        calib_sampling_size_lst = tuning_space.root_item.get_option_by_name('calib_sampling_size').options
-
-        calib_sampling_size = calib_sampling_size_lst[0]
-        # Initialize the tuning config for each op according to the quantization approach
-        op_item_dtype_dict, quant_mode_wise_items, initial_op_tuning_cfg = self.initial_tuning_cfg()
-        # Optype-wise tuning tuning items: the algorithm/scheme/granularity of activation(weight)
-        early_stop_tuning = False
-        stage1_cnt = 0
-        quant_ops = quant_mode_wise_items['static'] if 'static' in quant_mode_wise_items else []
-        quant_ops += quant_mode_wise_items['dynamic'] if 'dynamic' in quant_mode_wise_items else []
-        stage1_max = 1e9  # TODO set a more appropriate value
-        op_wise_tuning_sampler = OpTypeWiseTuningSampler(tuning_space, [], [],
-                                                         op_item_dtype_dict, initial_op_tuning_cfg)
-        # for op_tuning_cfg in op_wise_tuning_sampler:
-        #     stage1_cnt += 1
-        #     if early_stop_tuning and stage1_cnt > stage1_max:
-        #         logger.info("Early stopping the stage 1.")
-        #         break
-        #     op_tuning_cfg['calib_sampling_size'] = calib_sampling_size
-        #     yield op_tuning_cfg
-        # Fallback the ops supported both static and dynamic from static to dynamic
-        # Tuning items: None
-        # if self.cfg.quantization.approach == 'post_training_auto_quant':
-        #     static_dynamic_items = [item for item in tuning_space.query_items_by_quant_mode('static') if
-        #                             item in tuning_space.query_items_by_quant_mode('dynamic')]
-        #     if static_dynamic_items:
-        #         logger.info("Fallback all ops that support both dynamic and static to dynamic.")
-        #     else:
-        #         logger.info("Non ops that support both dynamic")
-        #
-        #     new_op_tuning_cfg = deepcopy(self.cur_best_tuning_cfg)
-        #     for item in static_dynamic_items:
-        #         new_op_tuning_cfg[item.name] = self.initial_dynamic_cfg_based_on_static_cfg(
-        #                                        new_op_tuning_cfg[item.name])
-        #     new_op_tuning_cfg['calib_sampling_size'] = calib_sampling_size
-        #     yield new_op_tuning_cfg
-        best_op_tuning_cfg_stage1 = deepcopy(self.cur_best_tuning_cfg)
-
-        # Fallback
-        for target_dtype in ['bf16', 'fp32']:
-            target_type_lst = set(tuning_space.query_items_by_quant_mode(target_dtype))
-            fallback_items_lst = [item for item in quant_ops if item in target_type_lst]
-            if fallback_items_lst:
-                logger.info(f"Start to fallback op to {target_dtype} one by one.")
-                self._fallback_started()
-            # fallback_items_name_lst = [item.name for item in fallback_items_lst][::-1] # from bottom to up
-            ops_sensitivity = self.adaptor.calculate_op_sensitivity(self._fp32_model,
-                                                                    self.calib_dataloader,
-                                                                    method_args={'name': 'hessian_trace'})
-
-            fallback_items_name_lst = sorted(ops_sensitivity, key=lambda items: items[1], reverse=True)
-
-            op_dtypes = OrderedDict(zip(fallback_items_name_lst, [target_dtype] * len(fallback_items_name_lst)))
-            initial_op_tuning_cfg = deepcopy(best_op_tuning_cfg_stage1)
-            fallback_sampler = FallbackTuningSampler(tuning_space, tuning_order_lst=[],
-                                                     initial_op_tuning_cfg=initial_op_tuning_cfg,
-                                                     op_dtypes=op_dtypes, accumulate=False)
-
-            op_fallback_acc_impact = OrderedDict()
-            for op_index, op_tuning_cfg in enumerate(fallback_sampler):
-                op_tuning_cfg['calib_sampling_size'] = calib_sampling_size
-                yield op_tuning_cfg
-                acc, _ = self.last_tune_result
-                op_fallback_acc_impact[fallback_items_name_lst[op_index]] = acc
-
-            # do accumulated fallback according to the order in the previous stage
-            if len(op_fallback_acc_impact) > 0:
-                ordered_ops = sorted(op_fallback_acc_impact.keys(),
-                                     key=lambda key: op_fallback_acc_impact[key],
-                                     reverse=self.higher_is_better)
-                op_dtypes = OrderedDict(zip(ordered_ops, [target_dtype] * len(fallback_items_name_lst)))
-                logger.info(f"Start to accumulate fallback to {target_dtype}.")
-                initial_op_tuning_cfg = deepcopy(best_op_tuning_cfg_stage1)
-                fallback_sampler = FallbackTuningSampler(tuning_space, tuning_order_lst=[],
-                                                         initial_op_tuning_cfg=initial_op_tuning_cfg,
-                                                         op_dtypes=op_dtypes, accumulate=True)
-                for op_tuning_cfg in fallback_sampler:
-                    op_tuning_cfg['calib_sampling_size'] = calib_sampling_size
-                    yield op_tuning_cfg
-
     def initial_dynamic_cfg_based_on_static_cfg(self, op_static_cfg: OpTuningConfig):
         op_state = op_static_cfg.get_state()
         op_name = op_static_cfg.op_name

From a7b58c776765e3289fd6c8cf75a5e78f5cde8bbb Mon Sep 17 00:00:00 2001
From: BiaoFangAIA <108742533+BiaoFangAIA@users.noreply.github.com>
Date: Wed, 23 Nov 2022 19:06:28 +0800
Subject: [PATCH 031/128] add weights_quant loss eval

still bugs for get avg traces
---
 neural_compressor/strategy/hawq.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 2f6a2e7e074..897dfcffea2 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -527,8 +527,16 @@ def next_tune_cfg(self):
             q_model_state_dict[new_key] = self.q_model.state_dict()[key]
 
         weight_quant_loss = compare_weights(ht.model.state_dict(), q_model_state_dict)
-
+        pertur_lst={}
+        for key in weight_quant_loss:
+            op_float_tensor=weight_quant_loss[key]['float']
+            op_qnt_tensor=weight_quant_loss[key]['quantized'].dequantize()
+            diff_l2=(torch.norm(op_float_tensor-op_qnt_tensor,p=2)**2) #Formula: L2=||Q(w)-w||p^2
+            pertur_lst[key]=diff_l2
+        # for i in pertur_lst:
+        #     print(pertur_lst[i])
         op_to_traces = ht.get_avg_traces()
+        print(op_to_traces)
         if orig_eval == False:
             self._fp32_model.train()
 

From 356dc2bbfd758f3f3caeec558dd4d455c8709a50 Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Thu, 24 Nov 2022 14:12:48 +0800
Subject: [PATCH 032/128] fixed weight trace issue

---
 neural_compressor/strategy/hawq.py | 50 ++++++++----------------------
 1 file changed, 13 insertions(+), 37 deletions(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 897dfcffea2..65c7ab72d82 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -55,22 +55,7 @@ def __init__(self, model, dataloader, criterion=None):
             self.criterion = torch.nn.CrossEntropyLoss().to(self.device)  ##TODO need to set in config
         self.criterion = self.criterion.to(self.device)
         self.weight_to_op, self.op_list = self.get_fused_mapping()
-
-    def get_qnt_weight_loss(self, weights_name):
-
-        fp32_model = self.fp32model
-
-        qnt_model = self.q_model
-
-        # print(self.model.state_dict())
-        for n, p in self.model.named_parameters():
-            print(n)
-
-        print("*" * 20)
-
-        for n, p in self.q_model._model.named_parameters():
-            print(n)
-        pass
+        self.get_params()
 
     def is_fused_module(self, module):
         """This is a helper function for `_propagate_qconfig_helper` to detecte
@@ -149,7 +134,6 @@ def sample_rademacher(self, params):
         return samples
 
     def get_vtHv_weight(self, params, num_samples):
-        num_batches = (num_samples + self.dataloader.batchsize - 1) // self.dataloader
         v = self.sample_rademacher(params)
         H_v = [0] * len(v)
         cnt = 0
@@ -159,7 +143,7 @@ def get_vtHv_weight(self, params, num_samples):
             gradients = self.get_gradients(self.model, data, self.criterion, create_graph=True)
             H_v_one = torch.autograd.grad(gradients, params, v, only_inputs=True, retain_graph=False)
             H_v = [pre + cur * float(batch_size) for cur, pre in zip(H_v_one, H_v)]
-            if step == num_batches - 1:
+            if cnt >=num_samples:
                 break
         if cnt > 0:
             H_v = [item / cnt for item in H_v]
@@ -246,7 +230,6 @@ def get_params(self):
         self.params = params
 
     def get_weight_traces(self, num_samples):
-
         layer_traces_per_iter = []
         prev_avg_model_trace = 0
         for i in range(self.max_iter):
@@ -261,7 +244,7 @@ def get_weight_traces(self, num_samples):
                 break
             prev_avg_model_trace = model_trace
         weight_name_to_traces = {}
-
+        layer_traces = layer_traces_estimate
         for weight_name, trace in zip(self.weight_names, layer_traces):
             weight_name_to_traces[weight_name] = trace
         op_name_to_trace = {}
@@ -269,7 +252,6 @@ def get_weight_traces(self, num_samples):
             op_name = self.weight_to_op[weight_name]
             op_name_to_trace[op_name] = weight_name_to_traces[weight_name]
         return op_name_to_trace
-        return layer_traces_estimate
 
     def get_act_traces(self, num_samples):
         self.hook_handles = []
@@ -279,24 +261,18 @@ def get_act_traces(self, num_samples):
         for i in range(self.max_iter):
             pass
 
-    def get_avg_traces(self, enable_act=True, num_samples=100):
+    def get_avg_traces(self, enable_act=True, num_samples=32):
         """
         Estimates average hessian trace for each parameter
         """
 
         assert num_samples > 0
-
-        ##num_data_iter = self.op_cfgs_list[0]['calib_iteration']
-        ##num_all_data = num_data_iter * self.dataloader.batch_size
-        ##op_list = self.op_list
-        ##TODO setting this in config
-        self.get_params()
-        # names = [n for n, p in self.model.named_parameters() if p.requires_grad and "bias" not in n]##remove bias
-        # params = [p for n, p in self.model.named_parameters() if p.requires_grad and "bias" not in n]##remove bias
+        weight_traces = self.get_weight_traces(num_samples)
+        return weight_traces
 
         ## handle activation
-        if enable_act:
-            self.get_act_traces(num_samples)
+        # if enable_act:
+        #     self.get_act_traces(num_samples)
             ##change batchsize to 1
 
         #
@@ -527,12 +503,12 @@ def next_tune_cfg(self):
             q_model_state_dict[new_key] = self.q_model.state_dict()[key]
 
         weight_quant_loss = compare_weights(ht.model.state_dict(), q_model_state_dict)
-        pertur_lst={}
+        pertur_lst = {}
         for key in weight_quant_loss:
-            op_float_tensor=weight_quant_loss[key]['float']
-            op_qnt_tensor=weight_quant_loss[key]['quantized'].dequantize()
-            diff_l2=(torch.norm(op_float_tensor-op_qnt_tensor,p=2)**2) #Formula: L2=||Q(w)-w||p^2
-            pertur_lst[key]=diff_l2
+            op_float_tensor = weight_quant_loss[key]['float']
+            op_qnt_tensor = weight_quant_loss[key]['quantized'].dequantize()
+            diff_l2 = (torch.norm(op_float_tensor - op_qnt_tensor, p=2) ** 2)  # Formula: L2=||Q(w)-w||p^2
+            pertur_lst[key] = diff_l2
         # for i in pertur_lst:
         #     print(pertur_lst[i])
         op_to_traces = ht.get_avg_traces()

From 5f78a9c479c661880a6fb166979b6fb2b8d20c3d Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Thu, 24 Nov 2022 14:15:11 +0800
Subject: [PATCH 033/128] fixed weight trace issue

---
 .../experimental/quantization.py              | 62 +++++++++----------
 test/strategy/test_hawq_wenhuach.py           | 10 +--
 2 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/neural_compressor/experimental/quantization.py b/neural_compressor/experimental/quantization.py
index c6e4a8c3646..7e8e8cfbbac 100644
--- a/neural_compressor/experimental/quantization.py
+++ b/neural_compressor/experimental/quantization.py
@@ -143,37 +143,37 @@ def pre_process(self):
             with open(self.resume_file, 'rb') as f:
                 _resume = pickle.load(f).__dict__
 
-        #
-        # import torchvision.datasets as datasets
-        # import torchvision.transforms as transforms
-        # data_path = "/home/bfang1/Projects/HAWQ_INC/datasets/raw"
-        # traindir = os.path.join(data_path, 'train')
-        # valdir = os.path.join(data_path, 'val')
-        # normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
-        #                                  std=[0.229, 0.224, 0.225])
-        #
-        # train_dataset = datasets.ImageFolder(
-        #     traindir,
-        #     transforms.Compose([
-        #         transforms.RandomResizedCrop(224),
-        #         transforms.RandomHorizontalFlip(),
-        #         transforms.ToTensor(),
-        #         normalize,
-        #     ]))
-        #
-        # val_dataset = datasets.ImageFolder(
-        #     valdir,
-        #     transforms.Compose([
-        #         transforms.RandomResizedCrop(224),
-        #         transforms.RandomHorizontalFlip(),
-        #         transforms.ToTensor(),
-        #         normalize,
-        #     ]))
-        #
-        # from torch.utils.data import DataLoader
-        #
-        # self._calib_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
-        # self._eval_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=False)
+
+        import torchvision.datasets as datasets
+        import torchvision.transforms as transforms
+        data_path = "/home/bfang1/Projects/HAWQ_INC/datasets/raw"
+        traindir = os.path.join(data_path, 'train')
+        valdir = os.path.join(data_path, 'val')
+        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                         std=[0.229, 0.224, 0.225])
+
+        train_dataset = datasets.ImageFolder(
+            traindir,
+            transforms.Compose([
+                transforms.RandomResizedCrop(224),
+                transforms.RandomHorizontalFlip(),
+                transforms.ToTensor(),
+                normalize,
+            ]))
+
+        val_dataset = datasets.ImageFolder(
+            valdir,
+            transforms.Compose([
+                transforms.RandomResizedCrop(224),
+                transforms.RandomHorizontalFlip(),
+                transforms.ToTensor(),
+                normalize,
+            ]))
+
+        from torch.utils.data import DataLoader
+
+        self._calib_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
+        self._eval_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=False)
 
         self.strategy = STRATEGIES[strategy](
             self._model,
diff --git a/test/strategy/test_hawq_wenhuach.py b/test/strategy/test_hawq_wenhuach.py
index 2adcd5a5812..df70e32cd9e 100644
--- a/test/strategy/test_hawq_wenhuach.py
+++ b/test/strategy/test_hawq_wenhuach.py
@@ -74,15 +74,15 @@ def tearDownClass(self):
 
 
     def test_run_hawq_one_trial(self):
-        # def eval_func(model):
-        #     self.i -= 1
-        #     return self.i
+        def eval_func(model):
+            self.i -= 1
+            return self.i
         from neural_compressor.experimental import Quantization, common
         model = copy.deepcopy(self.model)
         model.eval()
-        model = fuse_fx(model)
+        # model = fuse_fx(model)
         quantizer = Quantization('ptq_yaml.yaml')
-        ##quantizer.eval_func = eval_func
+        quantizer.eval_func = eval_func
         dataset = quantizer.dataset('dummy', (32, 3, 224, 224), label=True)
         quantizer.calib_dataloader = common.DataLoader(dataset)
         quantizer.eval_dataloader = common.DataLoader(dataset)

From df25db90edb3c7f5d8603e438f4886ea0db269f3 Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Thu, 24 Nov 2022 17:12:30 +0800
Subject: [PATCH 034/128] act traces have some issues

---
 neural_compressor/strategy/hawq.py | 216 +++++++++++++++++------------
 1 file changed, 124 insertions(+), 92 deletions(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 65c7ab72d82..c9f8c4488da 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -28,7 +28,7 @@
 from .st_utils.tuning_structs import OpTuningConfig
 from .st_utils.tuning_space import TUNING_ITEMS_LST
 from torch.quantization.quantize_fx import fuse_fx
-import torchvision
+
 from typing import Dict, List, Optional, Any, Union, Callable, Set
 
 
@@ -41,8 +41,7 @@ class HessianTrace:
     """
 
     def __init__(self, model, dataloader, criterion=None):
-        from torch.quantization.quantize_fx import fuse_fx
-        self.model = fuse_fx(model.model)
+        self.model = fuse_fx(model.model)  ##TODO need to check whether model has been already fused
 
         self.dataloader = dataloader
         self.max_iter = 500
@@ -102,89 +101,19 @@ def get_device(self, model: torch.nn.Module):
         for n, p in model.named_parameters():
             return p.data.device
 
-    def get_gradients(self, model, data, criterion, create_graph=False):
-        model.zero_grad()
-        input = data[0].to(self.device)
-        ##self._input_shape = input.shape  ## for resetting input activation
-        target = data[1].to(self.device)
-        # if enable_act:
-        #     input.requires_grad = True
-        output = model(input)
-        loss = criterion(output, target)
-        # torch.autograd.backward(loss, create_graph=create_graph)
-        loss.backward(create_graph=create_graph)
-        gradients = []
-        for n, p in model.named_parameters():
-            if p.grad != None and n in self.weight_names:
-                gradient = p.grad
-                gradients.append(gradient + 0.0)  ## add 0 to create a copy
-        model.zero_grad()
-        return gradients
-
-    # def get_params(self, model):
-    #     parameters = [p for p in model.parameters() if p.requires_grad]
-    #     return parameters
-
-    def sample_rademacher(self, params):
-        samples = []
-        for param in params:
-            r = torch.randint_like(param, high=2, device=self.device)
-            r.masked_fill_(r == 0, -1)
-            samples.append(r)
-        return samples
-
-    def get_vtHv_weight(self, params, num_samples):
-        v = self.sample_rademacher(params)
-        H_v = [0] * len(v)
-        cnt = 0
-        for step, data in enumerate(self.dataloader):
-            batch_size = data[0].shape[0]
-            cnt += batch_size
-            gradients = self.get_gradients(self.model, data, self.criterion, create_graph=True)
-            H_v_one = torch.autograd.grad(gradients, params, v, only_inputs=True, retain_graph=False)
-            H_v = [pre + cur * float(batch_size) for cur, pre in zip(H_v_one, H_v)]
-            if cnt >=num_samples:
-                break
-        if cnt > 0:
-            H_v = [item / cnt for item in H_v]
-        v_t_H_v = torch.stack([torch.mean(h_v * v_t) for (h_v, v_t) in zip(H_v, v)])  ##maybe sum is better
-        return v_t_H_v
-
-    def get_vtHv_act(self, params, num_samples):
-        v = self.sample_rademacher(params)
-        H_v = [0] * len(v)
-        cnt = 0
-        for step, data in enumerate(self.dataloader):
-            if cnt >= num_samples:
-                break
-            for i in range(self.dataloader.batchsize):  ##force to batchsize to be 1
-                input = data[0][i:i + 1]
-                target = data[1][i:i + 1]
-
-                self.get_gradients(self.model, (input, target), self.criterion, create_graph=True)
-                layer_acts = [self.layer_acts[key] for key in self.layer_acts.keys()]
-                layer_act_gradients = [self.layer_acts_grads[key] for key in self.layer_acts.keys()]
-                hv_one = torch.autograd.grad(layer_act_gradients, layer_acts, v, only_inputs=True, retain_graph=False)
-                cnt += 1
-                if cnt >= num_samples:
-                    break
-
     def _get_act_grad_hook(self, name):
         def act_grad_hook(model, grad_input, grad_output):
+            ##print(name, grad_input[0].shape, grad_output[0].shape)
             self.layer_acts_grads[name] = [grad_input, grad_output]
 
         return act_grad_hook
 
     def _get_enable_act_grad_hook(self, name):
         def enable_act_grad_hook(model, inputs, outputs):
-            try:
-                input = inputs[0]  ##TODO check whether this is right
-            except:
-                input = inputs
-
-            if input.requires_grad is False:
-                input.requires_grad = True
-            self.layer_acts[name] = input
+            for input in inputs:
+                if input.requires_grad is False:
+                    input.requires_grad = True
+            self.layer_acts[name] = inputs
 
         return enable_act_grad_hook
 
@@ -229,18 +158,87 @@ def get_params(self):
         self.weight_names = weight_names
         self.params = params
 
+    def forward_backward(self, data, create_graph=False, return_w_grad=True):
+        self.model.zero_grad()
+        input = data[0].to(self.device)
+        ##self._input_shape = input.shape  ## for resetting input activation
+        target = data[1].to(self.device)
+        ##input.requires_grad = True
+        output = self.model(input)
+        loss = self.criterion(output, target)
+        torch.autograd.backward(loss, create_graph=create_graph)
+        ##loss.backward(create_graph=create_graph)
+        if return_w_grad:
+            gradients = []
+            for n, p in self.model.named_parameters():
+                if p.grad != None and n in self.weight_names:
+                    gradient = p.grad
+                    gradients.append(gradient + 0.0)  ## add 0 to create a copy
+            self.model.zero_grad()
+            return gradients
+        else:
+            self.model.zero_grad()
+
+    # def get_params(self, model):
+    #     parameters = [p for p in model.parameters() if p.requires_grad]
+    #     return parameters
+
+    def sample_rademacher(self, params):
+        samples = []
+        for param in params:
+            r = torch.randint_like(param, high=2, device=self.device)
+            r.masked_fill_(r == 0, -1)
+            samples.append(r)
+        return samples
+
+    def get_vtHv_weight(self, params, num_samples):
+        v = self.sample_rademacher(params)
+        H_v = [0] * len(v)
+        cnt = 0
+        for step, data in enumerate(self.dataloader):
+            batch_size = data[0].shape[0]
+            cnt += batch_size
+            gradients = self.forward_backward(data, create_graph=True)
+            H_v_one = torch.autograd.grad(gradients, params, v, only_inputs=True, retain_graph=False)
+            H_v = [pre + cur * float(batch_size) for cur, pre in zip(H_v_one, H_v)]
+            if cnt >= num_samples:
+                break
+        if cnt > 0:
+            H_v = [item / cnt for item in H_v]
+        v_t_H_v = torch.stack([torch.mean(h_v * v_t) for (h_v, v_t) in zip(H_v, v)])  ##maybe sum is better
+        return v_t_H_v
+
+    def get_vtHv_act(self, params, num_samples):
+        v = self.sample_rademacher(params)
+        H_v = [0] * len(v)
+        cnt = 0
+        for step, data in enumerate(self.dataloader):
+            if cnt >= num_samples:
+                break
+            for i in range(self.dataloader.batchsize):  ##force to batchsize to be 1
+                input = data[0][i:i + 1]
+                target = data[1][i:i + 1]
+
+                self.get_gradients(self.model, (input, target), self.criterion, create_graph=True)
+                layer_acts = [self.layer_acts[key] for key in self.layer_acts.keys()]
+                layer_act_gradients = [self.layer_acts_grads[key] for key in self.layer_acts.keys()]
+                hv_one = torch.autograd.grad(layer_act_gradients, layer_acts, v, only_inputs=True, retain_graph=False)
+                cnt += 1
+                if cnt >= num_samples:
+                    break
+
     def get_weight_traces(self, num_samples):
         layer_traces_per_iter = []
         prev_avg_model_trace = 0
-        for i in range(self.max_iter):
+        for iter in range(self.max_iter):
             layer_traces = self.get_vtHv_weight(self.params, num_samples)
             layer_traces_per_iter.append(layer_traces)
             layer_traces_estimate = torch.mean(torch.stack(layer_traces_per_iter), dim=0)
             model_trace = torch.sum(layer_traces_estimate)
             diff_ratio = abs(model_trace - prev_avg_model_trace) / (prev_avg_model_trace + self.eps)
-            if diff_ratio < self.tolerance and i > 10:  ##TODO magic number
+            if diff_ratio < self.tolerance and iter > 10:  ##TODO magic number
                 break
-            if i == 50:  ##TODO for debug
+            if iter == 50:  ##TODO for debug
                 break
             prev_avg_model_trace = model_trace
         weight_name_to_traces = {}
@@ -258,28 +256,62 @@ def get_act_traces(self, num_samples):
         self.layer_acts = {}
         self.layer_acts_grads = {}
         self.register_act_grad_hooks()
-        for i in range(self.max_iter):
+        cnt = 0
+        for step, data in enumerate(self.dataloader):
+            if cnt >= num_samples:
+                break
+            bs = data[0].shape[0]
+            act_traces_sum = 0
+            act_traces_per_iter = []
+            prev_avg_model_trace = 0
+            act_traces_sums = None
+            for i in range(bs):  ##force the bs to be one
+                input = data[0][i:i + 1]
+                target = data[1][i:i + 1]
+                self.forward_backward((input, target), create_graph=True, return_w_grad=False)
+                acts = [self.layer_acts[key] for key in self.layer_acts.keys()]
+                if act_traces_sums == None:
+                    act_traces_sums = [0] * len(acts)
+                acts_grad = [self.layer_acts_grads[key] for key in self.layer_acts.keys()]  ##same order with acts
+                # vt_H_v_sum_per_act = [0] * len(acts)
+                #
+                # prev_model_act_trace = 0
+                # for iter in range(self.max_iter):
+                #     v = self.sample_rademacher(acts)
+                #     H_v = torch.autograd.grad(acts_grad, acts, v, only_inputs=True, retain_graph=False)
+                #     vt_H_v = [torch.mean(h_v * v_t) for (h_v, v_t) in zip(H_v, v)]
+                #
+                #     vt_H_v_sum_per_act = [vt_H_v_sum_per_act[index] + vt_H_v[index] for index, item in
+                #                           enumerate(vt_H_v_sum_per_act)]
+                #     vt_H_v_mean_per_act = [item / (iter + 1) for item in vt_H_v_sum_per_act]
+                #     current_model_act_trace = torch.mean(torch.stack(vt_H_v_mean_per_act))
+                #
+                #     diff_ratio = abs(current_model_act_trace - prev_model_act_trace) / (
+                #             prev_model_act_trace + self.eps)
+                #     if diff_ratio < self.tolerance and iter > 10:  ##TODO magic number
+                #         break
+                #     if iter == 50:  ##TODO for debug
+                #         break
+                #
+                #     prev_model_act_trace = current_vt_H_v_mean_per_model
+                #
+                # cnt += 1
+                # if cnt >= num_samples:
+                #     break
             pass
 
+        self.reset_act_gradient_and_hooks()  ##TODO have issues to reset the input grad to False
+
     def get_avg_traces(self, enable_act=True, num_samples=32):
         """
         Estimates average hessian trace for each parameter
         """
 
         assert num_samples > 0
+        ##self.get_act_traces(num_samples)
         weight_traces = self.get_weight_traces(num_samples)
         return weight_traces
 
-        ## handle activation
-        # if enable_act:
-        #     self.get_act_traces(num_samples)
-            ##change batchsize to 1
-
-        #
-        # layer_traces = layer_traces_estimate
-        # if enable_act:
-        #     self.reset_act_gradient_and_hooks()
-
 
 ##copy from torch.quantization._numeric_suite
 def _find_match(

From 5a266fff26f49ab13764e4298ae1dc725f52743a Mon Sep 17 00:00:00 2001
From: BiaoFangAIA <108742533+BiaoFangAIA@users.noreply.github.com>
Date: Thu, 24 Nov 2022 18:24:20 +0800
Subject: [PATCH 035/128] correct the qnt_weigths does't machted issue

---
 neural_compressor/strategy/hawq.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index c9f8c4488da..a1616c23dd9 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -391,9 +391,10 @@ def compare_weights(
         # For matching "fc.weight" and "fc._packed_params._packed_params"
         match_key = _find_match(float_dict, key, "_packed_params")
         if match_key is not None:
-            weight_dict[key] = {}
-            weight_dict[key]["float"] = float_dict[match_key]
-            weight_dict[key]["quantized"] = quantized_dict[key][0]
+            weight_dict[match_key] = {}
+            weight_dict[match_key]["float"] = float_dict[match_key]
+            weight_dict[match_key]["quantized"] = quantized_dict[key][0]
+            ##TODO:should consider more models in further work
 
         # For LSTM
         split_str = key.split(".")
@@ -586,4 +587,4 @@ def initial_dynamic_cfg_based_on_static_cfg(self, op_static_cfg: OpTuningConfig)
                     quant_mode_item = tuning_space.query_quant_mode_item((op_name, op_type), op_quant_mode)
                     tuning_item = quant_mode_item.get_option_by_name(att_item)
                     dynamic_state[att_item] = tuning_item.options[0] if tuning_item else None
-        return OpTuningConfig(op_name, op_type, op_quant_mode, tuning_space, kwargs=dynamic_state)
+        return OpTuningConfig(op_name, op_type, op_quant_mode, tuning_space, kwargs=dynamic_state)
\ No newline at end of file

From 523303f2d160c0cbbcc5e3ae15f5e1992d85532f Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Thu, 24 Nov 2022 20:10:25 +0800
Subject: [PATCH 036/128] support activation traces

---
 neural_compressor/strategy/hawq.py | 110 +++++++++++++++++------------
 1 file changed, 66 insertions(+), 44 deletions(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index c9f8c4488da..94745270ac1 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -41,6 +41,8 @@ class HessianTrace:
     """
 
     def __init__(self, model, dataloader, criterion=None):
+        self.unfused_model = model.model
+
         self.model = fuse_fx(model.model)  ##TODO need to check whether model has been already fused
 
         self.dataloader = dataloader
@@ -104,16 +106,19 @@ def get_device(self, model: torch.nn.Module):
     def _get_act_grad_hook(self, name):
         def act_grad_hook(model, grad_input, grad_output):
             ##print(name, grad_input[0].shape, grad_output[0].shape)
-            self.layer_acts_grads[name] = [grad_input, grad_output]
+            if type(model) == torch.nn.Linear:  ##TODO very tricky
+                self.layer_acts_grads[name] = grad_input[1]
+            else:
+                self.layer_acts_grads[name] = grad_input[0]
 
         return act_grad_hook
 
     def _get_enable_act_grad_hook(self, name):
         def enable_act_grad_hook(model, inputs, outputs):
-            for input in inputs:
-                if input.requires_grad is False:
-                    input.requires_grad = True
-            self.layer_acts[name] = inputs
+            input = inputs[0]
+            if input.requires_grad is False:
+                input.requires_grad = True
+            self.layer_acts[name] = input
 
         return enable_act_grad_hook
 
@@ -134,8 +139,8 @@ def _unregister_hook(self):
         for handel in self.hook_handles:
             handel.remove()
 
-    def register_act_grad_hooks(self):
-        for name, module in self.model.named_modules():
+    def register_act_grad_hooks(self, model):
+        for name, module in model.named_modules():
             if self.mapping_module_to_op(name) in self.op_list:
                 hook_handle = module.register_forward_hook(self._get_enable_act_grad_hook(name))
                 self.hook_handles.append(hook_handle)
@@ -158,13 +163,13 @@ def get_params(self):
         self.weight_names = weight_names
         self.params = params
 
-    def forward_backward(self, data, create_graph=False, return_w_grad=True):
-        self.model.zero_grad()
+    def forward_backward(self, model, data, create_graph=False, return_w_grad=True):
+        model.zero_grad()
         input = data[0].to(self.device)
         ##self._input_shape = input.shape  ## for resetting input activation
         target = data[1].to(self.device)
-        ##input.requires_grad = True
-        output = self.model(input)
+        input.requires_grad = True
+        output = model(input)
         loss = self.criterion(output, target)
         torch.autograd.backward(loss, create_graph=create_graph)
         ##loss.backward(create_graph=create_graph)
@@ -174,10 +179,10 @@ def forward_backward(self, data, create_graph=False, return_w_grad=True):
                 if p.grad != None and n in self.weight_names:
                     gradient = p.grad
                     gradients.append(gradient + 0.0)  ## add 0 to create a copy
-            self.model.zero_grad()
+            model.zero_grad()
             return gradients
         else:
-            self.model.zero_grad()
+            model.zero_grad()
 
     # def get_params(self, model):
     #     parameters = [p for p in model.parameters() if p.requires_grad]
@@ -198,7 +203,7 @@ def get_vtHv_weight(self, params, num_samples):
         for step, data in enumerate(self.dataloader):
             batch_size = data[0].shape[0]
             cnt += batch_size
-            gradients = self.forward_backward(data, create_graph=True)
+            gradients = self.forward_backward(self.model, data, create_graph=True)
             H_v_one = torch.autograd.grad(gradients, params, v, only_inputs=True, retain_graph=False)
             H_v = [pre + cur * float(batch_size) for cur, pre in zip(H_v_one, H_v)]
             if cnt >= num_samples:
@@ -252,11 +257,14 @@ def get_weight_traces(self, num_samples):
         return op_name_to_trace
 
     def get_act_traces(self, num_samples):
+        unfused_training = self.unfused_model.training
+        self.unfused_model.eval()
         self.hook_handles = []
         self.layer_acts = {}
         self.layer_acts_grads = {}
-        self.register_act_grad_hooks()
+        self.register_act_grad_hooks(self.unfused_model)
         cnt = 0
+        act_traces_per_sample = []
         for step, data in enumerate(self.dataloader):
             if cnt >= num_samples:
                 break
@@ -268,39 +276,49 @@ def get_act_traces(self, num_samples):
             for i in range(bs):  ##force the bs to be one
                 input = data[0][i:i + 1]
                 target = data[1][i:i + 1]
-                self.forward_backward((input, target), create_graph=True, return_w_grad=False)
+                self.forward_backward(self.unfused_model, (input, target), create_graph=True, return_w_grad=False)
                 acts = [self.layer_acts[key] for key in self.layer_acts.keys()]
                 if act_traces_sums == None:
                     act_traces_sums = [0] * len(acts)
                 acts_grad = [self.layer_acts_grads[key] for key in self.layer_acts.keys()]  ##same order with acts
-                # vt_H_v_sum_per_act = [0] * len(acts)
-                #
-                # prev_model_act_trace = 0
-                # for iter in range(self.max_iter):
-                #     v = self.sample_rademacher(acts)
-                #     H_v = torch.autograd.grad(acts_grad, acts, v, only_inputs=True, retain_graph=False)
-                #     vt_H_v = [torch.mean(h_v * v_t) for (h_v, v_t) in zip(H_v, v)]
-                #
-                #     vt_H_v_sum_per_act = [vt_H_v_sum_per_act[index] + vt_H_v[index] for index, item in
-                #                           enumerate(vt_H_v_sum_per_act)]
-                #     vt_H_v_mean_per_act = [item / (iter + 1) for item in vt_H_v_sum_per_act]
-                #     current_model_act_trace = torch.mean(torch.stack(vt_H_v_mean_per_act))
-                #
-                #     diff_ratio = abs(current_model_act_trace - prev_model_act_trace) / (
-                #             prev_model_act_trace + self.eps)
-                #     if diff_ratio < self.tolerance and iter > 10:  ##TODO magic number
-                #         break
-                #     if iter == 50:  ##TODO for debug
-                #         break
-                #
-                #     prev_model_act_trace = current_vt_H_v_mean_per_model
-                #
-                # cnt += 1
-                # if cnt >= num_samples:
-                #     break
-            pass
+                vt_H_v_sum_per_act = [0] * len(acts)
+
+                prev_model_act_trace = 0
+                for iter in range(self.max_iter):
+                    v = self.sample_rademacher(acts)
+                    H_v = torch.autograd.grad(acts_grad, acts, v, only_inputs=True, retain_graph=True)
+                    vt_H_v = [torch.mean(h_v * v_t) for (h_v, v_t) in zip(H_v, v)]
+
+                    vt_H_v_sum_per_act = [vt_H_v_sum_per_act[index] + vt_H_v[index] for index, item in
+                                          enumerate(vt_H_v_sum_per_act)]
+                    vt_H_v_mean_per_act = [item / (iter + 1) for item in vt_H_v_sum_per_act]
+                    current_model_act_trace = torch.mean(torch.stack(vt_H_v_mean_per_act))
+
+                    diff_ratio = abs(current_model_act_trace - prev_model_act_trace) / (
+                            prev_model_act_trace + self.eps)
+                    if diff_ratio < self.tolerance and iter > 10:  ##TODO magic number
+                        break
+                    if iter == 50:  ##TODO for debug
+                        break
 
+                    prev_model_act_trace = current_model_act_trace
+                act_traces_per_sample.append(vt_H_v_mean_per_act)
+                cnt += 1
+                if cnt >= num_samples:
+                    break
+
+        if unfused_training:
+            self.unfused_model.train()
         self.reset_act_gradient_and_hooks()  ##TODO have issues to reset the input grad to False
+        act_traces_stack = torch.stack([torch.stack(item) for item in act_traces_per_sample])
+        act_traces = torch.mean(act_traces_stack, dim=0)
+        res_dict={}
+        for index, key in enumerate(self.layer_acts.keys()):
+            res_dict[key]=act_traces[index]
+
+        self.layer_acts=[]
+        self.layer_acts_grads=[]
+        return act_traces
 
     def get_avg_traces(self, enable_act=True, num_samples=32):
         """
@@ -308,9 +326,13 @@ def get_avg_traces(self, enable_act=True, num_samples=32):
         """
 
         assert num_samples > 0
-        ##self.get_act_traces(num_samples)
+        traces = {}
         weight_traces = self.get_weight_traces(num_samples)
-        return weight_traces
+        traces['weight'] = weight_traces
+        if enable_act:
+            act_traces = self.get_act_traces(num_samples)
+            traces['activation']= act_traces
+        return traces
 
 
 ##copy from torch.quantization._numeric_suite

From f56ab18ab869e79ca9792abf04582913ab1aa96d Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Thu, 24 Nov 2022 20:43:03 +0800
Subject: [PATCH 037/128] only enable weight traces currently

---
 neural_compressor/strategy/hawq.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 6575c21fccb..2e590c3f34b 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -312,12 +312,12 @@ def get_act_traces(self, num_samples):
         self.reset_act_gradient_and_hooks()  ##TODO have issues to reset the input grad to False
         act_traces_stack = torch.stack([torch.stack(item) for item in act_traces_per_sample])
         act_traces = torch.mean(act_traces_stack, dim=0)
-        res_dict={}
+        res_dict = {}
         for index, key in enumerate(self.layer_acts.keys()):
-            res_dict[key]=act_traces[index]
+            res_dict[key] = act_traces[index]
 
-        self.layer_acts=[]
-        self.layer_acts_grads=[]
+        self.layer_acts = []
+        self.layer_acts_grads = []
         return act_traces
 
     def get_avg_traces(self, enable_act=True, num_samples=32):
@@ -331,7 +331,7 @@ def get_avg_traces(self, enable_act=True, num_samples=32):
         traces['weight'] = weight_traces
         if enable_act:
             act_traces = self.get_act_traces(num_samples)
-            traces['activation']= act_traces
+            traces['activation'] = act_traces
         return traces
 
 
@@ -566,7 +566,8 @@ def next_tune_cfg(self):
             pertur_lst[key] = diff_l2
         # for i in pertur_lst:
         #     print(pertur_lst[i])
-        op_to_traces = ht.get_avg_traces()
+        traces = ht.get_avg_traces(enable_act=False)
+        op_to_traces = traces['weight']
         print(op_to_traces)
         if orig_eval == False:
             self._fp32_model.train()
@@ -609,4 +610,4 @@ def initial_dynamic_cfg_based_on_static_cfg(self, op_static_cfg: OpTuningConfig)
                     quant_mode_item = tuning_space.query_quant_mode_item((op_name, op_type), op_quant_mode)
                     tuning_item = quant_mode_item.get_option_by_name(att_item)
                     dynamic_state[att_item] = tuning_item.options[0] if tuning_item else None
-        return OpTuningConfig(op_name, op_type, op_quant_mode, tuning_space, kwargs=dynamic_state)
\ No newline at end of file
+        return OpTuningConfig(op_name, op_type, op_quant_mode, tuning_space, kwargs=dynamic_state)

From 007b33606ef960af08a32b356b519cbc21835f66 Mon Sep 17 00:00:00 2001
From: BiaoFangAIA <108742533+BiaoFangAIA@users.noreply.github.com>
Date: Fri, 25 Nov 2022 15:30:44 +0800
Subject: [PATCH 038/128] merge weights quantization loss and trace
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Formula：pertubation=trace*weights_qnt_loss
---
 neural_compressor/strategy/hawq.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 2e590c3f34b..c000def9440 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -564,18 +564,17 @@ def next_tune_cfg(self):
             op_qnt_tensor = weight_quant_loss[key]['quantized'].dequantize()
             diff_l2 = (torch.norm(op_float_tensor - op_qnt_tensor, p=2) ** 2)  # Formula: L2=||Q(w)-w||p^2
             pertur_lst[key] = diff_l2
-        # for i in pertur_lst:
-        #     print(pertur_lst[i])
         traces = ht.get_avg_traces(enable_act=False)
         op_to_traces = traces['weight']
-        print(op_to_traces)
+        for trace_i, pertur_i in zip(op_to_traces.keys(),pertur_lst.keys()):
+            op_to_traces[trace_i]=pertur_lst[pertur_i]*op_to_traces[trace_i] #Formula:Omig=Trace*L2
         if orig_eval == False:
             self._fp32_model.train()
-
         ordered_ops = sorted(op_to_traces.keys(),
                              key=lambda key: op_to_traces[key],
                              reverse=self.higher_is_better)
         # WA for add op type
+        print("ordered_ops:",ordered_ops)
         op_info_map = {}
         for op_info in list(initial_op_tuning_cfg.keys()):
             op_info_map[op_info[0]] = op_info  # op_name: (op_name, op_type)

From 420fc95111bf12d33a0d539a1e18b35d77b3af19 Mon Sep 17 00:00:00 2001
From: BiaoFangAIA <108742533+BiaoFangAIA@users.noreply.github.com>
Date: Mon, 28 Nov 2022 22:44:38 +0800
Subject: [PATCH 039/128] Update conf.yaml

change root path to default config
---
 .../torchvision_models/quantization/ptq/cpu/fx/conf.yaml    | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/conf.yaml b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/conf.yaml
index 4b50b559e6a..ef61c6c3e0b 100644
--- a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/conf.yaml
+++ b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/conf.yaml
@@ -24,7 +24,7 @@ quantization:                                        # optional. tuning constrai
       batch_size: 1
       dataset:
         ImageFolder:
-          root: /home/bfang1/Projects/HAWQ_INC/datasets/raw/val     # NOTE: modify to calibration dataset location if needed
+          root: /path/to/calibration/dataset         # NOTE: modify to calibration dataset location if needed
       transform:
         Resize:
           size: 256
@@ -43,7 +43,7 @@ evaluation:                                          # optional. required if use
       batch_size: 1
       dataset:
         ImageFolder:
-          root: /home/bfang1/Projects/HAWQ_INC/datasets/raw/val      # NOTE: modify to evaluation dataset location if needed
+          root: /path/to/calibration/dataset         # NOTE: modify to evaluation dataset location if needed
       transform:
         Resize:
           size: 256
@@ -61,7 +61,7 @@ evaluation:                                          # optional. required if use
       batch_size: 1
       dataset:
         ImageFolder:
-          root: /home/bfang1/Projects/HAWQ_INC/datasets/raw/val        # NOTE: modify to evaluation dataset location if needed
+          root: /path/to/calibration/dataset         # NOTE: modify to evaluation dataset location if needed
       transform:
         Resize:
           size: 256

From 2707825b6706cac0c79860dc5940f77e4066e901 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Mon, 28 Nov 2022 22:28:42 +0800
Subject: [PATCH 040/128] WA add loss for strategy

---
 neural_compressor/conf/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neural_compressor/conf/config.py b/neural_compressor/conf/config.py
index aae60416104..dba150cb7d3 100644
--- a/neural_compressor/conf/config.py
+++ b/neural_compressor/conf/config.py
@@ -837,7 +837,7 @@ def percent_to_float(data):
     Optional('model_conversion'): model_conversion_schema,
 
     Optional('tuning', default={
-        'strategy': {'name': 'basic'},
+        'strategy': {'name': 'basic', 'loss': 'CrossEntropyLoss'}, # TODO move loss to appropriate position
         'accuracy_criterion': {'relative': 0.01, 'higher_is_better': True},
         'objective': 'performance',
         'exit_policy': {'timeout': 0, 'max_trials': 100, 'performance_only': False},

From 36731bcbfa8d67b1efc7eba94eeb27d2815890c8 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 8 Nov 2022 22:31:53 +0800
Subject: [PATCH 041/128] Feat(ST): add a interface for hawq(stage1)

---
 neural_compressor/adaptor/pytorch.py | 13 +++++++++++++
 neural_compressor/strategy/basic.py  |  7 ++++++-
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/neural_compressor/adaptor/pytorch.py b/neural_compressor/adaptor/pytorch.py
index 0a44fe2f5a3..4d7f4561ac8 100644
--- a/neural_compressor/adaptor/pytorch.py
+++ b/neural_compressor/adaptor/pytorch.py
@@ -1093,7 +1093,20 @@ def is_fused_module(self, module):
             return True
         else:
             return False
+        
+    def calculate_op_sensitivity(self, model, dataloader, method_args):
+        """Compute the op sensitivity by the specific method.
 
+        Args:
+            model(INC model): The fp32 model. 
+            dataloader: The calibration dataloader.
+            method_args(Dict): The parameters for specifying the method.  
+
+        Returns:
+            ops_sensitivity(Dict[tuple, float]): The key is (op_name, op_type), 
+              the value is the sensitivity under the specified method
+        """
+        pass
 
 unify_op_type_mapping = {
     "ConvReLU2d": "Conv2d",
diff --git a/neural_compressor/strategy/basic.py b/neural_compressor/strategy/basic.py
index c35398dd4bb..3cc4e38bde2 100644
--- a/neural_compressor/strategy/basic.py
+++ b/neural_compressor/strategy/basic.py
@@ -143,7 +143,12 @@ def next_tune_cfg(self):
                 if fallback_items_lst:
                     logger.info(f"Start to fallback op to {target_dtype} one by one.")
                     self._fallback_started()
-                fallback_items_name_lst = [item.name for item in fallback_items_lst][::-1] # from bottom to up
+                #fallback_items_name_lst = [item.name for item in fallback_items_lst][::-1] # from bottom to up
+                ops_sensitivity = self.adaptor.calculate_op_sensitivity(self._fp32_model, 
+                                                                        self.calib_dataloader, 
+                                                                        method_args = {'name': 'hessian_trace'})
+                fallback_items_name_lst = sorted(ops_sensitivity, key = lambda items: items[1], reverse=True)
+                
                 op_dtypes = OrderedDict(zip(fallback_items_name_lst, [target_dtype] * len(fallback_items_name_lst)))
                 initial_op_tuning_cfg = deepcopy(best_op_tuning_cfg_stage1)
                 fallback_sampler = FallbackTuningSampler(tuning_space, tuning_order_lst=[],

From c02b5c1cbf2da762cc4db7aa7cf941411677c89b Mon Sep 17 00:00:00 2001
From: root <root@milan-1.sh.intel.com>
Date: Thu, 10 Nov 2022 14:22:12 +0800
Subject: [PATCH 042/128] hawq_metric.py

---
 neural_compressor/strategy/hawq_metric.py | 291 ++++++++++++++++++++++
 1 file changed, 291 insertions(+)
 create mode 100644 neural_compressor/strategy/hawq_metric.py

diff --git a/neural_compressor/strategy/hawq_metric.py b/neural_compressor/strategy/hawq_metric.py
new file mode 100644
index 00000000000..acbcd98d740
--- /dev/null
+++ b/neural_compressor/strategy/hawq_metric.py
@@ -0,0 +1,291 @@
+"""
+ Copyright (c) 2022 Intel Corporation
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+      http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+import logging
+import torch
+import numpy as np
+from torch.autograd import Variable
+import yaml
+import torchvision.transforms as transforms
+import torchvision
+import random
+import copy
+from torch.quantization import get_default_qat_qconfig, quantize_jit,get_default_qconfig
+from torch.quantization.quantize_fx import prepare_fx, convert_fx,fuse_fx
+from torch.quantization import default_dynamic_qconfig, float_qparams_weight_only_qconfig
+import torch.quantization._numeric_suite as ns
+
+
+def fixed_seed(seed):
+    """Fixed rand seed to make sure results are same in different times on different devices.Eg CPU/GPU
+       Args:
+          seed:                              an integer number
+       return:                               None 
+    """
+    np.random.seed(seed)   #random
+    random.seed(seed)
+    torch.manual_seed(seed) #cpu
+    torch.cuda.manual_seed_all(seed)  #parallel cpu
+    torch.backends.cudnn.deterministic = True  #make sure results are same on cpu/gpu
+    torch.backends.cudnn.benchmark = True   #accelerator
+def calculate_params_gradients(model):
+     """
+     get the gradients and parameters from given model
+     Args:
+          model:                             FP32 model specificed
+     return:
+          params:                            paratmeters of model
+          grads:                             gradients of model
+     """
+     params=[]
+     grads=[]
+     for indx,(name, parm) in zip(enumerate(model.parameters()), model.named_parameters()): 
+          logging.info('->tensor_index:', indx[0],'-->name:', name, '-->grad_requirs:',parm.requires_grad, '-->current tensor len:',parm.shape)
+          if not parm.requires_grad:
+               continue
+          params.append(parm)
+          grads.append(0. if parm.grad is None else parm.grad+0.)
+     return params, grads
+def calculate_inner_product(list_x,list_y):
+     """Compute the inner product of two lists of variables list_x,list_y
+     Args:
+          list_x:                            input list variables
+          list_y:                            input list variables
+     return:
+          sum of inner product
+     """
+     return sum([torch.sum(x * y) for (x, y) in zip(list_x, list_y)])
+
+def calculate_vector_product(gradsH, params, v):
+     """compute the hessian vector product by torch.autograd.grad.
+     Agrs:
+          gradsH:                             gradient at current point
+          params:                             corresponding variables
+          v:                                  vector
+     return:
+          hv:                                 hessian vector product
+     """
+     hv=torch.autograd.grad(
+          gradsH,
+          params,
+          grad_outputs=v,
+          only_inputs=True,
+          retain_graph=True)
+     return hv
+def ptq_calibrate(model, data_loader,num_cal):
+     """Calibrate model in post train quantization model 
+        Args:
+            model:                            a pre_quantization model to calibrate
+            data_laoder:                      datasets
+            num_cal:                          maximization number of calibrated samples, such as images
+        return:
+            model:                            a calibrated model
+     """
+     #Generate some samples to calibrate from data_loader
+     calibrate_samples=[]
+     i=0
+     for inputs, targets in data_loader:
+          calibrate_samples.append(inputs)
+          i=i+1
+          if i>=num_cal:
+               break
+     # model.cpu()
+     model.eval()
+     #calibration
+     with torch.no_grad():
+          for sample in calibrate_samples:
+               model(sample)
+     return model
+def calculate_perturbation(model_qnt,model_fp32)->dict:
+     """calculate weights quantized perturbation using L2 normal
+        Args:
+            model_qnt:                       quantized model
+            model_fp32:                      float model
+        return:
+            pertur_lst:                      dict,which contains layer_name and value
+            
+     """
+     
+     wq_cmp_dict=ns.compare_weights(model_fp32.state_dict(), model_qnt.state_dict())
+     pertur_lst=[]
+     for key in wq_cmp_dict:
+          pertur_pair={"layer_name":'',"value":0}
+          op_float_tensor=wq_cmp_dict[key]['float']
+          op_qnt_tensor=wq_cmp_dict[key]['quantized'].dequantize()
+          diff_l2=(torch.norm(op_float_tensor-op_qnt_tensor,p=2)**2) #Formula: L2=||Q(w)-w||p^2
+          pertur_pair['layer_name']=key
+          pertur_pair['value']=diff_l2
+          pertur_lst.append(pertur_pair)
+     return pertur_lst
+class Hessian():
+     """This class used to compute each layer hessian trace from given FP32 model
+     """
+     def __init__(self,model,criterion, data=None, dataloader=None,device='cpu') -> None:
+          """Initial parameters 
+          Args:
+               model:                         FP32 model specificed
+               criterion:                     loss function
+               data:                          a single batch of data, including inputs and its corresponding labels
+               dataloader:                    the data loader including bunch of batches of data
+               device:                        currently only supports cpu device
+          """
+          #make sure we either pass a single batch or a dataloader
+          assert (data!=None and dataloader==None ) or (data==None and dataloader!=None)
+          #make mode is evaluation model
+          self.model=model.eval()
+          self.criterion=criterion
+          self.device=device
+
+          if data!=None:
+               self.data=data
+               self.full_dataset=False
+          if not self.full_dataset:
+               self.inputs, self.targets=self.data
+               outputs=self.model(self.inputs)
+               loss=self.criterion(outputs,self.targets)
+               loss.backward(create_graph=True)
+          params, gradSH=calculate_params_gradients(self.model)
+
+          self.params=params
+          self.gradSH=gradSH
+     def calculate_trace(self,max_Iter=100, tolerance=1e-3):
+          """Compute the hessian trace based on Hutchinson algorithm
+          Args:
+               max_Inter:                    number of  maximization iteration 
+               tolerance:                    minimum relative tolerance for stopping the algorithm.
+          return: 
+               avg_traces_lst:               return hessian trace per layer for given model
+          """
+          avg_traces_lst=[]
+          for (i_grad, i_param,(module_name, _)) in zip(self.gradSH, self.params, self.model.named_parameters()):
+               v=[torch.randint_like(i_param,high=2, device=self.device)]
+               for v_i in v:
+                    v_i[v_i==0]=-1
+               i_v=v
+               trace_vhv=[]
+               trace=0.
+               trace_pair={"layer_name":" ", "trace":0}
+               self.model.zero_grad()
+               for i in range(max_Iter):
+                    hv=calculate_vector_product(i_grad,i_param,i_v) # hessian vector
+                    trace_vhv_cur=calculate_inner_product(hv,v).cpu().item()#current point 
+                    trace_vhv.append(trace_vhv_cur)
+                    difference=(np.mean(trace_vhv)-trace)/(abs(trace)+1e-6)
+                    if abs(difference)<tolerance:
+                         avg_trace_vhv=np.mean(trace_vhv)
+                         trace_pair["layer_name"]=module_name
+                         trace_pair["trace"]=avg_trace_vhv
+                         avg_traces_lst.append(trace_pair)
+                         break
+                    else:
+                         trace=np.mean(trace_vhv)
+          return avg_traces_lst
+                         
+
+class Hawq_top():
+     """This class is a interface of hessian
+     """
+     def __init__(self,model,yaml_trace=None,yaml_cpu=None,dataloader=None) -> None:
+          self.dataloader=dataloader
+          if yaml_trace and yaml_cpu is not None:
+               with open(yaml_trace) as file:
+                    params_config=yaml.load(file)
+               if params_config['loss']=='CrossEntropyLoss':
+                    self.criterion=torch.nn.CrossEntropyLoss()
+               self.random_seed=params_config['random_seed']
+               self.max_Iteration=params_config['max_Iteration']
+               self.enable_op_fuse=params_config['enable_op_fuse']
+               self.tolerance=float(params_config['tolerance'])
+               self.max_cal_sample=float(params_config['max_cal_smaple'])
+               self.quantize_mode=params_config['quantize_mode']
+               with open(yaml_cpu,'r') as file:
+                    yaml_config=yaml.load(file)
+               str_dtype=(yaml_config[0]['precisions']['names'])
+               self.list_dtype = str_dtype.split(",") 
+          else:
+               self.criterion=torch.nn.CrossEntropyLoss()
+               self.random_seed=100
+               self.max_Iteration=100
+               self.enable_op_fuse=True
+               self.tolerance=1e-6
+               self.max_cal_sample=100
+               self.quantize_mode='ptq'
+               self.list_dtype=['int8','fp32']
+          logging.info("Current parameters config for Hutchinson’s algorithm as below:")
+          logging.info("criterion:",self.criterion,"| random_seed:",self.random_seed,"| max_Iteration:", self.max_Iteration, \
+          "| tolerance:", self.tolerance,"|  en_op_fuse", self.enable_op_fuse,"| max_cal_sample:", self.max_cal_sample)
+          fixed_seed(self.random_seed)
+          self.model=model
+          self.model.eval()
+          model_tmp=copy.deepcopy(model)
+          model_tmp.eval()
+          self.model_fused= fuse_fx(model_tmp)
+          self.model_fused.eval()
+              
+     def get_init_config(self)->dict: 
+          """
+          """
+          #Load a sample from dataloader to compute graident    
+          for inputs, targets in self.dataloader:
+               break
+          #Hessian average trace computation
+          with torch.enable_grad():
+               if self.enable_op_fuse:
+                    hawq_cmp=Hessian(self.model_fused,criterion=self.criterion,data=(inputs,targets))
+               else:
+                    hawq_cmp=Hessian(self.model,criterion=self.criterion,data=(inputs,targets))
+          avg_traces_lst=hawq_cmp.calculate_trace(max_Iter=self.max_Iteration,tolerance=self.tolerance)
+         
+          #fiter none weight layer and save weight layer to match perturbation computation
+          avg_traces_lst_weight=[]
+          for avg_trace_i in avg_traces_lst:
+               if 'weight' in avg_trace_i['layer_name']:
+                    avg_traces_lst_weight.append(avg_trace_i)
+          # avg_traces_lst_sorted=sorted(avg_traces_lst,key=lambda x:x["trace"], reverse=True)
+          if self.quantize_mode=='ptq':
+               #PTQ quantization
+               qconfig = get_default_qconfig("fbgemm")
+               qconfig_dict={"":qconfig} #enable all layers/tensor to quantize
+               #calibrate
+               model_prepared=prepare_fx(self.model, qconfig_dict)
+               model_prepared=ptq_calibrate(model_prepared,data_loader=self.dataloader,num_cal=self.max_cal_sample)
+               model_prepared.cpu()
+               model_all_qnt=convert_fx(model_prepared)
+               #calculate perturbation
+               pertu_list=calculate_perturbation(model_fp32=self.model,model_qnt=model_all_qnt)
+               #calculate omiga
+               for omiga_i in pertu_list:
+                    for avg_trace_i in avg_traces_lst:
+                         if avg_trace_i['layer_name']==omiga_i['layer_name']:
+                              avg_trace_i['trace']=avg_trace_i['trace']*omiga_i['value']
+               # for avg_trace_i, omiga_i in zip(avg_traces_lst_weight,pertu_list):
+               #      omig_pair={"layer_name":" ", "value":0}
+               #      omig_val=avg_trace_i['trace']*omiga_i['value']
+               #      omig_pair['layer_name']=avg_trace_i['layer_name']
+               #      omig_pair['value']=omig_val
+               #      omig_list.append(omig_pair)
+               # omig_list_sorted=sorted(omig_list,key=lambda x:x['value'],reverse=True)
+               omig_list_sorted=sorted(avg_traces_lst,key=lambda x:x['trace'],reverse=True)
+          tune_init_config_pairs=[]
+          #
+          for i in omig_list_sorted:
+               tune_init_config_pair={"op_name":'',"op_type":'','trace':0}
+               if i['layer_name']==omig_list_sorted[0]['layer_name']: 
+                    tune_init_config_pair['op_name']=i['layer_name']
+                    tune_init_config_pair['op_type']=self.list_dtype[-1] #setup as float op
+                    tune_init_config_pair['trace']=float(i['trace'])
+               else:
+                    tune_init_config_pair['op_name']=i['layer_name']
+                    tune_init_config_pair['op_type']=self.list_dtype[0]
+                    tune_init_config_pair['trace']=float(i['trace'])
+               tune_init_config_pairs.append(tune_init_config_pair)
+          return tune_init_config_pairs

From 399c732d12ed66ec11e4b359442fefa379866b5a Mon Sep 17 00:00:00 2001
From: root <root@milan-1.sh.intel.com>
Date: Thu, 10 Nov 2022 14:24:13 +0800
Subject: [PATCH 043/128] pytorch.py

---
 neural_compressor/adaptor/pytorch.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/neural_compressor/adaptor/pytorch.py b/neural_compressor/adaptor/pytorch.py
index 4d7f4561ac8..e89f687f81a 100644
--- a/neural_compressor/adaptor/pytorch.py
+++ b/neural_compressor/adaptor/pytorch.py
@@ -29,6 +29,7 @@
 from ..utils import logger
 from .query import QueryBackendCapability
 from ..experimental.data.dataloaders.base_dataloader import BaseDataLoader
+from neural_compressor.strategy.hawq_metric import Hawq_top
 
 
 torch = LazyImport("torch")
@@ -1106,6 +1107,10 @@ def calculate_op_sensitivity(self, model, dataloader, method_args):
             ops_sensitivity(Dict[tuple, float]): The key is (op_name, op_type), 
               the value is the sensitivity under the specified method
         """
+        if method_args['name']=='hessian_trace':
+            Hawq_top(model=model,yaml_cpu=None,yaml_trace=None,dataloader=dataloader)
+            hessian_cmp=Hawq_top.get_init_config()
+            return hessian_cmp
         pass
 
 unify_op_type_mapping = {

From 3b5abbf116b4bc72a2a4a549c3863a0964bbb140 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 30 Nov 2022 14:29:49 +0800
Subject: [PATCH 044/128] resolve conflicts

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 test/strategy/test_hessian_trace_inc.py | 63 +++++++++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 test/strategy/test_hessian_trace_inc.py

diff --git a/test/strategy/test_hessian_trace_inc.py b/test/strategy/test_hessian_trace_inc.py
new file mode 100644
index 00000000000..f05b47ca3aa
--- /dev/null
+++ b/test/strategy/test_hessian_trace_inc.py
@@ -0,0 +1,63 @@
+import torch
+import unittest
+import os
+import sys
+import copy
+import torchvision
+import torchvision.transforms as transforms
+from torch.utils.data import DataLoader
+from neural_compressor.data import DATASETS
+from neural_compressor.experimental.data.dataloaders.pytorch_dataloader import PyTorchDataLoader
+from neural_compressor.adaptor.pytorch import TemplateAdaptor
+from neural_compressor.strategy.hawq_metric import Hawq_top
+
+def build_hessian_trace():
+    hessian_trace_config_yaml='''
+    loss:
+        CrossEntropyLoss
+    random_seed:
+        1
+    max_Iteration:
+        100
+    tolerance:
+        1e-3
+    enable_op_fuse:
+        True
+    max_cal_smaple:
+        100
+    quantize_mode:
+        ptq
+    '''
+    with open('./hessian_trace_config_yaml','w+',encoding="utf-8") as f:
+        f.write(hessian_trace_config_yaml)
+class Test_hessian_trace(unittest.TestCase):
+    #boot up test
+    @classmethod
+    def setUpClass(cls) -> None:
+        build_hessian_trace()
+        cls.model=torchvision.models.resnet18()
+    #shotdown test
+    @classmethod
+    def tearDownClass(cls) -> None:
+        os.remove('./hessian_trace_config_yaml')
+    #one test case
+    def test_run_hessian_trace(cls):
+        """
+        hessian_trace_top
+        Inputs:
+            model:                      FP32 model
+            dataloader:                 imagenet
+        """ 
+        model=cls.model
+        datasets = DATASETS('pytorch')
+        dummy_dataset = datasets['dummy'](shape=(200, 3, 224, 224), low=0., high=1., label=True)
+        dummy_dataloader = PyTorchDataLoader(dummy_dataset)
+        # yaml_cpu='/home/bfang1/Projects/HAWQ_INC/frameworks.ai.lpot.intel-lpot/neural_compressor/adaptor/pytorch_cpu.yaml'
+        # hessian_cmp=hawq_metric.Hawq_top(model,'./hessian_trace_config_yaml',yaml_cpu,dummy_dataloader)
+        hessian_cmp=Hawq_top(model,yaml_cpu=None,yaml_trace=None,dataloader=dummy_dataloader)
+        tuning_init_config=hessian_cmp.get_init_config()
+        #print tuning init_config
+        for i in tuning_init_config:
+            print(i)
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file

From c7c16988c8ccd4a9fc4b6c06ffb5b18f2647e9e0 Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Tue, 15 Nov 2022 11:44:48 +0800
Subject: [PATCH 045/128] add wenhuach test env

---
 neural_compressor/strategy/hawq.py  | 311 ++++++++++++++++++++++++++++
 test/strategy/test_hawq_wenhuach.py |  74 +++++++
 2 files changed, 385 insertions(+)
 create mode 100644 neural_compressor/strategy/hawq.py
 create mode 100644 test/strategy/test_hawq_wenhuach.py

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
new file mode 100644
index 00000000000..2dd0287fa2e
--- /dev/null
+++ b/neural_compressor/strategy/hawq.py
@@ -0,0 +1,311 @@
+"""
+ Copyright (c) 2022 Intel Corporation
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+      http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+from ..utils import logger
+import torch
+import numpy as np
+from torch.autograd import Variable
+import yaml
+import torchvision.transforms as transforms
+import torchvision
+import random
+import copy
+from torch.quantization import get_default_qat_qconfig, quantize_jit, get_default_qconfig
+from torch.quantization.quantize_fx import prepare_fx, convert_fx, fuse_fx
+from torch.quantization import default_dynamic_qconfig, float_qparams_weight_only_qconfig
+import torch.quantization._numeric_suite as ns
+
+
+def fix_seed(seed):
+    """Fixed rand seed to make sure results are same in different times on different devices.Eg CPU/GPU
+       Args:
+          seed:                              an integer number
+       return:                               None
+    """
+    np.random.seed(seed)  # random
+    random.seed(seed)
+    torch.manual_seed(seed)  # cpu
+    torch.cuda.manual_seed_all(seed)  # parallel cpu
+    torch.backends.cudnn.deterministic = True  # make sure results are same on cpu/gpu
+    torch.backends.cudnn.benchmark = True  # accelerator
+
+
+def calculate_params_gradients(model):
+    """
+    get the gradients and parameters from given model
+    Args:
+         model:                             FP32 model specificed
+    return:
+         params:                            paratmeters of model
+         grads:                             gradients of model
+    """
+    params = []
+    grads = []
+    for indx, (name, parm) in zip(enumerate(model.parameters()), model.named_parameters()):
+        logger.info(
+            f'index:{indx[0]}-->name:{name}:{parm.shape}')
+
+        if not parm.requires_grad:
+            continue
+        params.append(parm)
+        grads.append(0. if parm.grad is None else parm.grad + 0.)
+    return params, grads
+
+
+def calculate_inner_product(list_x, list_y):
+    """Compute the inner product of two lists of variables list_x,list_y
+    Args:
+         list_x:                            input list variables
+         list_y:                            input list variables
+    return:
+         sum of inner product
+    """
+    return sum([torch.sum(x * y) for (x, y) in zip(list_x, list_y)])
+
+
+def calculate_vector_product(gradsH, params, v):
+    """compute the hessian vector product by torch.autograd.grad.
+    Agrs:
+         gradsH:                             gradient at current point
+         params:                             corresponding variables
+         v:                                  vector
+    return:
+         hv:                                 hessian vector product
+    """
+    hv = torch.autograd.grad(
+        gradsH,
+        params,
+        grad_outputs=v,
+        only_inputs=True,
+        retain_graph=True)
+    return hv
+
+
+def ptq_calibrate(model, data_loader, num_cal):
+    """Calibrate model in post train quantization model
+       Args:
+           model:                            a pre_quantization model to calibrate
+           data_laoder:                      datasets
+           num_cal:                          maximization number of calibrated samples, such as images
+       return:
+           model:                            a calibrated model
+    """
+    # Generate some samples to calibrate from data_loader
+    calibrate_samples = []
+    i = 0
+    for inputs, targets in data_loader:
+        calibrate_samples.append(inputs)
+        i = i + 1
+        if i >= num_cal:
+            break
+    # model.cpu()
+    model.eval()
+    # calibration
+    with torch.no_grad():
+        for sample in calibrate_samples:
+            model(sample)
+    return model
+
+
+def calculate_perturbation(model_qnt, model_fp32) -> dict:
+    """calculate weights quantized perturbation using L2 normal
+       Args:
+           model_qnt:                       quantized model
+           model_fp32:                      float model
+       return:
+           pertur_lst:                      dict,which contains layer_name and value
+
+    """
+
+    wq_cmp_dict = ns.compare_weights(model_fp32.state_dict(), model_qnt.state_dict())
+    pertur_lst = []
+    for key in wq_cmp_dict:
+        pertur_pair = {"layer_name": '', "value": 0}
+        op_float_tensor = wq_cmp_dict[key]['float']
+        op_qnt_tensor = wq_cmp_dict[key]['quantized'].dequantize()
+        diff_l2 = (torch.norm(op_float_tensor - op_qnt_tensor, p=2) ** 2)  # Formula: L2=||Q(w)-w||p^2
+        pertur_pair['layer_name'] = key
+        pertur_pair['value'] = diff_l2
+        pertur_lst.append(pertur_pair)
+    return pertur_lst
+
+
+class Hessian():
+    """This class used to compute each layer hessian trace from given FP32 model
+    """
+
+    def __init__(self, model, criterion, data=None, dataloader=None, device='cpu') -> None:
+        """Initial parameters
+        Args:
+             model:                         FP32 model specificed
+             criterion:                     loss function
+             data:                          a single batch of data, including inputs and its corresponding labels
+             dataloader:                    the data loader including bunch of batches of data
+             device:                        currently only supports cpu device
+        """
+        # make sure we either pass a single batch or a dataloader
+        assert (data != None and dataloader == None) or (data == None and dataloader != None)
+        # make mode is evaluation model
+        self.model = model.eval()
+        self.criterion = criterion
+        self.device = device
+
+        if data != None:
+            self.data = data
+            self.full_dataset = False
+        if not self.full_dataset:
+            self.inputs, self.targets = self.data
+            outputs = self.model(self.inputs)
+            loss = self.criterion(outputs, self.targets)
+            loss.backward(create_graph=True)
+        params, gradSH = calculate_params_gradients(self.model)
+
+        self.params = params
+        self.gradSH = gradSH
+
+    def calculate_trace(self, max_Iter=100, tolerance=1e-3):
+        """Compute the hessian trace based on Hutchinson algorithm
+        Args:
+             max_Inter:                    number of  maximization iteration
+             tolerance:                    minimum relative tolerance for stopping the algorithm.
+        return:
+             avg_traces_lst:               return hessian trace per layer for given model
+        """
+        avg_traces_lst = []
+        for (i_grad, i_param, (module_name, _)) in zip(self.gradSH, self.params, self.model.named_parameters()):
+            v = [torch.randint_like(i_param, high=2, device=self.device)]
+            for v_i in v:
+                v_i[v_i == 0] = -1
+            i_v = v
+            trace_vhv = []
+            trace = 0.
+            trace_pair = {"layer_name": " ", "trace": 0}
+            self.model.zero_grad()
+            for i in range(max_Iter):
+                hv = calculate_vector_product(i_grad, i_param, i_v)  # hessian vector
+                trace_vhv_cur = calculate_inner_product(hv, v).cpu().item()  # current point
+                trace_vhv.append(trace_vhv_cur)
+                difference = (np.mean(trace_vhv) - trace) / (abs(trace) + 1e-6)
+                if abs(difference) < tolerance:
+                    avg_trace_vhv = np.mean(trace_vhv)
+                    trace_pair["layer_name"] = module_name
+                    trace_pair["trace"] = avg_trace_vhv
+                    avg_traces_lst.append(trace_pair)
+                    break
+                else:
+                    trace = np.mean(trace_vhv)
+        return avg_traces_lst
+
+
+class Hawq_top():
+    """This class is a interface of hessian
+    """
+
+    def __init__(self, model, yaml_trace=None, yaml_cpu=None, dataloader=None) -> None:
+        self.dataloader = dataloader
+        if yaml_trace and yaml_cpu is not None:
+            with open(yaml_trace) as file:
+                params_config = yaml.load(file)
+            if params_config['loss'] == 'CrossEntropyLoss':
+                self.criterion = torch.nn.CrossEntropyLoss()
+            self.random_seed = params_config['random_seed']
+            self.max_Iteration = params_config['max_Iteration']
+            self.enable_op_fuse = params_config['enable_op_fuse']
+            self.tolerance = float(params_config['tolerance'])
+            self.max_cal_sample = float(params_config['max_cal_smaple'])
+            self.quantize_mode = params_config['quantize_mode']
+            with open(yaml_cpu, 'r') as file:
+                yaml_config = yaml.load(file)
+            str_dtype = (yaml_config[0]['precisions']['names'])
+            self.list_dtype = str_dtype.split(",")
+        else:
+            self.criterion = torch.nn.CrossEntropyLoss()
+            self.random_seed = 100
+            self.max_Iteration = 100
+            self.enable_op_fuse = True
+            self.tolerance = 1e-6
+            self.max_cal_sample = 100
+            self.quantize_mode = 'ptq'
+            self.list_dtype = ['int8', 'fp32']
+        # logger.info("Current parameters config for Hutchinson’s algorithm as below:")
+        logger.info(
+            f"criterion:{self.criterion}| random_seed:{self.random_seed}| max_Iteration:self.max_Iteration| tolerance:{self.tolerance}")
+        # logger.info("criterion:", self.criterion, "| random_seed:", self.random_seed, "| max_Iteration:",
+        #              self.max_Iteration, \
+        #              "| tolerance:", self.tolerance, "|  en_op_fuse", self.enable_op_fuse, "| max_cal_sample:",
+        #              self.max_cal_sample)
+        fix_seed(self.random_seed)
+        self.model = model
+        self.model.eval()
+        model_tmp = copy.deepcopy(model)
+        model_tmp.eval()
+        self.model_fused = fuse_fx(model_tmp)
+        self.model_fused.eval()
+
+    def get_init_config(self) -> dict:
+        """
+        """
+        # Load a sample from dataloader to compute graident
+        for inputs, targets in self.dataloader:
+            break
+        # Hessian average trace computation
+        with torch.enable_grad():
+            if self.enable_op_fuse:
+                hawq_cmp = Hessian(self.model_fused, criterion=self.criterion, data=(inputs, targets))
+            else:
+                hawq_cmp = Hessian(self.model, criterion=self.criterion, data=(inputs, targets))
+        avg_traces_lst = hawq_cmp.calculate_trace(max_Iter=self.max_Iteration, tolerance=self.tolerance)
+
+        # fiter none weight layer and save weight layer to match perturbation computation
+        avg_traces_lst_weight = []
+        for avg_trace_i in avg_traces_lst:
+            if 'weight' in avg_trace_i['layer_name']:
+                avg_traces_lst_weight.append(avg_trace_i)
+        # avg_traces_lst_sorted=sorted(avg_traces_lst,key=lambda x:x["trace"], reverse=True)
+        if self.quantize_mode == 'ptq':
+            # PTQ quantization
+            qconfig = get_default_qconfig("fbgemm")
+            qconfig_dict = {"": qconfig}  # enable all layers/tensor to quantize
+            # calibrate
+            model_prepared = prepare_fx(self.model, qconfig_dict)
+            model_prepared = ptq_calibrate(model_prepared, data_loader=self.dataloader, num_cal=self.max_cal_sample)
+            model_prepared.cpu()
+            model_all_qnt = convert_fx(model_prepared)
+            # calculate perturbation
+            pertu_list = calculate_perturbation(model_fp32=self.model, model_qnt=model_all_qnt)
+            # calculate omiga
+            for omiga_i in pertu_list:
+                for avg_trace_i in avg_traces_lst:
+                    if avg_trace_i['layer_name'] == omiga_i['layer_name']:
+                        avg_trace_i['trace'] = avg_trace_i['trace'] * omiga_i['value']
+            # for avg_trace_i, omiga_i in zip(avg_traces_lst_weight,pertu_list):
+            #      omig_pair={"layer_name":" ", "value":0}
+            #      omig_val=avg_trace_i['trace']*omiga_i['value']
+            #      omig_pair['layer_name']=avg_trace_i['layer_name']
+            #      omig_pair['value']=omig_val
+            #      omig_list.append(omig_pair)
+            # omig_list_sorted=sorted(omig_list,key=lambda x:x['value'],reverse=True)
+            omig_list_sorted = sorted(avg_traces_lst, key=lambda x: x['trace'], reverse=True)
+        tune_init_config_pairs = []
+        #
+        for i in omig_list_sorted:
+            tune_init_config_pair = {"op_name": '', "op_type": '', 'trace': 0}
+            if i['layer_name'] == omig_list_sorted[0]['layer_name']:
+                tune_init_config_pair['op_name'] = i['layer_name']
+                tune_init_config_pair['op_type'] = self.list_dtype[-1]  # setup as float op
+                tune_init_config_pair['trace'] = float(i['trace'])
+            else:
+                tune_init_config_pair['op_name'] = i['layer_name']
+                tune_init_config_pair['op_type'] = self.list_dtype[0]
+                tune_init_config_pair['trace'] = float(i['trace'])
+            tune_init_config_pairs.append(tune_init_config_pair)
+        return tune_init_config_pairs
diff --git a/test/strategy/test_hawq_wenhuach.py b/test/strategy/test_hawq_wenhuach.py
new file mode 100644
index 00000000000..5affedc70ca
--- /dev/null
+++ b/test/strategy/test_hawq_wenhuach.py
@@ -0,0 +1,74 @@
+import torch
+import unittest
+import os
+import sys
+import copy
+import torchvision
+import torchvision.transforms as transforms
+from torch.utils.data import DataLoader
+from neural_compressor.data import DATASETS
+from neural_compressor.experimental.data.dataloaders.pytorch_dataloader import PyTorchDataLoader
+from neural_compressor.adaptor.pytorch import TemplateAdaptor
+
+from neural_compressor.strategy.hawq import Hawq_top, fix_seed
+
+fix_seed(1)
+
+
+def build_hessian_trace():
+    hessian_trace_config_yaml = '''
+    loss:
+        CrossEntropyLoss
+    random_seed:
+        1
+    max_Iteration:
+        100
+    tolerance:
+        1e-3
+    enable_op_fuse:
+        True
+    max_cal_smaple:
+        100
+    quantize_mode:
+        ptq
+    '''
+    with open('./hessian_trace_config_yaml', 'w+', encoding="utf-8") as f:
+        f.write(hessian_trace_config_yaml)
+
+
+class Test_hessian_trace(unittest.TestCase):
+    # boot up test
+    @classmethod
+    def setUpClass(cls) -> None:
+        build_hessian_trace()
+        cls.model = torchvision.models.resnet18()
+
+    # shotdown test
+    @classmethod
+    def tearDownClass(cls) -> None:
+        os.remove('./hessian_trace_config_yaml')
+
+    # one test case
+    def test_run_hessian_trace(cls):
+        """
+        hessian_trace_top
+        Inputs:
+            model:                      FP32 model
+            dataloader:                 imagenet
+        """
+
+        model = cls.model
+        datasets = DATASETS('pytorch')
+        dummy_dataset = datasets['dummy'](shape=(200, 3, 224, 224), low=0., high=1., label=True)
+        dummy_dataloader = PyTorchDataLoader(dummy_dataset)
+        # yaml_cpu='/home/bfang1/Projects/HAWQ_INC/frameworks.ai.lpot.intel-lpot/neural_compressor/adaptor/pytorch_cpu.yaml'
+        # hessian_cmp=hawq_metric.Hawq_top(model,'./hessian_trace_config_yaml',yaml_cpu,dummy_dataloader)
+        hessian_cmp = Hawq_top(model, yaml_cpu=None, yaml_trace=None, dataloader=dummy_dataloader)
+        tuning_init_config = hessian_cmp.get_init_config()
+        # print tuning init_config
+        for i in tuning_init_config:
+            print(i)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 581b21e56cf8baf0766d4a3c980a86cef3339ef7 Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Tue, 15 Nov 2022 14:00:06 +0800
Subject: [PATCH 046/128] try to test mes strategy, have bug now

---
 test/strategy/test_hawq_wenhuach.py | 162 +++++++++++++++++++---------
 1 file changed, 113 insertions(+), 49 deletions(-)

diff --git a/test/strategy/test_hawq_wenhuach.py b/test/strategy/test_hawq_wenhuach.py
index 5affedc70ca..4443cd8d486 100644
--- a/test/strategy/test_hawq_wenhuach.py
+++ b/test/strategy/test_hawq_wenhuach.py
@@ -9,66 +9,130 @@
 from neural_compressor.data import DATASETS
 from neural_compressor.experimental.data.dataloaders.pytorch_dataloader import PyTorchDataLoader
 from neural_compressor.adaptor.pytorch import TemplateAdaptor
-
+from neural_compressor.adaptor import FRAMEWORKS
+import shutil
 from neural_compressor.strategy.hawq import Hawq_top, fix_seed
 
 fix_seed(1)
 
+def build_ptq_yaml():
+    fake_yaml = '''
+        model:
+          name: imagenet
+          framework: pytorch
+        quantization: 
+          calibration:
+        evaluation:
+          accuracy:
+            metric:
+              topk: 1
+        tuning:
+          strategy:
+            name: mse
+          accuracy_criterion:
+            relative: -0.1
+          random_seed: 9527
+          exit_policy:
+            max_trials: 1
+          workspace:
+            path: saved
+        '''
+    with open('ptq_yaml.yaml', 'w', encoding="utf-8") as f:
+        f.write(fake_yaml)
 
-def build_hessian_trace():
-    hessian_trace_config_yaml = '''
-    loss:
-        CrossEntropyLoss
-    random_seed:
-        1
-    max_Iteration:
-        100
-    tolerance:
-        1e-3
-    enable_op_fuse:
-        True
-    max_cal_smaple:
-        100
-    quantize_mode:
-        ptq
-    '''
-    with open('./hessian_trace_config_yaml', 'w+', encoding="utf-8") as f:
-        f.write(hessian_trace_config_yaml)
-
+class TestPytorchAdaptor(unittest.TestCase):
+    framework_specific_info = {"device": "cpu",
+                               "approach": "post_training_static_quant",
+                               "random_seed": 1234,
+                               "q_dataloader": None,
+                               "workspace_path": None}
+    framework = "pytorch"
+    adaptor = FRAMEWORKS[framework](framework_specific_info)
+    model = torchvision.models.resnet18()
 
-class Test_hessian_trace(unittest.TestCase):
-    # boot up test
-    @classmethod
-    def setUpClass(cls) -> None:
-        build_hessian_trace()
-        cls.model = torchvision.models.resnet18()
+    # model = torch.quantization.QuantWrapper(model)
 
-    # shotdown test
     @classmethod
-    def tearDownClass(cls) -> None:
-        os.remove('./hessian_trace_config_yaml')
+    def setUpClass(self):
+        build_ptq_yaml()
 
-    # one test case
-    def test_run_hessian_trace(cls):
-        """
-        hessian_trace_top
-        Inputs:
-            model:                      FP32 model
-            dataloader:                 imagenet
-        """
 
-        model = cls.model
-        datasets = DATASETS('pytorch')
-        dummy_dataset = datasets['dummy'](shape=(200, 3, 224, 224), low=0., high=1., label=True)
-        dummy_dataloader = PyTorchDataLoader(dummy_dataset)
-        # yaml_cpu='/home/bfang1/Projects/HAWQ_INC/frameworks.ai.lpot.intel-lpot/neural_compressor/adaptor/pytorch_cpu.yaml'
-        # hessian_cmp=hawq_metric.Hawq_top(model,'./hessian_trace_config_yaml',yaml_cpu,dummy_dataloader)
-        hessian_cmp = Hawq_top(model, yaml_cpu=None, yaml_trace=None, dataloader=dummy_dataloader)
-        tuning_init_config = hessian_cmp.get_init_config()
-        # print tuning init_config
-        for i in tuning_init_config:
-            print(i)
+    @classmethod
+    def tearDownClass(self):
+        os.remove('ptq_yaml.yaml')
+        shutil.rmtree('./saved', ignore_errors=True)
+        shutil.rmtree('runs', ignore_errors=True)
 
+    def test_run_hawq_one_trial(self):
+        from neural_compressor.experimental import Quantization, common
+        model = copy.deepcopy(self.model)
+        for fake_yaml in ['ptq_yaml.yaml']:
+            if fake_yaml == 'ptq_yaml.yaml':
+                model.eval()
+            quantizer = Quantization(fake_yaml)
+            dataset = quantizer.dataset('dummy', (1, 3, 224, 224), label=True)
+            quantizer.calib_dataloader = common.DataLoader(dataset)
+            quantizer.eval_dataloader = common.DataLoader(dataset)
+            quantizer.model = model
+            quantizer()
 
 if __name__ == "__main__":
     unittest.main()
+
+# def build_hessian_trace():
+#     hessian_trace_config_yaml = '''
+#     loss:
+#         CrossEntropyLoss
+#     random_seed:
+#         1
+#     max_Iteration:
+#         100
+#     tolerance:
+#         1e-3
+#     enable_op_fuse:
+#         True
+#     max_cal_smaple:
+#         100
+#     quantize_mode:
+#         ptq
+#     '''
+#     with open('./hessian_trace_config_yaml', 'w+', encoding="utf-8") as f:
+#         f.write(hessian_trace_config_yaml)
+#
+#
+# class Test_hessian_trace(unittest.TestCase):
+#     # boot up test
+#     @classmethod
+#     def setUpClass(cls) -> None:
+#         build_hessian_trace()
+#         cls.model = torchvision.models.resnet18()
+#
+#     # shotdown test
+#     @classmethod
+#     def tearDownClass(cls) -> None:
+#         os.remove('./hessian_trace_config_yaml')
+#
+#     # one test case
+#     def test_run_hessian_trace(cls):
+#         """
+#         hessian_trace_top
+#         Inputs:
+#             model:                      FP32 model
+#             dataloader:                 imagenet
+#         """
+#
+#         model = cls.model
+#         datasets = DATASETS('pytorch')
+#         dummy_dataset = datasets['dummy'](shape=(200, 3, 224, 224), low=0., high=1., label=True)
+#         dummy_dataloader = PyTorchDataLoader(dummy_dataset)
+#         # yaml_cpu='/home/bfang1/Projects/HAWQ_INC/frameworks.ai.lpot.intel-lpot/neural_compressor/adaptor/pytorch_cpu.yaml'
+#         # hessian_cmp=hawq_metric.Hawq_top(model,'./hessian_trace_config_yaml',yaml_cpu,dummy_dataloader)
+#         hessian_cmp = Hawq_top(model, yaml_cpu=None, yaml_trace=None, dataloader=dummy_dataloader)
+#         tuning_init_config = hessian_cmp.get_init_config()
+#         # print tuning init_config
+#         for i in tuning_init_config:
+#             print(i)
+
+
+# if __name__ == "__main__":
+#     unittest.main()

From 7bb75cc69b5702fc59b52842205bd65ea1130172 Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Tue, 15 Nov 2022 14:06:46 +0800
Subject: [PATCH 047/128] change name

---
 .../strategy/{hawq.py => hawq_wenhuach.py}    | 24 ++++++++++---------
 1 file changed, 13 insertions(+), 11 deletions(-)
 rename neural_compressor/strategy/{hawq.py => hawq_wenhuach.py} (95%)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq_wenhuach.py
similarity index 95%
rename from neural_compressor/strategy/hawq.py
rename to neural_compressor/strategy/hawq_wenhuach.py
index 2dd0287fa2e..6c74401c5fc 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq_wenhuach.py
@@ -246,23 +246,25 @@ def __init__(self, model, yaml_trace=None, yaml_cpu=None, dataloader=None) -> No
         fix_seed(self.random_seed)
         self.model = model
         self.model.eval()
-        model_tmp = copy.deepcopy(model)
-        model_tmp.eval()
-        self.model_fused = fuse_fx(model_tmp)
-        self.model_fused.eval()
+        if self.enable_op_fuse:
+            self.model = fuse_fx(self.model)
+
+        # model_tmp = copy.deepcopy(model)
+        # model_tmp.eval()
+        # self.model_fused = fuse_fx(model_tmp)
+        # self.model_fused.eval()
 
     def get_init_config(self) -> dict:
         """
         """
         # Load a sample from dataloader to compute graident
-        for inputs, targets in self.dataloader:
-            break
-        # Hessian average trace computation
+        inputs, targets = next(iter(self.dataloader))
+
         with torch.enable_grad():
-            if self.enable_op_fuse:
-                hawq_cmp = Hessian(self.model_fused, criterion=self.criterion, data=(inputs, targets))
-            else:
-                hawq_cmp = Hessian(self.model, criterion=self.criterion, data=(inputs, targets))
+            # if self.enable_op_fuse:
+            #     hawq_cmp = Hessian(self.model_fused, criterion=self.criterion, data=(inputs, targets))
+            # else:
+            hawq_cmp = Hessian(self.model, criterion=self.criterion, data=(inputs, targets))
         avg_traces_lst = hawq_cmp.calculate_trace(max_Iter=self.max_Iteration, tolerance=self.tolerance)
 
         # fiter none weight layer and save weight layer to match perturbation computation

From 312b8aaa85ccf2f5ecbc7bdfbeea5d8e78dbc829 Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Tue, 15 Nov 2022 14:10:03 +0800
Subject: [PATCH 048/128] comment test

---
 test/strategy/test_hawq_wenhuach.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/test/strategy/test_hawq_wenhuach.py b/test/strategy/test_hawq_wenhuach.py
index 4443cd8d486..a470f679cf8 100644
--- a/test/strategy/test_hawq_wenhuach.py
+++ b/test/strategy/test_hawq_wenhuach.py
@@ -11,7 +11,7 @@
 from neural_compressor.adaptor.pytorch import TemplateAdaptor
 from neural_compressor.adaptor import FRAMEWORKS
 import shutil
-from neural_compressor.strategy.hawq import Hawq_top, fix_seed
+from neural_compressor.strategy.hawq_wenhuach import Hawq_top, fix_seed
 
 fix_seed(1)
 
@@ -77,7 +77,8 @@ def test_run_hawq_one_trial(self):
             quantizer()
 
 if __name__ == "__main__":
-    unittest.main()
+    pass
+    # unittest.main()
 
 # def build_hessian_trace():
 #     hessian_trace_config_yaml = '''

From 90ef088d6364e88577bb100c23f1f3d94fc8c0d6 Mon Sep 17 00:00:00 2001
From: BiaoFangAIA <108742533+BiaoFangAIA@users.noreply.github.com>
Date: Tue, 15 Nov 2022 14:19:56 +0800
Subject: [PATCH 049/128] add activation quantized loss eval

---
 neural_compressor/strategy/hawq_metric.py | 146 ++++++++++++++--------
 1 file changed, 97 insertions(+), 49 deletions(-)

diff --git a/neural_compressor/strategy/hawq_metric.py b/neural_compressor/strategy/hawq_metric.py
index acbcd98d740..63db277ab14 100644
--- a/neural_compressor/strategy/hawq_metric.py
+++ b/neural_compressor/strategy/hawq_metric.py
@@ -37,7 +37,7 @@ def fixed_seed(seed):
     torch.cuda.manual_seed_all(seed)  #parallel cpu
     torch.backends.cudnn.deterministic = True  #make sure results are same on cpu/gpu
     torch.backends.cudnn.benchmark = True   #accelerator
-def calculate_params_gradients(model):
+def cal_params_grad(model):
      """
      get the gradients and parameters from given model
      Args:
@@ -55,17 +55,7 @@ def calculate_params_gradients(model):
           params.append(parm)
           grads.append(0. if parm.grad is None else parm.grad+0.)
      return params, grads
-def calculate_inner_product(list_x,list_y):
-     """Compute the inner product of two lists of variables list_x,list_y
-     Args:
-          list_x:                            input list variables
-          list_y:                            input list variables
-     return:
-          sum of inner product
-     """
-     return sum([torch.sum(x * y) for (x, y) in zip(list_x, list_y)])
-
-def calculate_vector_product(gradsH, params, v):
+def cal_vector_product(gradsH, params, v):
      """compute the hessian vector product by torch.autograd.grad.
      Agrs:
           gradsH:                             gradient at current point
@@ -105,7 +95,7 @@ def ptq_calibrate(model, data_loader,num_cal):
           for sample in calibrate_samples:
                model(sample)
      return model
-def calculate_perturbation(model_qnt,model_fp32)->dict:
+def cal_weights_pertubation(model_qnt,model_fp32)->dict:
      """calculate weights quantized perturbation using L2 normal
         Args:
             model_qnt:                       quantized model
@@ -126,6 +116,44 @@ def calculate_perturbation(model_qnt,model_fp32)->dict:
           pertur_pair['value']=diff_l2
           pertur_lst.append(pertur_pair)
      return pertur_lst
+def cal_act_pertubation(model_fp32,model_qnt,data_loader,num_cal=100)->dict:
+     """calculate weights quantized perturbation using L2 normal
+        Args:
+            model_qunt:                     quantized model
+            model_fp32:                     float model
+            data_loader:                    path to datasets
+        return:
+            pretur_lst:                     dict
+
+     """
+     ns.prepare_model_outputs(model_fp32, model_qnt)
+     model_fp32.cpu()
+     model_fp32.eval()
+     model_qnt.cpu()
+     model_qnt.eval()
+     obv_samples=[]
+     i=0
+     for inputs, targets in data_loader:
+          obv_samples.append(inputs)
+          i=i+1
+          if i>=num_cal:
+               break
+     with torch.no_grad():
+          for image in obv_samples:
+               model_fp32(image)
+               model_qnt(image)
+     act_qnt_pairs=[]
+     act_compare_dict = ns.get_matching_activations(model_fp32, q_module=model_qnt)
+     for key in act_compare_dict:
+          op_float_tensor=(act_compare_dict[key]['float'][0])
+          op_qnt_tensor=act_compare_dict[key]['quantized'][0].dequantize()
+          diff_l2=(torch.norm(op_float_tensor-op_qnt_tensor,p=2)**2)
+          pertur_pair={"layer_name":'',"value":0}
+          pertur_pair['layer_name']=key
+          pertur_pair['value']=diff_l2
+          act_qnt_pairs.append(pertur_pair)
+     return act_qnt_pairs
+     
 class Hessian():
      """This class used to compute each layer hessian trace from given FP32 model
      """
@@ -153,7 +181,7 @@ def __init__(self,model,criterion, data=None, dataloader=None,device='cpu') -> N
                outputs=self.model(self.inputs)
                loss=self.criterion(outputs,self.targets)
                loss.backward(create_graph=True)
-          params, gradSH=calculate_params_gradients(self.model)
+          params, gradSH=cal_params_grad(self.model)
 
           self.params=params
           self.gradSH=gradSH
@@ -176,8 +204,8 @@ def calculate_trace(self,max_Iter=100, tolerance=1e-3):
                trace_pair={"layer_name":" ", "trace":0}
                self.model.zero_grad()
                for i in range(max_Iter):
-                    hv=calculate_vector_product(i_grad,i_param,i_v) # hessian vector
-                    trace_vhv_cur=calculate_inner_product(hv,v).cpu().item()#current point 
+                    hv=cal_vector_product(i_grad,i_param,i_v) # hessian vector
+                    trace_vhv_cur=sum([torch.sum(x * y) for (x, y) in zip(hv, v)])
                     trace_vhv.append(trace_vhv_cur)
                     difference=(np.mean(trace_vhv)-trace)/(abs(trace)+1e-6)
                     if abs(difference)<tolerance:
@@ -217,7 +245,7 @@ def __init__(self,model,yaml_trace=None,yaml_cpu=None,dataloader=None) -> None:
                self.max_Iteration=100
                self.enable_op_fuse=True
                self.tolerance=1e-6
-               self.max_cal_sample=100
+               self.max_cal_sample=1
                self.quantize_mode='ptq'
                self.list_dtype=['int8','fp32']
           logging.info("Current parameters config for Hutchinson’s algorithm as below:")
@@ -230,6 +258,7 @@ def __init__(self,model,yaml_trace=None,yaml_cpu=None,dataloader=None) -> None:
           model_tmp.eval()
           self.model_fused= fuse_fx(model_tmp)
           self.model_fused.eval()
+          self.hawq_level='L3'   #L1:top engievalue L2:avg_trace L3:avg_trace+pertubation
               
      def get_init_config(self)->dict: 
           """
@@ -238,6 +267,7 @@ def get_init_config(self)->dict:
           for inputs, targets in self.dataloader:
                break
           #Hessian average trace computation
+          fixed_seed(self.random_seed)
           with torch.enable_grad():
                if self.enable_op_fuse:
                     hawq_cmp=Hessian(self.model_fused,criterion=self.criterion,data=(inputs,targets))
@@ -246,40 +276,58 @@ def get_init_config(self)->dict:
           avg_traces_lst=hawq_cmp.calculate_trace(max_Iter=self.max_Iteration,tolerance=self.tolerance)
          
           #fiter none weight layer and save weight layer to match perturbation computation
-          avg_traces_lst_weight=[]
-          for avg_trace_i in avg_traces_lst:
-               if 'weight' in avg_trace_i['layer_name']:
-                    avg_traces_lst_weight.append(avg_trace_i)
-          # avg_traces_lst_sorted=sorted(avg_traces_lst,key=lambda x:x["trace"], reverse=True)
-          if self.quantize_mode=='ptq':
-               #PTQ quantization
-               qconfig = get_default_qconfig("fbgemm")
-               qconfig_dict={"":qconfig} #enable all layers/tensor to quantize
-               #calibrate
-               model_prepared=prepare_fx(self.model, qconfig_dict)
-               model_prepared=ptq_calibrate(model_prepared,data_loader=self.dataloader,num_cal=self.max_cal_sample)
-               model_prepared.cpu()
-               model_all_qnt=convert_fx(model_prepared)
-               #calculate perturbation
-               pertu_list=calculate_perturbation(model_fp32=self.model,model_qnt=model_all_qnt)
-               #calculate omiga
-               for omiga_i in pertu_list:
-                    for avg_trace_i in avg_traces_lst:
-                         if avg_trace_i['layer_name']==omiga_i['layer_name']:
-                              avg_trace_i['trace']=avg_trace_i['trace']*omiga_i['value']
-               # for avg_trace_i, omiga_i in zip(avg_traces_lst_weight,pertu_list):
-               #      omig_pair={"layer_name":" ", "value":0}
-               #      omig_val=avg_trace_i['trace']*omiga_i['value']
-               #      omig_pair['layer_name']=avg_trace_i['layer_name']
-               #      omig_pair['value']=omig_val
-               #      omig_list.append(omig_pair)
-               # omig_list_sorted=sorted(omig_list,key=lambda x:x['value'],reverse=True)
-               omig_list_sorted=sorted(avg_traces_lst,key=lambda x:x['trace'],reverse=True)
+          if self.hawq_level=='L2':
+               avg_traces_lst_sorted=sorted(avg_traces_lst,key=lambda x:x["trace"], reverse=True)
+               logging.info("avg_traces desending sorted is:")
+               for i in avg_traces_lst_sorted:
+                    logging.info(i)
+               list_sorted=avg_traces_lst_sorted 
+          if self.hawq_level=='L3':
+               if self.quantize_mode=='ptq':
+                    #PTQ quantization
+                    qconfig = get_default_qconfig("fbgemm")
+                    qconfig_dict={"":qconfig} #enable all layers/tensor to quantize
+                    #calibrate
+                    model_prepared=prepare_fx(self.model, qconfig_dict)
+                    model_prepared=ptq_calibrate(model_prepared,data_loader=self.dataloader,num_cal=self.max_cal_sample)
+                    model_prepared.cpu()
+                    model_all_qnt=convert_fx(model_prepared)
+                    #calculate weights quantized perturbation
+                    weights_pertu_lst=cal_weights_pertubation(model_fp32=self.model,model_qnt=model_all_qnt)
+                    #merge weights quantized perturbation
+                    #generally, fused ops=quantized weights+quantized activation 
+                    avg_trace_i=0
+                    omigs=[]
+                    for wct_i in weights_pertu_lst:
+                        omig_pair={"layer_name":" ", "trace":0}
+                        tmp_value=avg_traces_lst[avg_trace_i]['trace']*wct_i['value']
+                        omig_pair['layer_name']=avg_traces_lst[avg_trace_i]['layer_name']
+                        omig_pair['trace']=tmp_value
+                        avg_trace_i=avg_trace_i+2
+                        omigs.append(omig_pair)
+                    act_pertu_lst=cal_act_pertubation(model_fp32=self.model, model_qnt=model_all_qnt,data_loader=self.dataloader,num_cal=self.max_cal_sample)
+                    avg_trace_i=1
+                    for act_i in act_pertu_lst:
+                         omig_pair={"layer_name":" ", "trace":0}
+                         tmp_value=avg_traces_lst[avg_trace_i]['trace']+act_i['value']
+                         omig_pair['layer_name']=avg_traces_lst[avg_trace_i]['layer_name']
+                         omig_pair['trace']=tmp_value
+                         avg_trace_i=avg_trace_i+2
+                         omigs.append(omig_pair)
+                    
+                    # for avg_trace_i, omiga_i in zip(avg_traces_lst_weight,pertu_list):
+                    #      omig_pair={"layer_name":" ", "value":0}
+                    #      omig_val=avg_trace_i['trace']*omiga_i['value']
+                    #      omig_pair['layer_name']=avg_trace_i['layer_name']
+                    #      omig_pair['value']=omig_val
+                    #      omig_list.append(omig_pair)
+                    # omig_list_sorted=sorted(omig_list,key=lambda x:x['value'],reverse=True)
+                    omig_list_sorted=sorted(omigs,key=lambda x:x['trace'],reverse=True)
+                    list_sorted=omig_list_sorted
           tune_init_config_pairs=[]
-          #
-          for i in omig_list_sorted:
+          for i in list_sorted:
                tune_init_config_pair={"op_name":'',"op_type":'','trace':0}
-               if i['layer_name']==omig_list_sorted[0]['layer_name']: 
+               if i['layer_name']==list_sorted[0]['layer_name']: 
                     tune_init_config_pair['op_name']=i['layer_name']
                     tune_init_config_pair['op_type']=self.list_dtype[-1] #setup as float op
                     tune_init_config_pair['trace']=float(i['trace'])

From 84fe8829ab4f018eb9de84aee182be96a345e47a Mon Sep 17 00:00:00 2001
From: BiaoFangAIA <108742533+BiaoFangAIA@users.noreply.github.com>
Date: Tue, 15 Nov 2022 14:21:17 +0800
Subject: [PATCH 050/128] fixed seed for dummy datasets

---
 test/strategy/test_hessian_trace_inc.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/test/strategy/test_hessian_trace_inc.py b/test/strategy/test_hessian_trace_inc.py
index f05b47ca3aa..5285bc619c7 100644
--- a/test/strategy/test_hessian_trace_inc.py
+++ b/test/strategy/test_hessian_trace_inc.py
@@ -10,7 +10,21 @@
 from neural_compressor.experimental.data.dataloaders.pytorch_dataloader import PyTorchDataLoader
 from neural_compressor.adaptor.pytorch import TemplateAdaptor
 from neural_compressor.strategy.hawq_metric import Hawq_top
-
+import random
+import numpy as np
+def fixed_seed(seed):
+    """Fixed rand seed to make sure results are same in different times on different devices.Eg CPU/GPU
+       Args:
+          seed:                              an integer number
+       return:                               None 
+    """
+    np.random.seed(seed)   #random
+    random.seed(seed)
+    torch.manual_seed(seed) #cpu
+    torch.cuda.manual_seed_all(seed)  #parallel cpu
+    torch.backends.cudnn.deterministic = True  #make sure results are same on cpu/gpu
+    torch.backends.cudnn.benchmark = True   #accelerator
+fixed_seed(100)
 def build_hessian_trace():
     hessian_trace_config_yaml='''
     loss:

From f221068657823d6615c5a782dadc0698c4e4275f Mon Sep 17 00:00:00 2001
From: BiaoFangAIA <108742533+BiaoFangAIA@users.noreply.github.com>
Date: Tue, 15 Nov 2022 14:24:18 +0800
Subject: [PATCH 051/128] for independence hawq tuning strategic

---
 neural_compressor/strategy/hawq.py | 202 +++++++++++++++++++++++++++++
 1 file changed, 202 insertions(+)
 create mode 100644 neural_compressor/strategy/hawq.py

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
new file mode 100644
index 00000000000..17231ceec9d
--- /dev/null
+++ b/neural_compressor/strategy/hawq.py
@@ -0,0 +1,202 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import numpy as np
+from collections import OrderedDict
+from .strategy import strategy_registry, TuneStrategy
+from ..utils import logger
+
+from .st_utils.tuning_sampler import OpTypeWiseTuningSampler, FallbackTuningSampler, ModelWiseTuningSampler
+from .st_utils.tuning_structs import OpTuningConfig
+from .st_utils.tuning_space import TUNING_ITEMS_LST
+
+@strategy_registry
+class HawqTuneStrategy(TuneStrategy):
+    """The basic tuning strategy which tunes the low precision model with below order.
+
+    1. modelwise tuning for all quantizable ops.
+    2. fallback tuning from bottom to top to decide the priority of which op has biggest impact
+       on accuracy.
+    3. incremental fallback tuning by fallbacking multiple ops with the order got from #2.
+
+    Args:
+        model (object):                        The FP32 model specified for low precision tuning.
+        conf (Class):                          The Conf class instance initialized from user yaml
+                                               config file.
+        q_dataloader (generator):              Data loader for calibration, mandatory for
+                                               post-training quantization.
+                                               It is iterable and should yield a tuple (input,
+                                               label) for calibration dataset containing label,
+                                               or yield (input, _) for label-free calibration
+                                               dataset. The input could be a object, list, tuple or
+                                               dict, depending on user implementation, as well as
+                                               it can be taken as model input.
+        q_func (function, optional):           Reserved for future use.
+        eval_dataloader (generator, optional): Data loader for evaluation. It is iterable
+                                               and should yield a tuple of (input, label).
+                                               The input could be a object, list, tuple or dict,
+                                               depending on user implementation, as well as it can
+                                               be taken as model input. The label should be able
+                                               to take as input of supported metrics. If this
+                                               parameter is not None, user needs to specify
+                                               pre-defined evaluation metrics through configuration
+                                               file and should set "eval_func" parameter as None.
+                                               Tuner will combine model, eval_dataloader and
+                                               pre-defined metrics to run evaluation process.
+        eval_func (function, optional):        The evaluation function provided by user.
+                                               This function takes model as parameter, and
+                                               evaluation dataset and metrics should be
+                                               encapsulated in this function implementation and
+                                               outputs a higher-is-better accuracy scalar value.
+
+                                               The pseudo code should be something like:
+
+                                               def eval_func(model):
+                                                    input, label = dataloader()
+                                                    output = model(input)
+                                                    accuracy = metric(output, label)
+                                                    return accuracy
+        dicts (dict, optional):                The dict containing resume information.
+                                               Defaults to None.
+
+    """
+
+    def __init__(self, model, conf, q_dataloader, q_func=None,
+                 eval_dataloader=None, eval_func=None, dicts=None, q_hooks=None):
+        super(
+            HawqTuneStrategy,
+            self).__init__(
+            model,
+            conf,
+            q_dataloader,
+            q_func,
+            eval_dataloader,
+            eval_func,
+            dicts,
+            q_hooks)
+
+    def next_tune_cfg(self):
+        """The generator of yielding next tuning config to traverse by concrete strategies
+           according to last tuning result.
+
+        Yields:
+            tune_config (dict): It's a dict containing the tuning configuration to run.
+        """
+        from copy import deepcopy
+        tuning_space = self.tuning_space
+        calib_sampling_size_lst = tuning_space.root_item.get_option_by_name('calib_sampling_size').options
+        for calib_sampling_size in calib_sampling_size_lst:
+            # Initialize the tuning config for each op according to the quantization approach 
+            op_item_dtype_dict, quant_mode_wise_items, initial_op_tuning_cfg = self.initial_tuning_cfg()
+            # Optype-wise tuning tuning items: the algorithm/scheme/granularity of activation(weight)
+            early_stop_tuning = False
+            stage1_cnt = 0
+            quant_ops = quant_mode_wise_items['static'] if 'static' in quant_mode_wise_items else []
+            quant_ops += quant_mode_wise_items['dynamic'] if 'dynamic' in quant_mode_wise_items else []
+            stage1_max = 1e9  # TODO set a more appropriate value
+            op_wise_tuning_sampler = OpTypeWiseTuningSampler(tuning_space, [], [], 
+                                                             op_item_dtype_dict, initial_op_tuning_cfg)
+            for op_tuning_cfg in op_wise_tuning_sampler:
+                stage1_cnt += 1
+                if early_stop_tuning and stage1_cnt > stage1_max:
+                    logger.info("Early stopping the stage 1.")
+                    break
+                op_tuning_cfg['calib_sampling_size'] = calib_sampling_size
+                yield op_tuning_cfg
+            # Fallback the ops supported both static and dynamic from static to dynamic
+            # Tuning items: None
+            if self.cfg.quantization.approach == 'post_training_auto_quant':
+                static_dynamic_items = [item for item in tuning_space.query_items_by_quant_mode('static') if
+                                        item in tuning_space.query_items_by_quant_mode('dynamic')]
+                if static_dynamic_items:
+                    logger.info("Fallback all ops that support both dynamic and static to dynamic.")
+                else:
+                    logger.info("Non ops that support both dynamic")
+
+                new_op_tuning_cfg = deepcopy(self.cur_best_tuning_cfg)
+                for item in static_dynamic_items:
+                    new_op_tuning_cfg[item.name] = self.initial_dynamic_cfg_based_on_static_cfg(
+                                                   new_op_tuning_cfg[item.name])
+                new_op_tuning_cfg['calib_sampling_size'] = calib_sampling_size
+                yield new_op_tuning_cfg
+            best_op_tuning_cfg_stage1 = deepcopy(self.cur_best_tuning_cfg)
+
+            # Fallback
+            for target_dtype in ['bf16', 'fp32']:
+                target_type_lst = set(tuning_space.query_items_by_quant_mode(target_dtype))
+                fallback_items_lst = [item for item in quant_ops if item in target_type_lst]
+                if fallback_items_lst:
+                    logger.info(f"Start to fallback op to {target_dtype} one by one.")
+                    self._fallback_started()
+                #fallback_items_name_lst = [item.name for item in fallback_items_lst][::-1] # from bottom to up
+                ops_sensitivity = self.adaptor.calculate_op_sensitivity(self._fp32_model, 
+                                                                        self.calib_dataloader, 
+                                                                        method_args = {'name': 'hessian_trace'})
+                fallback_items_name_lst = sorted(ops_sensitivity, key = lambda items: items[1], reverse=True)
+                
+                op_dtypes = OrderedDict(zip(fallback_items_name_lst, [target_dtype] * len(fallback_items_name_lst)))
+                initial_op_tuning_cfg = deepcopy(best_op_tuning_cfg_stage1)
+                fallback_sampler = FallbackTuningSampler(tuning_space, tuning_order_lst=[],
+                                                        initial_op_tuning_cfg=initial_op_tuning_cfg,
+                                                        op_dtypes=op_dtypes, accumulate=False)
+                op_fallback_acc_impact = OrderedDict()
+                for op_index, op_tuning_cfg in enumerate(fallback_sampler):
+                    op_tuning_cfg['calib_sampling_size'] = calib_sampling_size
+                    yield op_tuning_cfg
+                    acc, _ = self.last_tune_result
+                    op_fallback_acc_impact[fallback_items_name_lst[op_index]] = acc
+
+
+                # do accumulated fallback according to the order in the previous stage
+                if len(op_fallback_acc_impact) > 0:
+                    ordered_ops = sorted(op_fallback_acc_impact.keys(), 
+                                         key=lambda key: op_fallback_acc_impact[key],
+                                         reverse=self.higher_is_better)
+                    op_dtypes = OrderedDict(zip(ordered_ops, [target_dtype] * len(fallback_items_name_lst)))
+                    logger.info(f"Start to accumulate fallback to {target_dtype}.")
+                    initial_op_tuning_cfg = deepcopy(best_op_tuning_cfg_stage1)
+                    fallback_sampler = FallbackTuningSampler(tuning_space, tuning_order_lst=[],
+                                                            initial_op_tuning_cfg=initial_op_tuning_cfg,
+                                                            op_dtypes=op_dtypes, accumulate=True)
+                    for op_tuning_cfg in fallback_sampler:
+                        op_tuning_cfg['calib_sampling_size'] = calib_sampling_size
+                        yield op_tuning_cfg
+                        
+    def initial_dynamic_cfg_based_on_static_cfg(self, op_static_cfg:OpTuningConfig):
+        op_state = op_static_cfg.get_state()
+        op_name = op_static_cfg.op_name
+        op_type = op_static_cfg.op_type
+        op_quant_mode = 'dynamic'
+        tuning_space = self.tuning_space
+        dynamic_state = {}
+        for att in ['weight', 'activation']:
+            if att not in op_state:
+                continue
+            for item_name, item_val in op_state[att].items():
+                att_item = (att, item_name)
+                if att_item not in TUNING_ITEMS_LST:
+                    continue
+                if tuning_space.query_item_option((op_name, op_type), op_quant_mode, att_item, item_val):
+                    dynamic_state[att_item] = item_val
+                else:
+                    quant_mode_item = tuning_space.query_quant_mode_item((op_name, op_type), op_quant_mode)
+                    tuning_item = quant_mode_item.get_option_by_name(att_item)
+                    dynamic_state[att_item] = tuning_item.options[0] if tuning_item else None
+        return OpTuningConfig(op_name, op_type, op_quant_mode, tuning_space, kwargs=dynamic_state)
+        
+        
\ No newline at end of file

From c6ebf79959d6405e5e92f53064c4657a77a23b21 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 15 Nov 2022 14:38:19 +0800
Subject: [PATCH 052/128] add a fallback ut

---
 neural_compressor/strategy/basic.py           | 10 +--
 .../strategy/{ => st_utils}/hawq_metric.py    |  0
 .../strategy/{ => st_utils}/hawq_wenhuach.py  |  0
 test/strategy/test_basic_fallback.py          | 73 +++++++++++++++++++
 4 files changed, 78 insertions(+), 5 deletions(-)
 rename neural_compressor/strategy/{ => st_utils}/hawq_metric.py (100%)
 rename neural_compressor/strategy/{ => st_utils}/hawq_wenhuach.py (100%)
 create mode 100644 test/strategy/test_basic_fallback.py

diff --git a/neural_compressor/strategy/basic.py b/neural_compressor/strategy/basic.py
index 3cc4e38bde2..184a15996f7 100644
--- a/neural_compressor/strategy/basic.py
+++ b/neural_compressor/strategy/basic.py
@@ -143,11 +143,11 @@ def next_tune_cfg(self):
                 if fallback_items_lst:
                     logger.info(f"Start to fallback op to {target_dtype} one by one.")
                     self._fallback_started()
-                #fallback_items_name_lst = [item.name for item in fallback_items_lst][::-1] # from bottom to up
-                ops_sensitivity = self.adaptor.calculate_op_sensitivity(self._fp32_model, 
-                                                                        self.calib_dataloader, 
-                                                                        method_args = {'name': 'hessian_trace'})
-                fallback_items_name_lst = sorted(ops_sensitivity, key = lambda items: items[1], reverse=True)
+                fallback_items_name_lst = [item.name for item in fallback_items_lst][::-1] # from bottom to up
+                # ops_sensitivity = self.adaptor.calculate_op_sensitivity(self._fp32_model, 
+                #                                                         self.calib_dataloader, 
+                #                                                         method_args = {'name': 'hessian_trace'})
+                #fallback_items_name_lst = sorted(ops_sensitivity, key = lambda items: items[1], reverse=True)
                 
                 op_dtypes = OrderedDict(zip(fallback_items_name_lst, [target_dtype] * len(fallback_items_name_lst)))
                 initial_op_tuning_cfg = deepcopy(best_op_tuning_cfg_stage1)
diff --git a/neural_compressor/strategy/hawq_metric.py b/neural_compressor/strategy/st_utils/hawq_metric.py
similarity index 100%
rename from neural_compressor/strategy/hawq_metric.py
rename to neural_compressor/strategy/st_utils/hawq_metric.py
diff --git a/neural_compressor/strategy/hawq_wenhuach.py b/neural_compressor/strategy/st_utils/hawq_wenhuach.py
similarity index 100%
rename from neural_compressor/strategy/hawq_wenhuach.py
rename to neural_compressor/strategy/st_utils/hawq_wenhuach.py
diff --git a/test/strategy/test_basic_fallback.py b/test/strategy/test_basic_fallback.py
new file mode 100644
index 00000000000..352c81850c4
--- /dev/null
+++ b/test/strategy/test_basic_fallback.py
@@ -0,0 +1,73 @@
+import torch
+import unittest
+import os
+import sys
+import copy
+import torchvision
+import torchvision.transforms as transforms
+from torch.utils.data import DataLoader
+from neural_compressor.data import DATASETS
+from neural_compressor.experimental.data.dataloaders.pytorch_dataloader import PyTorchDataLoader
+from neural_compressor.adaptor.pytorch import TemplateAdaptor
+from neural_compressor.adaptor import FRAMEWORKS
+import shutil
+
+
+def build_ptq_yaml():
+    fake_yaml = '''
+    model:
+        name: resnet18
+        framework: pytorch_fx
+    tuning:
+        strategy:
+            name: basic
+        accuracy_criterion:
+            absolute:  -1
+        exit_policy:
+            timeout: 0
+    '''
+    with open('ptq_yaml.yaml', 'w', encoding="utf-8") as f:
+        f.write(fake_yaml)
+
+class TestPytorchAdaptor(unittest.TestCase):
+    framework_specific_info = {"device": "cpu",
+                               "approach": "post_training_static_quant",
+                               "random_seed": 1234,
+                               "q_dataloader": None,
+                               "workspace_path": None}
+    framework = "pytorch"
+    adaptor = FRAMEWORKS[framework](framework_specific_info)
+    model = torchvision.models.resnet18()
+
+    # model = torch.quantization.QuantWrapper(model)
+
+    @classmethod
+    def setUpClass(self):
+        self.i = 0
+        build_ptq_yaml()
+
+
+    @classmethod
+    def tearDownClass(self):
+        os.remove('ptq_yaml.yaml')
+        shutil.rmtree('./saved', ignore_errors=True)
+        shutil.rmtree('runs', ignore_errors=True)
+
+    def test_basic_fallback(self):
+        def eval_func(model):
+          self.i -= 1
+          return self.i
+          
+        from neural_compressor.experimental import Quantization, common
+        model = copy.deepcopy(self.model)
+        quantizer = Quantization('ptq_yaml.yaml')
+        quantizer.eval_func = eval_func
+        dataset = quantizer.dataset('dummy', (1, 3, 224, 224), label=True)
+        quantizer.calib_dataloader = common.DataLoader(dataset)
+        quantizer.eval_dataloader = common.DataLoader(dataset)
+        quantizer.model = model
+        q_model = quantizer()
+        self.assertTrue(q_model is None)
+        
+if __name__ == "__main__":
+    unittest.main()

From 69f6c2a3c1944e16bb709464c32b630568c3ed2e Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Wed, 16 Nov 2022 14:03:42 +0800
Subject: [PATCH 053/128] update test file

---
 test/strategy/test_hawq_wenhuach.py | 41 ++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 15 deletions(-)

diff --git a/test/strategy/test_hawq_wenhuach.py b/test/strategy/test_hawq_wenhuach.py
index a470f679cf8..ad7939d5d84 100644
--- a/test/strategy/test_hawq_wenhuach.py
+++ b/test/strategy/test_hawq_wenhuach.py
@@ -11,7 +11,7 @@
 from neural_compressor.adaptor.pytorch import TemplateAdaptor
 from neural_compressor.adaptor import FRAMEWORKS
 import shutil
-from neural_compressor.strategy.hawq_wenhuach import Hawq_top, fix_seed
+from neural_compressor.strategy.st_utils.hawq_wenhuach import Hawq_top, fix_seed
 
 fix_seed(1)
 
@@ -19,7 +19,7 @@ def build_ptq_yaml():
     fake_yaml = '''
         model:
           name: imagenet
-          framework: pytorch
+          framework: pytorch_fx
         quantization: 
           calibration:
         evaluation:
@@ -28,12 +28,12 @@ def build_ptq_yaml():
               topk: 1
         tuning:
           strategy:
-            name: mse
+            name: hawq
           accuracy_criterion:
             relative: -0.1
           random_seed: 9527
           exit_policy:
-            max_trials: 1
+            max_trials: 3
           workspace:
             path: saved
         '''
@@ -50,10 +50,17 @@ class TestPytorchAdaptor(unittest.TestCase):
     adaptor = FRAMEWORKS[framework](framework_specific_info)
     model = torchvision.models.resnet18()
 
+    # from collections import OrderedDict
+    # model = torch.nn.Sequential(OrderedDict([
+    #     ('conv1', torch.nn.Conv2d(3, 2, 1, 1)),
+    #     ('conv2', torch.nn.Conv2d(2, 1, 1, 1)),
+    #     ('flat', torch.nn.Flatten()),
+    # ]))
     # model = torch.quantization.QuantWrapper(model)
 
     @classmethod
     def setUpClass(self):
+        self.i = 0
         build_ptq_yaml()
 
 
@@ -63,22 +70,26 @@ def tearDownClass(self):
         shutil.rmtree('./saved', ignore_errors=True)
         shutil.rmtree('runs', ignore_errors=True)
 
+
+
     def test_run_hawq_one_trial(self):
+        def eval_func(model):
+            self.i -= 1
+            return self.i
         from neural_compressor.experimental import Quantization, common
         model = copy.deepcopy(self.model)
-        for fake_yaml in ['ptq_yaml.yaml']:
-            if fake_yaml == 'ptq_yaml.yaml':
-                model.eval()
-            quantizer = Quantization(fake_yaml)
-            dataset = quantizer.dataset('dummy', (1, 3, 224, 224), label=True)
-            quantizer.calib_dataloader = common.DataLoader(dataset)
-            quantizer.eval_dataloader = common.DataLoader(dataset)
-            quantizer.model = model
-            quantizer()
+
+        quantizer = Quantization('ptq_yaml.yaml')
+        quantizer.eval_func = eval_func
+        dataset = quantizer.dataset('dummy', (1, 3, 224, 224), label=True)
+        quantizer.calib_dataloader = common.DataLoader(dataset)
+        quantizer.eval_dataloader = common.DataLoader(dataset)
+        quantizer.model = model
+        quantizer()
 
 if __name__ == "__main__":
-    pass
-    # unittest.main()
+
+    unittest.main()
 
 # def build_hessian_trace():
 #     hessian_trace_config_yaml = '''

From 85f1d203461f5919841f5135e9795a30ae8e804a Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Thu, 17 Nov 2022 15:04:32 +0800
Subject: [PATCH 054/128] tiny update

---
 neural_compressor/strategy/hawq.py  | 329 +++++++++++++++++++++-------
 neural_compressor/strategy/mse.py   |   3 +-
 test/strategy/test_hawq_wenhuach.py |  10 +-
 3 files changed, 263 insertions(+), 79 deletions(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 17231ceec9d..3db5cf0aed5 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -18,6 +18,9 @@
 import copy
 import numpy as np
 from collections import OrderedDict
+
+import torch.nn
+
 from .strategy import strategy_registry, TuneStrategy
 from ..utils import logger
 
@@ -25,6 +28,154 @@
 from .st_utils.tuning_structs import OpTuningConfig
 from .st_utils.tuning_space import TUNING_ITEMS_LST
 
+
+class HessianTrace:
+    def __init__(self, model, conf, adaptor, op_cfgs_list, dataloader):
+        self.model = model
+        self.conf = conf  ##config
+        self.op_cfgs_list = op_cfgs_list  ##op to get
+        self.dataloader = dataloader
+        self.adaptor = adaptor
+        self.max_iter = 500
+        self.tolerance = 1e-5
+        self.eps = 1e-6
+        self.index = 0
+
+    # def apply_init(self):
+    #     trace_per_op = self._cal_trace()
+    #     if not trace_per_op:
+    #         raise RuntimeError('Failed to calculate hessian traces!')
+    #
+    #     perturbations = self._calc_quantization_noise()
+    #     configuration_metric = self._calc_hawq_metric_per_configuration(
+    #         perturbations, trace_per_op)
+    #     config_index = self.choose_configuration(configuration_metric)
+    #     chosen_config = self.op_cfgs_list[config_index]
+    #     return chosen_config, trace_per_op
+
+    def get_device(self, model: torch.nn.Module):
+        for n, p in model.named_parameters():
+            return p.data.device
+
+    def get_gradient(self, model, data, criterion, op_list, device="cpu", retrain_graph=False):
+        model.zero_grad()
+        input = data[0]
+        target = data[1]
+        output = model(input)
+        loss = criterion(output, target)
+        loss.backward(retain_graph=retrain_graph)
+        gradients = {}
+        for n, p in model.named_parameters():
+            if n in op_list:
+                continue
+            gradients[n] = 0
+            if p.grad != None:
+                gradients[n] = p.grad
+        return gradients
+
+    def get_avg_trace(self, num_batches=2):
+        """
+                Estimates average hessian trace for each parameter
+                """
+        assert num_batches > 0
+        ##num_data_iter = self.op_cfgs_list[0]['calib_iteration']
+        ##num_all_data = num_data_iter * self.dataloader.batch_size
+        op_list = [item.name for item in self.op_cfgs_list]
+        criterion = torch.nn.CrossEntropyLoss()  ##TODO setting this in config
+        device = self.get_device(self.model)
+
+        for step, batch in enumerate(self.dataloader):
+            gradient_dict = self.get_gradient(self.model, batch,criterion, op_list, device=device, retrain_graph=True)
+            tmp = 1
+            if step == num_batches - 1:
+                break
+
+
+        weight_vhp = []
+        w_avg_total_trace = 0.
+        w_avg_traces_per_iter = []
+        mean_avg_traces_per_param = None
+        act_vhp = []
+        a_avg_total_trace = 0.
+        a_avg_traces_per_iter = []
+        mean_avg_traces_per_act = None
+
+        for i in range(max_iter):
+            weight_vhp_list, w_v, \
+            act_vhp_list, a_v, op_act_grad = self.adaptor.get_2order_grad(self.model,
+                                                                          criterion,
+                                                                          self.dataloader,
+                                                                          num_data_iter,
+                                                                          qop_list)
+            if not weight_vhp:
+                weight_vhp = [np.random.randn(*p.shape) for p in w_v]
+            for vhp_curr in weight_vhp_list:
+                weight_vhp = [a + b * float(self.dataloader.batch_size) + 0. \
+                              for a, b in zip(weight_vhp, vhp_curr)]
+            weight_vhp = [a / float(num_all_data) for a in weight_vhp]
+            avg_traces_per_param = [np.sum(a * b) / a.size for (a, b) in zip(weight_vhp, w_v)]
+            w_avg_traces_per_iter.append(avg_traces_per_param)
+            mean_avg_traces_per_param = np.mean(w_avg_traces_per_iter, axis=0)
+            w_mean_avg_total_trace = np.sum(mean_avg_traces_per_param)
+
+            w_diff_avg = abs(w_mean_avg_total_trace - w_avg_total_trace) / \
+                         (w_avg_total_trace + diff_eps)
+            w_avg_total_trace = w_mean_avg_total_trace
+            logger.info(
+                '{}# weights difference_avg={} avg_trace={}'.format(
+                    i, w_diff_avg, w_avg_total_trace))
+
+            if not act_vhp:
+                act_vhp = [np.random.randn(*p.shape) for p in a_v]
+            for vhp_curr in act_vhp_list:
+                act_vhp = [a + b * float(self.dataloader.batch_size) + 0. \
+                           for a, b in zip(act_vhp, vhp_curr)]
+            act_vhp = [a / float(num_all_data) for a in act_vhp]
+            avg_traces_per_act = [np.sum(a * b) / a.size for (a, b) in zip(act_vhp, a_v)]
+            a_avg_traces_per_iter.append(avg_traces_per_act)
+            mean_avg_traces_per_act = np.mean(a_avg_traces_per_iter, axis=0)
+            a_mean_avg_total_trace = np.sum(mean_avg_traces_per_act)
+
+            a_diff_avg = abs(a_mean_avg_total_trace - a_avg_total_trace) / \
+                         (a_avg_total_trace + diff_eps)
+            a_avg_total_trace = a_mean_avg_total_trace
+            logger.info(
+                '{}# activation difference_avg={} avg_trace={}'.format(
+                    i, a_diff_avg, a_avg_total_trace))
+
+            if w_diff_avg < tolerance and a_diff_avg < tolerance:
+                return mean_avg_traces_per_param, mean_avg_traces_per_act, op_act_grad
+
+        return mean_avg_traces_per_param, mean_avg_traces_per_act, op_act_grad
+
+    def _cal_trace(self):
+        """
+        Calculate the trace for both weight and activation per layer
+        """
+        pass
+        # trace_estimator = HessianTraceEstimator(self.model,
+        #                                         self.conf,
+        #                                         self.adaptor,
+        #                                         self.op_cfgs_list,
+        #                                         self.dataloader)
+        # w_avg_trace, a_avg_trace, op_act_grad = trace_estimator.get_avg_trace()
+        #
+        # # mapping trace to op per op_weight_mapping
+        # weights_name = self.adaptor.get_all_weight_names(self.model)
+        # op_weight_mapping = self.get_op_weight_mapping()
+        # trace_per_op = OrderedDict()
+        # w_op_trace_info = np.zeros(len(op_weight_mapping))
+        # for i, (op_name, w_name) in enumerate(op_weight_mapping.items()):
+        #     index = weights_name.index(w_name)
+        #     w_op_trace_info[i] = w_avg_trace[index]
+        #     act_trace = 0.0
+        #     if op_name in op_act_grad:
+        #         a_index = op_act_grad.index(op_name)
+        #         act_trace = a_avg_trace[a_index]
+        #     trace_per_op[op_name] = (w_avg_trace[index], act_trace)
+        # return trace_per_op
+
+
 @strategy_registry
 class HawqTuneStrategy(TuneStrategy):
     """The basic tuning strategy which tunes the low precision model with below order.
@@ -91,6 +242,37 @@ def __init__(self, model, conf, q_dataloader, q_func=None,
             q_hooks)
 
     def next_tune_cfg(self):
+        from copy import deepcopy
+        tuning_space = self.tuning_space
+        calib_size = tuning_space.root_item.get_option_by_name('calib_sampling_size').options[0]  ##TODO suppoprt list
+
+        # Initialize the tuning config for each op according to the quantization approach
+        op_item_dtype_dict, quant_mode_wise_items, initial_op_tuning_cfg = self.initial_tuning_cfg()
+
+        quant_ops = quant_mode_wise_items['static'] if 'static' in quant_mode_wise_items else []
+        quant_ops += quant_mode_wise_items['dynamic'] if 'dynamic' in quant_mode_wise_items else []
+
+        target_dtype = "fp32"  ##TODO support bf16
+        target_type_lst = set(tuning_space.query_items_by_quant_mode(target_dtype))
+        fp_op_list = [item for item in quant_ops if item in target_type_lst]
+        orig_eval = True
+        if self._fp32_model.training:
+            orig_eval = False
+        self._fp32_model.train()
+        ht = HessianTrace(self._fp32_model, self.cfg, self.adaptor, fp_op_list, self.calib_dataloader)
+        ht.get_avg_trace()
+        # if orig_eval:
+        #     self._fp32_model.eval()
+        # ht.get_avg_trace()
+        # tmp = 1
+        # fallback_items_name_lst = [item.name for item in fallback_items_lst][::-1] # from bottom to up
+        # ops_sensitivity = self.adaptor.get_hessian_trace(self._fp32_model,
+        #                                                         self.calib_dataloader,
+        #                                                         self.
+        #                                                         method_args={'name': 'hessian_trace'})
+        # tmp = 1
+
+    def next_tune_cfg_bk(self):
         """The generator of yielding next tuning config to traverse by concrete strategies
            according to last tuning result.
 
@@ -100,84 +282,85 @@ def next_tune_cfg(self):
         from copy import deepcopy
         tuning_space = self.tuning_space
         calib_sampling_size_lst = tuning_space.root_item.get_option_by_name('calib_sampling_size').options
-        for calib_sampling_size in calib_sampling_size_lst:
-            # Initialize the tuning config for each op according to the quantization approach 
-            op_item_dtype_dict, quant_mode_wise_items, initial_op_tuning_cfg = self.initial_tuning_cfg()
-            # Optype-wise tuning tuning items: the algorithm/scheme/granularity of activation(weight)
-            early_stop_tuning = False
-            stage1_cnt = 0
-            quant_ops = quant_mode_wise_items['static'] if 'static' in quant_mode_wise_items else []
-            quant_ops += quant_mode_wise_items['dynamic'] if 'dynamic' in quant_mode_wise_items else []
-            stage1_max = 1e9  # TODO set a more appropriate value
-            op_wise_tuning_sampler = OpTypeWiseTuningSampler(tuning_space, [], [], 
-                                                             op_item_dtype_dict, initial_op_tuning_cfg)
-            for op_tuning_cfg in op_wise_tuning_sampler:
-                stage1_cnt += 1
-                if early_stop_tuning and stage1_cnt > stage1_max:
-                    logger.info("Early stopping the stage 1.")
-                    break
+
+        calib_sampling_size = calib_sampling_size_lst[0]
+        # Initialize the tuning config for each op according to the quantization approach
+        op_item_dtype_dict, quant_mode_wise_items, initial_op_tuning_cfg = self.initial_tuning_cfg()
+        # Optype-wise tuning tuning items: the algorithm/scheme/granularity of activation(weight)
+        early_stop_tuning = False
+        stage1_cnt = 0
+        quant_ops = quant_mode_wise_items['static'] if 'static' in quant_mode_wise_items else []
+        quant_ops += quant_mode_wise_items['dynamic'] if 'dynamic' in quant_mode_wise_items else []
+        stage1_max = 1e9  # TODO set a more appropriate value
+        op_wise_tuning_sampler = OpTypeWiseTuningSampler(tuning_space, [], [],
+                                                         op_item_dtype_dict, initial_op_tuning_cfg)
+        # for op_tuning_cfg in op_wise_tuning_sampler:
+        #     stage1_cnt += 1
+        #     if early_stop_tuning and stage1_cnt > stage1_max:
+        #         logger.info("Early stopping the stage 1.")
+        #         break
+        #     op_tuning_cfg['calib_sampling_size'] = calib_sampling_size
+        #     yield op_tuning_cfg
+        # Fallback the ops supported both static and dynamic from static to dynamic
+        # Tuning items: None
+        # if self.cfg.quantization.approach == 'post_training_auto_quant':
+        #     static_dynamic_items = [item for item in tuning_space.query_items_by_quant_mode('static') if
+        #                             item in tuning_space.query_items_by_quant_mode('dynamic')]
+        #     if static_dynamic_items:
+        #         logger.info("Fallback all ops that support both dynamic and static to dynamic.")
+        #     else:
+        #         logger.info("Non ops that support both dynamic")
+        #
+        #     new_op_tuning_cfg = deepcopy(self.cur_best_tuning_cfg)
+        #     for item in static_dynamic_items:
+        #         new_op_tuning_cfg[item.name] = self.initial_dynamic_cfg_based_on_static_cfg(
+        #                                        new_op_tuning_cfg[item.name])
+        #     new_op_tuning_cfg['calib_sampling_size'] = calib_sampling_size
+        #     yield new_op_tuning_cfg
+        best_op_tuning_cfg_stage1 = deepcopy(self.cur_best_tuning_cfg)
+
+        # Fallback
+        for target_dtype in ['bf16', 'fp32']:
+            target_type_lst = set(tuning_space.query_items_by_quant_mode(target_dtype))
+            fallback_items_lst = [item for item in quant_ops if item in target_type_lst]
+            if fallback_items_lst:
+                logger.info(f"Start to fallback op to {target_dtype} one by one.")
+                self._fallback_started()
+            # fallback_items_name_lst = [item.name for item in fallback_items_lst][::-1] # from bottom to up
+            ops_sensitivity = self.adaptor.calculate_op_sensitivity(self._fp32_model,
+                                                                    self.calib_dataloader,
+                                                                    method_args={'name': 'hessian_trace'})
+
+            fallback_items_name_lst = sorted(ops_sensitivity, key=lambda items: items[1], reverse=True)
+
+            op_dtypes = OrderedDict(zip(fallback_items_name_lst, [target_dtype] * len(fallback_items_name_lst)))
+            initial_op_tuning_cfg = deepcopy(best_op_tuning_cfg_stage1)
+            fallback_sampler = FallbackTuningSampler(tuning_space, tuning_order_lst=[],
+                                                     initial_op_tuning_cfg=initial_op_tuning_cfg,
+                                                     op_dtypes=op_dtypes, accumulate=False)
+            op_fallback_acc_impact = OrderedDict()
+            for op_index, op_tuning_cfg in enumerate(fallback_sampler):
                 op_tuning_cfg['calib_sampling_size'] = calib_sampling_size
                 yield op_tuning_cfg
-            # Fallback the ops supported both static and dynamic from static to dynamic
-            # Tuning items: None
-            if self.cfg.quantization.approach == 'post_training_auto_quant':
-                static_dynamic_items = [item for item in tuning_space.query_items_by_quant_mode('static') if
-                                        item in tuning_space.query_items_by_quant_mode('dynamic')]
-                if static_dynamic_items:
-                    logger.info("Fallback all ops that support both dynamic and static to dynamic.")
-                else:
-                    logger.info("Non ops that support both dynamic")
-
-                new_op_tuning_cfg = deepcopy(self.cur_best_tuning_cfg)
-                for item in static_dynamic_items:
-                    new_op_tuning_cfg[item.name] = self.initial_dynamic_cfg_based_on_static_cfg(
-                                                   new_op_tuning_cfg[item.name])
-                new_op_tuning_cfg['calib_sampling_size'] = calib_sampling_size
-                yield new_op_tuning_cfg
-            best_op_tuning_cfg_stage1 = deepcopy(self.cur_best_tuning_cfg)
-
-            # Fallback
-            for target_dtype in ['bf16', 'fp32']:
-                target_type_lst = set(tuning_space.query_items_by_quant_mode(target_dtype))
-                fallback_items_lst = [item for item in quant_ops if item in target_type_lst]
-                if fallback_items_lst:
-                    logger.info(f"Start to fallback op to {target_dtype} one by one.")
-                    self._fallback_started()
-                #fallback_items_name_lst = [item.name for item in fallback_items_lst][::-1] # from bottom to up
-                ops_sensitivity = self.adaptor.calculate_op_sensitivity(self._fp32_model, 
-                                                                        self.calib_dataloader, 
-                                                                        method_args = {'name': 'hessian_trace'})
-                fallback_items_name_lst = sorted(ops_sensitivity, key = lambda items: items[1], reverse=True)
-                
-                op_dtypes = OrderedDict(zip(fallback_items_name_lst, [target_dtype] * len(fallback_items_name_lst)))
+                acc, _ = self.last_tune_result
+                op_fallback_acc_impact[fallback_items_name_lst[op_index]] = acc
+
+            # do accumulated fallback according to the order in the previous stage
+            if len(op_fallback_acc_impact) > 0:
+                ordered_ops = sorted(op_fallback_acc_impact.keys(),
+                                     key=lambda key: op_fallback_acc_impact[key],
+                                     reverse=self.higher_is_better)
+                op_dtypes = OrderedDict(zip(ordered_ops, [target_dtype] * len(fallback_items_name_lst)))
+                logger.info(f"Start to accumulate fallback to {target_dtype}.")
                 initial_op_tuning_cfg = deepcopy(best_op_tuning_cfg_stage1)
                 fallback_sampler = FallbackTuningSampler(tuning_space, tuning_order_lst=[],
-                                                        initial_op_tuning_cfg=initial_op_tuning_cfg,
-                                                        op_dtypes=op_dtypes, accumulate=False)
-                op_fallback_acc_impact = OrderedDict()
-                for op_index, op_tuning_cfg in enumerate(fallback_sampler):
+                                                         initial_op_tuning_cfg=initial_op_tuning_cfg,
+                                                         op_dtypes=op_dtypes, accumulate=True)
+                for op_tuning_cfg in fallback_sampler:
                     op_tuning_cfg['calib_sampling_size'] = calib_sampling_size
                     yield op_tuning_cfg
-                    acc, _ = self.last_tune_result
-                    op_fallback_acc_impact[fallback_items_name_lst[op_index]] = acc
-
-
-                # do accumulated fallback according to the order in the previous stage
-                if len(op_fallback_acc_impact) > 0:
-                    ordered_ops = sorted(op_fallback_acc_impact.keys(), 
-                                         key=lambda key: op_fallback_acc_impact[key],
-                                         reverse=self.higher_is_better)
-                    op_dtypes = OrderedDict(zip(ordered_ops, [target_dtype] * len(fallback_items_name_lst)))
-                    logger.info(f"Start to accumulate fallback to {target_dtype}.")
-                    initial_op_tuning_cfg = deepcopy(best_op_tuning_cfg_stage1)
-                    fallback_sampler = FallbackTuningSampler(tuning_space, tuning_order_lst=[],
-                                                            initial_op_tuning_cfg=initial_op_tuning_cfg,
-                                                            op_dtypes=op_dtypes, accumulate=True)
-                    for op_tuning_cfg in fallback_sampler:
-                        op_tuning_cfg['calib_sampling_size'] = calib_sampling_size
-                        yield op_tuning_cfg
-                        
-    def initial_dynamic_cfg_based_on_static_cfg(self, op_static_cfg:OpTuningConfig):
+
+    def initial_dynamic_cfg_based_on_static_cfg(self, op_static_cfg: OpTuningConfig):
         op_state = op_static_cfg.get_state()
         op_name = op_static_cfg.op_name
         op_type = op_static_cfg.op_type
@@ -198,5 +381,3 @@ def initial_dynamic_cfg_based_on_static_cfg(self, op_static_cfg:OpTuningConfig):
                     tuning_item = quant_mode_item.get_option_by_name(att_item)
                     dynamic_state[att_item] = tuning_item.options[0] if tuning_item else None
         return OpTuningConfig(op_name, op_type, op_quant_mode, tuning_space, kwargs=dynamic_state)
-        
-        
\ No newline at end of file
diff --git a/neural_compressor/strategy/mse.py b/neural_compressor/strategy/mse.py
index 614984359ba..8dafa35759d 100644
--- a/neural_compressor/strategy/mse.py
+++ b/neural_compressor/strategy/mse.py
@@ -194,10 +194,11 @@ def initial_op_quant_mode(items_lst, target_quant_mode, op_item_dtype_dict):
                 initial_op_quant_mode(quant_mode_items, quant_mode, op_item_dtype_dict)
 
             # step3. optype-wise tuning tuning items: the algorithm/scheme/granularity of activation(weight)
-            early_stop_tuning = False
+            early_stop_tuning = True
             stage1_cnt = 0
             int8_ops = quant_mode_wise_items['dynamic'] + quant_mode_wise_items['static']
             stage1_max = min(5, len(int8_ops))  # TODO set a more appropriate value
+            stage1_max=-1
             op_wise_tuning_sampler = OpTypeWiseTuningSampler(tuning_space, [], [], 
                                                              op_item_dtype_dict, initial_op_tuning_cfg)
             for op_tuning_cfg in op_wise_tuning_sampler:
diff --git a/test/strategy/test_hawq_wenhuach.py b/test/strategy/test_hawq_wenhuach.py
index ad7939d5d84..a6ee28b9d4a 100644
--- a/test/strategy/test_hawq_wenhuach.py
+++ b/test/strategy/test_hawq_wenhuach.py
@@ -12,7 +12,7 @@
 from neural_compressor.adaptor import FRAMEWORKS
 import shutil
 from neural_compressor.strategy.st_utils.hawq_wenhuach import Hawq_top, fix_seed
-
+from torch.quantization.quantize_fx import fuse_fx
 fix_seed(1)
 
 def build_ptq_yaml():
@@ -41,7 +41,7 @@ def build_ptq_yaml():
         f.write(fake_yaml)
 
 class TestPytorchAdaptor(unittest.TestCase):
-    framework_specific_info = {"device": "cpu",
+    framework_specific_info = {"device": "gpu",
                                "approach": "post_training_static_quant",
                                "random_seed": 1234,
                                "q_dataloader": None,
@@ -50,6 +50,7 @@ class TestPytorchAdaptor(unittest.TestCase):
     adaptor = FRAMEWORKS[framework](framework_specific_info)
     model = torchvision.models.resnet18()
 
+
     # from collections import OrderedDict
     # model = torch.nn.Sequential(OrderedDict([
     #     ('conv1', torch.nn.Conv2d(3, 2, 1, 1)),
@@ -78,10 +79,11 @@ def eval_func(model):
             return self.i
         from neural_compressor.experimental import Quantization, common
         model = copy.deepcopy(self.model)
-
+        model.eval()
+        model = fuse_fx(model)
         quantizer = Quantization('ptq_yaml.yaml')
         quantizer.eval_func = eval_func
-        dataset = quantizer.dataset('dummy', (1, 3, 224, 224), label=True)
+        dataset = quantizer.dataset('dummy', (32, 3, 224, 224), label=True)
         quantizer.calib_dataloader = common.DataLoader(dataset)
         quantizer.eval_dataloader = common.DataLoader(dataset)
         quantizer.model = model

From a490187d23bd50e0b5025be21009bfe5ea36124a Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Thu, 17 Nov 2022 20:01:48 +0800
Subject: [PATCH 055/128] weight hessian trace, not finished

---
 neural_compressor/strategy/hawq.py | 242 +++++++++++++++++++----------
 1 file changed, 161 insertions(+), 81 deletions(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 3db5cf0aed5..4d3b9489b8f 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -27,19 +27,22 @@
 from .st_utils.tuning_sampler import OpTypeWiseTuningSampler, FallbackTuningSampler, ModelWiseTuningSampler
 from .st_utils.tuning_structs import OpTuningConfig
 from .st_utils.tuning_space import TUNING_ITEMS_LST
-
+from torch.quantization.quantize_fx import fuse_fx
+import  torchvision
 
 class HessianTrace:
-    def __init__(self, model, conf, adaptor, op_cfgs_list, dataloader):
+    def __init__(self, model, conf, adaptor, weight_list, dataloader):
         self.model = model
         self.conf = conf  ##config
-        self.op_cfgs_list = op_cfgs_list  ##op to get
+        self.weight_list = weight_list  ##op to get
         self.dataloader = dataloader
         self.adaptor = adaptor
         self.max_iter = 500
         self.tolerance = 1e-5
         self.eps = 1e-6
         self.index = 0
+        self.device = self.get_device(self.model)
+        self.criterion = torch.nn.CrossEntropyLoss().to(self.device)  ##TODO need to set in config
 
     # def apply_init(self):
     #     trace_per_op = self._cal_trace()
@@ -57,22 +60,51 @@ def get_device(self, model: torch.nn.Module):
         for n, p in model.named_parameters():
             return p.data.device
 
-    def get_gradient(self, model, data, criterion, op_list, device="cpu", retrain_graph=False):
+    def get_gradients(self, model, data, criterion, create_graph=False):
         model.zero_grad()
-        input = data[0]
-        target = data[1]
+        input = data[0].to(self.device)
+        target = data[1].to(self.device)
         output = model(input)
         loss = criterion(output, target)
-        loss.backward(retain_graph=retrain_graph)
-        gradients = {}
+        loss.backward(create_graph=create_graph)
+        gradients = []
         for n, p in model.named_parameters():
-            if n in op_list:
-                continue
-            gradients[n] = 0
             if p.grad != None:
-                gradients[n] = p.grad
+                gradient = p.grad
+                gradients.append(gradient+0.0) ## add 0 to create a copy
+        model.zero_grad()
         return gradients
 
+    def get_params(self, model):
+        parameters = [p for p in model.parameters() if p.requires_grad]
+        return parameters
+
+    def sample_rademacher(self, params):
+        samples = []
+        for param in params:
+            r = torch.randint_like(param, high=2, device=self.device)
+            r.masked_fill_(r == 0, -1)
+            samples.append(r)
+        return samples
+
+    def hutchinson_one_step(self, params, num_batches):
+        v = self.sample_rademacher(params)
+        H_v = [0] * len(v)
+        cnt = 0
+        for step, batch in enumerate(self.dataloader):
+            batch_size = batch[0].shape[0]
+            cnt += batch_size
+            gradients = self.get_gradients(self.model, batch, self.criterion, create_graph=True)
+            H_v_one = torch.autograd.grad(gradients, params, v, only_inputs=True, retain_graph=False)
+            H_v = [pre + cur * float(batch_size) + 0.0 for cur, pre in zip(H_v_one, H_v)]
+            if step == num_batches - 1:
+                break
+        if cnt > 0:
+            H_v = [item / cnt for item in H_v]
+        v_t_H_v = [torch.sum(h_v * v_t) / h_v.size().numel() for (h_v, v_t) in zip(H_v, v)]
+        return v_t_H_v
+
+
     def get_avg_trace(self, num_batches=2):
         """
                 Estimates average hessian trace for each parameter
@@ -80,73 +112,75 @@ def get_avg_trace(self, num_batches=2):
         assert num_batches > 0
         ##num_data_iter = self.op_cfgs_list[0]['calib_iteration']
         ##num_all_data = num_data_iter * self.dataloader.batch_size
-        op_list = [item.name for item in self.op_cfgs_list]
-        criterion = torch.nn.CrossEntropyLoss()  ##TODO setting this in config
-        device = self.get_device(self.model)
+        op_list = self.weight_list
 
-        for step, batch in enumerate(self.dataloader):
-            gradient_dict = self.get_gradient(self.model, batch,criterion, op_list, device=device, retrain_graph=True)
-            tmp = 1
-            if step == num_batches - 1:
-                break
+        ##TODO setting this in config
+
+
+        params = [p for p in self.model.parameters() if p.requires_grad]
 
+        for i in range(self.max_iter):
+            trace_estimated = self.hutchinson_one_step(params, num_batches)
 
-        weight_vhp = []
-        w_avg_total_trace = 0.
-        w_avg_traces_per_iter = []
-        mean_avg_traces_per_param = None
-        act_vhp = []
-        a_avg_total_trace = 0.
-        a_avg_traces_per_iter = []
-        mean_avg_traces_per_act = None
-
-        for i in range(max_iter):
-            weight_vhp_list, w_v, \
-            act_vhp_list, a_v, op_act_grad = self.adaptor.get_2order_grad(self.model,
-                                                                          criterion,
-                                                                          self.dataloader,
-                                                                          num_data_iter,
-                                                                          qop_list)
-            if not weight_vhp:
-                weight_vhp = [np.random.randn(*p.shape) for p in w_v]
-            for vhp_curr in weight_vhp_list:
-                weight_vhp = [a + b * float(self.dataloader.batch_size) + 0. \
-                              for a, b in zip(weight_vhp, vhp_curr)]
-            weight_vhp = [a / float(num_all_data) for a in weight_vhp]
-            avg_traces_per_param = [np.sum(a * b) / a.size for (a, b) in zip(weight_vhp, w_v)]
-            w_avg_traces_per_iter.append(avg_traces_per_param)
-            mean_avg_traces_per_param = np.mean(w_avg_traces_per_iter, axis=0)
-            w_mean_avg_total_trace = np.sum(mean_avg_traces_per_param)
-
-            w_diff_avg = abs(w_mean_avg_total_trace - w_avg_total_trace) / \
-                         (w_avg_total_trace + diff_eps)
-            w_avg_total_trace = w_mean_avg_total_trace
-            logger.info(
-                '{}# weights difference_avg={} avg_trace={}'.format(
-                    i, w_diff_avg, w_avg_total_trace))
-
-            if not act_vhp:
-                act_vhp = [np.random.randn(*p.shape) for p in a_v]
-            for vhp_curr in act_vhp_list:
-                act_vhp = [a + b * float(self.dataloader.batch_size) + 0. \
-                           for a, b in zip(act_vhp, vhp_curr)]
-            act_vhp = [a / float(num_all_data) for a in act_vhp]
-            avg_traces_per_act = [np.sum(a * b) / a.size for (a, b) in zip(act_vhp, a_v)]
-            a_avg_traces_per_iter.append(avg_traces_per_act)
-            mean_avg_traces_per_act = np.mean(a_avg_traces_per_iter, axis=0)
-            a_mean_avg_total_trace = np.sum(mean_avg_traces_per_act)
-
-            a_diff_avg = abs(a_mean_avg_total_trace - a_avg_total_trace) / \
-                         (a_avg_total_trace + diff_eps)
-            a_avg_total_trace = a_mean_avg_total_trace
-            logger.info(
-                '{}# activation difference_avg={} avg_trace={}'.format(
-                    i, a_diff_avg, a_avg_total_trace))
-
-            if w_diff_avg < tolerance and a_diff_avg < tolerance:
-                return mean_avg_traces_per_param, mean_avg_traces_per_act, op_act_grad
-
-        return mean_avg_traces_per_param, mean_avg_traces_per_act, op_act_grad
+
+        tmp = 1
+        #
+        # weight_vhp = []
+        # w_avg_total_trace = 0.
+        # w_avg_traces_per_iter = []
+        # mean_avg_traces_per_param = None
+        # act_vhp = []
+        # a_avg_total_trace = 0.
+        # a_avg_traces_per_iter = []
+        # mean_avg_traces_per_act = None
+        #
+        # for i in range(self.max_iter):
+        #     weight_vhp_list, w_v, \
+        #     act_vhp_list, a_v, op_act_grad = self.adaptor.get_2order_grad(self.model,
+        #                                                                   criterion,
+        #                                                                   self.dataloader,
+        #                                                                   num_data_iter,
+        #                                                                   qop_list)
+        #     if not weight_vhp:
+        #         weight_vhp = [np.random.randn(*p.shape) for p in w_v]
+        #     for vhp_curr in weight_vhp_list:
+        #         weight_vhp = [a + b * float(self.dataloader.batch_size) + 0. \
+        #                       for a, b in zip(weight_vhp, vhp_curr)]
+        #     weight_vhp = [a / float(num_all_data) for a in weight_vhp]
+        #     avg_traces_per_param = [np.sum(a * b) / a.size for (a, b) in zip(weight_vhp, w_v)]
+        #     w_avg_traces_per_iter.append(avg_traces_per_param)
+        #     mean_avg_traces_per_param = np.mean(w_avg_traces_per_iter, axis=0)
+        #     w_mean_avg_total_trace = np.sum(mean_avg_traces_per_param)
+        #
+        #     w_diff_avg = abs(w_mean_avg_total_trace - w_avg_total_trace) / \
+        #                  (w_avg_total_trace + diff_eps)
+        #     w_avg_total_trace = w_mean_avg_total_trace
+        #     logger.info(
+        #         '{}# weights difference_avg={} avg_trace={}'.format(
+        #             i, w_diff_avg, w_avg_total_trace))
+        #
+        #     if not act_vhp:
+        #         act_vhp = [np.random.randn(*p.shape) for p in a_v]
+        #     for vhp_curr in act_vhp_list:
+        #         act_vhp = [a + b * float(self.dataloader.batch_size) + 0. \
+        #                    for a, b in zip(act_vhp, vhp_curr)]
+        #     act_vhp = [a / float(num_all_data) for a in act_vhp]
+        #     avg_traces_per_act = [np.sum(a * b) / a.size for (a, b) in zip(act_vhp, a_v)]
+        #     a_avg_traces_per_iter.append(avg_traces_per_act)
+        #     mean_avg_traces_per_act = np.mean(a_avg_traces_per_iter, axis=0)
+        #     a_mean_avg_total_trace = np.sum(mean_avg_traces_per_act)
+        #
+        #     a_diff_avg = abs(a_mean_avg_total_trace - a_avg_total_trace) / \
+        #                  (a_avg_total_trace + diff_eps)
+        #     a_avg_total_trace = a_mean_avg_total_trace
+        #     logger.info(
+        #         '{}# activation difference_avg={} avg_trace={}'.format(
+        #             i, a_diff_avg, a_avg_total_trace))
+        #
+        #     if w_diff_avg < tolerance and a_diff_avg < tolerance:
+        #         return mean_avg_traces_per_param, mean_avg_traces_per_act, op_act_grad
+        #
+        # return mean_avg_traces_per_param, mean_avg_traces_per_act, op_act_grad
 
     def _cal_trace(self):
         """
@@ -241,6 +275,46 @@ def __init__(self, model, conf, q_dataloader, q_func=None,
             dicts,
             q_hooks)
 
+    def is_fused_module(self, module):
+        """This is a helper function for `_propagate_qconfig_helper` to detecte
+           if this module is fused.
+        Args:
+            module (object): input module
+        Returns:
+            (bool): is fused or not
+        """
+        op_type = str(type(module))
+        if 'fused' in op_type:
+            return True
+        else:
+            return False
+
+    def get_fused_mapping(self):
+        # tmp = self.model
+        # if isinstance(self._fp32_model, torch.nn.Module):
+        #     fx_model  = self._fp32_model
+        #
+        # model = copy.deepcopy(self._fp32_model) ##orig model
+        # model.eval()
+        # fx_model = fuse_fx(model)
+        model = self._fp32_model
+        weights_info = dict(model.named_parameters())
+        weight_to_op = {}
+
+        module_dict = dict(model.named_modules())
+        for op_name, child in model.named_modules():
+            if self.is_fused_module(child):
+                for name, _ in child.named_children():
+                    if op_name + "." + name + ".weight" in weights_info:  ##TODO check if this is right
+                        weight_to_op[op_name + "." + name + ".weight"] = op_name
+                    # module_prefix = op_name + '.' + name
+                    # if module_prefix in module_dict:
+                    #     module_dict.pop(module_prefix)  # remove sub-modules of fused modules
+            else:
+                if op_name + ".weight" in weights_info:
+                    weight_to_op[op_name + ".weight"] = op_name
+        return weight_to_op
+
     def next_tune_cfg(self):
         from copy import deepcopy
         tuning_space = self.tuning_space
@@ -254,16 +328,21 @@ def next_tune_cfg(self):
 
         target_dtype = "fp32"  ##TODO support bf16
         target_type_lst = set(tuning_space.query_items_by_quant_mode(target_dtype))
-        fp_op_list = [item for item in quant_ops if item in target_type_lst]
+        fp_op_list = [item.name for item in quant_ops if item in target_type_lst]
+        # for n, p in self._fp32_model.named_modules():
+        #     print(n)
+        # for n, p in self._fp32_model.named_parameters():
+        #     print(n)
+        weight_to_op = self.get_fused_mapping()
         orig_eval = True
         if self._fp32_model.training:
             orig_eval = False
         self._fp32_model.train()
-        ht = HessianTrace(self._fp32_model, self.cfg, self.adaptor, fp_op_list, self.calib_dataloader)
+        ht = HessianTrace(self._fp32_model, self.cfg, self.adaptor, weight_to_op.keys(), self.calib_dataloader)
         ht.get_avg_trace()
-        # if orig_eval:
-        #     self._fp32_model.eval()
-        # ht.get_avg_trace()
+        if orig_eval:
+            self._fp32_model.eval()
+
         # tmp = 1
         # fallback_items_name_lst = [item.name for item in fallback_items_lst][::-1] # from bottom to up
         # ops_sensitivity = self.adaptor.get_hessian_trace(self._fp32_model,
@@ -338,6 +417,7 @@ def next_tune_cfg_bk(self):
             fallback_sampler = FallbackTuningSampler(tuning_space, tuning_order_lst=[],
                                                      initial_op_tuning_cfg=initial_op_tuning_cfg,
                                                      op_dtypes=op_dtypes, accumulate=False)
+
             op_fallback_acc_impact = OrderedDict()
             for op_index, op_tuning_cfg in enumerate(fallback_sampler):
                 op_tuning_cfg['calib_sampling_size'] = calib_sampling_size

From 6c683f43ced5a22845f9da3d03505f562cbb0c8b Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Fri, 18 Nov 2022 11:16:25 +0800
Subject: [PATCH 056/128] bascially finished weight trace

---
 neural_compressor/strategy/hawq.py | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 4d3b9489b8f..39fd93fd3ff 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -96,19 +96,20 @@ def hutchinson_one_step(self, params, num_batches):
             cnt += batch_size
             gradients = self.get_gradients(self.model, batch, self.criterion, create_graph=True)
             H_v_one = torch.autograd.grad(gradients, params, v, only_inputs=True, retain_graph=False)
-            H_v = [pre + cur * float(batch_size) + 0.0 for cur, pre in zip(H_v_one, H_v)]
+            H_v = [pre + cur * float(batch_size)  for cur, pre in zip(H_v_one, H_v)]
             if step == num_batches - 1:
                 break
         if cnt > 0:
             H_v = [item / cnt for item in H_v]
-        v_t_H_v = [torch.sum(h_v * v_t) / h_v.size().numel() for (h_v, v_t) in zip(H_v, v)]
+        v_t_H_v = torch.stack([torch.mean(h_v * v_t) for (h_v, v_t) in zip(H_v, v)])##maybe sum is better
         return v_t_H_v
 
 
-    def get_avg_trace(self, num_batches=2):
+
+    def get_avg_traces(self, num_batches=2):
+        """
+        Estimates average hessian trace for each parameter
         """
-                Estimates average hessian trace for each parameter
-                """
         assert num_batches > 0
         ##num_data_iter = self.op_cfgs_list[0]['calib_iteration']
         ##num_all_data = num_data_iter * self.dataloader.batch_size
@@ -119,8 +120,21 @@ def get_avg_trace(self, num_batches=2):
 
         params = [p for p in self.model.parameters() if p.requires_grad]
 
+        layer_traces_per_iter = []
+        prev_avg_model_trace = 0
         for i in range(self.max_iter):
-            trace_estimated = self.hutchinson_one_step(params, num_batches)
+            layer_traces = self.hutchinson_one_step(params, num_batches)
+            layer_traces_per_iter.append(layer_traces)
+            layer_traces_estimate = torch.mean(torch.stack(layer_traces_per_iter), dim=0)
+            model_trace = torch.sum(layer_traces_estimate)
+            diff_ratio = abs(model_trace-prev_avg_model_trace)/(prev_avg_model_trace+self.eps)
+            if diff_ratio < self.tolerance and i > 10:##TODO magic number
+                break
+            prev_avg_model_trace = model_trace
+
+        layer_traces = layer_traces_estimate
+        return layer_traces
+
 
 
         tmp = 1
@@ -339,7 +353,7 @@ def next_tune_cfg(self):
             orig_eval = False
         self._fp32_model.train()
         ht = HessianTrace(self._fp32_model, self.cfg, self.adaptor, weight_to_op.keys(), self.calib_dataloader)
-        ht.get_avg_trace()
+        ht.get_avg_traces()
         if orig_eval:
             self._fp32_model.eval()
 

From 03993e600f428efe430cd4a66c592c3d03a964ec Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Fri, 18 Nov 2022 15:45:52 +0800
Subject: [PATCH 057/128] enable activation gradient hook,  activation trace is
 not finished

---
 neural_compressor/strategy/hawq.py | 266 +++++++++++------------------
 1 file changed, 101 insertions(+), 165 deletions(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 39fd93fd3ff..34d94901167 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -28,42 +28,73 @@
 from .st_utils.tuning_structs import OpTuningConfig
 from .st_utils.tuning_space import TUNING_ITEMS_LST
 from torch.quantization.quantize_fx import fuse_fx
-import  torchvision
+import torchvision
+
 
 class HessianTrace:
-    def __init__(self, model, conf, adaptor, weight_list, dataloader):
-        self.model = model
-        self.conf = conf  ##config
-        self.weight_list = weight_list  ##op to get
+    """
+    please refer to
+    Yao, Zhewei, et al. "Pyhessian: Neural networks through the lens of the hessian." 2020 IEEE international conference on big data (Big data). IEEE, 2020.
+    Dong, Zhen, et al. "Hawq-v2: Hessian aware trace-weighted quantization of neural networks." Advances in neural information processing systems 33 (2020): 18518-18529.
+    https://github.com/openvinotoolkit/nncf/blob/develop/nncf/torch/quantization/hessian_trace.py
+    """
+
+    def __init__(self, model, dataloader, criterion=None):
+        self.model = model  ##TODO need to check fused or not
         self.dataloader = dataloader
-        self.adaptor = adaptor
         self.max_iter = 500
         self.tolerance = 1e-5
         self.eps = 1e-6
         self.index = 0
         self.device = self.get_device(self.model)
-        self.criterion = torch.nn.CrossEntropyLoss().to(self.device)  ##TODO need to set in config
-
-    # def apply_init(self):
-    #     trace_per_op = self._cal_trace()
-    #     if not trace_per_op:
-    #         raise RuntimeError('Failed to calculate hessian traces!')
-    #
-    #     perturbations = self._calc_quantization_noise()
-    #     configuration_metric = self._calc_hawq_metric_per_configuration(
-    #         perturbations, trace_per_op)
-    #     config_index = self.choose_configuration(configuration_metric)
-    #     chosen_config = self.op_cfgs_list[config_index]
-    #     return chosen_config, trace_per_op
+        self.criterion = criterion
+        if self.criterion == None:
+            self.criterion = torch.nn.CrossEntropyLoss().to(self.device)  ##TODO need to set in config
+        self.criterion = self.criterion.to(self.device)
+        self.weight_to_op, self.op_list = self.get_fused_mapping()
+
+    def is_fused_module(self, module):
+        """This is a helper function for `_propagate_qconfig_helper` to detecte
+           if this module is fused.
+        Args:
+            module (object): input module
+        Returns:
+            (bool): is fused or not
+        """
+        op_type = str(type(module))
+        if 'fused' in op_type:
+            return True
+        else:
+            return False
+
+    def get_fused_mapping(self):
+        model = self.model
+        weights_info = dict(model.named_parameters())
+        weight_to_op = {}
+        for op_name, child in model.named_modules():
+            if self.is_fused_module(child):
+                for name, _ in child.named_children():
+                    if op_name + "." + name + ".weight" in weights_info:  ##TODO check if this is right
+                        weight_to_op[op_name + "." + name + ".weight"] = op_name
+                        break
+            else:
+                if op_name + ".weight" in weights_info:
+                    weight_to_op[op_name + ".weight"] = op_name
+        op_list = []
+        for key in weight_to_op.keys():
+            op_list.append(weight_to_op[key])
+        return weight_to_op, op_list
 
     def get_device(self, model: torch.nn.Module):
         for n, p in model.named_parameters():
             return p.data.device
 
-    def get_gradients(self, model, data, criterion, create_graph=False):
+    def get_gradients(self, model, data, criterion, create_graph=False, enable_act=False):
         model.zero_grad()
         input = data[0].to(self.device)
         target = data[1].to(self.device)
+        if enable_act:
+            input.requires_grad = True
         output = model(input)
         loss = criterion(output, target)
         loss.backward(create_graph=create_graph)
@@ -71,7 +102,7 @@ def get_gradients(self, model, data, criterion, create_graph=False):
         for n, p in model.named_parameters():
             if p.grad != None:
                 gradient = p.grad
-                gradients.append(gradient+0.0) ## add 0 to create a copy
+                gradients.append(gradient + 0.0)  ## add 0 to create a copy
         model.zero_grad()
         return gradients
 
@@ -87,143 +118,88 @@ def sample_rademacher(self, params):
             samples.append(r)
         return samples
 
-    def hutchinson_one_step(self, params, num_batches):
+    def hutchinson_one_step(self, params, enable_act, num_batches):
         v = self.sample_rademacher(params)
         H_v = [0] * len(v)
         cnt = 0
-        for step, batch in enumerate(self.dataloader):
-            batch_size = batch[0].shape[0]
+        for step, data in enumerate(self.dataloader):
+            batch_size = data[0].shape[0]
             cnt += batch_size
-            gradients = self.get_gradients(self.model, batch, self.criterion, create_graph=True)
+            gradients = self.get_gradients(self.model, data, self.criterion, create_graph=True,enable_act=enable_act)
             H_v_one = torch.autograd.grad(gradients, params, v, only_inputs=True, retain_graph=False)
-            H_v = [pre + cur * float(batch_size)  for cur, pre in zip(H_v_one, H_v)]
+            H_v = [pre + cur * float(batch_size) for cur, pre in zip(H_v_one, H_v)]
             if step == num_batches - 1:
                 break
         if cnt > 0:
             H_v = [item / cnt for item in H_v]
-        v_t_H_v = torch.stack([torch.mean(h_v * v_t) for (h_v, v_t) in zip(H_v, v)])##maybe sum is better
+        v_t_H_v = torch.stack([torch.mean(h_v * v_t) for (h_v, v_t) in zip(H_v, v)])  ##maybe sum is better
         return v_t_H_v
 
 
+    def backward_hook(self, name):
+        def grad_hook(model, grad_input, grad_output):
+            self.layer_acts_grads[name] = [grad_input, grad_output]
+        return grad_hook
+
+    def forward_hook(self, name):
+        def enable_input_grad_hook(model, inputs, outputs):
+            try:
+                input = inputs[0]##TODO check whether this is right
+            except:
+                input = inputs
 
-    def get_avg_traces(self, num_batches=2):
+            if input.is_leaf == False:
+                if input.requires_grad is False:
+                    input.requires_grad = True
+                    self.layer_acts[name] = input
+
+        return enable_input_grad_hook
+
+    def register_hook(self):
+        for name, module in self.model.named_modules():
+            if name in self.op_list:
+                forward_handle = module.register_forward_hook(self.forward_hook(name))
+                backward_handle = module.register_backward_hook(self.backward_hook(name))
+                self.hook_handlers.append(forward_handle)
+                self.hook_handlers.append(backward_handle)
+
+    def unregister_hook(self):
+        for handel in self.hook_handlers:
+            handel.remove()
+
+    def get_avg_traces(self, enable_act=True, num_batches=2):
         """
         Estimates average hessian trace for each parameter
         """
         assert num_batches > 0
+        if enable_act:
+            self.hook_handlers = []
+            self.layer_acts = {}
+            self.layer_acts_grads = {}
+            self.register_hook()
         ##num_data_iter = self.op_cfgs_list[0]['calib_iteration']
         ##num_all_data = num_data_iter * self.dataloader.batch_size
-        op_list = self.weight_list
-
+        ##op_list = self.op_list
         ##TODO setting this in config
-
-
         params = [p for p in self.model.parameters() if p.requires_grad]
 
         layer_traces_per_iter = []
         prev_avg_model_trace = 0
         for i in range(self.max_iter):
-            layer_traces = self.hutchinson_one_step(params, num_batches)
+            layer_traces = self.hutchinson_one_step(params, enable_act, num_batches )
             layer_traces_per_iter.append(layer_traces)
             layer_traces_estimate = torch.mean(torch.stack(layer_traces_per_iter), dim=0)
             model_trace = torch.sum(layer_traces_estimate)
-            diff_ratio = abs(model_trace-prev_avg_model_trace)/(prev_avg_model_trace+self.eps)
-            if diff_ratio < self.tolerance and i > 10:##TODO magic number
+            diff_ratio = abs(model_trace - prev_avg_model_trace) / (prev_avg_model_trace + self.eps)
+            if diff_ratio < self.tolerance and i > 10:  ##TODO magic number
                 break
             prev_avg_model_trace = model_trace
 
         layer_traces = layer_traces_estimate
+        self.unregister_hook()
         return layer_traces
 
 
-
-        tmp = 1
-        #
-        # weight_vhp = []
-        # w_avg_total_trace = 0.
-        # w_avg_traces_per_iter = []
-        # mean_avg_traces_per_param = None
-        # act_vhp = []
-        # a_avg_total_trace = 0.
-        # a_avg_traces_per_iter = []
-        # mean_avg_traces_per_act = None
-        #
-        # for i in range(self.max_iter):
-        #     weight_vhp_list, w_v, \
-        #     act_vhp_list, a_v, op_act_grad = self.adaptor.get_2order_grad(self.model,
-        #                                                                   criterion,
-        #                                                                   self.dataloader,
-        #                                                                   num_data_iter,
-        #                                                                   qop_list)
-        #     if not weight_vhp:
-        #         weight_vhp = [np.random.randn(*p.shape) for p in w_v]
-        #     for vhp_curr in weight_vhp_list:
-        #         weight_vhp = [a + b * float(self.dataloader.batch_size) + 0. \
-        #                       for a, b in zip(weight_vhp, vhp_curr)]
-        #     weight_vhp = [a / float(num_all_data) for a in weight_vhp]
-        #     avg_traces_per_param = [np.sum(a * b) / a.size for (a, b) in zip(weight_vhp, w_v)]
-        #     w_avg_traces_per_iter.append(avg_traces_per_param)
-        #     mean_avg_traces_per_param = np.mean(w_avg_traces_per_iter, axis=0)
-        #     w_mean_avg_total_trace = np.sum(mean_avg_traces_per_param)
-        #
-        #     w_diff_avg = abs(w_mean_avg_total_trace - w_avg_total_trace) / \
-        #                  (w_avg_total_trace + diff_eps)
-        #     w_avg_total_trace = w_mean_avg_total_trace
-        #     logger.info(
-        #         '{}# weights difference_avg={} avg_trace={}'.format(
-        #             i, w_diff_avg, w_avg_total_trace))
-        #
-        #     if not act_vhp:
-        #         act_vhp = [np.random.randn(*p.shape) for p in a_v]
-        #     for vhp_curr in act_vhp_list:
-        #         act_vhp = [a + b * float(self.dataloader.batch_size) + 0. \
-        #                    for a, b in zip(act_vhp, vhp_curr)]
-        #     act_vhp = [a / float(num_all_data) for a in act_vhp]
-        #     avg_traces_per_act = [np.sum(a * b) / a.size for (a, b) in zip(act_vhp, a_v)]
-        #     a_avg_traces_per_iter.append(avg_traces_per_act)
-        #     mean_avg_traces_per_act = np.mean(a_avg_traces_per_iter, axis=0)
-        #     a_mean_avg_total_trace = np.sum(mean_avg_traces_per_act)
-        #
-        #     a_diff_avg = abs(a_mean_avg_total_trace - a_avg_total_trace) / \
-        #                  (a_avg_total_trace + diff_eps)
-        #     a_avg_total_trace = a_mean_avg_total_trace
-        #     logger.info(
-        #         '{}# activation difference_avg={} avg_trace={}'.format(
-        #             i, a_diff_avg, a_avg_total_trace))
-        #
-        #     if w_diff_avg < tolerance and a_diff_avg < tolerance:
-        #         return mean_avg_traces_per_param, mean_avg_traces_per_act, op_act_grad
-        #
-        # return mean_avg_traces_per_param, mean_avg_traces_per_act, op_act_grad
-
-    def _cal_trace(self):
-        """
-        Calculate the trace for both weight and activation per layer
-        """
-        pass
-        # trace_estimator = HessianTraceEstimator(self.model,
-        #                                         self.conf,
-        #                                         self.adaptor,
-        #                                         self.op_cfgs_list,
-        #                                         self.dataloader)
-        # w_avg_trace, a_avg_trace, op_act_grad = trace_estimator.get_avg_trace()
-        #
-        # # mapping trace to op per op_weight_mapping
-        # weights_name = self.adaptor.get_all_weight_names(self.model)
-        # op_weight_mapping = self.get_op_weight_mapping()
-        # trace_per_op = OrderedDict()
-        # w_op_trace_info = np.zeros(len(op_weight_mapping))
-        # for i, (op_name, w_name) in enumerate(op_weight_mapping.items()):
-        #     index = weights_name.index(w_name)
-        #     w_op_trace_info[i] = w_avg_trace[index]
-        #     act_trace = 0.0
-        #     if op_name in op_act_grad:
-        #         a_index = op_act_grad.index(op_name)
-        #         act_trace = a_avg_trace[a_index]
-        #     trace_per_op[op_name] = (w_avg_trace[index], act_trace)
-        # return trace_per_op
-
-
 @strategy_registry
 class HawqTuneStrategy(TuneStrategy):
     """The basic tuning strategy which tunes the low precision model with below order.
@@ -289,46 +265,6 @@ def __init__(self, model, conf, q_dataloader, q_func=None,
             dicts,
             q_hooks)
 
-    def is_fused_module(self, module):
-        """This is a helper function for `_propagate_qconfig_helper` to detecte
-           if this module is fused.
-        Args:
-            module (object): input module
-        Returns:
-            (bool): is fused or not
-        """
-        op_type = str(type(module))
-        if 'fused' in op_type:
-            return True
-        else:
-            return False
-
-    def get_fused_mapping(self):
-        # tmp = self.model
-        # if isinstance(self._fp32_model, torch.nn.Module):
-        #     fx_model  = self._fp32_model
-        #
-        # model = copy.deepcopy(self._fp32_model) ##orig model
-        # model.eval()
-        # fx_model = fuse_fx(model)
-        model = self._fp32_model
-        weights_info = dict(model.named_parameters())
-        weight_to_op = {}
-
-        module_dict = dict(model.named_modules())
-        for op_name, child in model.named_modules():
-            if self.is_fused_module(child):
-                for name, _ in child.named_children():
-                    if op_name + "." + name + ".weight" in weights_info:  ##TODO check if this is right
-                        weight_to_op[op_name + "." + name + ".weight"] = op_name
-                    # module_prefix = op_name + '.' + name
-                    # if module_prefix in module_dict:
-                    #     module_dict.pop(module_prefix)  # remove sub-modules of fused modules
-            else:
-                if op_name + ".weight" in weights_info:
-                    weight_to_op[op_name + ".weight"] = op_name
-        return weight_to_op
-
     def next_tune_cfg(self):
         from copy import deepcopy
         tuning_space = self.tuning_space
@@ -347,12 +283,12 @@ def next_tune_cfg(self):
         #     print(n)
         # for n, p in self._fp32_model.named_parameters():
         #     print(n)
-        weight_to_op = self.get_fused_mapping()
+
         orig_eval = True
         if self._fp32_model.training:
             orig_eval = False
         self._fp32_model.train()
-        ht = HessianTrace(self._fp32_model, self.cfg, self.adaptor, weight_to_op.keys(), self.calib_dataloader)
+        ht = HessianTrace(self._fp32_model, self.calib_dataloader)
         ht.get_avg_traces()
         if orig_eval:
             self._fp32_model.eval()

From 20bed968470a144ceeda15142fadf0c96779389a Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Fri, 18 Nov 2022 15:47:25 +0800
Subject: [PATCH 058/128] reformat code

---
 neural_compressor/strategy/hawq.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 34d94901167..8ec728337b9 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -102,7 +102,7 @@ def get_gradients(self, model, data, criterion, create_graph=False, enable_act=F
         for n, p in model.named_parameters():
             if p.grad != None:
                 gradient = p.grad
-                gradients.append(gradient + 0.0)  ## add 0 to create a copy
+                gradients.append(gradient + 0.0) ## add 0 to create a copy
         model.zero_grad()
         return gradients
 
@@ -125,7 +125,7 @@ def hutchinson_one_step(self, params, enable_act, num_batches):
         for step, data in enumerate(self.dataloader):
             batch_size = data[0].shape[0]
             cnt += batch_size
-            gradients = self.get_gradients(self.model, data, self.criterion, create_graph=True,enable_act=enable_act)
+            gradients = self.get_gradients(self.model, data, self.criterion, create_graph=True, enable_act=enable_act)
             H_v_one = torch.autograd.grad(gradients, params, v, only_inputs=True, retain_graph=False)
             H_v = [pre + cur * float(batch_size) for cur, pre in zip(H_v_one, H_v)]
             if step == num_batches - 1:
@@ -135,16 +135,16 @@ def hutchinson_one_step(self, params, enable_act, num_batches):
         v_t_H_v = torch.stack([torch.mean(h_v * v_t) for (h_v, v_t) in zip(H_v, v)])  ##maybe sum is better
         return v_t_H_v
 
-
     def backward_hook(self, name):
         def grad_hook(model, grad_input, grad_output):
             self.layer_acts_grads[name] = [grad_input, grad_output]
+
         return grad_hook
 
     def forward_hook(self, name):
         def enable_input_grad_hook(model, inputs, outputs):
             try:
-                input = inputs[0]##TODO check whether this is right
+                input = inputs[0]  ##TODO check whether this is right
             except:
                 input = inputs
 
@@ -167,7 +167,7 @@ def unregister_hook(self):
         for handel in self.hook_handlers:
             handel.remove()
 
-    def get_avg_traces(self, enable_act=True, num_batches=2):
+    def get_avg_traces(self, enable_act=False, num_batches=2):
         """
         Estimates average hessian trace for each parameter
         """
@@ -186,7 +186,7 @@ def get_avg_traces(self, enable_act=True, num_batches=2):
         layer_traces_per_iter = []
         prev_avg_model_trace = 0
         for i in range(self.max_iter):
-            layer_traces = self.hutchinson_one_step(params, enable_act, num_batches )
+            layer_traces = self.hutchinson_one_step(params, enable_act, num_batches)
             layer_traces_per_iter.append(layer_traces)
             layer_traces_estimate = torch.mean(torch.stack(layer_traces_per_iter), dim=0)
             model_trace = torch.sum(layer_traces_estimate)

From 806290a679ca0882be29fdd65eb945b49f736fae Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Fri, 18 Nov 2022 15:54:48 +0800
Subject: [PATCH 059/128] fix a bug

---
 neural_compressor/strategy/hawq.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 8ec728337b9..7d2331af345 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -196,7 +196,8 @@ def get_avg_traces(self, enable_act=False, num_batches=2):
             prev_avg_model_trace = model_trace
 
         layer_traces = layer_traces_estimate
-        self.unregister_hook()
+        if enable_act:
+            self.unregister_hook()
         return layer_traces
 
 

From 4efc18cd5a013f96849c7027a3cac3be9aeb4401 Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Mon, 21 Nov 2022 11:06:43 +0800
Subject: [PATCH 060/128] when reset the required grad, something goes wrong

---
 neural_compressor/strategy/hawq.py | 67 ++++++++++++++++++++----------
 1 file changed, 46 insertions(+), 21 deletions(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 7d2331af345..228bb249e2a 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -92,6 +92,7 @@ def get_device(self, model: torch.nn.Module):
     def get_gradients(self, model, data, criterion, create_graph=False, enable_act=False):
         model.zero_grad()
         input = data[0].to(self.device)
+        ##self._input_shape = input.shape  ## for resetting input activation
         target = data[1].to(self.device)
         if enable_act:
             input.requires_grad = True
@@ -102,7 +103,7 @@ def get_gradients(self, model, data, criterion, create_graph=False, enable_act=F
         for n, p in model.named_parameters():
             if p.grad != None:
                 gradient = p.grad
-                gradients.append(gradient + 0.0) ## add 0 to create a copy
+                gradients.append(gradient + 0.0)  ## add 0 to create a copy
         model.zero_grad()
         return gradients
 
@@ -118,7 +119,7 @@ def sample_rademacher(self, params):
             samples.append(r)
         return samples
 
-    def hutchinson_one_step(self, params, enable_act, num_batches):
+    def get_hv_one_sample(self, params, enable_act, num_batches):
         v = self.sample_rademacher(params)
         H_v = [0] * len(v)
         cnt = 0
@@ -135,19 +136,17 @@ def hutchinson_one_step(self, params, enable_act, num_batches):
         v_t_H_v = torch.stack([torch.mean(h_v * v_t) for (h_v, v_t) in zip(H_v, v)])  ##maybe sum is better
         return v_t_H_v
 
-    def backward_hook(self, name):
-        def grad_hook(model, grad_input, grad_output):
+    def _get_input_grad_hook(self, name):
+        def input_grad_hook(model, grad_input, grad_output):
             self.layer_acts_grads[name] = [grad_input, grad_output]
+        return input_grad_hook
 
-        return grad_hook
-
-    def forward_hook(self, name):
+    def _get_enable_input_grad_hook(self, name):
         def enable_input_grad_hook(model, inputs, outputs):
             try:
                 input = inputs[0]  ##TODO check whether this is right
             except:
                 input = inputs
-
             if input.is_leaf == False:
                 if input.requires_grad is False:
                     input.requires_grad = True
@@ -155,28 +154,54 @@ def enable_input_grad_hook(model, inputs, outputs):
 
         return enable_input_grad_hook
 
-    def register_hook(self):
+    # def _get_disable_input_grad_hook(self, name):
+    #     def disable_input_grad_hook(model, inputs, outputs):
+    #         try:
+    #             input = inputs[0]  ##TODO check whether this is right
+    #         except:
+    #             input = inputs
+    #         if input.is_leaf == False:## you can only change requires_grad flags of leaf variables
+    #             if input.requires_grad is True:
+    #                 input.requires_grad = False
+    #
+    #
+    #     return disable_input_grad_hook
+
+
+    def _unregister_hook(self):
+        for handel in self.hook_handles:
+            handel.remove()
+
+    def register_input_grad_hooks(self):
         for name, module in self.model.named_modules():
             if name in self.op_list:
-                forward_handle = module.register_forward_hook(self.forward_hook(name))
-                backward_handle = module.register_backward_hook(self.backward_hook(name))
-                self.hook_handlers.append(forward_handle)
-                self.hook_handlers.append(backward_handle)
+                hook_handle = module.register_forward_hook(self._get_enable_input_grad_hook(name))
+                self.hook_handles.append(hook_handle)
+                hook_handle = module.register_forward_hook(self._get_input_grad_hook(name))
+                self.hook_handles.append(hook_handle)
+
+
+    def reset_input_gradient_and_hooks(self):
+        # tmp_input = torch.zeros(self._input_shape, device=self.device)
+        # for name, module in self.model.named_modules():
+        #     if name in self.op_list:
+        #         hook_handle = module.register_forward_hook(self._get_disable_input_grad_hook(name))
+        #         self.hook_handles.append(hook_handle)
+        # self.model(tmp_input)
+        self._unregister_hook()
+
 
-    def unregister_hook(self):
-        for handel in self.hook_handlers:
-            handel.remove()
 
-    def get_avg_traces(self, enable_act=False, num_batches=2):
+    def get_avg_traces(self, enable_act=True, num_batches=2):
         """
         Estimates average hessian trace for each parameter
         """
         assert num_batches > 0
         if enable_act:
-            self.hook_handlers = []
+            self.hook_handles = []
             self.layer_acts = {}
             self.layer_acts_grads = {}
-            self.register_hook()
+            self.register_input_grad_hooks()
         ##num_data_iter = self.op_cfgs_list[0]['calib_iteration']
         ##num_all_data = num_data_iter * self.dataloader.batch_size
         ##op_list = self.op_list
@@ -186,7 +211,7 @@ def get_avg_traces(self, enable_act=False, num_batches=2):
         layer_traces_per_iter = []
         prev_avg_model_trace = 0
         for i in range(self.max_iter):
-            layer_traces = self.hutchinson_one_step(params, enable_act, num_batches)
+            layer_traces = self.get_hv_one_sample(params, enable_act, num_batches)
             layer_traces_per_iter.append(layer_traces)
             layer_traces_estimate = torch.mean(torch.stack(layer_traces_per_iter), dim=0)
             model_trace = torch.sum(layer_traces_estimate)
@@ -197,7 +222,7 @@ def get_avg_traces(self, enable_act=False, num_batches=2):
 
         layer_traces = layer_traces_estimate
         if enable_act:
-            self.unregister_hook()
+            self.reset_input_gradient_and_hooks()
         return layer_traces
 
 

From 62dddf766c0917fe06848e1c7ee74d05893a1258 Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Mon, 21 Nov 2022 15:13:01 +0800
Subject: [PATCH 061/128] add trick imagenet dataset fix one issue

---
 .../experimental/quantization.py              | 32 +++++++++++++++++++
 neural_compressor/strategy/hawq.py            |  3 +-
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/neural_compressor/experimental/quantization.py b/neural_compressor/experimental/quantization.py
index 3d7b7811ea2..77dfc51d465 100644
--- a/neural_compressor/experimental/quantization.py
+++ b/neural_compressor/experimental/quantization.py
@@ -146,6 +146,38 @@ def pre_process(self):
             with open(self.resume_file, 'rb') as f:
                 _resume = pickle.load(f).__dict__
 
+
+        import torchvision.datasets as datasets
+        import torchvision.transforms as transforms
+        data_path = "/mnt/data2/dataset/dataset/imagenet/img_raw"
+        traindir = os.path.join(data_path, 'train')
+        valdir = os.path.join(data_path, 'val')
+        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                         std=[0.229, 0.224, 0.225])
+
+        train_dataset = datasets.ImageFolder(
+            traindir,
+            transforms.Compose([
+                transforms.RandomResizedCrop(224),
+                transforms.RandomHorizontalFlip(),
+                transforms.ToTensor(),
+                normalize,
+            ]))
+
+        val_dataset = datasets.ImageFolder(
+            valdir,
+            transforms.Compose([
+                transforms.RandomResizedCrop(224),
+                transforms.RandomHorizontalFlip(),
+                transforms.ToTensor(),
+                normalize,
+            ]))
+
+        from torch.utils.data import DataLoader
+
+        self._calib_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
+        self._eval_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=False)
+
         self.strategy = STRATEGIES[strategy](
             self._model,
             self.conf,
diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 228bb249e2a..2beef8668b4 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -98,6 +98,7 @@ def get_gradients(self, model, data, criterion, create_graph=False, enable_act=F
             input.requires_grad = True
         output = model(input)
         loss = criterion(output, target)
+        # torch.autograd.backward(loss, create_graph=create_graph)
         loss.backward(create_graph=create_graph)
         gradients = []
         for n, p in model.named_parameters():
@@ -177,7 +178,7 @@ def register_input_grad_hooks(self):
             if name in self.op_list:
                 hook_handle = module.register_forward_hook(self._get_enable_input_grad_hook(name))
                 self.hook_handles.append(hook_handle)
-                hook_handle = module.register_forward_hook(self._get_input_grad_hook(name))
+                hook_handle = module.register_backward_hook(self._get_input_grad_hook(name))
                 self.hook_handles.append(hook_handle)
 
 

From 755c38cc34218f5e6ceff8b4b9bb95c3e6fb7e14 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 30 Nov 2022 14:30:37 +0800
Subject: [PATCH 062/128] resolve conflicts

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/adaptor/pytorch.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/neural_compressor/adaptor/pytorch.py b/neural_compressor/adaptor/pytorch.py
index e89f687f81a..3421828a8ab 100644
--- a/neural_compressor/adaptor/pytorch.py
+++ b/neural_compressor/adaptor/pytorch.py
@@ -3155,7 +3155,6 @@ def _get_quantizable_ops_recursively(self, model, prefix, quantizable_ops):
         Returns:
             None
         """
-
         module_dict = dict(model.named_modules())
         for op_name, child in model.named_modules():
             if self.is_fused_module(child):

From 87793cf19718f7b26e00e191aaa60c70b07cabfc Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Mon, 21 Nov 2022 18:51:51 +0800
Subject: [PATCH 063/128] change to eval model, remove bias

---
 .../quantization/ptq/cpu/fx/conf.yaml         | 12 +++--
 .../experimental/quantization.py              |  2 +-
 .../strategy/auto_mixed_precision.py          |  1 +
 neural_compressor/strategy/hawq.py            | 46 ++++++++++++++-----
 .../strategy/st_utils/hawq_wenhuach.py        |  2 +-
 neural_compressor/strategy/strategy.py        |  3 +-
 test/strategy/test_hawq_wenhuach.py           |  2 +-
 7 files changed, 47 insertions(+), 21 deletions(-)

diff --git a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/conf.yaml b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/conf.yaml
index d1dab0d2f43..064656e872b 100644
--- a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/conf.yaml
+++ b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/conf.yaml
@@ -21,10 +21,10 @@ quantization:                                        # optional. tuning constrai
   calibration:
     sampling_size: 300                               # optional. default value is 100. used to set how many samples should be used in calibration.
     dataloader:
-      batch_size: 30
+      batch_size: 1
       dataset:
         ImageFolder:
-          root: /path/to/calibration/dataset         # NOTE: modify to calibration dataset location if needed
+          root: /home/bfang1/Projects/HAWQ_INC/datasets/raw      # NOTE: modify to calibration dataset location if needed
       transform:
         Resize:
           size: 256
@@ -40,10 +40,10 @@ evaluation:                                          # optional. required if use
     metric:
       topk: 1                                        # built-in metrics are topk, map, f1, allow user to register new metric.
     dataloader:
-      batch_size: 30
+      batch_size: 1
       dataset:
         ImageFolder:
-          root: /path/to/evaluation/dataset          # NOTE: modify to evaluation dataset location if needed
+          root: /home/bfang1/Projects/HAWQ_INC/datasets/raw      # NOTE: modify to evaluation dataset location if needed
       transform:
         Resize:
           size: 256
@@ -61,7 +61,7 @@ evaluation:                                          # optional. required if use
       batch_size: 1
       dataset:
         ImageFolder:
-          root: /path/to/evaluation/dataset          # NOTE: modify to evaluation dataset location if needed
+          root: /home/bfang1/Projects/HAWQ_INC/datasets/raw        # NOTE: modify to evaluation dataset location if needed
       transform:
         Resize:
           size: 256
@@ -73,6 +73,8 @@ evaluation:                                          # optional. required if use
           std: [0.229, 0.224, 0.225]
 
 tuning:
+  strategy:
+    name: hawq
   accuracy_criterion:
     relative:  0.01                                  # optional. default value is relative, other value is absolute. this example allows relative accuracy loss: 1%.
   exit_policy:
diff --git a/neural_compressor/experimental/quantization.py b/neural_compressor/experimental/quantization.py
index 77dfc51d465..dae0f8611c5 100644
--- a/neural_compressor/experimental/quantization.py
+++ b/neural_compressor/experimental/quantization.py
@@ -149,7 +149,7 @@ def pre_process(self):
 
         import torchvision.datasets as datasets
         import torchvision.transforms as transforms
-        data_path = "/mnt/data2/dataset/dataset/imagenet/img_raw"
+        data_path = "/home/bfang1/Projects/HAWQ_INC/datasets/raw"
         traindir = os.path.join(data_path, 'train')
         valdir = os.path.join(data_path, 'val')
         normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
diff --git a/neural_compressor/strategy/auto_mixed_precision.py b/neural_compressor/strategy/auto_mixed_precision.py
index 4b59cf2cced..7fbd759a87e 100644
--- a/neural_compressor/strategy/auto_mixed_precision.py
+++ b/neural_compressor/strategy/auto_mixed_precision.py
@@ -145,6 +145,7 @@ def traverse(self):
         if self.baseline is None and (self.eval_dataloader or self.eval_func):
             logger.info("Get FP32 model baseline.")
             self.baseline = self._evaluate(self.model)
+            self.baseline=[0.698,[700]]
             # record the FP32 baseline
             self._add_tuning_history()
 
diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 2beef8668b4..09f0b1ef175 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -102,15 +102,15 @@ def get_gradients(self, model, data, criterion, create_graph=False, enable_act=F
         loss.backward(create_graph=create_graph)
         gradients = []
         for n, p in model.named_parameters():
-            if p.grad != None:
+            if p.grad != None and n in self.weight_names:
                 gradient = p.grad
                 gradients.append(gradient + 0.0)  ## add 0 to create a copy
         model.zero_grad()
         return gradients
 
-    def get_params(self, model):
-        parameters = [p for p in model.parameters() if p.requires_grad]
-        return parameters
+    # def get_params(self, model):
+    #     parameters = [p for p in model.parameters() if p.requires_grad]
+    #     return parameters
 
     def sample_rademacher(self, params):
         samples = []
@@ -191,9 +191,13 @@ def reset_input_gradient_and_hooks(self):
         # self.model(tmp_input)
         self._unregister_hook()
 
+    def get_params(self):
+        weight_names = [n for n, p in self.model.named_parameters() if p.requires_grad and "bias" not in n]  ##remove bias
+        params = [p for n, p in self.model.named_parameters() if p.requires_grad and "bias" not in n]  ##remove bias
+        self.weight_names = weight_names
+        self.params = params
 
-
-    def get_avg_traces(self, enable_act=True, num_batches=2):
+    def get_avg_traces(self, enable_act=False, num_batches=2):
         """
         Estimates average hessian trace for each parameter
         """
@@ -207,18 +211,22 @@ def get_avg_traces(self, enable_act=True, num_batches=2):
         ##num_all_data = num_data_iter * self.dataloader.batch_size
         ##op_list = self.op_list
         ##TODO setting this in config
-        params = [p for p in self.model.parameters() if p.requires_grad]
+        self.get_params()
+        # names = [n for n, p in self.model.named_parameters() if p.requires_grad and "bias" not in n]##remove bias
+        # params = [p for n, p in self.model.named_parameters() if p.requires_grad and "bias" not in n]##remove bias
 
         layer_traces_per_iter = []
         prev_avg_model_trace = 0
         for i in range(self.max_iter):
-            layer_traces = self.get_hv_one_sample(params, enable_act, num_batches)
+            layer_traces = self.get_hv_one_sample(self.params, enable_act, num_batches)
             layer_traces_per_iter.append(layer_traces)
             layer_traces_estimate = torch.mean(torch.stack(layer_traces_per_iter), dim=0)
             model_trace = torch.sum(layer_traces_estimate)
             diff_ratio = abs(model_trace - prev_avg_model_trace) / (prev_avg_model_trace + self.eps)
             if diff_ratio < self.tolerance and i > 10:  ##TODO magic number
                 break
+            if i==50:##TODO for debug
+                break
             prev_avg_model_trace = model_trace
 
         layer_traces = layer_traces_estimate
@@ -314,11 +322,25 @@ def next_tune_cfg(self):
         orig_eval = True
         if self._fp32_model.training:
             orig_eval = False
-        self._fp32_model.train()
+        self._fp32_model.eval()
         ht = HessianTrace(self._fp32_model, self.calib_dataloader)
-        ht.get_avg_traces()
-        if orig_eval:
-            self._fp32_model.eval()
+        traces = ht.get_avg_traces()
+        if orig_eval==False:
+            self._fp32_model.train()
+
+        ordered_ops = sorted(op_fallback_acc_impact.keys(),
+                             key=lambda key: op_fallback_acc_impact[key],
+                             reverse=self.higher_is_better)
+        op_dtypes = OrderedDict(zip(ordered_ops, [target_dtype] * len(fallback_items_name_lst)))
+        logger.info(f"Start to accumulate fallback to {target_dtype}.")
+        initial_op_tuning_cfg = deepcopy(best_op_tuning_cfg_stage1)
+        fallback_sampler = FallbackTuningSampler(tuning_space, tuning_order_lst=[],
+                                                 initial_op_tuning_cfg=initial_op_tuning_cfg,
+                                                 op_dtypes=op_dtypes, accumulate=True)
+        for op_tuning_cfg in fallback_sampler:
+            op_tuning_cfg['calib_sampling_size'] = calib_sampling_size
+            yield op_tuning_cfg
+
 
         # tmp = 1
         # fallback_items_name_lst = [item.name for item in fallback_items_lst][::-1] # from bottom to up
diff --git a/neural_compressor/strategy/st_utils/hawq_wenhuach.py b/neural_compressor/strategy/st_utils/hawq_wenhuach.py
index 6c74401c5fc..c0ced2af3f4 100644
--- a/neural_compressor/strategy/st_utils/hawq_wenhuach.py
+++ b/neural_compressor/strategy/st_utils/hawq_wenhuach.py
@@ -10,7 +10,7 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 """
-from ..utils import logger
+from ...utils import logger
 import torch
 import numpy as np
 from torch.autograd import Variable
diff --git a/neural_compressor/strategy/strategy.py b/neural_compressor/strategy/strategy.py
index 63710b43264..58faa5d919a 100644
--- a/neural_compressor/strategy/strategy.py
+++ b/neural_compressor/strategy/strategy.py
@@ -219,7 +219,8 @@ def traverse(self):
         if self.baseline is None:
             logger.info("Get FP32 model baseline.")
             self._fp32_model = self.model
-            self.baseline = self._evaluate(self.model)
+            ##self.baseline = self._evaluate(self.model)
+            self.baseline = [0.698,[700]]
             # record the FP32 baseline
             self._add_tuning_history()
         self.show_baseline_info()
diff --git a/test/strategy/test_hawq_wenhuach.py b/test/strategy/test_hawq_wenhuach.py
index a6ee28b9d4a..236d8219e71 100644
--- a/test/strategy/test_hawq_wenhuach.py
+++ b/test/strategy/test_hawq_wenhuach.py
@@ -11,7 +11,7 @@
 from neural_compressor.adaptor.pytorch import TemplateAdaptor
 from neural_compressor.adaptor import FRAMEWORKS
 import shutil
-from neural_compressor.strategy.st_utils.hawq_wenhuach import Hawq_top, fix_seed
+from neural_compressor.strategy.st_utils.hawq_wenhuach import fix_seed
 from torch.quantization.quantize_fx import fuse_fx
 fix_seed(1)
 

From 7a7520bedcaaf7d1af9787885fd7d93d7e7f164d Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Mon, 21 Nov 2022 19:16:10 +0800
Subject: [PATCH 064/128] fixed weight to op bug

---
 neural_compressor/strategy/hawq.py  | 41 ++++++++++++++++++-----------
 test/strategy/test_hawq_wenhuach.py |  2 +-
 2 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 09f0b1ef175..015d9e678c1 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -75,11 +75,12 @@ def get_fused_mapping(self):
             if self.is_fused_module(child):
                 for name, _ in child.named_children():
                     if op_name + "." + name + ".weight" in weights_info:  ##TODO check if this is right
-                        weight_to_op[op_name + "." + name + ".weight"] = op_name
+                        weight_to_op[op_name + "." + name + ".weight"] = op_name[7:]
                         break
             else:
-                if op_name + ".weight" in weights_info:
-                    weight_to_op[op_name + ".weight"] = op_name
+                name = op_name + ".weight"
+                if name in weights_info and name not in weight_to_op.keys():
+                    weight_to_op[op_name + ".weight"] = op_name[7:]
         op_list = []
         for key in weight_to_op.keys():
             op_list.append(weight_to_op[key])
@@ -232,7 +233,15 @@ def get_avg_traces(self, enable_act=False, num_batches=2):
         layer_traces = layer_traces_estimate
         if enable_act:
             self.reset_input_gradient_and_hooks()
-        return layer_traces
+        weight_name_to_traces={}
+
+        for weigth_name,trace in zip(self.weight_names, layer_traces):
+            weight_name_to_traces[weigth_name] = trace
+        op_name_to_trace={}
+        for weigth_name in self.weight_names:
+            op_name = self.weight_to_op[weigth_name]
+            op_name_to_trace[op_name] = weight_name_to_traces[weigth_name]
+        return op_name_to_trace
 
 
 @strategy_registry
@@ -328,18 +337,18 @@ def next_tune_cfg(self):
         if orig_eval==False:
             self._fp32_model.train()
 
-        ordered_ops = sorted(op_fallback_acc_impact.keys(),
-                             key=lambda key: op_fallback_acc_impact[key],
-                             reverse=self.higher_is_better)
-        op_dtypes = OrderedDict(zip(ordered_ops, [target_dtype] * len(fallback_items_name_lst)))
-        logger.info(f"Start to accumulate fallback to {target_dtype}.")
-        initial_op_tuning_cfg = deepcopy(best_op_tuning_cfg_stage1)
-        fallback_sampler = FallbackTuningSampler(tuning_space, tuning_order_lst=[],
-                                                 initial_op_tuning_cfg=initial_op_tuning_cfg,
-                                                 op_dtypes=op_dtypes, accumulate=True)
-        for op_tuning_cfg in fallback_sampler:
-            op_tuning_cfg['calib_sampling_size'] = calib_sampling_size
-            yield op_tuning_cfg
+        # ordered_ops = sorted(op_fallback_acc_impact.keys(),
+        #                      key=lambda key: op_fallback_acc_impact[key],
+        #                      reverse=self.higher_is_better)
+        # op_dtypes = OrderedDict(zip(ordered_ops, [target_dtype] * len(fallback_items_name_lst)))
+        # logger.info(f"Start to accumulate fallback to {target_dtype}.")
+        # initial_op_tuning_cfg = deepcopy(best_op_tuning_cfg_stage1)
+        # fallback_sampler = FallbackTuningSampler(tuning_space, tuning_order_lst=[],
+        #                                          initial_op_tuning_cfg=initial_op_tuning_cfg,
+        #                                          op_dtypes=op_dtypes, accumulate=True)
+        # for op_tuning_cfg in fallback_sampler:
+        #     op_tuning_cfg['calib_sampling_size'] = calib_sampling_size
+        #     yield op_tuning_cfg
 
 
         # tmp = 1
diff --git a/test/strategy/test_hawq_wenhuach.py b/test/strategy/test_hawq_wenhuach.py
index 236d8219e71..a09c83c3452 100644
--- a/test/strategy/test_hawq_wenhuach.py
+++ b/test/strategy/test_hawq_wenhuach.py
@@ -13,7 +13,7 @@
 import shutil
 from neural_compressor.strategy.st_utils.hawq_wenhuach import fix_seed
 from torch.quantization.quantize_fx import fuse_fx
-fix_seed(1)
+# fix_seed(1)
 
 def build_ptq_yaml():
     fake_yaml = '''

From 6cc95b047b3c754f9500d0b8967ea79a91459de1 Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Mon, 21 Nov 2022 19:44:42 +0800
Subject: [PATCH 065/128] still have issues

---
 neural_compressor/strategy/hawq.py               | 16 +++++++++++++++-
 .../strategy/st_utils/tuning_sampler.py          |  2 +-
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 015d9e678c1..dbbaa98e931 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -333,10 +333,24 @@ def next_tune_cfg(self):
             orig_eval = False
         self._fp32_model.eval()
         ht = HessianTrace(self._fp32_model, self.calib_dataloader)
-        traces = ht.get_avg_traces()
+        op_to_traces = ht.get_avg_traces()
         if orig_eval==False:
             self._fp32_model.train()
 
+        ordered_ops = sorted(op_to_traces.keys(),
+                             key=lambda key: op_to_traces[key],
+                             reverse=self.higher_is_better)
+        op_dtypes = OrderedDict(zip(ordered_ops, [target_dtype] * len(ordered_ops)))
+        logger.info(f"Start to accumulate fallback to {target_dtype}.")
+
+        fallback_sampler = FallbackTuningSampler(tuning_space, tuning_order_lst=[],
+                                                 initial_op_tuning_cfg=None,
+                                                 op_dtypes=op_dtypes, accumulate=True)
+        for op_tuning_cfg in fallback_sampler:
+            op_tuning_cfg['calib_sampling_size'] = calib_size
+            yield op_tuning_cfg
+
+
         # ordered_ops = sorted(op_fallback_acc_impact.keys(),
         #                      key=lambda key: op_fallback_acc_impact[key],
         #                      reverse=self.higher_is_better)
diff --git a/neural_compressor/strategy/st_utils/tuning_sampler.py b/neural_compressor/strategy/st_utils/tuning_sampler.py
index fea140a9e4d..c583f1c2764 100644
--- a/neural_compressor/strategy/st_utils/tuning_sampler.py
+++ b/neural_compressor/strategy/st_utils/tuning_sampler.py
@@ -263,7 +263,7 @@ def __init__(self,
 
     def __iter__(self):
         new_tune_cfg = copy.deepcopy(self.initial_op_tuning_cfg)
-        skip_first = True
+        skip_first = False
         for op_name_type, target_dtype in self.op_dtypes.items():
             if not self.accumulate:
                 new_tune_cfg = copy.deepcopy(self.initial_op_tuning_cfg)

From 72a238575b23e6b515030a79c18a754a34c91d93 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 22 Nov 2022 11:01:22 +0800
Subject: [PATCH 066/128] WA for align the op name

---
 neural_compressor/strategy/hawq.py | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index dbbaa98e931..bc042f06b2c 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -316,7 +316,22 @@ def next_tune_cfg(self):
 
         # Initialize the tuning config for each op according to the quantization approach
         op_item_dtype_dict, quant_mode_wise_items, initial_op_tuning_cfg = self.initial_tuning_cfg()
-
+        # Optype-wise tuning tuning items: the algorithm/scheme/granularity of activation(weight)
+        early_stop_tuning = True
+        stage1_cnt = 0
+        quant_ops = quant_mode_wise_items['static'] if 'static' in quant_mode_wise_items else []
+        quant_ops += quant_mode_wise_items['dynamic'] if 'dynamic' in quant_mode_wise_items else []
+        stage1_max = 2  # TODO set a more appropriate value
+        op_wise_tuning_sampler = OpTypeWiseTuningSampler(tuning_space, [], [], 
+                                                            op_item_dtype_dict, initial_op_tuning_cfg)
+        for op_tuning_cfg in op_wise_tuning_sampler:
+            stage1_cnt += 1
+            if early_stop_tuning and stage1_cnt > stage1_max:
+                logger.info("Early stopping the stage 1.")
+                break
+            op_tuning_cfg['calib_sampling_size'] = calib_size
+            yield op_tuning_cfg
+        # Fallback the ops supported both static and dynamic from static to dynamic
         quant_ops = quant_mode_wise_items['static'] if 'static' in quant_mode_wise_items else []
         quant_ops += quant_mode_wise_items['dynamic'] if 'dynamic' in quant_mode_wise_items else []
 
@@ -340,11 +355,16 @@ def next_tune_cfg(self):
         ordered_ops = sorted(op_to_traces.keys(),
                              key=lambda key: op_to_traces[key],
                              reverse=self.higher_is_better)
-        op_dtypes = OrderedDict(zip(ordered_ops, [target_dtype] * len(ordered_ops)))
+        # WA for add op type
+        op_info_map = {}
+        for op_info in list(initial_op_tuning_cfg.keys()):
+            op_info_map[op_info[0]] = op_info # op_name: (op_name, op_type)
+        tmp_ordered_ops = [op_info_map[op_name] for op_name in ordered_ops]
+        op_dtypes = OrderedDict(zip(tmp_ordered_ops, [target_dtype] * len(ordered_ops)))
         logger.info(f"Start to accumulate fallback to {target_dtype}.")
 
         fallback_sampler = FallbackTuningSampler(tuning_space, tuning_order_lst=[],
-                                                 initial_op_tuning_cfg=None,
+                                                 initial_op_tuning_cfg=op_tuning_cfg,
                                                  op_dtypes=op_dtypes, accumulate=True)
         for op_tuning_cfg in fallback_sampler:
             op_tuning_cfg['calib_sampling_size'] = calib_size

From 71a4832e3be2edc0cdb8c95f8fddd2fb99390b08 Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Tue, 22 Nov 2022 15:36:21 +0800
Subject: [PATCH 067/128] change entry point to main function fx model before
 entering into quantization

---
 .../quantization/ptq/cpu/fx/conf.yaml         |  6 +-
 .../quantization/ptq/cpu/fx/main.py           |  2 +
 .../experimental/quantization.py              | 62 +++++++++----------
 neural_compressor/strategy/hawq.py            |  2 +-
 .../strategy/st_utils/tuning_sampler.py       |  2 +-
 test/strategy/test_hawq_wenhuach.py           |  8 +--
 6 files changed, 42 insertions(+), 40 deletions(-)

diff --git a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/conf.yaml b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/conf.yaml
index 064656e872b..4b50b559e6a 100644
--- a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/conf.yaml
+++ b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/conf.yaml
@@ -24,7 +24,7 @@ quantization:                                        # optional. tuning constrai
       batch_size: 1
       dataset:
         ImageFolder:
-          root: /home/bfang1/Projects/HAWQ_INC/datasets/raw      # NOTE: modify to calibration dataset location if needed
+          root: /home/bfang1/Projects/HAWQ_INC/datasets/raw/val     # NOTE: modify to calibration dataset location if needed
       transform:
         Resize:
           size: 256
@@ -43,7 +43,7 @@ evaluation:                                          # optional. required if use
       batch_size: 1
       dataset:
         ImageFolder:
-          root: /home/bfang1/Projects/HAWQ_INC/datasets/raw      # NOTE: modify to evaluation dataset location if needed
+          root: /home/bfang1/Projects/HAWQ_INC/datasets/raw/val      # NOTE: modify to evaluation dataset location if needed
       transform:
         Resize:
           size: 256
@@ -61,7 +61,7 @@ evaluation:                                          # optional. required if use
       batch_size: 1
       dataset:
         ImageFolder:
-          root: /home/bfang1/Projects/HAWQ_INC/datasets/raw        # NOTE: modify to evaluation dataset location if needed
+          root: /home/bfang1/Projects/HAWQ_INC/datasets/raw/val        # NOTE: modify to evaluation dataset location if needed
       transform:
         Resize:
           size: 256
diff --git a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/main.py b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/main.py
index 8646048ccf4..30008bfa3db 100644
--- a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/main.py
+++ b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/main.py
@@ -169,6 +169,8 @@ def main():
     if args.tune:
         from neural_compressor.experimental import Quantization, common
         model.eval()
+        from torch.quantization.quantize_fx import fuse_fx
+        model = fuse_fx(model)
         quantizer = Quantization("./conf.yaml")
         quantizer.model = common.Model(model)
         q_model = quantizer.fit()
diff --git a/neural_compressor/experimental/quantization.py b/neural_compressor/experimental/quantization.py
index dae0f8611c5..bdcba064e6e 100644
--- a/neural_compressor/experimental/quantization.py
+++ b/neural_compressor/experimental/quantization.py
@@ -146,37 +146,37 @@ def pre_process(self):
             with open(self.resume_file, 'rb') as f:
                 _resume = pickle.load(f).__dict__
 
-
-        import torchvision.datasets as datasets
-        import torchvision.transforms as transforms
-        data_path = "/home/bfang1/Projects/HAWQ_INC/datasets/raw"
-        traindir = os.path.join(data_path, 'train')
-        valdir = os.path.join(data_path, 'val')
-        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
-                                         std=[0.229, 0.224, 0.225])
-
-        train_dataset = datasets.ImageFolder(
-            traindir,
-            transforms.Compose([
-                transforms.RandomResizedCrop(224),
-                transforms.RandomHorizontalFlip(),
-                transforms.ToTensor(),
-                normalize,
-            ]))
-
-        val_dataset = datasets.ImageFolder(
-            valdir,
-            transforms.Compose([
-                transforms.RandomResizedCrop(224),
-                transforms.RandomHorizontalFlip(),
-                transforms.ToTensor(),
-                normalize,
-            ]))
-
-        from torch.utils.data import DataLoader
-
-        self._calib_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
-        self._eval_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=False)
+        #
+        # import torchvision.datasets as datasets
+        # import torchvision.transforms as transforms
+        # data_path = "/home/bfang1/Projects/HAWQ_INC/datasets/raw"
+        # traindir = os.path.join(data_path, 'train')
+        # valdir = os.path.join(data_path, 'val')
+        # normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+        #                                  std=[0.229, 0.224, 0.225])
+        #
+        # train_dataset = datasets.ImageFolder(
+        #     traindir,
+        #     transforms.Compose([
+        #         transforms.RandomResizedCrop(224),
+        #         transforms.RandomHorizontalFlip(),
+        #         transforms.ToTensor(),
+        #         normalize,
+        #     ]))
+        #
+        # val_dataset = datasets.ImageFolder(
+        #     valdir,
+        #     transforms.Compose([
+        #         transforms.RandomResizedCrop(224),
+        #         transforms.RandomHorizontalFlip(),
+        #         transforms.ToTensor(),
+        #         normalize,
+        #     ]))
+        #
+        # from torch.utils.data import DataLoader
+        #
+        # self._calib_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
+        # self._eval_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=False)
 
         self.strategy = STRATEGIES[strategy](
             self._model,
diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index bc042f06b2c..6db4757aa0c 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -321,7 +321,7 @@ def next_tune_cfg(self):
         stage1_cnt = 0
         quant_ops = quant_mode_wise_items['static'] if 'static' in quant_mode_wise_items else []
         quant_ops += quant_mode_wise_items['dynamic'] if 'dynamic' in quant_mode_wise_items else []
-        stage1_max = 2  # TODO set a more appropriate value
+        stage1_max = -1  # TODO set a more appropriate value
         op_wise_tuning_sampler = OpTypeWiseTuningSampler(tuning_space, [], [], 
                                                             op_item_dtype_dict, initial_op_tuning_cfg)
         for op_tuning_cfg in op_wise_tuning_sampler:
diff --git a/neural_compressor/strategy/st_utils/tuning_sampler.py b/neural_compressor/strategy/st_utils/tuning_sampler.py
index c583f1c2764..f311d7c16a4 100644
--- a/neural_compressor/strategy/st_utils/tuning_sampler.py
+++ b/neural_compressor/strategy/st_utils/tuning_sampler.py
@@ -272,7 +272,7 @@ def __iter__(self):
             if self.accumulate and skip_first:  # skip the first one
                 skip_first = False
                 continue
-            logger.debug(f"fallback {op_name_type} to {target_dtype}")
+            logger.info(f"fallback {op_name_type} to {target_dtype}")
             yield new_tune_cfg  # need to skip the first one
 
 
diff --git a/test/strategy/test_hawq_wenhuach.py b/test/strategy/test_hawq_wenhuach.py
index a09c83c3452..2adcd5a5812 100644
--- a/test/strategy/test_hawq_wenhuach.py
+++ b/test/strategy/test_hawq_wenhuach.py
@@ -74,15 +74,15 @@ def tearDownClass(self):
 
 
     def test_run_hawq_one_trial(self):
-        def eval_func(model):
-            self.i -= 1
-            return self.i
+        # def eval_func(model):
+        #     self.i -= 1
+        #     return self.i
         from neural_compressor.experimental import Quantization, common
         model = copy.deepcopy(self.model)
         model.eval()
         model = fuse_fx(model)
         quantizer = Quantization('ptq_yaml.yaml')
-        quantizer.eval_func = eval_func
+        ##quantizer.eval_func = eval_func
         dataset = quantizer.dataset('dummy', (32, 3, 224, 224), label=True)
         quantizer.calib_dataloader = common.DataLoader(dataset)
         quantizer.eval_dataloader = common.DataLoader(dataset)

From d9378c1af0a981a4c75063623e543c534847235a Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Wed, 23 Nov 2022 10:54:34 +0800
Subject: [PATCH 068/128] get activations and the corresponding gradients

---
 neural_compressor/strategy/hawq.py | 72 ++++++++++++++++--------------
 1 file changed, 39 insertions(+), 33 deletions(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 6db4757aa0c..604008ac2d4 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -67,6 +67,13 @@ def is_fused_module(self, module):
         else:
             return False
 
+    def mapping_module_to_op(self, name):
+        length = len("_model.")
+        if len(name) < length:
+            return name
+        else:
+            return name[length:]
+
     def get_fused_mapping(self):
         model = self.model
         weights_info = dict(model.named_parameters())
@@ -75,7 +82,8 @@ def get_fused_mapping(self):
             if self.is_fused_module(child):
                 for name, _ in child.named_children():
                     if op_name + "." + name + ".weight" in weights_info:  ##TODO check if this is right
-                        weight_to_op[op_name + "." + name + ".weight"] = op_name[7:]
+
+                        weight_to_op[op_name + "." + name + ".weight"] = self.mapping_module_to_op(op_name)
                         break
             else:
                 name = op_name + ".weight"
@@ -95,8 +103,8 @@ def get_gradients(self, model, data, criterion, create_graph=False, enable_act=F
         input = data[0].to(self.device)
         ##self._input_shape = input.shape  ## for resetting input activation
         target = data[1].to(self.device)
-        if enable_act:
-            input.requires_grad = True
+        # if enable_act:
+        #     input.requires_grad = True
         output = model(input)
         loss = criterion(output, target)
         # torch.autograd.backward(loss, create_graph=create_graph)
@@ -138,23 +146,24 @@ def get_hv_one_sample(self, params, enable_act, num_batches):
         v_t_H_v = torch.stack([torch.mean(h_v * v_t) for (h_v, v_t) in zip(H_v, v)])  ##maybe sum is better
         return v_t_H_v
 
-    def _get_input_grad_hook(self, name):
-        def input_grad_hook(model, grad_input, grad_output):
+    def _get_act_grad_hook(self, name):
+        def act_grad_hook(model, grad_input, grad_output):
             self.layer_acts_grads[name] = [grad_input, grad_output]
-        return input_grad_hook
 
-    def _get_enable_input_grad_hook(self, name):
-        def enable_input_grad_hook(model, inputs, outputs):
+        return act_grad_hook
+
+    def _get_enable_act_grad_hook(self, name):
+        def enable_act_grad_hook(model, inputs, outputs):
             try:
                 input = inputs[0]  ##TODO check whether this is right
             except:
                 input = inputs
-            if input.is_leaf == False:
-                if input.requires_grad is False:
-                    input.requires_grad = True
-                    self.layer_acts[name] = input
 
-        return enable_input_grad_hook
+            if input.requires_grad is False:
+                input.requires_grad = True
+            self.layer_acts[name] = input
+
+        return enable_act_grad_hook
 
     # def _get_disable_input_grad_hook(self, name):
     #     def disable_input_grad_hook(model, inputs, outputs):
@@ -169,21 +178,19 @@ def enable_input_grad_hook(model, inputs, outputs):
     #
     #     return disable_input_grad_hook
 
-
     def _unregister_hook(self):
         for handel in self.hook_handles:
             handel.remove()
 
-    def register_input_grad_hooks(self):
+    def register_act_grad_hooks(self):
         for name, module in self.model.named_modules():
-            if name in self.op_list:
-                hook_handle = module.register_forward_hook(self._get_enable_input_grad_hook(name))
+            if self.mapping_module_to_op(name) in self.op_list:
+                hook_handle = module.register_forward_hook(self._get_enable_act_grad_hook(name))
                 self.hook_handles.append(hook_handle)
-                hook_handle = module.register_backward_hook(self._get_input_grad_hook(name))
+                hook_handle = module.register_backward_hook(self._get_act_grad_hook(name))
                 self.hook_handles.append(hook_handle)
 
-
-    def reset_input_gradient_and_hooks(self):
+    def reset_act_gradient_and_hooks(self):
         # tmp_input = torch.zeros(self._input_shape, device=self.device)
         # for name, module in self.model.named_modules():
         #     if name in self.op_list:
@@ -193,12 +200,13 @@ def reset_input_gradient_and_hooks(self):
         self._unregister_hook()
 
     def get_params(self):
-        weight_names = [n for n, p in self.model.named_parameters() if p.requires_grad and "bias" not in n]  ##remove bias
+        weight_names = [n for n, p in self.model.named_parameters() if
+                        p.requires_grad and "bias" not in n]  ##remove bias
         params = [p for n, p in self.model.named_parameters() if p.requires_grad and "bias" not in n]  ##remove bias
         self.weight_names = weight_names
         self.params = params
 
-    def get_avg_traces(self, enable_act=False, num_batches=2):
+    def get_avg_traces(self, enable_act=True, num_batches=2):
         """
         Estimates average hessian trace for each parameter
         """
@@ -207,7 +215,7 @@ def get_avg_traces(self, enable_act=False, num_batches=2):
             self.hook_handles = []
             self.layer_acts = {}
             self.layer_acts_grads = {}
-            self.register_input_grad_hooks()
+            self.register_act_grad_hooks()
         ##num_data_iter = self.op_cfgs_list[0]['calib_iteration']
         ##num_all_data = num_data_iter * self.dataloader.batch_size
         ##op_list = self.op_list
@@ -226,18 +234,18 @@ def get_avg_traces(self, enable_act=False, num_batches=2):
             diff_ratio = abs(model_trace - prev_avg_model_trace) / (prev_avg_model_trace + self.eps)
             if diff_ratio < self.tolerance and i > 10:  ##TODO magic number
                 break
-            if i==50:##TODO for debug
+            if i == 50:  ##TODO for debug
                 break
             prev_avg_model_trace = model_trace
 
         layer_traces = layer_traces_estimate
         if enable_act:
             self.reset_input_gradient_and_hooks()
-        weight_name_to_traces={}
+        weight_name_to_traces = {}
 
-        for weigth_name,trace in zip(self.weight_names, layer_traces):
+        for weigth_name, trace in zip(self.weight_names, layer_traces):
             weight_name_to_traces[weigth_name] = trace
-        op_name_to_trace={}
+        op_name_to_trace = {}
         for weigth_name in self.weight_names:
             op_name = self.weight_to_op[weigth_name]
             op_name_to_trace[op_name] = weight_name_to_traces[weigth_name]
@@ -322,8 +330,8 @@ def next_tune_cfg(self):
         quant_ops = quant_mode_wise_items['static'] if 'static' in quant_mode_wise_items else []
         quant_ops += quant_mode_wise_items['dynamic'] if 'dynamic' in quant_mode_wise_items else []
         stage1_max = -1  # TODO set a more appropriate value
-        op_wise_tuning_sampler = OpTypeWiseTuningSampler(tuning_space, [], [], 
-                                                            op_item_dtype_dict, initial_op_tuning_cfg)
+        op_wise_tuning_sampler = OpTypeWiseTuningSampler(tuning_space, [], [],
+                                                         op_item_dtype_dict, initial_op_tuning_cfg)
         for op_tuning_cfg in op_wise_tuning_sampler:
             stage1_cnt += 1
             if early_stop_tuning and stage1_cnt > stage1_max:
@@ -349,7 +357,7 @@ def next_tune_cfg(self):
         self._fp32_model.eval()
         ht = HessianTrace(self._fp32_model, self.calib_dataloader)
         op_to_traces = ht.get_avg_traces()
-        if orig_eval==False:
+        if orig_eval == False:
             self._fp32_model.train()
 
         ordered_ops = sorted(op_to_traces.keys(),
@@ -358,7 +366,7 @@ def next_tune_cfg(self):
         # WA for add op type
         op_info_map = {}
         for op_info in list(initial_op_tuning_cfg.keys()):
-            op_info_map[op_info[0]] = op_info # op_name: (op_name, op_type)
+            op_info_map[op_info[0]] = op_info  # op_name: (op_name, op_type)
         tmp_ordered_ops = [op_info_map[op_name] for op_name in ordered_ops]
         op_dtypes = OrderedDict(zip(tmp_ordered_ops, [target_dtype] * len(ordered_ops)))
         logger.info(f"Start to accumulate fallback to {target_dtype}.")
@@ -370,7 +378,6 @@ def next_tune_cfg(self):
             op_tuning_cfg['calib_sampling_size'] = calib_size
             yield op_tuning_cfg
 
-
         # ordered_ops = sorted(op_fallback_acc_impact.keys(),
         #                      key=lambda key: op_fallback_acc_impact[key],
         #                      reverse=self.higher_is_better)
@@ -384,7 +391,6 @@ def next_tune_cfg(self):
         #     op_tuning_cfg['calib_sampling_size'] = calib_sampling_size
         #     yield op_tuning_cfg
 
-
         # tmp = 1
         # fallback_items_name_lst = [item.name for item in fallback_items_lst][::-1] # from bottom to up
         # ops_sensitivity = self.adaptor.get_hessian_trace(self._fp32_model,

From 17d381f7611b1e004778ac68c9fd53a3f791ea0f Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Wed, 23 Nov 2022 11:54:27 +0800
Subject: [PATCH 069/128] change fusefx position

---
 .../quantization/ptq/cpu/fx/main.py            |  2 --
 neural_compressor/strategy/hawq.py             | 18 ++++++++++--------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/main.py b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/main.py
index 30008bfa3db..8646048ccf4 100644
--- a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/main.py
+++ b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/main.py
@@ -169,8 +169,6 @@ def main():
     if args.tune:
         from neural_compressor.experimental import Quantization, common
         model.eval()
-        from torch.quantization.quantize_fx import fuse_fx
-        model = fuse_fx(model)
         quantizer = Quantization("./conf.yaml")
         quantizer.model = common.Model(model)
         q_model = quantizer.fit()
diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 604008ac2d4..2cfac2b5815 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -40,7 +40,9 @@ class HessianTrace:
     """
 
     def __init__(self, model, dataloader, criterion=None):
-        self.model = model  ##TODO need to check fused or not
+        from torch.quantization.quantize_fx import fuse_fx
+        self.model = fuse_fx(model.model)
+
         self.dataloader = dataloader
         self.max_iter = 500
         self.tolerance = 1e-5
@@ -68,11 +70,11 @@ def is_fused_module(self, module):
             return False
 
     def mapping_module_to_op(self, name):
-        length = len("_model.")
-        if len(name) < length:
-            return name
-        else:
-            return name[length:]
+        # length = len("_model.")
+        # if len(name) < length:
+        #     return name
+        # else:
+        return name
 
     def get_fused_mapping(self):
         model = self.model
@@ -88,7 +90,7 @@ def get_fused_mapping(self):
             else:
                 name = op_name + ".weight"
                 if name in weights_info and name not in weight_to_op.keys():
-                    weight_to_op[op_name + ".weight"] = op_name[7:]
+                    weight_to_op[op_name + ".weight"] = op_name
         op_list = []
         for key in weight_to_op.keys():
             op_list.append(weight_to_op[key])
@@ -240,7 +242,7 @@ def get_avg_traces(self, enable_act=True, num_batches=2):
 
         layer_traces = layer_traces_estimate
         if enable_act:
-            self.reset_input_gradient_and_hooks()
+            self.reset_act_gradient_and_hooks()
         weight_name_to_traces = {}
 
         for weigth_name, trace in zip(self.weight_names, layer_traces):

From d0a3fc7c199a2d165695e1b34190b991921bca15 Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Wed, 23 Nov 2022 17:20:16 +0800
Subject: [PATCH 070/128] add weight quant loss, the current key is from quant
 model

---
 neural_compressor/strategy/hawq.py | 342 +++++++++++++++++------------
 1 file changed, 201 insertions(+), 141 deletions(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 2cfac2b5815..2f6a2e7e074 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -29,6 +29,7 @@
 from .st_utils.tuning_space import TUNING_ITEMS_LST
 from torch.quantization.quantize_fx import fuse_fx
 import torchvision
+from typing import Dict, List, Optional, Any, Union, Callable, Set
 
 
 class HessianTrace:
@@ -55,6 +56,22 @@ def __init__(self, model, dataloader, criterion=None):
         self.criterion = self.criterion.to(self.device)
         self.weight_to_op, self.op_list = self.get_fused_mapping()
 
+    def get_qnt_weight_loss(self, weights_name):
+
+        fp32_model = self.fp32model
+
+        qnt_model = self.q_model
+
+        # print(self.model.state_dict())
+        for n, p in self.model.named_parameters():
+            print(n)
+
+        print("*" * 20)
+
+        for n, p in self.q_model._model.named_parameters():
+            print(n)
+        pass
+
     def is_fused_module(self, module):
         """This is a helper function for `_propagate_qconfig_helper` to detecte
            if this module is fused.
@@ -100,7 +117,7 @@ def get_device(self, model: torch.nn.Module):
         for n, p in model.named_parameters():
             return p.data.device
 
-    def get_gradients(self, model, data, criterion, create_graph=False, enable_act=False):
+    def get_gradients(self, model, data, criterion, create_graph=False):
         model.zero_grad()
         input = data[0].to(self.device)
         ##self._input_shape = input.shape  ## for resetting input activation
@@ -131,14 +148,15 @@ def sample_rademacher(self, params):
             samples.append(r)
         return samples
 
-    def get_hv_one_sample(self, params, enable_act, num_batches):
+    def get_vtHv_weight(self, params, num_samples):
+        num_batches = (num_samples + self.dataloader.batchsize - 1) // self.dataloader
         v = self.sample_rademacher(params)
         H_v = [0] * len(v)
         cnt = 0
         for step, data in enumerate(self.dataloader):
             batch_size = data[0].shape[0]
             cnt += batch_size
-            gradients = self.get_gradients(self.model, data, self.criterion, create_graph=True, enable_act=enable_act)
+            gradients = self.get_gradients(self.model, data, self.criterion, create_graph=True)
             H_v_one = torch.autograd.grad(gradients, params, v, only_inputs=True, retain_graph=False)
             H_v = [pre + cur * float(batch_size) for cur, pre in zip(H_v_one, H_v)]
             if step == num_batches - 1:
@@ -148,6 +166,25 @@ def get_hv_one_sample(self, params, enable_act, num_batches):
         v_t_H_v = torch.stack([torch.mean(h_v * v_t) for (h_v, v_t) in zip(H_v, v)])  ##maybe sum is better
         return v_t_H_v
 
+    def get_vtHv_act(self, params, num_samples):
+        v = self.sample_rademacher(params)
+        H_v = [0] * len(v)
+        cnt = 0
+        for step, data in enumerate(self.dataloader):
+            if cnt >= num_samples:
+                break
+            for i in range(self.dataloader.batchsize):  ##force to batchsize to be 1
+                input = data[0][i:i + 1]
+                target = data[1][i:i + 1]
+
+                self.get_gradients(self.model, (input, target), self.criterion, create_graph=True)
+                layer_acts = [self.layer_acts[key] for key in self.layer_acts.keys()]
+                layer_act_gradients = [self.layer_acts_grads[key] for key in self.layer_acts.keys()]
+                hv_one = torch.autograd.grad(layer_act_gradients, layer_acts, v, only_inputs=True, retain_graph=False)
+                cnt += 1
+                if cnt >= num_samples:
+                    break
+
     def _get_act_grad_hook(self, name):
         def act_grad_hook(model, grad_input, grad_output):
             self.layer_acts_grads[name] = [grad_input, grad_output]
@@ -208,28 +245,12 @@ def get_params(self):
         self.weight_names = weight_names
         self.params = params
 
-    def get_avg_traces(self, enable_act=True, num_batches=2):
-        """
-        Estimates average hessian trace for each parameter
-        """
-        assert num_batches > 0
-        if enable_act:
-            self.hook_handles = []
-            self.layer_acts = {}
-            self.layer_acts_grads = {}
-            self.register_act_grad_hooks()
-        ##num_data_iter = self.op_cfgs_list[0]['calib_iteration']
-        ##num_all_data = num_data_iter * self.dataloader.batch_size
-        ##op_list = self.op_list
-        ##TODO setting this in config
-        self.get_params()
-        # names = [n for n, p in self.model.named_parameters() if p.requires_grad and "bias" not in n]##remove bias
-        # params = [p for n, p in self.model.named_parameters() if p.requires_grad and "bias" not in n]##remove bias
+    def get_weight_traces(self, num_samples):
 
         layer_traces_per_iter = []
         prev_avg_model_trace = 0
         for i in range(self.max_iter):
-            layer_traces = self.get_hv_one_sample(self.params, enable_act, num_batches)
+            layer_traces = self.get_vtHv_weight(self.params, num_samples)
             layer_traces_per_iter.append(layer_traces)
             layer_traces_estimate = torch.mean(torch.stack(layer_traces_per_iter), dim=0)
             model_trace = torch.sum(layer_traces_estimate)
@@ -239,19 +260,152 @@ def get_avg_traces(self, enable_act=True, num_batches=2):
             if i == 50:  ##TODO for debug
                 break
             prev_avg_model_trace = model_trace
-
-        layer_traces = layer_traces_estimate
-        if enable_act:
-            self.reset_act_gradient_and_hooks()
         weight_name_to_traces = {}
 
-        for weigth_name, trace in zip(self.weight_names, layer_traces):
-            weight_name_to_traces[weigth_name] = trace
+        for weight_name, trace in zip(self.weight_names, layer_traces):
+            weight_name_to_traces[weight_name] = trace
         op_name_to_trace = {}
-        for weigth_name in self.weight_names:
-            op_name = self.weight_to_op[weigth_name]
-            op_name_to_trace[op_name] = weight_name_to_traces[weigth_name]
+        for weight_name in self.weight_names:
+            op_name = self.weight_to_op[weight_name]
+            op_name_to_trace[op_name] = weight_name_to_traces[weight_name]
         return op_name_to_trace
+        return layer_traces_estimate
+
+    def get_act_traces(self, num_samples):
+        self.hook_handles = []
+        self.layer_acts = {}
+        self.layer_acts_grads = {}
+        self.register_act_grad_hooks()
+        for i in range(self.max_iter):
+            pass
+
+    def get_avg_traces(self, enable_act=True, num_samples=100):
+        """
+        Estimates average hessian trace for each parameter
+        """
+
+        assert num_samples > 0
+
+        ##num_data_iter = self.op_cfgs_list[0]['calib_iteration']
+        ##num_all_data = num_data_iter * self.dataloader.batch_size
+        ##op_list = self.op_list
+        ##TODO setting this in config
+        self.get_params()
+        # names = [n for n, p in self.model.named_parameters() if p.requires_grad and "bias" not in n]##remove bias
+        # params = [p for n, p in self.model.named_parameters() if p.requires_grad and "bias" not in n]##remove bias
+
+        ## handle activation
+        if enable_act:
+            self.get_act_traces(num_samples)
+            ##change batchsize to 1
+
+        #
+        # layer_traces = layer_traces_estimate
+        # if enable_act:
+        #     self.reset_act_gradient_and_hooks()
+
+
+##copy from torch.quantization._numeric_suite
+def _find_match(
+        str_list: Union[Dict[str, Any], List[str]], key_str: str,
+        postfix: str,
+) -> Optional[str]:
+    split_str = key_str.split(".")
+    if split_str[-1] == postfix:
+        match_string = "".join(key_str.split(".")[0:-1])
+        for s2 in str_list:
+            pattern1 = "".join(s2.split(".")[0:-1])
+            pattern2 = "".join(s2.split(".")[0:-2])
+            if match_string == pattern1:
+                return s2
+            if match_string == pattern2:
+                return s2
+
+        # For matching "fc.weight" and "fc._packed_params._packed_params"
+        if postfix == "_packed_params":
+            match_string = "".join(key_str.split(".")[0:-2])
+            if len(match_string) == 0:
+                return None
+            for s2 in str_list:
+                pattern1 = "".join(s2.split(".")[0:-1])
+                pattern2 = "".join(s2.split(".")[0:-2])
+                if match_string == pattern1:
+                    return s2
+                if match_string == pattern2:
+                    return s2
+        return None
+    else:
+        return None
+
+
+##copy form torch.quantization._numeric_suite
+def compare_weights(
+        float_dict: Dict[str, Any], quantized_dict: Dict[str, Any]
+) -> Dict[str, Dict[str, torch.Tensor]]:
+    r"""Compare the weights of the float module with its corresponding quantized
+    module. Return a dict with key corresponding to module names and each entry being
+    a dictionary with two keys 'float' and 'quantized', containing the float and
+    quantized weights. This dict can be used to compare and compute the quantization
+    error of the weights of float and quantized models.
+
+    Example usage::
+
+        wt_compare_dict = compare_weights(
+            float_model.state_dict(), qmodel.state_dict())
+        for key in wt_compare_dict:
+            print(
+                key,
+                compute_error(
+                    wt_compare_dict[key]['float'],
+                    wt_compare_dict[key]['quantized'].dequantize()
+                )
+            )
+
+    Args:
+        float_dict: state dict of the float model
+        quantized_dict: state dict of the quantized model
+
+    Return:
+        weight_dict: dict with key corresponding to module names and each entry being
+        a dictionary with two keys 'float' and 'quantized', containing the float and
+        quantized weights
+    """
+
+    weight_dict: Dict[str, Dict] = {}
+    for key in quantized_dict:
+        match_key = _find_match(float_dict, key, "weight")
+        if match_key is not None:
+            weight_dict[key] = {}
+            weight_dict[key]["float"] = float_dict[match_key]
+            weight_dict[key]["quantized"] = quantized_dict[key]
+            continue
+
+        # For matching "fc.weight" and "fc._packed_params._packed_params"
+        match_key = _find_match(float_dict, key, "_packed_params")
+        if match_key is not None:
+            weight_dict[key] = {}
+            weight_dict[key]["float"] = float_dict[match_key]
+            weight_dict[key]["quantized"] = quantized_dict[key][0]
+
+        # For LSTM
+        split_str = key.split(".")
+        if split_str[-1] == "param" and split_str[-3] == "_all_weight_values":
+            layer = split_str[-2]
+            module_name = ".".join(split_str[:-3])
+            float_weight_ih_key = module_name + ".weight_ih_l" + layer
+            float_weight_hh_key = module_name + ".weight_hh_l" + layer
+            if float_weight_ih_key in float_dict and float_weight_hh_key in float_dict:
+                weight_dict[key] = {}
+                weight_dict[key]["float"] = float_dict[float_weight_ih_key]
+                weight_dict[key]["quantized"] = (
+                    quantized_dict[key].__getstate__()[0][4][0].__getstate__()[0][0]
+                )
+                weight_dict[key]["float"] = float_dict[float_weight_hh_key]
+                weight_dict[key]["quantized"] = (
+                    quantized_dict[key].__getstate__()[0][4][1].__getstate__()[0][0]
+                )
+
+    return weight_dict
 
 
 @strategy_registry
@@ -331,7 +485,7 @@ def next_tune_cfg(self):
         stage1_cnt = 0
         quant_ops = quant_mode_wise_items['static'] if 'static' in quant_mode_wise_items else []
         quant_ops += quant_mode_wise_items['dynamic'] if 'dynamic' in quant_mode_wise_items else []
-        stage1_max = -1  # TODO set a more appropriate value
+        stage1_max = 1  # TODO set a more appropriate value
         op_wise_tuning_sampler = OpTypeWiseTuningSampler(tuning_space, [], [],
                                                          op_item_dtype_dict, initial_op_tuning_cfg)
         for op_tuning_cfg in op_wise_tuning_sampler:
@@ -341,6 +495,12 @@ def next_tune_cfg(self):
                 break
             op_tuning_cfg['calib_sampling_size'] = calib_size
             yield op_tuning_cfg
+
+        # import torch.quantization._numeric_suite as ns
+        # self.model.eval()
+        # fused_model = fuse_fx(self.model.model)
+        # res = compare_weights(fused_model.state_dict(), self.q_model.state_dict())
+
         # Fallback the ops supported both static and dynamic from static to dynamic
         quant_ops = quant_mode_wise_items['static'] if 'static' in quant_mode_wise_items else []
         quant_ops += quant_mode_wise_items['dynamic'] if 'dynamic' in quant_mode_wise_items else []
@@ -358,6 +518,16 @@ def next_tune_cfg(self):
             orig_eval = False
         self._fp32_model.eval()
         ht = HessianTrace(self._fp32_model, self.calib_dataloader)
+
+        q_model_state_dict = {
+        }
+        for key in self.q_model.state_dict().keys():
+            length = len("_model.")
+            new_key = key[length:]
+            q_model_state_dict[new_key] = self.q_model.state_dict()[key]
+
+        weight_quant_loss = compare_weights(ht.model.state_dict(), q_model_state_dict)
+
         op_to_traces = ht.get_avg_traces()
         if orig_eval == False:
             self._fp32_model.train()
@@ -380,116 +550,6 @@ def next_tune_cfg(self):
             op_tuning_cfg['calib_sampling_size'] = calib_size
             yield op_tuning_cfg
 
-        # ordered_ops = sorted(op_fallback_acc_impact.keys(),
-        #                      key=lambda key: op_fallback_acc_impact[key],
-        #                      reverse=self.higher_is_better)
-        # op_dtypes = OrderedDict(zip(ordered_ops, [target_dtype] * len(fallback_items_name_lst)))
-        # logger.info(f"Start to accumulate fallback to {target_dtype}.")
-        # initial_op_tuning_cfg = deepcopy(best_op_tuning_cfg_stage1)
-        # fallback_sampler = FallbackTuningSampler(tuning_space, tuning_order_lst=[],
-        #                                          initial_op_tuning_cfg=initial_op_tuning_cfg,
-        #                                          op_dtypes=op_dtypes, accumulate=True)
-        # for op_tuning_cfg in fallback_sampler:
-        #     op_tuning_cfg['calib_sampling_size'] = calib_sampling_size
-        #     yield op_tuning_cfg
-
-        # tmp = 1
-        # fallback_items_name_lst = [item.name for item in fallback_items_lst][::-1] # from bottom to up
-        # ops_sensitivity = self.adaptor.get_hessian_trace(self._fp32_model,
-        #                                                         self.calib_dataloader,
-        #                                                         self.
-        #                                                         method_args={'name': 'hessian_trace'})
-        # tmp = 1
-
-    def next_tune_cfg_bk(self):
-        """The generator of yielding next tuning config to traverse by concrete strategies
-           according to last tuning result.
-
-        Yields:
-            tune_config (dict): It's a dict containing the tuning configuration to run.
-        """
-        from copy import deepcopy
-        tuning_space = self.tuning_space
-        calib_sampling_size_lst = tuning_space.root_item.get_option_by_name('calib_sampling_size').options
-
-        calib_sampling_size = calib_sampling_size_lst[0]
-        # Initialize the tuning config for each op according to the quantization approach
-        op_item_dtype_dict, quant_mode_wise_items, initial_op_tuning_cfg = self.initial_tuning_cfg()
-        # Optype-wise tuning tuning items: the algorithm/scheme/granularity of activation(weight)
-        early_stop_tuning = False
-        stage1_cnt = 0
-        quant_ops = quant_mode_wise_items['static'] if 'static' in quant_mode_wise_items else []
-        quant_ops += quant_mode_wise_items['dynamic'] if 'dynamic' in quant_mode_wise_items else []
-        stage1_max = 1e9  # TODO set a more appropriate value
-        op_wise_tuning_sampler = OpTypeWiseTuningSampler(tuning_space, [], [],
-                                                         op_item_dtype_dict, initial_op_tuning_cfg)
-        # for op_tuning_cfg in op_wise_tuning_sampler:
-        #     stage1_cnt += 1
-        #     if early_stop_tuning and stage1_cnt > stage1_max:
-        #         logger.info("Early stopping the stage 1.")
-        #         break
-        #     op_tuning_cfg['calib_sampling_size'] = calib_sampling_size
-        #     yield op_tuning_cfg
-        # Fallback the ops supported both static and dynamic from static to dynamic
-        # Tuning items: None
-        # if self.cfg.quantization.approach == 'post_training_auto_quant':
-        #     static_dynamic_items = [item for item in tuning_space.query_items_by_quant_mode('static') if
-        #                             item in tuning_space.query_items_by_quant_mode('dynamic')]
-        #     if static_dynamic_items:
-        #         logger.info("Fallback all ops that support both dynamic and static to dynamic.")
-        #     else:
-        #         logger.info("Non ops that support both dynamic")
-        #
-        #     new_op_tuning_cfg = deepcopy(self.cur_best_tuning_cfg)
-        #     for item in static_dynamic_items:
-        #         new_op_tuning_cfg[item.name] = self.initial_dynamic_cfg_based_on_static_cfg(
-        #                                        new_op_tuning_cfg[item.name])
-        #     new_op_tuning_cfg['calib_sampling_size'] = calib_sampling_size
-        #     yield new_op_tuning_cfg
-        best_op_tuning_cfg_stage1 = deepcopy(self.cur_best_tuning_cfg)
-
-        # Fallback
-        for target_dtype in ['bf16', 'fp32']:
-            target_type_lst = set(tuning_space.query_items_by_quant_mode(target_dtype))
-            fallback_items_lst = [item for item in quant_ops if item in target_type_lst]
-            if fallback_items_lst:
-                logger.info(f"Start to fallback op to {target_dtype} one by one.")
-                self._fallback_started()
-            # fallback_items_name_lst = [item.name for item in fallback_items_lst][::-1] # from bottom to up
-            ops_sensitivity = self.adaptor.calculate_op_sensitivity(self._fp32_model,
-                                                                    self.calib_dataloader,
-                                                                    method_args={'name': 'hessian_trace'})
-
-            fallback_items_name_lst = sorted(ops_sensitivity, key=lambda items: items[1], reverse=True)
-
-            op_dtypes = OrderedDict(zip(fallback_items_name_lst, [target_dtype] * len(fallback_items_name_lst)))
-            initial_op_tuning_cfg = deepcopy(best_op_tuning_cfg_stage1)
-            fallback_sampler = FallbackTuningSampler(tuning_space, tuning_order_lst=[],
-                                                     initial_op_tuning_cfg=initial_op_tuning_cfg,
-                                                     op_dtypes=op_dtypes, accumulate=False)
-
-            op_fallback_acc_impact = OrderedDict()
-            for op_index, op_tuning_cfg in enumerate(fallback_sampler):
-                op_tuning_cfg['calib_sampling_size'] = calib_sampling_size
-                yield op_tuning_cfg
-                acc, _ = self.last_tune_result
-                op_fallback_acc_impact[fallback_items_name_lst[op_index]] = acc
-
-            # do accumulated fallback according to the order in the previous stage
-            if len(op_fallback_acc_impact) > 0:
-                ordered_ops = sorted(op_fallback_acc_impact.keys(),
-                                     key=lambda key: op_fallback_acc_impact[key],
-                                     reverse=self.higher_is_better)
-                op_dtypes = OrderedDict(zip(ordered_ops, [target_dtype] * len(fallback_items_name_lst)))
-                logger.info(f"Start to accumulate fallback to {target_dtype}.")
-                initial_op_tuning_cfg = deepcopy(best_op_tuning_cfg_stage1)
-                fallback_sampler = FallbackTuningSampler(tuning_space, tuning_order_lst=[],
-                                                         initial_op_tuning_cfg=initial_op_tuning_cfg,
-                                                         op_dtypes=op_dtypes, accumulate=True)
-                for op_tuning_cfg in fallback_sampler:
-                    op_tuning_cfg['calib_sampling_size'] = calib_sampling_size
-                    yield op_tuning_cfg
-
     def initial_dynamic_cfg_based_on_static_cfg(self, op_static_cfg: OpTuningConfig):
         op_state = op_static_cfg.get_state()
         op_name = op_static_cfg.op_name

From c46653971bceb06040635d1deef3982d897ee480 Mon Sep 17 00:00:00 2001
From: BiaoFangAIA <108742533+BiaoFangAIA@users.noreply.github.com>
Date: Wed, 23 Nov 2022 19:06:28 +0800
Subject: [PATCH 071/128] add weights_quant loss eval

still bugs for get avg traces
---
 neural_compressor/strategy/hawq.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 2f6a2e7e074..897dfcffea2 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -527,8 +527,16 @@ def next_tune_cfg(self):
             q_model_state_dict[new_key] = self.q_model.state_dict()[key]
 
         weight_quant_loss = compare_weights(ht.model.state_dict(), q_model_state_dict)
-
+        pertur_lst={}
+        for key in weight_quant_loss:
+            op_float_tensor=weight_quant_loss[key]['float']
+            op_qnt_tensor=weight_quant_loss[key]['quantized'].dequantize()
+            diff_l2=(torch.norm(op_float_tensor-op_qnt_tensor,p=2)**2) #Formula: L2=||Q(w)-w||p^2
+            pertur_lst[key]=diff_l2
+        # for i in pertur_lst:
+        #     print(pertur_lst[i])
         op_to_traces = ht.get_avg_traces()
+        print(op_to_traces)
         if orig_eval == False:
             self._fp32_model.train()
 

From c4c00cad00f9dd364d88ff5e9e95dc0b44a612e9 Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Thu, 24 Nov 2022 14:12:48 +0800
Subject: [PATCH 072/128] fixed weight trace issue

---
 neural_compressor/strategy/hawq.py | 50 ++++++++----------------------
 1 file changed, 13 insertions(+), 37 deletions(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 897dfcffea2..65c7ab72d82 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -55,22 +55,7 @@ def __init__(self, model, dataloader, criterion=None):
             self.criterion = torch.nn.CrossEntropyLoss().to(self.device)  ##TODO need to set in config
         self.criterion = self.criterion.to(self.device)
         self.weight_to_op, self.op_list = self.get_fused_mapping()
-
-    def get_qnt_weight_loss(self, weights_name):
-
-        fp32_model = self.fp32model
-
-        qnt_model = self.q_model
-
-        # print(self.model.state_dict())
-        for n, p in self.model.named_parameters():
-            print(n)
-
-        print("*" * 20)
-
-        for n, p in self.q_model._model.named_parameters():
-            print(n)
-        pass
+        self.get_params()
 
     def is_fused_module(self, module):
         """This is a helper function for `_propagate_qconfig_helper` to detecte
@@ -149,7 +134,6 @@ def sample_rademacher(self, params):
         return samples
 
     def get_vtHv_weight(self, params, num_samples):
-        num_batches = (num_samples + self.dataloader.batchsize - 1) // self.dataloader
         v = self.sample_rademacher(params)
         H_v = [0] * len(v)
         cnt = 0
@@ -159,7 +143,7 @@ def get_vtHv_weight(self, params, num_samples):
             gradients = self.get_gradients(self.model, data, self.criterion, create_graph=True)
             H_v_one = torch.autograd.grad(gradients, params, v, only_inputs=True, retain_graph=False)
             H_v = [pre + cur * float(batch_size) for cur, pre in zip(H_v_one, H_v)]
-            if step == num_batches - 1:
+            if cnt >=num_samples:
                 break
         if cnt > 0:
             H_v = [item / cnt for item in H_v]
@@ -246,7 +230,6 @@ def get_params(self):
         self.params = params
 
     def get_weight_traces(self, num_samples):
-
         layer_traces_per_iter = []
         prev_avg_model_trace = 0
         for i in range(self.max_iter):
@@ -261,7 +244,7 @@ def get_weight_traces(self, num_samples):
                 break
             prev_avg_model_trace = model_trace
         weight_name_to_traces = {}
-
+        layer_traces = layer_traces_estimate
         for weight_name, trace in zip(self.weight_names, layer_traces):
             weight_name_to_traces[weight_name] = trace
         op_name_to_trace = {}
@@ -269,7 +252,6 @@ def get_weight_traces(self, num_samples):
             op_name = self.weight_to_op[weight_name]
             op_name_to_trace[op_name] = weight_name_to_traces[weight_name]
         return op_name_to_trace
-        return layer_traces_estimate
 
     def get_act_traces(self, num_samples):
         self.hook_handles = []
@@ -279,24 +261,18 @@ def get_act_traces(self, num_samples):
         for i in range(self.max_iter):
             pass
 
-    def get_avg_traces(self, enable_act=True, num_samples=100):
+    def get_avg_traces(self, enable_act=True, num_samples=32):
         """
         Estimates average hessian trace for each parameter
         """
 
         assert num_samples > 0
-
-        ##num_data_iter = self.op_cfgs_list[0]['calib_iteration']
-        ##num_all_data = num_data_iter * self.dataloader.batch_size
-        ##op_list = self.op_list
-        ##TODO setting this in config
-        self.get_params()
-        # names = [n for n, p in self.model.named_parameters() if p.requires_grad and "bias" not in n]##remove bias
-        # params = [p for n, p in self.model.named_parameters() if p.requires_grad and "bias" not in n]##remove bias
+        weight_traces = self.get_weight_traces(num_samples)
+        return weight_traces
 
         ## handle activation
-        if enable_act:
-            self.get_act_traces(num_samples)
+        # if enable_act:
+        #     self.get_act_traces(num_samples)
             ##change batchsize to 1
 
         #
@@ -527,12 +503,12 @@ def next_tune_cfg(self):
             q_model_state_dict[new_key] = self.q_model.state_dict()[key]
 
         weight_quant_loss = compare_weights(ht.model.state_dict(), q_model_state_dict)
-        pertur_lst={}
+        pertur_lst = {}
         for key in weight_quant_loss:
-            op_float_tensor=weight_quant_loss[key]['float']
-            op_qnt_tensor=weight_quant_loss[key]['quantized'].dequantize()
-            diff_l2=(torch.norm(op_float_tensor-op_qnt_tensor,p=2)**2) #Formula: L2=||Q(w)-w||p^2
-            pertur_lst[key]=diff_l2
+            op_float_tensor = weight_quant_loss[key]['float']
+            op_qnt_tensor = weight_quant_loss[key]['quantized'].dequantize()
+            diff_l2 = (torch.norm(op_float_tensor - op_qnt_tensor, p=2) ** 2)  # Formula: L2=||Q(w)-w||p^2
+            pertur_lst[key] = diff_l2
         # for i in pertur_lst:
         #     print(pertur_lst[i])
         op_to_traces = ht.get_avg_traces()

From 85fac870c000efff48ec9b801a9a8b6a3d3fc736 Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Thu, 24 Nov 2022 14:15:11 +0800
Subject: [PATCH 073/128] fixed weight trace issue

---
 .../experimental/quantization.py              | 62 +++++++++----------
 test/strategy/test_hawq_wenhuach.py           | 10 +--
 2 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/neural_compressor/experimental/quantization.py b/neural_compressor/experimental/quantization.py
index bdcba064e6e..dae0f8611c5 100644
--- a/neural_compressor/experimental/quantization.py
+++ b/neural_compressor/experimental/quantization.py
@@ -146,37 +146,37 @@ def pre_process(self):
             with open(self.resume_file, 'rb') as f:
                 _resume = pickle.load(f).__dict__
 
-        #
-        # import torchvision.datasets as datasets
-        # import torchvision.transforms as transforms
-        # data_path = "/home/bfang1/Projects/HAWQ_INC/datasets/raw"
-        # traindir = os.path.join(data_path, 'train')
-        # valdir = os.path.join(data_path, 'val')
-        # normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
-        #                                  std=[0.229, 0.224, 0.225])
-        #
-        # train_dataset = datasets.ImageFolder(
-        #     traindir,
-        #     transforms.Compose([
-        #         transforms.RandomResizedCrop(224),
-        #         transforms.RandomHorizontalFlip(),
-        #         transforms.ToTensor(),
-        #         normalize,
-        #     ]))
-        #
-        # val_dataset = datasets.ImageFolder(
-        #     valdir,
-        #     transforms.Compose([
-        #         transforms.RandomResizedCrop(224),
-        #         transforms.RandomHorizontalFlip(),
-        #         transforms.ToTensor(),
-        #         normalize,
-        #     ]))
-        #
-        # from torch.utils.data import DataLoader
-        #
-        # self._calib_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
-        # self._eval_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=False)
+
+        import torchvision.datasets as datasets
+        import torchvision.transforms as transforms
+        data_path = "/home/bfang1/Projects/HAWQ_INC/datasets/raw"
+        traindir = os.path.join(data_path, 'train')
+        valdir = os.path.join(data_path, 'val')
+        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                         std=[0.229, 0.224, 0.225])
+
+        train_dataset = datasets.ImageFolder(
+            traindir,
+            transforms.Compose([
+                transforms.RandomResizedCrop(224),
+                transforms.RandomHorizontalFlip(),
+                transforms.ToTensor(),
+                normalize,
+            ]))
+
+        val_dataset = datasets.ImageFolder(
+            valdir,
+            transforms.Compose([
+                transforms.RandomResizedCrop(224),
+                transforms.RandomHorizontalFlip(),
+                transforms.ToTensor(),
+                normalize,
+            ]))
+
+        from torch.utils.data import DataLoader
+
+        self._calib_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
+        self._eval_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=False)
 
         self.strategy = STRATEGIES[strategy](
             self._model,
diff --git a/test/strategy/test_hawq_wenhuach.py b/test/strategy/test_hawq_wenhuach.py
index 2adcd5a5812..df70e32cd9e 100644
--- a/test/strategy/test_hawq_wenhuach.py
+++ b/test/strategy/test_hawq_wenhuach.py
@@ -74,15 +74,15 @@ def tearDownClass(self):
 
 
     def test_run_hawq_one_trial(self):
-        # def eval_func(model):
-        #     self.i -= 1
-        #     return self.i
+        def eval_func(model):
+            self.i -= 1
+            return self.i
         from neural_compressor.experimental import Quantization, common
         model = copy.deepcopy(self.model)
         model.eval()
-        model = fuse_fx(model)
+        # model = fuse_fx(model)
         quantizer = Quantization('ptq_yaml.yaml')
-        ##quantizer.eval_func = eval_func
+        quantizer.eval_func = eval_func
         dataset = quantizer.dataset('dummy', (32, 3, 224, 224), label=True)
         quantizer.calib_dataloader = common.DataLoader(dataset)
         quantizer.eval_dataloader = common.DataLoader(dataset)

From dc28247c6f21814657f6bee0bec82852b2f1979d Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Thu, 24 Nov 2022 17:12:30 +0800
Subject: [PATCH 074/128] act traces have some issues

---
 neural_compressor/strategy/hawq.py | 216 +++++++++++++++++------------
 1 file changed, 124 insertions(+), 92 deletions(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 65c7ab72d82..c9f8c4488da 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -28,7 +28,7 @@
 from .st_utils.tuning_structs import OpTuningConfig
 from .st_utils.tuning_space import TUNING_ITEMS_LST
 from torch.quantization.quantize_fx import fuse_fx
-import torchvision
+
 from typing import Dict, List, Optional, Any, Union, Callable, Set
 
 
@@ -41,8 +41,7 @@ class HessianTrace:
     """
 
     def __init__(self, model, dataloader, criterion=None):
-        from torch.quantization.quantize_fx import fuse_fx
-        self.model = fuse_fx(model.model)
+        self.model = fuse_fx(model.model)  ##TODO need to check whether model has been already fused
 
         self.dataloader = dataloader
         self.max_iter = 500
@@ -102,89 +101,19 @@ def get_device(self, model: torch.nn.Module):
         for n, p in model.named_parameters():
             return p.data.device
 
-    def get_gradients(self, model, data, criterion, create_graph=False):
-        model.zero_grad()
-        input = data[0].to(self.device)
-        ##self._input_shape = input.shape  ## for resetting input activation
-        target = data[1].to(self.device)
-        # if enable_act:
-        #     input.requires_grad = True
-        output = model(input)
-        loss = criterion(output, target)
-        # torch.autograd.backward(loss, create_graph=create_graph)
-        loss.backward(create_graph=create_graph)
-        gradients = []
-        for n, p in model.named_parameters():
-            if p.grad != None and n in self.weight_names:
-                gradient = p.grad
-                gradients.append(gradient + 0.0)  ## add 0 to create a copy
-        model.zero_grad()
-        return gradients
-
-    # def get_params(self, model):
-    #     parameters = [p for p in model.parameters() if p.requires_grad]
-    #     return parameters
-
-    def sample_rademacher(self, params):
-        samples = []
-        for param in params:
-            r = torch.randint_like(param, high=2, device=self.device)
-            r.masked_fill_(r == 0, -1)
-            samples.append(r)
-        return samples
-
-    def get_vtHv_weight(self, params, num_samples):
-        v = self.sample_rademacher(params)
-        H_v = [0] * len(v)
-        cnt = 0
-        for step, data in enumerate(self.dataloader):
-            batch_size = data[0].shape[0]
-            cnt += batch_size
-            gradients = self.get_gradients(self.model, data, self.criterion, create_graph=True)
-            H_v_one = torch.autograd.grad(gradients, params, v, only_inputs=True, retain_graph=False)
-            H_v = [pre + cur * float(batch_size) for cur, pre in zip(H_v_one, H_v)]
-            if cnt >=num_samples:
-                break
-        if cnt > 0:
-            H_v = [item / cnt for item in H_v]
-        v_t_H_v = torch.stack([torch.mean(h_v * v_t) for (h_v, v_t) in zip(H_v, v)])  ##maybe sum is better
-        return v_t_H_v
-
-    def get_vtHv_act(self, params, num_samples):
-        v = self.sample_rademacher(params)
-        H_v = [0] * len(v)
-        cnt = 0
-        for step, data in enumerate(self.dataloader):
-            if cnt >= num_samples:
-                break
-            for i in range(self.dataloader.batchsize):  ##force to batchsize to be 1
-                input = data[0][i:i + 1]
-                target = data[1][i:i + 1]
-
-                self.get_gradients(self.model, (input, target), self.criterion, create_graph=True)
-                layer_acts = [self.layer_acts[key] for key in self.layer_acts.keys()]
-                layer_act_gradients = [self.layer_acts_grads[key] for key in self.layer_acts.keys()]
-                hv_one = torch.autograd.grad(layer_act_gradients, layer_acts, v, only_inputs=True, retain_graph=False)
-                cnt += 1
-                if cnt >= num_samples:
-                    break
-
     def _get_act_grad_hook(self, name):
         def act_grad_hook(model, grad_input, grad_output):
+            ##print(name, grad_input[0].shape, grad_output[0].shape)
             self.layer_acts_grads[name] = [grad_input, grad_output]
 
         return act_grad_hook
 
     def _get_enable_act_grad_hook(self, name):
         def enable_act_grad_hook(model, inputs, outputs):
-            try:
-                input = inputs[0]  ##TODO check whether this is right
-            except:
-                input = inputs
-
-            if input.requires_grad is False:
-                input.requires_grad = True
-            self.layer_acts[name] = input
+            for input in inputs:
+                if input.requires_grad is False:
+                    input.requires_grad = True
+            self.layer_acts[name] = inputs
 
         return enable_act_grad_hook
 
@@ -229,18 +158,87 @@ def get_params(self):
         self.weight_names = weight_names
         self.params = params
 
+    def forward_backward(self, data, create_graph=False, return_w_grad=True):
+        self.model.zero_grad()
+        input = data[0].to(self.device)
+        ##self._input_shape = input.shape  ## for resetting input activation
+        target = data[1].to(self.device)
+        ##input.requires_grad = True
+        output = self.model(input)
+        loss = self.criterion(output, target)
+        torch.autograd.backward(loss, create_graph=create_graph)
+        ##loss.backward(create_graph=create_graph)
+        if return_w_grad:
+            gradients = []
+            for n, p in self.model.named_parameters():
+                if p.grad != None and n in self.weight_names:
+                    gradient = p.grad
+                    gradients.append(gradient + 0.0)  ## add 0 to create a copy
+            self.model.zero_grad()
+            return gradients
+        else:
+            self.model.zero_grad()
+
+    # def get_params(self, model):
+    #     parameters = [p for p in model.parameters() if p.requires_grad]
+    #     return parameters
+
+    def sample_rademacher(self, params):
+        samples = []
+        for param in params:
+            r = torch.randint_like(param, high=2, device=self.device)
+            r.masked_fill_(r == 0, -1)
+            samples.append(r)
+        return samples
+
+    def get_vtHv_weight(self, params, num_samples):
+        v = self.sample_rademacher(params)
+        H_v = [0] * len(v)
+        cnt = 0
+        for step, data in enumerate(self.dataloader):
+            batch_size = data[0].shape[0]
+            cnt += batch_size
+            gradients = self.forward_backward(data, create_graph=True)
+            H_v_one = torch.autograd.grad(gradients, params, v, only_inputs=True, retain_graph=False)
+            H_v = [pre + cur * float(batch_size) for cur, pre in zip(H_v_one, H_v)]
+            if cnt >= num_samples:
+                break
+        if cnt > 0:
+            H_v = [item / cnt for item in H_v]
+        v_t_H_v = torch.stack([torch.mean(h_v * v_t) for (h_v, v_t) in zip(H_v, v)])  ##maybe sum is better
+        return v_t_H_v
+
+    def get_vtHv_act(self, params, num_samples):
+        v = self.sample_rademacher(params)
+        H_v = [0] * len(v)
+        cnt = 0
+        for step, data in enumerate(self.dataloader):
+            if cnt >= num_samples:
+                break
+            for i in range(self.dataloader.batchsize):  ##force to batchsize to be 1
+                input = data[0][i:i + 1]
+                target = data[1][i:i + 1]
+
+                self.get_gradients(self.model, (input, target), self.criterion, create_graph=True)
+                layer_acts = [self.layer_acts[key] for key in self.layer_acts.keys()]
+                layer_act_gradients = [self.layer_acts_grads[key] for key in self.layer_acts.keys()]
+                hv_one = torch.autograd.grad(layer_act_gradients, layer_acts, v, only_inputs=True, retain_graph=False)
+                cnt += 1
+                if cnt >= num_samples:
+                    break
+
     def get_weight_traces(self, num_samples):
         layer_traces_per_iter = []
         prev_avg_model_trace = 0
-        for i in range(self.max_iter):
+        for iter in range(self.max_iter):
             layer_traces = self.get_vtHv_weight(self.params, num_samples)
             layer_traces_per_iter.append(layer_traces)
             layer_traces_estimate = torch.mean(torch.stack(layer_traces_per_iter), dim=0)
             model_trace = torch.sum(layer_traces_estimate)
             diff_ratio = abs(model_trace - prev_avg_model_trace) / (prev_avg_model_trace + self.eps)
-            if diff_ratio < self.tolerance and i > 10:  ##TODO magic number
+            if diff_ratio < self.tolerance and iter > 10:  ##TODO magic number
                 break
-            if i == 50:  ##TODO for debug
+            if iter == 50:  ##TODO for debug
                 break
             prev_avg_model_trace = model_trace
         weight_name_to_traces = {}
@@ -258,28 +256,62 @@ def get_act_traces(self, num_samples):
         self.layer_acts = {}
         self.layer_acts_grads = {}
         self.register_act_grad_hooks()
-        for i in range(self.max_iter):
+        cnt = 0
+        for step, data in enumerate(self.dataloader):
+            if cnt >= num_samples:
+                break
+            bs = data[0].shape[0]
+            act_traces_sum = 0
+            act_traces_per_iter = []
+            prev_avg_model_trace = 0
+            act_traces_sums = None
+            for i in range(bs):  ##force the bs to be one
+                input = data[0][i:i + 1]
+                target = data[1][i:i + 1]
+                self.forward_backward((input, target), create_graph=True, return_w_grad=False)
+                acts = [self.layer_acts[key] for key in self.layer_acts.keys()]
+                if act_traces_sums == None:
+                    act_traces_sums = [0] * len(acts)
+                acts_grad = [self.layer_acts_grads[key] for key in self.layer_acts.keys()]  ##same order with acts
+                # vt_H_v_sum_per_act = [0] * len(acts)
+                #
+                # prev_model_act_trace = 0
+                # for iter in range(self.max_iter):
+                #     v = self.sample_rademacher(acts)
+                #     H_v = torch.autograd.grad(acts_grad, acts, v, only_inputs=True, retain_graph=False)
+                #     vt_H_v = [torch.mean(h_v * v_t) for (h_v, v_t) in zip(H_v, v)]
+                #
+                #     vt_H_v_sum_per_act = [vt_H_v_sum_per_act[index] + vt_H_v[index] for index, item in
+                #                           enumerate(vt_H_v_sum_per_act)]
+                #     vt_H_v_mean_per_act = [item / (iter + 1) for item in vt_H_v_sum_per_act]
+                #     current_model_act_trace = torch.mean(torch.stack(vt_H_v_mean_per_act))
+                #
+                #     diff_ratio = abs(current_model_act_trace - prev_model_act_trace) / (
+                #             prev_model_act_trace + self.eps)
+                #     if diff_ratio < self.tolerance and iter > 10:  ##TODO magic number
+                #         break
+                #     if iter == 50:  ##TODO for debug
+                #         break
+                #
+                #     prev_model_act_trace = current_vt_H_v_mean_per_model
+                #
+                # cnt += 1
+                # if cnt >= num_samples:
+                #     break
             pass
 
+        self.reset_act_gradient_and_hooks()  ##TODO have issues to reset the input grad to False
+
     def get_avg_traces(self, enable_act=True, num_samples=32):
         """
         Estimates average hessian trace for each parameter
         """
 
         assert num_samples > 0
+        ##self.get_act_traces(num_samples)
         weight_traces = self.get_weight_traces(num_samples)
         return weight_traces
 
-        ## handle activation
-        # if enable_act:
-        #     self.get_act_traces(num_samples)
-            ##change batchsize to 1
-
-        #
-        # layer_traces = layer_traces_estimate
-        # if enable_act:
-        #     self.reset_act_gradient_and_hooks()
-
 
 ##copy from torch.quantization._numeric_suite
 def _find_match(

From deb413e9396563206d9d187a1a46976ba215e48d Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Thu, 24 Nov 2022 20:10:25 +0800
Subject: [PATCH 075/128] support activation traces

---
 neural_compressor/strategy/hawq.py | 110 +++++++++++++++++------------
 1 file changed, 66 insertions(+), 44 deletions(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index c9f8c4488da..94745270ac1 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -41,6 +41,8 @@ class HessianTrace:
     """
 
     def __init__(self, model, dataloader, criterion=None):
+        self.unfused_model = model.model
+
         self.model = fuse_fx(model.model)  ##TODO need to check whether model has been already fused
 
         self.dataloader = dataloader
@@ -104,16 +106,19 @@ def get_device(self, model: torch.nn.Module):
     def _get_act_grad_hook(self, name):
         def act_grad_hook(model, grad_input, grad_output):
             ##print(name, grad_input[0].shape, grad_output[0].shape)
-            self.layer_acts_grads[name] = [grad_input, grad_output]
+            if type(model) == torch.nn.Linear:  ##TODO very tricky
+                self.layer_acts_grads[name] = grad_input[1]
+            else:
+                self.layer_acts_grads[name] = grad_input[0]
 
         return act_grad_hook
 
     def _get_enable_act_grad_hook(self, name):
         def enable_act_grad_hook(model, inputs, outputs):
-            for input in inputs:
-                if input.requires_grad is False:
-                    input.requires_grad = True
-            self.layer_acts[name] = inputs
+            input = inputs[0]
+            if input.requires_grad is False:
+                input.requires_grad = True
+            self.layer_acts[name] = input
 
         return enable_act_grad_hook
 
@@ -134,8 +139,8 @@ def _unregister_hook(self):
         for handel in self.hook_handles:
             handel.remove()
 
-    def register_act_grad_hooks(self):
-        for name, module in self.model.named_modules():
+    def register_act_grad_hooks(self, model):
+        for name, module in model.named_modules():
             if self.mapping_module_to_op(name) in self.op_list:
                 hook_handle = module.register_forward_hook(self._get_enable_act_grad_hook(name))
                 self.hook_handles.append(hook_handle)
@@ -158,13 +163,13 @@ def get_params(self):
         self.weight_names = weight_names
         self.params = params
 
-    def forward_backward(self, data, create_graph=False, return_w_grad=True):
-        self.model.zero_grad()
+    def forward_backward(self, model, data, create_graph=False, return_w_grad=True):
+        model.zero_grad()
         input = data[0].to(self.device)
         ##self._input_shape = input.shape  ## for resetting input activation
         target = data[1].to(self.device)
-        ##input.requires_grad = True
-        output = self.model(input)
+        input.requires_grad = True
+        output = model(input)
         loss = self.criterion(output, target)
         torch.autograd.backward(loss, create_graph=create_graph)
         ##loss.backward(create_graph=create_graph)
@@ -174,10 +179,10 @@ def forward_backward(self, data, create_graph=False, return_w_grad=True):
                 if p.grad != None and n in self.weight_names:
                     gradient = p.grad
                     gradients.append(gradient + 0.0)  ## add 0 to create a copy
-            self.model.zero_grad()
+            model.zero_grad()
             return gradients
         else:
-            self.model.zero_grad()
+            model.zero_grad()
 
     # def get_params(self, model):
     #     parameters = [p for p in model.parameters() if p.requires_grad]
@@ -198,7 +203,7 @@ def get_vtHv_weight(self, params, num_samples):
         for step, data in enumerate(self.dataloader):
             batch_size = data[0].shape[0]
             cnt += batch_size
-            gradients = self.forward_backward(data, create_graph=True)
+            gradients = self.forward_backward(self.model, data, create_graph=True)
             H_v_one = torch.autograd.grad(gradients, params, v, only_inputs=True, retain_graph=False)
             H_v = [pre + cur * float(batch_size) for cur, pre in zip(H_v_one, H_v)]
             if cnt >= num_samples:
@@ -252,11 +257,14 @@ def get_weight_traces(self, num_samples):
         return op_name_to_trace
 
     def get_act_traces(self, num_samples):
+        unfused_training = self.unfused_model.training
+        self.unfused_model.eval()
         self.hook_handles = []
         self.layer_acts = {}
         self.layer_acts_grads = {}
-        self.register_act_grad_hooks()
+        self.register_act_grad_hooks(self.unfused_model)
         cnt = 0
+        act_traces_per_sample = []
         for step, data in enumerate(self.dataloader):
             if cnt >= num_samples:
                 break
@@ -268,39 +276,49 @@ def get_act_traces(self, num_samples):
             for i in range(bs):  ##force the bs to be one
                 input = data[0][i:i + 1]
                 target = data[1][i:i + 1]
-                self.forward_backward((input, target), create_graph=True, return_w_grad=False)
+                self.forward_backward(self.unfused_model, (input, target), create_graph=True, return_w_grad=False)
                 acts = [self.layer_acts[key] for key in self.layer_acts.keys()]
                 if act_traces_sums == None:
                     act_traces_sums = [0] * len(acts)
                 acts_grad = [self.layer_acts_grads[key] for key in self.layer_acts.keys()]  ##same order with acts
-                # vt_H_v_sum_per_act = [0] * len(acts)
-                #
-                # prev_model_act_trace = 0
-                # for iter in range(self.max_iter):
-                #     v = self.sample_rademacher(acts)
-                #     H_v = torch.autograd.grad(acts_grad, acts, v, only_inputs=True, retain_graph=False)
-                #     vt_H_v = [torch.mean(h_v * v_t) for (h_v, v_t) in zip(H_v, v)]
-                #
-                #     vt_H_v_sum_per_act = [vt_H_v_sum_per_act[index] + vt_H_v[index] for index, item in
-                #                           enumerate(vt_H_v_sum_per_act)]
-                #     vt_H_v_mean_per_act = [item / (iter + 1) for item in vt_H_v_sum_per_act]
-                #     current_model_act_trace = torch.mean(torch.stack(vt_H_v_mean_per_act))
-                #
-                #     diff_ratio = abs(current_model_act_trace - prev_model_act_trace) / (
-                #             prev_model_act_trace + self.eps)
-                #     if diff_ratio < self.tolerance and iter > 10:  ##TODO magic number
-                #         break
-                #     if iter == 50:  ##TODO for debug
-                #         break
-                #
-                #     prev_model_act_trace = current_vt_H_v_mean_per_model
-                #
-                # cnt += 1
-                # if cnt >= num_samples:
-                #     break
-            pass
+                vt_H_v_sum_per_act = [0] * len(acts)
+
+                prev_model_act_trace = 0
+                for iter in range(self.max_iter):
+                    v = self.sample_rademacher(acts)
+                    H_v = torch.autograd.grad(acts_grad, acts, v, only_inputs=True, retain_graph=True)
+                    vt_H_v = [torch.mean(h_v * v_t) for (h_v, v_t) in zip(H_v, v)]
+
+                    vt_H_v_sum_per_act = [vt_H_v_sum_per_act[index] + vt_H_v[index] for index, item in
+                                          enumerate(vt_H_v_sum_per_act)]
+                    vt_H_v_mean_per_act = [item / (iter + 1) for item in vt_H_v_sum_per_act]
+                    current_model_act_trace = torch.mean(torch.stack(vt_H_v_mean_per_act))
+
+                    diff_ratio = abs(current_model_act_trace - prev_model_act_trace) / (
+                            prev_model_act_trace + self.eps)
+                    if diff_ratio < self.tolerance and iter > 10:  ##TODO magic number
+                        break
+                    if iter == 50:  ##TODO for debug
+                        break
 
+                    prev_model_act_trace = current_model_act_trace
+                act_traces_per_sample.append(vt_H_v_mean_per_act)
+                cnt += 1
+                if cnt >= num_samples:
+                    break
+
+        if unfused_training:
+            self.unfused_model.train()
         self.reset_act_gradient_and_hooks()  ##TODO have issues to reset the input grad to False
+        act_traces_stack = torch.stack([torch.stack(item) for item in act_traces_per_sample])
+        act_traces = torch.mean(act_traces_stack, dim=0)
+        res_dict={}
+        for index, key in enumerate(self.layer_acts.keys()):
+            res_dict[key]=act_traces[index]
+
+        self.layer_acts=[]
+        self.layer_acts_grads=[]
+        return act_traces
 
     def get_avg_traces(self, enable_act=True, num_samples=32):
         """
@@ -308,9 +326,13 @@ def get_avg_traces(self, enable_act=True, num_samples=32):
         """
 
         assert num_samples > 0
-        ##self.get_act_traces(num_samples)
+        traces = {}
         weight_traces = self.get_weight_traces(num_samples)
-        return weight_traces
+        traces['weight'] = weight_traces
+        if enable_act:
+            act_traces = self.get_act_traces(num_samples)
+            traces['activation']= act_traces
+        return traces
 
 
 ##copy from torch.quantization._numeric_suite

From 7c508d51c2ebe364c5142e5bbfcdd45ce722e1f2 Mon Sep 17 00:00:00 2001
From: BiaoFangAIA <108742533+BiaoFangAIA@users.noreply.github.com>
Date: Thu, 24 Nov 2022 18:24:20 +0800
Subject: [PATCH 076/128] correct the qnt_weigths does't machted issue

---
 neural_compressor/strategy/hawq.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 94745270ac1..6575c21fccb 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -413,9 +413,10 @@ def compare_weights(
         # For matching "fc.weight" and "fc._packed_params._packed_params"
         match_key = _find_match(float_dict, key, "_packed_params")
         if match_key is not None:
-            weight_dict[key] = {}
-            weight_dict[key]["float"] = float_dict[match_key]
-            weight_dict[key]["quantized"] = quantized_dict[key][0]
+            weight_dict[match_key] = {}
+            weight_dict[match_key]["float"] = float_dict[match_key]
+            weight_dict[match_key]["quantized"] = quantized_dict[key][0]
+            ##TODO:should consider more models in further work
 
         # For LSTM
         split_str = key.split(".")
@@ -608,4 +609,4 @@ def initial_dynamic_cfg_based_on_static_cfg(self, op_static_cfg: OpTuningConfig)
                     quant_mode_item = tuning_space.query_quant_mode_item((op_name, op_type), op_quant_mode)
                     tuning_item = quant_mode_item.get_option_by_name(att_item)
                     dynamic_state[att_item] = tuning_item.options[0] if tuning_item else None
-        return OpTuningConfig(op_name, op_type, op_quant_mode, tuning_space, kwargs=dynamic_state)
+        return OpTuningConfig(op_name, op_type, op_quant_mode, tuning_space, kwargs=dynamic_state)
\ No newline at end of file

From 2520925c8f3ad071b0df6820c80993f722c0fd54 Mon Sep 17 00:00:00 2001
From: wenhuach <dd>
Date: Thu, 24 Nov 2022 20:43:03 +0800
Subject: [PATCH 077/128] only enable weight traces currently

---
 neural_compressor/strategy/hawq.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 6575c21fccb..2e590c3f34b 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -312,12 +312,12 @@ def get_act_traces(self, num_samples):
         self.reset_act_gradient_and_hooks()  ##TODO have issues to reset the input grad to False
         act_traces_stack = torch.stack([torch.stack(item) for item in act_traces_per_sample])
         act_traces = torch.mean(act_traces_stack, dim=0)
-        res_dict={}
+        res_dict = {}
         for index, key in enumerate(self.layer_acts.keys()):
-            res_dict[key]=act_traces[index]
+            res_dict[key] = act_traces[index]
 
-        self.layer_acts=[]
-        self.layer_acts_grads=[]
+        self.layer_acts = []
+        self.layer_acts_grads = []
         return act_traces
 
     def get_avg_traces(self, enable_act=True, num_samples=32):
@@ -331,7 +331,7 @@ def get_avg_traces(self, enable_act=True, num_samples=32):
         traces['weight'] = weight_traces
         if enable_act:
             act_traces = self.get_act_traces(num_samples)
-            traces['activation']= act_traces
+            traces['activation'] = act_traces
         return traces
 
 
@@ -566,7 +566,8 @@ def next_tune_cfg(self):
             pertur_lst[key] = diff_l2
         # for i in pertur_lst:
         #     print(pertur_lst[i])
-        op_to_traces = ht.get_avg_traces()
+        traces = ht.get_avg_traces(enable_act=False)
+        op_to_traces = traces['weight']
         print(op_to_traces)
         if orig_eval == False:
             self._fp32_model.train()
@@ -609,4 +610,4 @@ def initial_dynamic_cfg_based_on_static_cfg(self, op_static_cfg: OpTuningConfig)
                     quant_mode_item = tuning_space.query_quant_mode_item((op_name, op_type), op_quant_mode)
                     tuning_item = quant_mode_item.get_option_by_name(att_item)
                     dynamic_state[att_item] = tuning_item.options[0] if tuning_item else None
-        return OpTuningConfig(op_name, op_type, op_quant_mode, tuning_space, kwargs=dynamic_state)
\ No newline at end of file
+        return OpTuningConfig(op_name, op_type, op_quant_mode, tuning_space, kwargs=dynamic_state)

From 1530c94b33e3fc5fed7e95b92335bdd65dc1148e Mon Sep 17 00:00:00 2001
From: BiaoFangAIA <108742533+BiaoFangAIA@users.noreply.github.com>
Date: Fri, 25 Nov 2022 15:30:44 +0800
Subject: [PATCH 078/128] merge weights quantization loss and trace
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Formula：pertubation=trace*weights_qnt_loss
---
 neural_compressor/strategy/hawq.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 2e590c3f34b..c000def9440 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -564,18 +564,17 @@ def next_tune_cfg(self):
             op_qnt_tensor = weight_quant_loss[key]['quantized'].dequantize()
             diff_l2 = (torch.norm(op_float_tensor - op_qnt_tensor, p=2) ** 2)  # Formula: L2=||Q(w)-w||p^2
             pertur_lst[key] = diff_l2
-        # for i in pertur_lst:
-        #     print(pertur_lst[i])
         traces = ht.get_avg_traces(enable_act=False)
         op_to_traces = traces['weight']
-        print(op_to_traces)
+        for trace_i, pertur_i in zip(op_to_traces.keys(),pertur_lst.keys()):
+            op_to_traces[trace_i]=pertur_lst[pertur_i]*op_to_traces[trace_i] #Formula:Omig=Trace*L2
         if orig_eval == False:
             self._fp32_model.train()
-
         ordered_ops = sorted(op_to_traces.keys(),
                              key=lambda key: op_to_traces[key],
                              reverse=self.higher_is_better)
         # WA for add op type
+        print("ordered_ops:",ordered_ops)
         op_info_map = {}
         for op_info in list(initial_op_tuning_cfg.keys()):
             op_info_map[op_info[0]] = op_info  # op_name: (op_name, op_type)

From 6edf3854f45fa4590655e4397462f2eba89c8169 Mon Sep 17 00:00:00 2001
From: BiaoFangAIA <108742533+BiaoFangAIA@users.noreply.github.com>
Date: Mon, 28 Nov 2022 22:44:38 +0800
Subject: [PATCH 079/128] Update conf.yaml

change root path to default config
---
 .../torchvision_models/quantization/ptq/cpu/fx/conf.yaml    | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/conf.yaml b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/conf.yaml
index 4b50b559e6a..ef61c6c3e0b 100644
--- a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/conf.yaml
+++ b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/conf.yaml
@@ -24,7 +24,7 @@ quantization:                                        # optional. tuning constrai
       batch_size: 1
       dataset:
         ImageFolder:
-          root: /home/bfang1/Projects/HAWQ_INC/datasets/raw/val     # NOTE: modify to calibration dataset location if needed
+          root: /path/to/calibration/dataset         # NOTE: modify to calibration dataset location if needed
       transform:
         Resize:
           size: 256
@@ -43,7 +43,7 @@ evaluation:                                          # optional. required if use
       batch_size: 1
       dataset:
         ImageFolder:
-          root: /home/bfang1/Projects/HAWQ_INC/datasets/raw/val      # NOTE: modify to evaluation dataset location if needed
+          root: /path/to/calibration/dataset         # NOTE: modify to evaluation dataset location if needed
       transform:
         Resize:
           size: 256
@@ -61,7 +61,7 @@ evaluation:                                          # optional. required if use
       batch_size: 1
       dataset:
         ImageFolder:
-          root: /home/bfang1/Projects/HAWQ_INC/datasets/raw/val        # NOTE: modify to evaluation dataset location if needed
+          root: /path/to/calibration/dataset         # NOTE: modify to evaluation dataset location if needed
       transform:
         Resize:
           size: 256

From 80299f52573b70dbd2a2fbd1ed33803c181c46d9 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Mon, 28 Nov 2022 22:28:42 +0800
Subject: [PATCH 080/128] WA add loss for strategy

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/conf/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neural_compressor/conf/config.py b/neural_compressor/conf/config.py
index aef8f695291..f83dbf7ceaf 100644
--- a/neural_compressor/conf/config.py
+++ b/neural_compressor/conf/config.py
@@ -847,7 +847,7 @@ def percent_to_float(data):
     Optional('model_conversion'): model_conversion_schema,
 
     Optional('tuning', default={
-        'strategy': {'name': 'basic'},
+        'strategy': {'name': 'basic', 'loss': 'CrossEntropyLoss'}, # TODO move loss to appropriate position
         'accuracy_criterion': {'relative': 0.01, 'higher_is_better': True},
         'objective': 'performance',
         'exit_policy': {'timeout': 0, 'max_trials': 100, 'performance_only': False},

From 4b96aa5093bc172d5d36584ae906f434be6f80ef Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 30 Nov 2022 14:49:50 +0800
Subject: [PATCH 081/128] WA for hawq strategy loss

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/conf/config.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/neural_compressor/conf/config.py b/neural_compressor/conf/config.py
index f83dbf7ceaf..dae5524c9ef 100644
--- a/neural_compressor/conf/config.py
+++ b/neural_compressor/conf/config.py
@@ -860,7 +860,8 @@ def percent_to_float(data):
             Optional('sigopt_project_id'): str,
             Optional('sigopt_experiment_name', default='nc-tune'): str,
             Optional('accuracy_weight', default=1.0): float,
-            Optional('latency_weight', default=1.0): float
+            Optional('latency_weight', default=1.0): float,
+            Optional('loss', default='CrossEntropyLoss'): str # TODO only for test, remove it before merge
         } ,
         Hook('accuracy_criterion', handler=_valid_accuracy_field): object,
         Optional('accuracy_criterion', default={'relative': 0.01}): {

From 26061f21fdfbd3c9aed34074fed1dd112ae43dff Mon Sep 17 00:00:00 2001
From: BiaoFangAIA <108742533+BiaoFangAIA@users.noreply.github.com>
Date: Wed, 30 Nov 2022 15:02:12 +0800
Subject: [PATCH 082/128] change to default path

---
 neural_compressor/experimental/quantization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neural_compressor/experimental/quantization.py b/neural_compressor/experimental/quantization.py
index 7e8e8cfbbac..4fa143fc5c8 100644
--- a/neural_compressor/experimental/quantization.py
+++ b/neural_compressor/experimental/quantization.py
@@ -146,7 +146,7 @@ def pre_process(self):
 
         import torchvision.datasets as datasets
         import torchvision.transforms as transforms
-        data_path = "/home/bfang1/Projects/HAWQ_INC/datasets/raw"
+        data_path = "/mnt/data2/dataset/dataset/imagenet/img_raw"
         traindir = os.path.join(data_path, 'train')
         valdir = os.path.join(data_path, 'val')
         normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],

From 31b11ff1c03df87194559dbe3fbcc99688763ce6 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 30 Nov 2022 15:59:50 +0800
Subject: [PATCH 083/128] remove useless code

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/adaptor/pytorch.py          |  18 -
 .../experimental/quantization.py              |  32 --
 .../strategy/st_utils/hawq_metric.py          | 339 ------------------
 .../strategy/st_utils/hawq_wenhuach.py        | 313 ----------------
 test/strategy/test_hawq_wenhuach.py           | 152 --------
 test/strategy/test_hessian_trace_inc.py       |  77 ----
 6 files changed, 931 deletions(-)
 delete mode 100644 neural_compressor/strategy/st_utils/hawq_metric.py
 delete mode 100644 neural_compressor/strategy/st_utils/hawq_wenhuach.py
 delete mode 100644 test/strategy/test_hawq_wenhuach.py
 delete mode 100644 test/strategy/test_hessian_trace_inc.py

diff --git a/neural_compressor/adaptor/pytorch.py b/neural_compressor/adaptor/pytorch.py
index 3421828a8ab..06245b4fb0d 100644
--- a/neural_compressor/adaptor/pytorch.py
+++ b/neural_compressor/adaptor/pytorch.py
@@ -29,7 +29,6 @@
 from ..utils import logger
 from .query import QueryBackendCapability
 from ..experimental.data.dataloaders.base_dataloader import BaseDataLoader
-from neural_compressor.strategy.hawq_metric import Hawq_top
 
 
 torch = LazyImport("torch")
@@ -1095,23 +1094,6 @@ def is_fused_module(self, module):
         else:
             return False
         
-    def calculate_op_sensitivity(self, model, dataloader, method_args):
-        """Compute the op sensitivity by the specific method.
-
-        Args:
-            model(INC model): The fp32 model. 
-            dataloader: The calibration dataloader.
-            method_args(Dict): The parameters for specifying the method.  
-
-        Returns:
-            ops_sensitivity(Dict[tuple, float]): The key is (op_name, op_type), 
-              the value is the sensitivity under the specified method
-        """
-        if method_args['name']=='hessian_trace':
-            Hawq_top(model=model,yaml_cpu=None,yaml_trace=None,dataloader=dataloader)
-            hessian_cmp=Hawq_top.get_init_config()
-            return hessian_cmp
-        pass
 
 unify_op_type_mapping = {
     "ConvReLU2d": "Conv2d",
diff --git a/neural_compressor/experimental/quantization.py b/neural_compressor/experimental/quantization.py
index 77dfc51d465..3d7b7811ea2 100644
--- a/neural_compressor/experimental/quantization.py
+++ b/neural_compressor/experimental/quantization.py
@@ -146,38 +146,6 @@ def pre_process(self):
             with open(self.resume_file, 'rb') as f:
                 _resume = pickle.load(f).__dict__
 
-
-        import torchvision.datasets as datasets
-        import torchvision.transforms as transforms
-        data_path = "/mnt/data2/dataset/dataset/imagenet/img_raw"
-        traindir = os.path.join(data_path, 'train')
-        valdir = os.path.join(data_path, 'val')
-        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
-                                         std=[0.229, 0.224, 0.225])
-
-        train_dataset = datasets.ImageFolder(
-            traindir,
-            transforms.Compose([
-                transforms.RandomResizedCrop(224),
-                transforms.RandomHorizontalFlip(),
-                transforms.ToTensor(),
-                normalize,
-            ]))
-
-        val_dataset = datasets.ImageFolder(
-            valdir,
-            transforms.Compose([
-                transforms.RandomResizedCrop(224),
-                transforms.RandomHorizontalFlip(),
-                transforms.ToTensor(),
-                normalize,
-            ]))
-
-        from torch.utils.data import DataLoader
-
-        self._calib_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
-        self._eval_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=False)
-
         self.strategy = STRATEGIES[strategy](
             self._model,
             self.conf,
diff --git a/neural_compressor/strategy/st_utils/hawq_metric.py b/neural_compressor/strategy/st_utils/hawq_metric.py
deleted file mode 100644
index 63db277ab14..00000000000
--- a/neural_compressor/strategy/st_utils/hawq_metric.py
+++ /dev/null
@@ -1,339 +0,0 @@
-"""
- Copyright (c) 2022 Intel Corporation
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-      http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-"""
-import logging
-import torch
-import numpy as np
-from torch.autograd import Variable
-import yaml
-import torchvision.transforms as transforms
-import torchvision
-import random
-import copy
-from torch.quantization import get_default_qat_qconfig, quantize_jit,get_default_qconfig
-from torch.quantization.quantize_fx import prepare_fx, convert_fx,fuse_fx
-from torch.quantization import default_dynamic_qconfig, float_qparams_weight_only_qconfig
-import torch.quantization._numeric_suite as ns
-
-
-def fixed_seed(seed):
-    """Fixed rand seed to make sure results are same in different times on different devices.Eg CPU/GPU
-       Args:
-          seed:                              an integer number
-       return:                               None 
-    """
-    np.random.seed(seed)   #random
-    random.seed(seed)
-    torch.manual_seed(seed) #cpu
-    torch.cuda.manual_seed_all(seed)  #parallel cpu
-    torch.backends.cudnn.deterministic = True  #make sure results are same on cpu/gpu
-    torch.backends.cudnn.benchmark = True   #accelerator
-def cal_params_grad(model):
-     """
-     get the gradients and parameters from given model
-     Args:
-          model:                             FP32 model specificed
-     return:
-          params:                            paratmeters of model
-          grads:                             gradients of model
-     """
-     params=[]
-     grads=[]
-     for indx,(name, parm) in zip(enumerate(model.parameters()), model.named_parameters()): 
-          logging.info('->tensor_index:', indx[0],'-->name:', name, '-->grad_requirs:',parm.requires_grad, '-->current tensor len:',parm.shape)
-          if not parm.requires_grad:
-               continue
-          params.append(parm)
-          grads.append(0. if parm.grad is None else parm.grad+0.)
-     return params, grads
-def cal_vector_product(gradsH, params, v):
-     """compute the hessian vector product by torch.autograd.grad.
-     Agrs:
-          gradsH:                             gradient at current point
-          params:                             corresponding variables
-          v:                                  vector
-     return:
-          hv:                                 hessian vector product
-     """
-     hv=torch.autograd.grad(
-          gradsH,
-          params,
-          grad_outputs=v,
-          only_inputs=True,
-          retain_graph=True)
-     return hv
-def ptq_calibrate(model, data_loader,num_cal):
-     """Calibrate model in post train quantization model 
-        Args:
-            model:                            a pre_quantization model to calibrate
-            data_laoder:                      datasets
-            num_cal:                          maximization number of calibrated samples, such as images
-        return:
-            model:                            a calibrated model
-     """
-     #Generate some samples to calibrate from data_loader
-     calibrate_samples=[]
-     i=0
-     for inputs, targets in data_loader:
-          calibrate_samples.append(inputs)
-          i=i+1
-          if i>=num_cal:
-               break
-     # model.cpu()
-     model.eval()
-     #calibration
-     with torch.no_grad():
-          for sample in calibrate_samples:
-               model(sample)
-     return model
-def cal_weights_pertubation(model_qnt,model_fp32)->dict:
-     """calculate weights quantized perturbation using L2 normal
-        Args:
-            model_qnt:                       quantized model
-            model_fp32:                      float model
-        return:
-            pertur_lst:                      dict,which contains layer_name and value
-            
-     """
-     
-     wq_cmp_dict=ns.compare_weights(model_fp32.state_dict(), model_qnt.state_dict())
-     pertur_lst=[]
-     for key in wq_cmp_dict:
-          pertur_pair={"layer_name":'',"value":0}
-          op_float_tensor=wq_cmp_dict[key]['float']
-          op_qnt_tensor=wq_cmp_dict[key]['quantized'].dequantize()
-          diff_l2=(torch.norm(op_float_tensor-op_qnt_tensor,p=2)**2) #Formula: L2=||Q(w)-w||p^2
-          pertur_pair['layer_name']=key
-          pertur_pair['value']=diff_l2
-          pertur_lst.append(pertur_pair)
-     return pertur_lst
-def cal_act_pertubation(model_fp32,model_qnt,data_loader,num_cal=100)->dict:
-     """calculate weights quantized perturbation using L2 normal
-        Args:
-            model_qunt:                     quantized model
-            model_fp32:                     float model
-            data_loader:                    path to datasets
-        return:
-            pretur_lst:                     dict
-
-     """
-     ns.prepare_model_outputs(model_fp32, model_qnt)
-     model_fp32.cpu()
-     model_fp32.eval()
-     model_qnt.cpu()
-     model_qnt.eval()
-     obv_samples=[]
-     i=0
-     for inputs, targets in data_loader:
-          obv_samples.append(inputs)
-          i=i+1
-          if i>=num_cal:
-               break
-     with torch.no_grad():
-          for image in obv_samples:
-               model_fp32(image)
-               model_qnt(image)
-     act_qnt_pairs=[]
-     act_compare_dict = ns.get_matching_activations(model_fp32, q_module=model_qnt)
-     for key in act_compare_dict:
-          op_float_tensor=(act_compare_dict[key]['float'][0])
-          op_qnt_tensor=act_compare_dict[key]['quantized'][0].dequantize()
-          diff_l2=(torch.norm(op_float_tensor-op_qnt_tensor,p=2)**2)
-          pertur_pair={"layer_name":'',"value":0}
-          pertur_pair['layer_name']=key
-          pertur_pair['value']=diff_l2
-          act_qnt_pairs.append(pertur_pair)
-     return act_qnt_pairs
-     
-class Hessian():
-     """This class used to compute each layer hessian trace from given FP32 model
-     """
-     def __init__(self,model,criterion, data=None, dataloader=None,device='cpu') -> None:
-          """Initial parameters 
-          Args:
-               model:                         FP32 model specificed
-               criterion:                     loss function
-               data:                          a single batch of data, including inputs and its corresponding labels
-               dataloader:                    the data loader including bunch of batches of data
-               device:                        currently only supports cpu device
-          """
-          #make sure we either pass a single batch or a dataloader
-          assert (data!=None and dataloader==None ) or (data==None and dataloader!=None)
-          #make mode is evaluation model
-          self.model=model.eval()
-          self.criterion=criterion
-          self.device=device
-
-          if data!=None:
-               self.data=data
-               self.full_dataset=False
-          if not self.full_dataset:
-               self.inputs, self.targets=self.data
-               outputs=self.model(self.inputs)
-               loss=self.criterion(outputs,self.targets)
-               loss.backward(create_graph=True)
-          params, gradSH=cal_params_grad(self.model)
-
-          self.params=params
-          self.gradSH=gradSH
-     def calculate_trace(self,max_Iter=100, tolerance=1e-3):
-          """Compute the hessian trace based on Hutchinson algorithm
-          Args:
-               max_Inter:                    number of  maximization iteration 
-               tolerance:                    minimum relative tolerance for stopping the algorithm.
-          return: 
-               avg_traces_lst:               return hessian trace per layer for given model
-          """
-          avg_traces_lst=[]
-          for (i_grad, i_param,(module_name, _)) in zip(self.gradSH, self.params, self.model.named_parameters()):
-               v=[torch.randint_like(i_param,high=2, device=self.device)]
-               for v_i in v:
-                    v_i[v_i==0]=-1
-               i_v=v
-               trace_vhv=[]
-               trace=0.
-               trace_pair={"layer_name":" ", "trace":0}
-               self.model.zero_grad()
-               for i in range(max_Iter):
-                    hv=cal_vector_product(i_grad,i_param,i_v) # hessian vector
-                    trace_vhv_cur=sum([torch.sum(x * y) for (x, y) in zip(hv, v)])
-                    trace_vhv.append(trace_vhv_cur)
-                    difference=(np.mean(trace_vhv)-trace)/(abs(trace)+1e-6)
-                    if abs(difference)<tolerance:
-                         avg_trace_vhv=np.mean(trace_vhv)
-                         trace_pair["layer_name"]=module_name
-                         trace_pair["trace"]=avg_trace_vhv
-                         avg_traces_lst.append(trace_pair)
-                         break
-                    else:
-                         trace=np.mean(trace_vhv)
-          return avg_traces_lst
-                         
-
-class Hawq_top():
-     """This class is a interface of hessian
-     """
-     def __init__(self,model,yaml_trace=None,yaml_cpu=None,dataloader=None) -> None:
-          self.dataloader=dataloader
-          if yaml_trace and yaml_cpu is not None:
-               with open(yaml_trace) as file:
-                    params_config=yaml.load(file)
-               if params_config['loss']=='CrossEntropyLoss':
-                    self.criterion=torch.nn.CrossEntropyLoss()
-               self.random_seed=params_config['random_seed']
-               self.max_Iteration=params_config['max_Iteration']
-               self.enable_op_fuse=params_config['enable_op_fuse']
-               self.tolerance=float(params_config['tolerance'])
-               self.max_cal_sample=float(params_config['max_cal_smaple'])
-               self.quantize_mode=params_config['quantize_mode']
-               with open(yaml_cpu,'r') as file:
-                    yaml_config=yaml.load(file)
-               str_dtype=(yaml_config[0]['precisions']['names'])
-               self.list_dtype = str_dtype.split(",") 
-          else:
-               self.criterion=torch.nn.CrossEntropyLoss()
-               self.random_seed=100
-               self.max_Iteration=100
-               self.enable_op_fuse=True
-               self.tolerance=1e-6
-               self.max_cal_sample=1
-               self.quantize_mode='ptq'
-               self.list_dtype=['int8','fp32']
-          logging.info("Current parameters config for Hutchinson’s algorithm as below:")
-          logging.info("criterion:",self.criterion,"| random_seed:",self.random_seed,"| max_Iteration:", self.max_Iteration, \
-          "| tolerance:", self.tolerance,"|  en_op_fuse", self.enable_op_fuse,"| max_cal_sample:", self.max_cal_sample)
-          fixed_seed(self.random_seed)
-          self.model=model
-          self.model.eval()
-          model_tmp=copy.deepcopy(model)
-          model_tmp.eval()
-          self.model_fused= fuse_fx(model_tmp)
-          self.model_fused.eval()
-          self.hawq_level='L3'   #L1:top engievalue L2:avg_trace L3:avg_trace+pertubation
-              
-     def get_init_config(self)->dict: 
-          """
-          """
-          #Load a sample from dataloader to compute graident    
-          for inputs, targets in self.dataloader:
-               break
-          #Hessian average trace computation
-          fixed_seed(self.random_seed)
-          with torch.enable_grad():
-               if self.enable_op_fuse:
-                    hawq_cmp=Hessian(self.model_fused,criterion=self.criterion,data=(inputs,targets))
-               else:
-                    hawq_cmp=Hessian(self.model,criterion=self.criterion,data=(inputs,targets))
-          avg_traces_lst=hawq_cmp.calculate_trace(max_Iter=self.max_Iteration,tolerance=self.tolerance)
-         
-          #fiter none weight layer and save weight layer to match perturbation computation
-          if self.hawq_level=='L2':
-               avg_traces_lst_sorted=sorted(avg_traces_lst,key=lambda x:x["trace"], reverse=True)
-               logging.info("avg_traces desending sorted is:")
-               for i in avg_traces_lst_sorted:
-                    logging.info(i)
-               list_sorted=avg_traces_lst_sorted 
-          if self.hawq_level=='L3':
-               if self.quantize_mode=='ptq':
-                    #PTQ quantization
-                    qconfig = get_default_qconfig("fbgemm")
-                    qconfig_dict={"":qconfig} #enable all layers/tensor to quantize
-                    #calibrate
-                    model_prepared=prepare_fx(self.model, qconfig_dict)
-                    model_prepared=ptq_calibrate(model_prepared,data_loader=self.dataloader,num_cal=self.max_cal_sample)
-                    model_prepared.cpu()
-                    model_all_qnt=convert_fx(model_prepared)
-                    #calculate weights quantized perturbation
-                    weights_pertu_lst=cal_weights_pertubation(model_fp32=self.model,model_qnt=model_all_qnt)
-                    #merge weights quantized perturbation
-                    #generally, fused ops=quantized weights+quantized activation 
-                    avg_trace_i=0
-                    omigs=[]
-                    for wct_i in weights_pertu_lst:
-                        omig_pair={"layer_name":" ", "trace":0}
-                        tmp_value=avg_traces_lst[avg_trace_i]['trace']*wct_i['value']
-                        omig_pair['layer_name']=avg_traces_lst[avg_trace_i]['layer_name']
-                        omig_pair['trace']=tmp_value
-                        avg_trace_i=avg_trace_i+2
-                        omigs.append(omig_pair)
-                    act_pertu_lst=cal_act_pertubation(model_fp32=self.model, model_qnt=model_all_qnt,data_loader=self.dataloader,num_cal=self.max_cal_sample)
-                    avg_trace_i=1
-                    for act_i in act_pertu_lst:
-                         omig_pair={"layer_name":" ", "trace":0}
-                         tmp_value=avg_traces_lst[avg_trace_i]['trace']+act_i['value']
-                         omig_pair['layer_name']=avg_traces_lst[avg_trace_i]['layer_name']
-                         omig_pair['trace']=tmp_value
-                         avg_trace_i=avg_trace_i+2
-                         omigs.append(omig_pair)
-                    
-                    # for avg_trace_i, omiga_i in zip(avg_traces_lst_weight,pertu_list):
-                    #      omig_pair={"layer_name":" ", "value":0}
-                    #      omig_val=avg_trace_i['trace']*omiga_i['value']
-                    #      omig_pair['layer_name']=avg_trace_i['layer_name']
-                    #      omig_pair['value']=omig_val
-                    #      omig_list.append(omig_pair)
-                    # omig_list_sorted=sorted(omig_list,key=lambda x:x['value'],reverse=True)
-                    omig_list_sorted=sorted(omigs,key=lambda x:x['trace'],reverse=True)
-                    list_sorted=omig_list_sorted
-          tune_init_config_pairs=[]
-          for i in list_sorted:
-               tune_init_config_pair={"op_name":'',"op_type":'','trace':0}
-               if i['layer_name']==list_sorted[0]['layer_name']: 
-                    tune_init_config_pair['op_name']=i['layer_name']
-                    tune_init_config_pair['op_type']=self.list_dtype[-1] #setup as float op
-                    tune_init_config_pair['trace']=float(i['trace'])
-               else:
-                    tune_init_config_pair['op_name']=i['layer_name']
-                    tune_init_config_pair['op_type']=self.list_dtype[0]
-                    tune_init_config_pair['trace']=float(i['trace'])
-               tune_init_config_pairs.append(tune_init_config_pair)
-          return tune_init_config_pairs
diff --git a/neural_compressor/strategy/st_utils/hawq_wenhuach.py b/neural_compressor/strategy/st_utils/hawq_wenhuach.py
deleted file mode 100644
index c0ced2af3f4..00000000000
--- a/neural_compressor/strategy/st_utils/hawq_wenhuach.py
+++ /dev/null
@@ -1,313 +0,0 @@
-"""
- Copyright (c) 2022 Intel Corporation
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-      http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-"""
-from ...utils import logger
-import torch
-import numpy as np
-from torch.autograd import Variable
-import yaml
-import torchvision.transforms as transforms
-import torchvision
-import random
-import copy
-from torch.quantization import get_default_qat_qconfig, quantize_jit, get_default_qconfig
-from torch.quantization.quantize_fx import prepare_fx, convert_fx, fuse_fx
-from torch.quantization import default_dynamic_qconfig, float_qparams_weight_only_qconfig
-import torch.quantization._numeric_suite as ns
-
-
-def fix_seed(seed):
-    """Fixed rand seed to make sure results are same in different times on different devices.Eg CPU/GPU
-       Args:
-          seed:                              an integer number
-       return:                               None
-    """
-    np.random.seed(seed)  # random
-    random.seed(seed)
-    torch.manual_seed(seed)  # cpu
-    torch.cuda.manual_seed_all(seed)  # parallel cpu
-    torch.backends.cudnn.deterministic = True  # make sure results are same on cpu/gpu
-    torch.backends.cudnn.benchmark = True  # accelerator
-
-
-def calculate_params_gradients(model):
-    """
-    get the gradients and parameters from given model
-    Args:
-         model:                             FP32 model specificed
-    return:
-         params:                            paratmeters of model
-         grads:                             gradients of model
-    """
-    params = []
-    grads = []
-    for indx, (name, parm) in zip(enumerate(model.parameters()), model.named_parameters()):
-        logger.info(
-            f'index:{indx[0]}-->name:{name}:{parm.shape}')
-
-        if not parm.requires_grad:
-            continue
-        params.append(parm)
-        grads.append(0. if parm.grad is None else parm.grad + 0.)
-    return params, grads
-
-
-def calculate_inner_product(list_x, list_y):
-    """Compute the inner product of two lists of variables list_x,list_y
-    Args:
-         list_x:                            input list variables
-         list_y:                            input list variables
-    return:
-         sum of inner product
-    """
-    return sum([torch.sum(x * y) for (x, y) in zip(list_x, list_y)])
-
-
-def calculate_vector_product(gradsH, params, v):
-    """compute the hessian vector product by torch.autograd.grad.
-    Agrs:
-         gradsH:                             gradient at current point
-         params:                             corresponding variables
-         v:                                  vector
-    return:
-         hv:                                 hessian vector product
-    """
-    hv = torch.autograd.grad(
-        gradsH,
-        params,
-        grad_outputs=v,
-        only_inputs=True,
-        retain_graph=True)
-    return hv
-
-
-def ptq_calibrate(model, data_loader, num_cal):
-    """Calibrate model in post train quantization model
-       Args:
-           model:                            a pre_quantization model to calibrate
-           data_laoder:                      datasets
-           num_cal:                          maximization number of calibrated samples, such as images
-       return:
-           model:                            a calibrated model
-    """
-    # Generate some samples to calibrate from data_loader
-    calibrate_samples = []
-    i = 0
-    for inputs, targets in data_loader:
-        calibrate_samples.append(inputs)
-        i = i + 1
-        if i >= num_cal:
-            break
-    # model.cpu()
-    model.eval()
-    # calibration
-    with torch.no_grad():
-        for sample in calibrate_samples:
-            model(sample)
-    return model
-
-
-def calculate_perturbation(model_qnt, model_fp32) -> dict:
-    """calculate weights quantized perturbation using L2 normal
-       Args:
-           model_qnt:                       quantized model
-           model_fp32:                      float model
-       return:
-           pertur_lst:                      dict,which contains layer_name and value
-
-    """
-
-    wq_cmp_dict = ns.compare_weights(model_fp32.state_dict(), model_qnt.state_dict())
-    pertur_lst = []
-    for key in wq_cmp_dict:
-        pertur_pair = {"layer_name": '', "value": 0}
-        op_float_tensor = wq_cmp_dict[key]['float']
-        op_qnt_tensor = wq_cmp_dict[key]['quantized'].dequantize()
-        diff_l2 = (torch.norm(op_float_tensor - op_qnt_tensor, p=2) ** 2)  # Formula: L2=||Q(w)-w||p^2
-        pertur_pair['layer_name'] = key
-        pertur_pair['value'] = diff_l2
-        pertur_lst.append(pertur_pair)
-    return pertur_lst
-
-
-class Hessian():
-    """This class used to compute each layer hessian trace from given FP32 model
-    """
-
-    def __init__(self, model, criterion, data=None, dataloader=None, device='cpu') -> None:
-        """Initial parameters
-        Args:
-             model:                         FP32 model specificed
-             criterion:                     loss function
-             data:                          a single batch of data, including inputs and its corresponding labels
-             dataloader:                    the data loader including bunch of batches of data
-             device:                        currently only supports cpu device
-        """
-        # make sure we either pass a single batch or a dataloader
-        assert (data != None and dataloader == None) or (data == None and dataloader != None)
-        # make mode is evaluation model
-        self.model = model.eval()
-        self.criterion = criterion
-        self.device = device
-
-        if data != None:
-            self.data = data
-            self.full_dataset = False
-        if not self.full_dataset:
-            self.inputs, self.targets = self.data
-            outputs = self.model(self.inputs)
-            loss = self.criterion(outputs, self.targets)
-            loss.backward(create_graph=True)
-        params, gradSH = calculate_params_gradients(self.model)
-
-        self.params = params
-        self.gradSH = gradSH
-
-    def calculate_trace(self, max_Iter=100, tolerance=1e-3):
-        """Compute the hessian trace based on Hutchinson algorithm
-        Args:
-             max_Inter:                    number of  maximization iteration
-             tolerance:                    minimum relative tolerance for stopping the algorithm.
-        return:
-             avg_traces_lst:               return hessian trace per layer for given model
-        """
-        avg_traces_lst = []
-        for (i_grad, i_param, (module_name, _)) in zip(self.gradSH, self.params, self.model.named_parameters()):
-            v = [torch.randint_like(i_param, high=2, device=self.device)]
-            for v_i in v:
-                v_i[v_i == 0] = -1
-            i_v = v
-            trace_vhv = []
-            trace = 0.
-            trace_pair = {"layer_name": " ", "trace": 0}
-            self.model.zero_grad()
-            for i in range(max_Iter):
-                hv = calculate_vector_product(i_grad, i_param, i_v)  # hessian vector
-                trace_vhv_cur = calculate_inner_product(hv, v).cpu().item()  # current point
-                trace_vhv.append(trace_vhv_cur)
-                difference = (np.mean(trace_vhv) - trace) / (abs(trace) + 1e-6)
-                if abs(difference) < tolerance:
-                    avg_trace_vhv = np.mean(trace_vhv)
-                    trace_pair["layer_name"] = module_name
-                    trace_pair["trace"] = avg_trace_vhv
-                    avg_traces_lst.append(trace_pair)
-                    break
-                else:
-                    trace = np.mean(trace_vhv)
-        return avg_traces_lst
-
-
-class Hawq_top():
-    """This class is a interface of hessian
-    """
-
-    def __init__(self, model, yaml_trace=None, yaml_cpu=None, dataloader=None) -> None:
-        self.dataloader = dataloader
-        if yaml_trace and yaml_cpu is not None:
-            with open(yaml_trace) as file:
-                params_config = yaml.load(file)
-            if params_config['loss'] == 'CrossEntropyLoss':
-                self.criterion = torch.nn.CrossEntropyLoss()
-            self.random_seed = params_config['random_seed']
-            self.max_Iteration = params_config['max_Iteration']
-            self.enable_op_fuse = params_config['enable_op_fuse']
-            self.tolerance = float(params_config['tolerance'])
-            self.max_cal_sample = float(params_config['max_cal_smaple'])
-            self.quantize_mode = params_config['quantize_mode']
-            with open(yaml_cpu, 'r') as file:
-                yaml_config = yaml.load(file)
-            str_dtype = (yaml_config[0]['precisions']['names'])
-            self.list_dtype = str_dtype.split(",")
-        else:
-            self.criterion = torch.nn.CrossEntropyLoss()
-            self.random_seed = 100
-            self.max_Iteration = 100
-            self.enable_op_fuse = True
-            self.tolerance = 1e-6
-            self.max_cal_sample = 100
-            self.quantize_mode = 'ptq'
-            self.list_dtype = ['int8', 'fp32']
-        # logger.info("Current parameters config for Hutchinson’s algorithm as below:")
-        logger.info(
-            f"criterion:{self.criterion}| random_seed:{self.random_seed}| max_Iteration:self.max_Iteration| tolerance:{self.tolerance}")
-        # logger.info("criterion:", self.criterion, "| random_seed:", self.random_seed, "| max_Iteration:",
-        #              self.max_Iteration, \
-        #              "| tolerance:", self.tolerance, "|  en_op_fuse", self.enable_op_fuse, "| max_cal_sample:",
-        #              self.max_cal_sample)
-        fix_seed(self.random_seed)
-        self.model = model
-        self.model.eval()
-        if self.enable_op_fuse:
-            self.model = fuse_fx(self.model)
-
-        # model_tmp = copy.deepcopy(model)
-        # model_tmp.eval()
-        # self.model_fused = fuse_fx(model_tmp)
-        # self.model_fused.eval()
-
-    def get_init_config(self) -> dict:
-        """
-        """
-        # Load a sample from dataloader to compute graident
-        inputs, targets = next(iter(self.dataloader))
-
-        with torch.enable_grad():
-            # if self.enable_op_fuse:
-            #     hawq_cmp = Hessian(self.model_fused, criterion=self.criterion, data=(inputs, targets))
-            # else:
-            hawq_cmp = Hessian(self.model, criterion=self.criterion, data=(inputs, targets))
-        avg_traces_lst = hawq_cmp.calculate_trace(max_Iter=self.max_Iteration, tolerance=self.tolerance)
-
-        # fiter none weight layer and save weight layer to match perturbation computation
-        avg_traces_lst_weight = []
-        for avg_trace_i in avg_traces_lst:
-            if 'weight' in avg_trace_i['layer_name']:
-                avg_traces_lst_weight.append(avg_trace_i)
-        # avg_traces_lst_sorted=sorted(avg_traces_lst,key=lambda x:x["trace"], reverse=True)
-        if self.quantize_mode == 'ptq':
-            # PTQ quantization
-            qconfig = get_default_qconfig("fbgemm")
-            qconfig_dict = {"": qconfig}  # enable all layers/tensor to quantize
-            # calibrate
-            model_prepared = prepare_fx(self.model, qconfig_dict)
-            model_prepared = ptq_calibrate(model_prepared, data_loader=self.dataloader, num_cal=self.max_cal_sample)
-            model_prepared.cpu()
-            model_all_qnt = convert_fx(model_prepared)
-            # calculate perturbation
-            pertu_list = calculate_perturbation(model_fp32=self.model, model_qnt=model_all_qnt)
-            # calculate omiga
-            for omiga_i in pertu_list:
-                for avg_trace_i in avg_traces_lst:
-                    if avg_trace_i['layer_name'] == omiga_i['layer_name']:
-                        avg_trace_i['trace'] = avg_trace_i['trace'] * omiga_i['value']
-            # for avg_trace_i, omiga_i in zip(avg_traces_lst_weight,pertu_list):
-            #      omig_pair={"layer_name":" ", "value":0}
-            #      omig_val=avg_trace_i['trace']*omiga_i['value']
-            #      omig_pair['layer_name']=avg_trace_i['layer_name']
-            #      omig_pair['value']=omig_val
-            #      omig_list.append(omig_pair)
-            # omig_list_sorted=sorted(omig_list,key=lambda x:x['value'],reverse=True)
-            omig_list_sorted = sorted(avg_traces_lst, key=lambda x: x['trace'], reverse=True)
-        tune_init_config_pairs = []
-        #
-        for i in omig_list_sorted:
-            tune_init_config_pair = {"op_name": '', "op_type": '', 'trace': 0}
-            if i['layer_name'] == omig_list_sorted[0]['layer_name']:
-                tune_init_config_pair['op_name'] = i['layer_name']
-                tune_init_config_pair['op_type'] = self.list_dtype[-1]  # setup as float op
-                tune_init_config_pair['trace'] = float(i['trace'])
-            else:
-                tune_init_config_pair['op_name'] = i['layer_name']
-                tune_init_config_pair['op_type'] = self.list_dtype[0]
-                tune_init_config_pair['trace'] = float(i['trace'])
-            tune_init_config_pairs.append(tune_init_config_pair)
-        return tune_init_config_pairs
diff --git a/test/strategy/test_hawq_wenhuach.py b/test/strategy/test_hawq_wenhuach.py
deleted file mode 100644
index df70e32cd9e..00000000000
--- a/test/strategy/test_hawq_wenhuach.py
+++ /dev/null
@@ -1,152 +0,0 @@
-import torch
-import unittest
-import os
-import sys
-import copy
-import torchvision
-import torchvision.transforms as transforms
-from torch.utils.data import DataLoader
-from neural_compressor.data import DATASETS
-from neural_compressor.experimental.data.dataloaders.pytorch_dataloader import PyTorchDataLoader
-from neural_compressor.adaptor.pytorch import TemplateAdaptor
-from neural_compressor.adaptor import FRAMEWORKS
-import shutil
-from neural_compressor.strategy.st_utils.hawq_wenhuach import fix_seed
-from torch.quantization.quantize_fx import fuse_fx
-# fix_seed(1)
-
-def build_ptq_yaml():
-    fake_yaml = '''
-        model:
-          name: imagenet
-          framework: pytorch_fx
-        quantization: 
-          calibration:
-        evaluation:
-          accuracy:
-            metric:
-              topk: 1
-        tuning:
-          strategy:
-            name: hawq
-          accuracy_criterion:
-            relative: -0.1
-          random_seed: 9527
-          exit_policy:
-            max_trials: 3
-          workspace:
-            path: saved
-        '''
-    with open('ptq_yaml.yaml', 'w', encoding="utf-8") as f:
-        f.write(fake_yaml)
-
-class TestPytorchAdaptor(unittest.TestCase):
-    framework_specific_info = {"device": "gpu",
-                               "approach": "post_training_static_quant",
-                               "random_seed": 1234,
-                               "q_dataloader": None,
-                               "workspace_path": None}
-    framework = "pytorch"
-    adaptor = FRAMEWORKS[framework](framework_specific_info)
-    model = torchvision.models.resnet18()
-
-
-    # from collections import OrderedDict
-    # model = torch.nn.Sequential(OrderedDict([
-    #     ('conv1', torch.nn.Conv2d(3, 2, 1, 1)),
-    #     ('conv2', torch.nn.Conv2d(2, 1, 1, 1)),
-    #     ('flat', torch.nn.Flatten()),
-    # ]))
-    # model = torch.quantization.QuantWrapper(model)
-
-    @classmethod
-    def setUpClass(self):
-        self.i = 0
-        build_ptq_yaml()
-
-
-    @classmethod
-    def tearDownClass(self):
-        os.remove('ptq_yaml.yaml')
-        shutil.rmtree('./saved', ignore_errors=True)
-        shutil.rmtree('runs', ignore_errors=True)
-
-
-
-    def test_run_hawq_one_trial(self):
-        def eval_func(model):
-            self.i -= 1
-            return self.i
-        from neural_compressor.experimental import Quantization, common
-        model = copy.deepcopy(self.model)
-        model.eval()
-        # model = fuse_fx(model)
-        quantizer = Quantization('ptq_yaml.yaml')
-        quantizer.eval_func = eval_func
-        dataset = quantizer.dataset('dummy', (32, 3, 224, 224), label=True)
-        quantizer.calib_dataloader = common.DataLoader(dataset)
-        quantizer.eval_dataloader = common.DataLoader(dataset)
-        quantizer.model = model
-        quantizer()
-
-if __name__ == "__main__":
-
-    unittest.main()
-
-# def build_hessian_trace():
-#     hessian_trace_config_yaml = '''
-#     loss:
-#         CrossEntropyLoss
-#     random_seed:
-#         1
-#     max_Iteration:
-#         100
-#     tolerance:
-#         1e-3
-#     enable_op_fuse:
-#         True
-#     max_cal_smaple:
-#         100
-#     quantize_mode:
-#         ptq
-#     '''
-#     with open('./hessian_trace_config_yaml', 'w+', encoding="utf-8") as f:
-#         f.write(hessian_trace_config_yaml)
-#
-#
-# class Test_hessian_trace(unittest.TestCase):
-#     # boot up test
-#     @classmethod
-#     def setUpClass(cls) -> None:
-#         build_hessian_trace()
-#         cls.model = torchvision.models.resnet18()
-#
-#     # shotdown test
-#     @classmethod
-#     def tearDownClass(cls) -> None:
-#         os.remove('./hessian_trace_config_yaml')
-#
-#     # one test case
-#     def test_run_hessian_trace(cls):
-#         """
-#         hessian_trace_top
-#         Inputs:
-#             model:                      FP32 model
-#             dataloader:                 imagenet
-#         """
-#
-#         model = cls.model
-#         datasets = DATASETS('pytorch')
-#         dummy_dataset = datasets['dummy'](shape=(200, 3, 224, 224), low=0., high=1., label=True)
-#         dummy_dataloader = PyTorchDataLoader(dummy_dataset)
-#         # yaml_cpu='/home/bfang1/Projects/HAWQ_INC/frameworks.ai.lpot.intel-lpot/neural_compressor/adaptor/pytorch_cpu.yaml'
-#         # hessian_cmp=hawq_metric.Hawq_top(model,'./hessian_trace_config_yaml',yaml_cpu,dummy_dataloader)
-#         hessian_cmp = Hawq_top(model, yaml_cpu=None, yaml_trace=None, dataloader=dummy_dataloader)
-#         tuning_init_config = hessian_cmp.get_init_config()
-#         # print tuning init_config
-#         for i in tuning_init_config:
-#             print(i)
-
-
-# if __name__ == "__main__":
-#     unittest.main()
diff --git a/test/strategy/test_hessian_trace_inc.py b/test/strategy/test_hessian_trace_inc.py
deleted file mode 100644
index 5285bc619c7..00000000000
--- a/test/strategy/test_hessian_trace_inc.py
+++ /dev/null
@@ -1,77 +0,0 @@
-import torch
-import unittest
-import os
-import sys
-import copy
-import torchvision
-import torchvision.transforms as transforms
-from torch.utils.data import DataLoader
-from neural_compressor.data import DATASETS
-from neural_compressor.experimental.data.dataloaders.pytorch_dataloader import PyTorchDataLoader
-from neural_compressor.adaptor.pytorch import TemplateAdaptor
-from neural_compressor.strategy.hawq_metric import Hawq_top
-import random
-import numpy as np
-def fixed_seed(seed):
-    """Fixed rand seed to make sure results are same in different times on different devices.Eg CPU/GPU
-       Args:
-          seed:                              an integer number
-       return:                               None 
-    """
-    np.random.seed(seed)   #random
-    random.seed(seed)
-    torch.manual_seed(seed) #cpu
-    torch.cuda.manual_seed_all(seed)  #parallel cpu
-    torch.backends.cudnn.deterministic = True  #make sure results are same on cpu/gpu
-    torch.backends.cudnn.benchmark = True   #accelerator
-fixed_seed(100)
-def build_hessian_trace():
-    hessian_trace_config_yaml='''
-    loss:
-        CrossEntropyLoss
-    random_seed:
-        1
-    max_Iteration:
-        100
-    tolerance:
-        1e-3
-    enable_op_fuse:
-        True
-    max_cal_smaple:
-        100
-    quantize_mode:
-        ptq
-    '''
-    with open('./hessian_trace_config_yaml','w+',encoding="utf-8") as f:
-        f.write(hessian_trace_config_yaml)
-class Test_hessian_trace(unittest.TestCase):
-    #boot up test
-    @classmethod
-    def setUpClass(cls) -> None:
-        build_hessian_trace()
-        cls.model=torchvision.models.resnet18()
-    #shotdown test
-    @classmethod
-    def tearDownClass(cls) -> None:
-        os.remove('./hessian_trace_config_yaml')
-    #one test case
-    def test_run_hessian_trace(cls):
-        """
-        hessian_trace_top
-        Inputs:
-            model:                      FP32 model
-            dataloader:                 imagenet
-        """ 
-        model=cls.model
-        datasets = DATASETS('pytorch')
-        dummy_dataset = datasets['dummy'](shape=(200, 3, 224, 224), low=0., high=1., label=True)
-        dummy_dataloader = PyTorchDataLoader(dummy_dataset)
-        # yaml_cpu='/home/bfang1/Projects/HAWQ_INC/frameworks.ai.lpot.intel-lpot/neural_compressor/adaptor/pytorch_cpu.yaml'
-        # hessian_cmp=hawq_metric.Hawq_top(model,'./hessian_trace_config_yaml',yaml_cpu,dummy_dataloader)
-        hessian_cmp=Hawq_top(model,yaml_cpu=None,yaml_trace=None,dataloader=dummy_dataloader)
-        tuning_init_config=hessian_cmp.get_init_config()
-        #print tuning init_config
-        for i in tuning_init_config:
-            print(i)
-if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file

From 5b813ea52e4f5439a5d42ee62c77f63c5c8af185 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 30 Nov 2022 16:03:06 +0800
Subject: [PATCH 084/128] update ut

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 test/strategy/test_basic_fallback.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/strategy/test_basic_fallback.py b/test/strategy/test_basic_fallback.py
index 352c81850c4..fef994a4f1b 100644
--- a/test/strategy/test_basic_fallback.py
+++ b/test/strategy/test_basic_fallback.py
@@ -20,7 +20,7 @@ def build_ptq_yaml():
         framework: pytorch_fx
     tuning:
         strategy:
-            name: basic
+            name: hawq
         accuracy_criterion:
             absolute:  -1
         exit_policy:

From 152774f8ceb87674248decbcf2bc13b5919a3428 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 30 Nov 2022 17:46:06 +0800
Subject: [PATCH 085/128] remove WA for hawq loss

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/conf/config.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/neural_compressor/conf/config.py b/neural_compressor/conf/config.py
index dae5524c9ef..627d91e0d96 100644
--- a/neural_compressor/conf/config.py
+++ b/neural_compressor/conf/config.py
@@ -847,7 +847,7 @@ def percent_to_float(data):
     Optional('model_conversion'): model_conversion_schema,
 
     Optional('tuning', default={
-        'strategy': {'name': 'basic', 'loss': 'CrossEntropyLoss'}, # TODO move loss to appropriate position
+        'strategy': {'name': 'basic'}, 
         'accuracy_criterion': {'relative': 0.01, 'higher_is_better': True},
         'objective': 'performance',
         'exit_policy': {'timeout': 0, 'max_trials': 100, 'performance_only': False},
@@ -860,8 +860,7 @@ def percent_to_float(data):
             Optional('sigopt_project_id'): str,
             Optional('sigopt_experiment_name', default='nc-tune'): str,
             Optional('accuracy_weight', default=1.0): float,
-            Optional('latency_weight', default=1.0): float,
-            Optional('loss', default='CrossEntropyLoss'): str # TODO only for test, remove it before merge
+            Optional('latency_weight', default=1.0): float
         } ,
         Hook('accuracy_criterion', handler=_valid_accuracy_field): object,
         Optional('accuracy_criterion', default={'relative': 0.01}): {

From 5174c8027bbd79b98fdf8ac9889fe6e7eb6e921e Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 1 Dec 2022 09:04:25 +0800
Subject: [PATCH 086/128] remove hard code for baseline

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/strategy/auto_mixed_precision.py | 1 -
 neural_compressor/strategy/strategy.py             | 3 +--
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/neural_compressor/strategy/auto_mixed_precision.py b/neural_compressor/strategy/auto_mixed_precision.py
index 7fbd759a87e..4b59cf2cced 100644
--- a/neural_compressor/strategy/auto_mixed_precision.py
+++ b/neural_compressor/strategy/auto_mixed_precision.py
@@ -145,7 +145,6 @@ def traverse(self):
         if self.baseline is None and (self.eval_dataloader or self.eval_func):
             logger.info("Get FP32 model baseline.")
             self.baseline = self._evaluate(self.model)
-            self.baseline=[0.698,[700]]
             # record the FP32 baseline
             self._add_tuning_history()
 
diff --git a/neural_compressor/strategy/strategy.py b/neural_compressor/strategy/strategy.py
index 58faa5d919a..63710b43264 100644
--- a/neural_compressor/strategy/strategy.py
+++ b/neural_compressor/strategy/strategy.py
@@ -219,8 +219,7 @@ def traverse(self):
         if self.baseline is None:
             logger.info("Get FP32 model baseline.")
             self._fp32_model = self.model
-            ##self.baseline = self._evaluate(self.model)
-            self.baseline = [0.698,[700]]
+            self.baseline = self._evaluate(self.model)
             # record the FP32 baseline
             self._add_tuning_history()
         self.show_baseline_info()

From c9a16ae9247ddf99faa2ba5a7bd7e1e743aeae24 Mon Sep 17 00:00:00 2001
From: BiaoFangAIA <108742533+BiaoFangAIA@users.noreply.github.com>
Date: Thu, 1 Dec 2022 15:38:37 +0800
Subject: [PATCH 087/128] add efficientnet_b0_fx model

---
 examples/.config/model_params_pytorch.json | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/examples/.config/model_params_pytorch.json b/examples/.config/model_params_pytorch.json
index 848c1e9f0c6..d6e5e4f92ab 100644
--- a/examples/.config/model_params_pytorch.json
+++ b/examples/.config/model_params_pytorch.json
@@ -8,6 +8,15 @@
       "strategy": "basic",
       "batch_size": 100,
       "new_benchmark": false
+    },
+      "efficientnet_b0_fx": {
+      "model_src_dir": "image_recognition/torchvision_models/quantization/ptq/cpu/fx/",
+      "dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
+      "input_model": "",
+      "yaml": "conf.yaml",
+      "strategy": "basic",
+      "batch_size": 100,
+      "new_benchmark": false
     },
     "resnet18_fx": {
       "model_src_dir": "image_recognition/torchvision_models/quantization/ptq/cpu/fx/",

From a64c5707349e19f47634b1bbc126a1c8dbe3c4d5 Mon Sep 17 00:00:00 2001
From: BiaoFangAIA <108742533+BiaoFangAIA@users.noreply.github.com>
Date: Thu, 1 Dec 2022 20:05:12 +0800
Subject: [PATCH 088/128] add act_qnt loss analysis

---
 neural_compressor/strategy/hawq.py | 164 +++++++++++++++++++++++++----
 1 file changed, 142 insertions(+), 22 deletions(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index c000def9440..241716bf293 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -28,10 +28,22 @@
 from .st_utils.tuning_structs import OpTuningConfig
 from .st_utils.tuning_space import TUNING_ITEMS_LST
 from torch.quantization.quantize_fx import fuse_fx
-
+import torch.nn.intrinsic.quantized as nniq
+from torch.fx import symbolic_trace, graph_module
+import torch.nn as nn
+import logging
+logger = logging.getLogger(__name__)
 from typing import Dict, List, Optional, Any, Union, Callable, Set
-
-
+# Define Collector based on hook, which is used to record the intermediate result
+class Node_collector:
+    def __init__(self, m):
+        self.handle = m.register_forward_hook(self.hook_fn_act)
+    def hook_fn_act(self, m, inp, outp):
+        self.out_features = outp.clone()
+        self.in_features = inp
+        self.m = m
+    def remove(self):
+        self.handle.remove()
 class HessianTrace:
     """
     please refer to
@@ -40,11 +52,10 @@ class HessianTrace:
     https://github.com/openvinotoolkit/nncf/blob/develop/nncf/torch/quantization/hessian_trace.py
     """
 
-    def __init__(self, model, dataloader, criterion=None):
+    def __init__(self, model, dataloader,q_model,criterion=None):
         self.unfused_model = model.model
-
+        self.q_model=q_model
         self.model = fuse_fx(model.model)  ##TODO need to check whether model has been already fused
-
         self.dataloader = dataloader
         self.max_iter = 500
         self.tolerance = 1e-5
@@ -78,7 +89,22 @@ def mapping_module_to_op(self, name):
         #     return name
         # else:
         return name
-
+    def mse_metric_gap(self,fp32_tensor, dequantize_tensor):
+        """Calculate the euclidean distance between fp32 tensor and int8 dequantize tensor
+        Args:
+            fp32_tensor (tensor): The FP32 tensor.
+            dequantize_tensor (tensor): The INT8 dequantize tensor.
+        """
+        fp32_max = np.max(fp32_tensor)
+        fp32_min = np.min(fp32_tensor)
+        dequantize_max = np.max(dequantize_tensor)
+        dequantize_min = np.min(dequantize_tensor)
+        fp32_tensor = (fp32_tensor - fp32_min) / (fp32_max - fp32_min)
+        dequantize_tensor = (dequantize_tensor - dequantize_min) / \
+            (dequantize_max - dequantize_min)
+        diff_tensor = fp32_tensor - dequantize_tensor
+        euclidean_dist = np.sum(diff_tensor ** 2)
+        return euclidean_dist / fp32_tensor.size
     def get_fused_mapping(self):
         model = self.model
         weights_info = dict(model.named_parameters())
@@ -255,7 +281,6 @@ def get_weight_traces(self, num_samples):
             op_name = self.weight_to_op[weight_name]
             op_name_to_trace[op_name] = weight_name_to_traces[weight_name]
         return op_name_to_trace
-
     def get_act_traces(self, num_samples):
         unfused_training = self.unfused_model.training
         self.unfused_model.eval()
@@ -318,19 +343,100 @@ def get_act_traces(self, num_samples):
 
         self.layer_acts = []
         self.layer_acts_grads = []
-        return act_traces
-
+        return res_dict    
+    def insert_hook(self, model, target_module_list):
+        intern_outputs = []
+        for layer,module in model.named_modules():     
+            for target_module in target_module_list:
+                # print("layer:",layer)
+                # print("target_model:",target_module)   
+                if  layer == target_module:
+                    logging.debug("Collect: %s" % (module))
+                    # print("Collect: %s" % (module))
+                    intern_outputs.append(Node_collector(module))
+                
+        logging.info("Total %d hook inserted" % (len(intern_outputs)))
+        # print("Total %d hook inserted" % (len(intern_outputs)))
+        return model, intern_outputs
+    def insert_hook_quantize(self,model, target_module_list):
+        intern_outputs = []
+        for layer,module in model.named_modules():     
+            for target_module in target_module_list:
+                # print("layer:",layer)
+                length = len("_model.")
+                new_key = layer[length:]
+                # print("target_model:",target_module)   
+                if  new_key == target_module:
+                    logging.debug("Collect: %s" % (module))
+                    # print("Collect: %s" % (module))
+                    intern_outputs.append(Node_collector(module))
+        logging.info("Total %d hook inserted" % (len(intern_outputs)))
+        # print("Total %d hook inserted" % (len(intern_outputs)))
+        return model, intern_outputs
+    def get_act_gap(self,fp32_model,q_model):
+        """
+        Estimates each activation gap between quantized model and float model 
+        """
+        self.handle_acts=[]
+        fp32_model.eval()
+        # temp_model = fuse_fx(fp32_model.model)
+        temp_model=fp32_model
+        # target_module_list = [nn.ReLU] # Insert hook for FP32 model
+        target_module_list = self.op_list
+        temp_model, intern_outputs =self.insert_hook(temp_model, target_module_list)
+        # intern_outputs={}
+        for input, target in self.dataloader:
+            temp_model(input)
+            break
+
+        fp32_act_out={}
+        for i, intern_output in enumerate(intern_outputs):
+            stat_features = intern_output.out_features.view(-1)
+            # print ("No.", i, " ", intern_output.out_features.shape)
+            # print ("Numpy No.", i, " ", intern_output.out_features.cpu().data.numpy().shape)
+            # print ("No.", i, " ", stat_features.cpu().data.numpy().shape)
+            # print ("Numpy No.", i, " ", stat_features.cpu().data.numpy())
+            fp32_act_out[target_module_list[i]]=stat_features.cpu().data.numpy()
+            # break
+        for i in intern_outputs:
+            # print(i)
+            i.remove()
+        target_module_list = self.op_list
+        q_model, intern_outputs=self.insert_hook_quantize(q_model, target_module_list)
+        for input, target in self.dataloader: #only one sample
+            q_model(input)
+            break
+        qnt_act_out={}
+        intern_outputs={}
+        for i, intern_output in enumerate(intern_outputs):
+            stat_features = intern_output.out_features.view(-1)
+            qnt_act_out[target_module_list[i]]=stat_features.dequantize().cpu().data.numpy()
+            # break
+        for i in intern_outputs:
+            # print(i)
+            i.remove()
+        act_gap={}
+        mse_gap={}
+        for fp_i,int_i in zip(fp32_act_out,qnt_act_out):
+            activation_qnt_error=fp32_act_out[fp_i]-qnt_act_out[int_i]
+            mse_gap[fp_i]=self.mse_metric_gap(fp32_act_out[fp_i],qnt_act_out[int_i])
+            act_gap[fp_i]=np.sum(activation_qnt_error)/activation_qnt_error.size
+        return act_gap,mse_gap
     def get_avg_traces(self, enable_act=True, num_samples=32):
         """
         Estimates average hessian trace for each parameter
         """
-
         assert num_samples > 0
         traces = {}
         weight_traces = self.get_weight_traces(num_samples)
         traces['weight'] = weight_traces
+        act_trace={}
         if enable_act:
+            act_gap,mse_gap=self.get_act_gap(self.model,self.q_model)
             act_traces = self.get_act_traces(num_samples)
+            for i,j in zip(act_traces,mse_gap):
+                #currently use mse to analysis 
+                act_trace[i]=act_traces[i]+mse_gap[j]
             traces['activation'] = act_traces
         return traces
 
@@ -536,22 +642,21 @@ def next_tune_cfg(self):
         quant_ops = quant_mode_wise_items['static'] if 'static' in quant_mode_wise_items else []
         quant_ops += quant_mode_wise_items['dynamic'] if 'dynamic' in quant_mode_wise_items else []
 
-        target_dtype = "fp32"  ##TODO support bf16
+        target_dtype = "int8"  ##TODO support bf16
         target_type_lst = set(tuning_space.query_items_by_quant_mode(target_dtype))
         fp_op_list = [item.name for item in quant_ops if item in target_type_lst]
         # for n, p in self._fp32_model.named_modules():
         #     print(n)
         # for n, p in self._fp32_model.named_parameters():
         #     print(n)
-
         orig_eval = True
         if self._fp32_model.training:
             orig_eval = False
         self._fp32_model.eval()
-        ht = HessianTrace(self._fp32_model, self.calib_dataloader)
-
-        q_model_state_dict = {
-        }
+        import copy
+        temp_q_model=copy.deepcopy(self.q_model)
+        ht = HessianTrace(self._fp32_model, self.calib_dataloader,temp_q_model)
+        q_model_state_dict = {}
         for key in self.q_model.state_dict().keys():
             length = len("_model.")
             new_key = key[length:]
@@ -564,24 +669,39 @@ def next_tune_cfg(self):
             op_qnt_tensor = weight_quant_loss[key]['quantized'].dequantize()
             diff_l2 = (torch.norm(op_float_tensor - op_qnt_tensor, p=2) ** 2)  # Formula: L2=||Q(w)-w||p^2
             pertur_lst[key] = diff_l2
-        traces = ht.get_avg_traces(enable_act=False)
+        traces = ht.get_avg_traces(enable_act=True)
         op_to_traces = traces['weight']
-        for trace_i, pertur_i in zip(op_to_traces.keys(),pertur_lst.keys()):
-            op_to_traces[trace_i]=pertur_lst[pertur_i]*op_to_traces[trace_i] #Formula:Omig=Trace*L2
+        act_to_traces=traces['activation']
+        # print("act_to_traces:",act_to_traces)
+        #TODO() optimize relationship of weights quantized loss and activation quantized loss, to find best conbine
+        #TODO() do double check why layer1's output is not 0 for activation quantized
+        for trace_i, pertur_i,act_i in zip(op_to_traces.keys(),pertur_lst.keys(),act_to_traces.keys()):
+            op_to_traces[trace_i]=pertur_lst[pertur_i]*op_to_traces[trace_i]+act_to_traces[act_i] #Formula:Omig=Trace*L2+act_trace
+        # for trace_i, pertur_i in zip(op_to_traces.keys(),pertur_lst.keys()):
+        #     op_to_traces[trace_i]=pertur_lst[pertur_i]*op_to_traces[trace_i] #Formula:Omig=Trace*L2
         if orig_eval == False:
             self._fp32_model.train()
         ordered_ops = sorted(op_to_traces.keys(),
                              key=lambda key: op_to_traces[key],
                              reverse=self.higher_is_better)
         # WA for add op type
-        print("ordered_ops:",ordered_ops)
+        # print("ordered_ops:",ordered_ops)
         op_info_map = {}
         for op_info in list(initial_op_tuning_cfg.keys()):
             op_info_map[op_info[0]] = op_info  # op_name: (op_name, op_type)
         tmp_ordered_ops = [op_info_map[op_name] for op_name in ordered_ops]
         op_dtypes = OrderedDict(zip(tmp_ordered_ops, [target_dtype] * len(ordered_ops)))
+        indx=0
+        #defautly fallback 5 ops
+        for i in op_dtypes.keys():
+            op_dtypes[i]="fp32"
+            indx=indx+1
+            if indx>4:
+                break
+        print(op_dtypes)
+        logger.info("hawq op_config:"+str(op_dtypes))
         logger.info(f"Start to accumulate fallback to {target_dtype}.")
-
+        initial_op_tuning_cfg = deepcopy(op_tuning_cfg)
         fallback_sampler = FallbackTuningSampler(tuning_space, tuning_order_lst=[],
                                                  initial_op_tuning_cfg=op_tuning_cfg,
                                                  op_dtypes=op_dtypes, accumulate=True)

From 81e04d5cf001dc1c7e00923f429a555fe55d4a5c Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Fri, 2 Dec 2022 09:19:36 +0800
Subject: [PATCH 089/128] comment some hard code for acc

---
 .../efficientnet/quantization/ptq/eager/run_tuning.sh      | 3 ++-
 .../quantization/ptq/cpu/eager/run_tuning.sh               | 7 ++++---
 .../quantization/ptq/cpu/eager/run_tuning_dump_tensor.sh   | 7 ++++---
 .../quantization/ptq/cpu/fx/run_tuning.sh                  | 7 ++++---
 .../quantization/ptq/gpu/eager/run_tuning.sh               | 7 ++++---
 5 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/examples/pytorch/image_recognition/efficientnet/quantization/ptq/eager/run_tuning.sh b/examples/pytorch/image_recognition/efficientnet/quantization/ptq/eager/run_tuning.sh
index c5c764b7155..588ec872406 100644
--- a/examples/pytorch/image_recognition/efficientnet/quantization/ptq/eager/run_tuning.sh
+++ b/examples/pytorch/image_recognition/efficientnet/quantization/ptq/eager/run_tuning.sh
@@ -41,7 +41,8 @@ function run_tuning {
         conf_yaml=conf_efficientnet_b0.yaml
     elif [ "${topology}" = "mobilenetv3_rw" ]; then
         conf_yaml=conf_mobilenetv3_rw.yaml
-        sed -i "/relative:/s|relative:.*|relative: 0.02|g" $conf_yaml
+        # TODO only for test, uncomment it before merge
+        # sed -i "/relative:/s|relative:.*|relative: 0.02|g" $conf_yaml
     fi
     sed -i "/\/path\/to\/calibration\/dataset/s|root:.*|root: $dataset_location/train|g" $conf_yaml
     sed -i "/\/path\/to\/evaluation\/dataset/s|root:.*|root: $dataset_location/val|g" $conf_yaml
diff --git a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/eager/run_tuning.sh b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/eager/run_tuning.sh
index 2f930ad1470..7752585ddb5 100644
--- a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/eager/run_tuning.sh
+++ b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/eager/run_tuning.sh
@@ -37,9 +37,10 @@ function init_params {
 
 # run_tuning
 function run_tuning {
-    if [ "mobilenet_v2" = "$topology" ];then
-        sed -i "/relative:/s|relative:.*|relative: 0.02|g" conf.yaml
-    fi
+    # TODO only for test, uncomment it before merge
+    # if [ "mobilenet_v2" = "$topology" ];then
+    #     sed -i "/relative:/s|relative:.*|relative: 0.02|g" conf.yaml
+    # fi
     extra_cmd=""
     if [ -n "$output_model" ];then
         extra_cmd = $extra_cmd"--tuned_checkpoint ${output_model}"
diff --git a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/eager/run_tuning_dump_tensor.sh b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/eager/run_tuning_dump_tensor.sh
index 02f968d7d23..3c45fe25a32 100644
--- a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/eager/run_tuning_dump_tensor.sh
+++ b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/eager/run_tuning_dump_tensor.sh
@@ -39,9 +39,10 @@ function init_params {
 function run_tuning {
     sed -i "/\/path\/to\/calibration\/dataset/s|root:.*|root: $dataset_location/train|g" conf_dump_tensors.yaml
     sed -i "/\/path\/to\/evaluation\/dataset/s|root:.*|root: $dataset_location/val|g" conf_dump_tensors.yaml
-    if [ "mobilenet_v2" = "$topology" ];then
-        sed -i "/relative:/s|relative:.*|relative: 0.02|g" conf_dump_tensors.yaml
-    fi
+    # TODO only for test, uncomment it before merge
+    # if [ "mobilenet_v2" = "$topology" ];then
+    #     sed -i "/relative:/s|relative:.*|relative: 0.02|g" conf_dump_tensors.yaml
+    # fi
 
     extra_cmd=""
     if [ -n "$output_model" ];then
diff --git a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/run_tuning.sh b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/run_tuning.sh
index 054d4389d9c..eaa81d6e85c 100644
--- a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/run_tuning.sh
+++ b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/run_tuning.sh
@@ -37,9 +37,10 @@ function init_params {
 
 # run_tuning
 function run_tuning {
-    if [ "mobilenet_v2" = "$topology" ];then
-        sed -i "/relative:/s|relative:.*|relative: 0.02|g" conf.yaml
-    fi
+    # TODO only for test, uncomment it before merge
+    # if [ "mobilenet_v2" = "$topology" ];then
+    #     sed -i "/relative:/s|relative:.*|relative: 0.02|g" conf.yaml
+    # fi
     extra_cmd=""
     if [ -n "$output_model" ];then
         extra_cmd = $extra_cmd"--tuned_checkpoint ${output_model}"
diff --git a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/gpu/eager/run_tuning.sh b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/gpu/eager/run_tuning.sh
index 3a272f7e8eb..a4460264ee2 100644
--- a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/gpu/eager/run_tuning.sh
+++ b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/gpu/eager/run_tuning.sh
@@ -39,9 +39,10 @@ function init_params {
 function run_tuning {
     sed -i "/\/path\/to\/calibration\/dataset/s|root:.*|root: $dataset_location/train|g" conf.yaml
     sed -i "/\/path\/to\/evaluation\/dataset/s|root:.*|root: $dataset_location/val|g" conf.yaml
-    if [ "mobilenet_v2" = "$topology" ];then
-        sed -i "/relative:/s|relative:.*|relative: 0.02|g" conf.yaml
-    fi
+    # TODO only for test, uncomment it before merge
+    # if [ "mobilenet_v2" = "$topology" ];then
+    #     sed -i "/relative:/s|relative:.*|relative: 0.02|g" conf.yaml
+    # fi
 
     extra_cmd="${dataset_location}"
 

From d7f051178590abc9a3f358a66e8899ccbaa86643 Mon Sep 17 00:00:00 2001
From: BiaoFangAIA <108742533+BiaoFangAIA@users.noreply.github.com>
Date: Fri, 2 Dec 2022 14:30:46 +0800
Subject: [PATCH 090/128] setting as disable act qnt loss analysis

add check fused model feature
---
 neural_compressor/strategy/hawq.py | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 241716bf293..3397cacdfcf 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -55,7 +55,13 @@ class HessianTrace:
     def __init__(self, model, dataloader,q_model,criterion=None):
         self.unfused_model = model.model
         self.q_model=q_model
-        self.model = fuse_fx(model.model)  ##TODO need to check whether model has been already fused
+        tmp_model=model.model
+        if 'graph' in (str(dir(tmp_model))): #check the attribute and it's length
+            logger.info("This is aready fused model")
+            self.model=model.model
+        else:
+            logger.info("fusing model")
+            self.model = fuse_fx(model.model)  ##TODO need to check whether model has been already fused
         self.dataloader = dataloader
         self.max_iter = 500
         self.tolerance = 1e-5
@@ -654,14 +660,12 @@ def next_tune_cfg(self):
             orig_eval = False
         self._fp32_model.eval()
         import copy
-        temp_q_model=copy.deepcopy(self.q_model)
-        ht = HessianTrace(self._fp32_model, self.calib_dataloader,temp_q_model)
+        ht = HessianTrace(self._fp32_model, self.calib_dataloader,self.q_model)
         q_model_state_dict = {}
         for key in self.q_model.state_dict().keys():
             length = len("_model.")
             new_key = key[length:]
             q_model_state_dict[new_key] = self.q_model.state_dict()[key]
-
         weight_quant_loss = compare_weights(ht.model.state_dict(), q_model_state_dict)
         pertur_lst = {}
         for key in weight_quant_loss:
@@ -669,16 +673,18 @@ def next_tune_cfg(self):
             op_qnt_tensor = weight_quant_loss[key]['quantized'].dequantize()
             diff_l2 = (torch.norm(op_float_tensor - op_qnt_tensor, p=2) ** 2)  # Formula: L2=||Q(w)-w||p^2
             pertur_lst[key] = diff_l2
-        traces = ht.get_avg_traces(enable_act=True)
+        self.enable_act=False #enable activation trace and quantization loss analysis feature
+        traces = ht.get_avg_traces(self.enable_act)
         op_to_traces = traces['weight']
-        act_to_traces=traces['activation']
-        # print("act_to_traces:",act_to_traces)
+        if self.enable_act:
+            act_to_traces=traces['activation']
         #TODO() optimize relationship of weights quantized loss and activation quantized loss, to find best conbine
         #TODO() do double check why layer1's output is not 0 for activation quantized
-        for trace_i, pertur_i,act_i in zip(op_to_traces.keys(),pertur_lst.keys(),act_to_traces.keys()):
-            op_to_traces[trace_i]=pertur_lst[pertur_i]*op_to_traces[trace_i]+act_to_traces[act_i] #Formula:Omig=Trace*L2+act_trace
-        # for trace_i, pertur_i in zip(op_to_traces.keys(),pertur_lst.keys()):
-        #     op_to_traces[trace_i]=pertur_lst[pertur_i]*op_to_traces[trace_i] #Formula:Omig=Trace*L2
+            for trace_i, pertur_i,act_i in zip(op_to_traces.keys(),pertur_lst.keys(),act_to_traces.keys()):
+                op_to_traces[trace_i]=pertur_lst[pertur_i]*op_to_traces[trace_i]+act_to_traces[act_i] #Formula:Omig=Trace*L2+act_trace
+        else:
+            for trace_i, pertur_i in zip(op_to_traces.keys(),pertur_lst.keys()):
+                op_to_traces[trace_i]=pertur_lst[pertur_i]*op_to_traces[trace_i] #Formula:Omig=Trace*L2
         if orig_eval == False:
             self._fp32_model.train()
         ordered_ops = sorted(op_to_traces.keys(),

From 8a48f849f9b8dda0eb44a342f3887948ef43ef01 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 6 Dec 2022 09:12:36 +0800
Subject: [PATCH 091/128] aligned the interface between adaptor and strategy

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/adaptor/pytorch.py | 19 +++++++++++++++++++
 neural_compressor/strategy/hawq.py   | 10 ++++++++++
 2 files changed, 29 insertions(+)

diff --git a/neural_compressor/adaptor/pytorch.py b/neural_compressor/adaptor/pytorch.py
index 06245b4fb0d..3589b65aca1 100644
--- a/neural_compressor/adaptor/pytorch.py
+++ b/neural_compressor/adaptor/pytorch.py
@@ -1094,6 +1094,25 @@ def is_fused_module(self, module):
         else:
             return False
         
+    def calculate_hessian_trace(fp32_model, 
+                                dataloader, 
+                                q_model,
+                                criterion = torch.nn.CrossEntropyLoss(), 
+                                enable_act = False):
+        """Calculate hessian trace.
+
+        Args:
+            fp32_model: The original fp32 model.
+            criterion: The loss function for calculate the hessian trace. # loss = criterion(output, target)
+            dataloader: The dataloader for calculate the gradient.
+            q_model: The INT8 AMAP model.
+            enable_act: Enabling quantization error or not.
+            
+        Return:
+            hessian_trace(Dict[Tuple, float]), key: (op_name, op_type); value: hessian trace.
+        """
+        pass
+
 
 unify_op_type_mapping = {
     "ConvReLU2d": "Conv2d",
diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 3397cacdfcf..013e45ece32 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -638,6 +638,8 @@ def next_tune_cfg(self):
                 break
             op_tuning_cfg['calib_sampling_size'] = calib_size
             yield op_tuning_cfg
+        
+        # Start compute the hessian trace
 
         # import torch.quantization._numeric_suite as ns
         # self.model.eval()
@@ -687,6 +689,14 @@ def next_tune_cfg(self):
                 op_to_traces[trace_i]=pertur_lst[pertur_i]*op_to_traces[trace_i] #Formula:Omig=Trace*L2
         if orig_eval == False:
             self._fp32_model.train()
+
+        # End compute the hessian trace
+        # # TODO uncomment it when algo ready.
+        # op_to_traces = self.adaptor.calculate_hessian_trace(fp32_model = self._fp32_model, 
+        #                                                     dataloader = self.calib_dataloader, 
+        #                                                     q_model = self.q_model, 
+        #                                                     criterion = torch.nn.CrossEntropyLoss(), # TODO replace it with user specify loss
+        #                                                     enable_act = False)
         ordered_ops = sorted(op_to_traces.keys(),
                              key=lambda key: op_to_traces[key],
                              reverse=self.higher_is_better)

From 895cc207ef0741667e4695ba326c96d2c40a71da Mon Sep 17 00:00:00 2001
From: BiaoFangAIA <biao.fang@intel.com>
Date: Tue, 6 Dec 2022 15:22:04 +0800
Subject: [PATCH 092/128] add hawq metric logical

---
 .../adaptor/torch_utils/hawq_metric.py        | 579 ++++++++++++++++++
 1 file changed, 579 insertions(+)
 create mode 100644 neural_compressor/adaptor/torch_utils/hawq_metric.py

diff --git a/neural_compressor/adaptor/torch_utils/hawq_metric.py b/neural_compressor/adaptor/torch_utils/hawq_metric.py
new file mode 100644
index 00000000000..465b7f9ca88
--- /dev/null
+++ b/neural_compressor/adaptor/torch_utils/hawq_metric.py
@@ -0,0 +1,579 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import numpy as np
+from collections import OrderedDict
+import torch.nn
+from torch.quantization.quantize_fx import fuse_fx
+import torch.nn.intrinsic.quantized as nniq
+from torch.fx import symbolic_trace, graph_module
+import torch.nn as nn
+import logging
+logger = logging.getLogger(__name__)
+from typing import Dict, List, Optional, Any, Union, Callable, Set
+# Define Collector based on hook, which is used to record the intermediate result
+class Node_collector:
+    def __init__(self, m):
+        self.handle = m.register_forward_hook(self.hook_fn_act)
+    def hook_fn_act(self, m, inp, outp):
+        self.out_features = outp.clone()
+        self.in_features = inp
+        self.m = m
+    def remove(self):
+        self.handle.remove()
+class HessianTrace:
+    """
+    please refer to
+    Yao, Zhewei, et al. "Pyhessian: Neural networks through the lens of the hessian." 2020 IEEE international conference on big data (Big data). IEEE, 2020.
+    Dong, Zhen, et al. "Hawq-v2: Hessian aware trace-weighted quantization of neural networks." Advances in neural information processing systems 33 (2020): 18518-18529.
+    https://github.com/openvinotoolkit/nncf/blob/develop/nncf/torch/quantization/hessian_trace.py
+    """
+
+    def __init__(self, model, dataloader,q_model,criterion=None):
+        self.unfused_model = model.model
+        self.q_model=q_model
+        tmp_model=model.model
+        if 'graph' in (str(dir(tmp_model))): #check the attribute and it's length
+            logger.info("This is aready fused model")
+            self.model=model.model
+        else:
+            logger.info("fusing model")
+            self.model = fuse_fx(model.model)  ##TODO need to check whether model has been already fused
+        self.dataloader = dataloader
+        self.max_iter = 500
+        self.tolerance = 1e-5
+        self.eps = 1e-6
+        self.index = 0
+        self.device = self.get_device(self.model)
+        self.criterion = criterion
+        if self.criterion == None:
+            self.criterion = torch.nn.CrossEntropyLoss().to(self.device)  ##TODO need to set in config
+        self.criterion = self.criterion.to(self.device)
+        self.weight_to_op, self.op_list = self.get_fused_mapping()
+        self.get_params()
+
+    def is_fused_module(self, module):
+        """This is a helper function for `_propagate_qconfig_helper` to detecte
+           if this module is fused.
+        Args:
+            module (object): input module
+        Returns:
+            (bool): is fused or not
+        """
+        op_type = str(type(module))
+        if 'fused' in op_type:
+            return True
+        else:
+            return False
+
+    def mapping_module_to_op(self, name):
+        # length = len("_model.")
+        # if len(name) < length:
+        #     return name
+        # else:
+        return name
+    def mse_metric_gap(self,fp32_tensor, dequantize_tensor):
+        """Calculate the euclidean distance between fp32 tensor and int8 dequantize tensor
+        Args:
+            fp32_tensor (tensor): The FP32 tensor.
+            dequantize_tensor (tensor): The INT8 dequantize tensor.
+        """
+        fp32_max = np.max(fp32_tensor)
+        fp32_min = np.min(fp32_tensor)
+        dequantize_max = np.max(dequantize_tensor)
+        dequantize_min = np.min(dequantize_tensor)
+        fp32_tensor = (fp32_tensor - fp32_min) / (fp32_max - fp32_min)
+        dequantize_tensor = (dequantize_tensor - dequantize_min) / \
+            (dequantize_max - dequantize_min)
+        diff_tensor = fp32_tensor - dequantize_tensor
+        euclidean_dist = np.sum(diff_tensor ** 2)
+        return euclidean_dist / fp32_tensor.size
+    def get_fused_mapping(self):
+        model = self.model
+        weights_info = dict(model.named_parameters())
+        weight_to_op = {}
+        for op_name, child in model.named_modules():
+            if self.is_fused_module(child):
+                for name, _ in child.named_children():
+                    if op_name + "." + name + ".weight" in weights_info:  ##TODO check if this is right
+
+                        weight_to_op[op_name + "." + name + ".weight"] = self.mapping_module_to_op(op_name)
+                        break
+            else:
+                name = op_name + ".weight"
+                if name in weights_info and name not in weight_to_op.keys():
+                    weight_to_op[op_name + ".weight"] = op_name
+        op_list = []
+        for key in weight_to_op.keys():
+            op_list.append(weight_to_op[key])
+        return weight_to_op, op_list
+
+    def get_device(self, model: torch.nn.Module):
+        for n, p in model.named_parameters():
+            return p.data.device
+
+    def _get_act_grad_hook(self, name):
+        def act_grad_hook(model, grad_input, grad_output):
+            ##print(name, grad_input[0].shape, grad_output[0].shape)
+            if type(model) == torch.nn.Linear:  ##TODO very tricky
+                self.layer_acts_grads[name] = grad_input[1]
+            else:
+                self.layer_acts_grads[name] = grad_input[0]
+
+        return act_grad_hook
+
+    def _get_enable_act_grad_hook(self, name):
+        def enable_act_grad_hook(model, inputs, outputs):
+            input = inputs[0]
+            if input.requires_grad is False:
+                input.requires_grad = True
+            self.layer_acts[name] = input
+
+        return enable_act_grad_hook
+
+    # def _get_disable_input_grad_hook(self, name):
+    #     def disable_input_grad_hook(model, inputs, outputs):
+    #         try:
+    #             input = inputs[0]  ##TODO check whether this is right
+    #         except:
+    #             input = inputs
+    #         if input.is_leaf == False:## you can only change requires_grad flags of leaf variables
+    #             if input.requires_grad is True:
+    #                 input.requires_grad = False
+    #
+    #
+    #     return disable_input_grad_hook
+
+    def _unregister_hook(self):
+        for handel in self.hook_handles:
+            handel.remove()
+
+    def register_act_grad_hooks(self, model):
+        for name, module in model.named_modules():
+            if self.mapping_module_to_op(name) in self.op_list:
+                hook_handle = module.register_forward_hook(self._get_enable_act_grad_hook(name))
+                self.hook_handles.append(hook_handle)
+                hook_handle = module.register_backward_hook(self._get_act_grad_hook(name))
+                self.hook_handles.append(hook_handle)
+
+    def reset_act_gradient_and_hooks(self):
+        # tmp_input = torch.zeros(self._input_shape, device=self.device)
+        # for name, module in self.model.named_modules():
+        #     if name in self.op_list:
+        #         hook_handle = module.register_forward_hook(self._get_disable_input_grad_hook(name))
+        #         self.hook_handles.append(hook_handle)
+        # self.model(tmp_input)
+        self._unregister_hook()
+
+    def get_params(self):
+        weight_names = [n for n, p in self.model.named_parameters() if
+                        p.requires_grad and "bias" not in n]  ##remove bias
+        params = [p for n, p in self.model.named_parameters() if p.requires_grad and "bias" not in n]  ##remove bias
+        self.weight_names = weight_names
+        self.params = params
+
+    def forward_backward(self, model, data, create_graph=False, return_w_grad=True):
+        model.zero_grad()
+        input = data[0].to(self.device)
+        ##self._input_shape = input.shape  ## for resetting input activation
+        target = data[1].to(self.device)
+        input.requires_grad = True
+        output = model(input)
+        loss = self.criterion(output, target)
+        torch.autograd.backward(loss, create_graph=create_graph)
+        ##loss.backward(create_graph=create_graph)
+        if return_w_grad:
+            gradients = []
+            for n, p in self.model.named_parameters():
+                if p.grad != None and n in self.weight_names:
+                    gradient = p.grad
+                    gradients.append(gradient + 0.0)  ## add 0 to create a copy
+            model.zero_grad()
+            return gradients
+        else:
+            model.zero_grad()
+
+    # def get_params(self, model):
+    #     parameters = [p for p in model.parameters() if p.requires_grad]
+    #     return parameters
+
+    def sample_rademacher(self, params):
+        samples = []
+        for param in params:
+            r = torch.randint_like(param, high=2, device=self.device)
+            r.masked_fill_(r == 0, -1)
+            samples.append(r)
+        return samples
+
+    def get_vtHv_weight(self, params, num_samples):
+        v = self.sample_rademacher(params)
+        H_v = [0] * len(v)
+        cnt = 0
+        for step, data in enumerate(self.dataloader):
+            batch_size = data[0].shape[0]
+            cnt += batch_size
+            gradients = self.forward_backward(self.model, data, create_graph=True)
+            H_v_one = torch.autograd.grad(gradients, params, v, only_inputs=True, retain_graph=False)
+            H_v = [pre + cur * float(batch_size) for cur, pre in zip(H_v_one, H_v)]
+            if cnt >= num_samples:
+                break
+        if cnt > 0:
+            H_v = [item / cnt for item in H_v]
+        v_t_H_v = torch.stack([torch.mean(h_v * v_t) for (h_v, v_t) in zip(H_v, v)])  ##maybe sum is better
+        return v_t_H_v
+
+    def get_vtHv_act(self, params, num_samples):
+        v = self.sample_rademacher(params)
+        H_v = [0] * len(v)
+        cnt = 0
+        for step, data in enumerate(self.dataloader):
+            if cnt >= num_samples:
+                break
+            for i in range(self.dataloader.batchsize):  ##force to batchsize to be 1
+                input = data[0][i:i + 1]
+                target = data[1][i:i + 1]
+
+                self.get_gradients(self.model, (input, target), self.criterion, create_graph=True)
+                layer_acts = [self.layer_acts[key] for key in self.layer_acts.keys()]
+                layer_act_gradients = [self.layer_acts_grads[key] for key in self.layer_acts.keys()]
+                hv_one = torch.autograd.grad(layer_act_gradients, layer_acts, v, only_inputs=True, retain_graph=False)
+                cnt += 1
+                if cnt >= num_samples:
+                    break
+
+    def get_weight_traces(self, num_samples):
+        layer_traces_per_iter = []
+        prev_avg_model_trace = 0
+        for iter in range(self.max_iter):
+            layer_traces = self.get_vtHv_weight(self.params, num_samples)
+            layer_traces_per_iter.append(layer_traces)
+            layer_traces_estimate = torch.mean(torch.stack(layer_traces_per_iter), dim=0)
+            model_trace = torch.sum(layer_traces_estimate)
+            diff_ratio = abs(model_trace - prev_avg_model_trace) / (prev_avg_model_trace + self.eps)
+            if diff_ratio < self.tolerance and iter > 10:  ##TODO magic number
+                break
+            if iter == 50:  ##TODO for debug
+                break
+            prev_avg_model_trace = model_trace
+        weight_name_to_traces = {}
+        layer_traces = layer_traces_estimate
+        for weight_name, trace in zip(self.weight_names, layer_traces):
+            weight_name_to_traces[weight_name] = trace
+        op_name_to_trace = {}
+        for weight_name in self.weight_names:
+            op_name = self.weight_to_op[weight_name]
+            op_name_to_trace[op_name] = weight_name_to_traces[weight_name]
+        return op_name_to_trace
+    def get_act_traces(self, num_samples):
+        unfused_training = self.unfused_model.training
+        self.unfused_model.eval()
+        self.hook_handles = []
+        self.layer_acts = {}
+        self.layer_acts_grads = {}
+        self.register_act_grad_hooks(self.unfused_model)
+        cnt = 0
+        act_traces_per_sample = []
+        for step, data in enumerate(self.dataloader):
+            if cnt >= num_samples:
+                break
+            bs = data[0].shape[0]
+            act_traces_sum = 0
+            act_traces_per_iter = []
+            prev_avg_model_trace = 0
+            act_traces_sums = None
+            for i in range(bs):  ##force the bs to be one
+                input = data[0][i:i + 1]
+                target = data[1][i:i + 1]
+                self.forward_backward(self.unfused_model, (input, target), create_graph=True, return_w_grad=False)
+                acts = [self.layer_acts[key] for key in self.layer_acts.keys()]
+                if act_traces_sums == None:
+                    act_traces_sums = [0] * len(acts)
+                acts_grad = [self.layer_acts_grads[key] for key in self.layer_acts.keys()]  ##same order with acts
+                vt_H_v_sum_per_act = [0] * len(acts)
+
+                prev_model_act_trace = 0
+                for iter in range(self.max_iter):
+                    v = self.sample_rademacher(acts)
+                    H_v = torch.autograd.grad(acts_grad, acts, v, only_inputs=True, retain_graph=True)
+                    vt_H_v = [torch.mean(h_v * v_t) for (h_v, v_t) in zip(H_v, v)]
+
+                    vt_H_v_sum_per_act = [vt_H_v_sum_per_act[index] + vt_H_v[index] for index, item in
+                                          enumerate(vt_H_v_sum_per_act)]
+                    vt_H_v_mean_per_act = [item / (iter + 1) for item in vt_H_v_sum_per_act]
+                    current_model_act_trace = torch.mean(torch.stack(vt_H_v_mean_per_act))
+
+                    diff_ratio = abs(current_model_act_trace - prev_model_act_trace) / (
+                            prev_model_act_trace + self.eps)
+                    if diff_ratio < self.tolerance and iter > 10:  ##TODO magic number
+                        break
+                    if iter == 50:  ##TODO for debug
+                        break
+
+                    prev_model_act_trace = current_model_act_trace
+                act_traces_per_sample.append(vt_H_v_mean_per_act)
+                cnt += 1
+                if cnt >= num_samples:
+                    break
+
+        if unfused_training:
+            self.unfused_model.train()
+        self.reset_act_gradient_and_hooks()  ##TODO have issues to reset the input grad to False
+        act_traces_stack = torch.stack([torch.stack(item) for item in act_traces_per_sample])
+        act_traces = torch.mean(act_traces_stack, dim=0)
+        res_dict = {}
+        for index, key in enumerate(self.layer_acts.keys()):
+            res_dict[key] = act_traces[index]
+
+        self.layer_acts = []
+        self.layer_acts_grads = []
+        return res_dict    
+    def insert_hook(self, model, target_module_list):
+        intern_outputs = []
+        for layer,module in model.named_modules():     
+            for target_module in target_module_list:
+                # print("layer:",layer)
+                # print("target_model:",target_module)   
+                if  layer == target_module:
+                    logging.debug("Collect: %s" % (module))
+                    # print("Collect: %s" % (module))
+                    intern_outputs.append(Node_collector(module))
+                
+        logging.info("Total %d hook inserted" % (len(intern_outputs)))
+        # print("Total %d hook inserted" % (len(intern_outputs)))
+        return model, intern_outputs
+    def insert_hook_quantize(self,model, target_module_list):
+        intern_outputs = []
+        for layer,module in model.named_modules():     
+            for target_module in target_module_list:
+                # print("layer:",layer)
+                length = len("_model.")
+                new_key = layer[length:]
+                # print("target_model:",target_module)   
+                if  new_key == target_module:
+                    logging.debug("Collect: %s" % (module))
+                    # print("Collect: %s" % (module))
+                    intern_outputs.append(Node_collector(module))
+        logging.info("Total %d hook inserted" % (len(intern_outputs)))
+        # print("Total %d hook inserted" % (len(intern_outputs)))
+        return model, intern_outputs
+    def get_act_gap(self,fp32_model,q_model):
+        """
+        Estimates each activation gap between quantized model and float model 
+        """
+        self.handle_acts=[]
+        fp32_model.eval()
+        # temp_model = fuse_fx(fp32_model.model)
+        temp_model=fp32_model
+        # target_module_list = [nn.ReLU] # Insert hook for FP32 model
+        target_module_list = self.op_list
+        temp_model, intern_outputs =self.insert_hook(temp_model, target_module_list)
+        # intern_outputs={}
+        for input, target in self.dataloader:
+            temp_model(input)
+            break
+
+        fp32_act_out={}
+        for i, intern_output in enumerate(intern_outputs):
+            stat_features = intern_output.out_features.view(-1)
+            # print ("No.", i, " ", intern_output.out_features.shape)
+            # print ("Numpy No.", i, " ", intern_output.out_features.cpu().data.numpy().shape)
+            # print ("No.", i, " ", stat_features.cpu().data.numpy().shape)
+            # print ("Numpy No.", i, " ", stat_features.cpu().data.numpy())
+            fp32_act_out[target_module_list[i]]=stat_features.cpu().data.numpy()
+            # break
+        for i in intern_outputs:
+            # print(i)
+            i.remove()
+        target_module_list = self.op_list
+        q_model, intern_outputs=self.insert_hook_quantize(q_model, target_module_list)
+        for input, target in self.dataloader: #only one sample
+            q_model(input)
+            break
+        qnt_act_out={}
+        intern_outputs={}
+        for i, intern_output in enumerate(intern_outputs):
+            stat_features = intern_output.out_features.view(-1)
+            qnt_act_out[target_module_list[i]]=stat_features.dequantize().cpu().data.numpy()
+            # break
+        for i in intern_outputs:
+            # print(i)
+            i.remove()
+        act_gap={}
+        mse_gap={}
+        for fp_i,int_i in zip(fp32_act_out,qnt_act_out):
+            activation_qnt_error=fp32_act_out[fp_i]-qnt_act_out[int_i]
+            mse_gap[fp_i]=self.mse_metric_gap(fp32_act_out[fp_i],qnt_act_out[int_i])
+            act_gap[fp_i]=np.sum(activation_qnt_error)/activation_qnt_error.size
+        return act_gap,mse_gap
+    def get_avg_traces(self, enable_act=True, num_samples=32):
+        """
+        Estimates average hessian trace for each parameter
+        """
+        assert num_samples > 0
+        traces = {}
+        weight_traces = self.get_weight_traces(num_samples)
+        traces['weight'] = weight_traces
+        act_trace={}
+        if enable_act:
+            act_gap,mse_gap=self.get_act_gap(self.model,self.q_model)
+            act_traces = self.get_act_traces(num_samples)
+            for i,j in zip(act_traces,mse_gap):
+                #currently use mse to analysis 
+                act_trace[i]=act_traces[i]+mse_gap[j]
+            traces['activation'] = act_traces
+        return traces
+
+
+##copy from torch.quantization._numeric_suite
+def _find_match(
+        str_list: Union[Dict[str, Any], List[str]], key_str: str,
+        postfix: str,
+) -> Optional[str]:
+    split_str = key_str.split(".")
+    if split_str[-1] == postfix:
+        match_string = "".join(key_str.split(".")[0:-1])
+        for s2 in str_list:
+            pattern1 = "".join(s2.split(".")[0:-1])
+            pattern2 = "".join(s2.split(".")[0:-2])
+            if match_string == pattern1:
+                return s2
+            if match_string == pattern2:
+                return s2
+
+        # For matching "fc.weight" and "fc._packed_params._packed_params"
+        if postfix == "_packed_params":
+            match_string = "".join(key_str.split(".")[0:-2])
+            if len(match_string) == 0:
+                return None
+            for s2 in str_list:
+                pattern1 = "".join(s2.split(".")[0:-1])
+                pattern2 = "".join(s2.split(".")[0:-2])
+                if match_string == pattern1:
+                    return s2
+                if match_string == pattern2:
+                    return s2
+        return None
+    else:
+        return None
+
+
+##copy form torch.quantization._numeric_suite
+def compare_weights(
+        float_dict: Dict[str, Any], quantized_dict: Dict[str, Any]
+) -> Dict[str, Dict[str, torch.Tensor]]:
+    r"""Compare the weights of the float module with its corresponding quantized
+    module. Return a dict with key corresponding to module names and each entry being
+    a dictionary with two keys 'float' and 'quantized', containing the float and
+    quantized weights. This dict can be used to compare and compute the quantization
+    error of the weights of float and quantized models.
+
+    Example usage::
+
+        wt_compare_dict = compare_weights(
+            float_model.state_dict(), qmodel.state_dict())
+        for key in wt_compare_dict:
+            print(
+                key,
+                compute_error(
+                    wt_compare_dict[key]['float'],
+                    wt_compare_dict[key]['quantized'].dequantize()
+                )
+            )
+
+    Args:
+        float_dict: state dict of the float model
+        quantized_dict: state dict of the quantized model
+
+    Return:
+        weight_dict: dict with key corresponding to module names and each entry being
+        a dictionary with two keys 'float' and 'quantized', containing the float and
+        quantized weights
+    """
+
+    weight_dict: Dict[str, Dict] = {}
+    for key in quantized_dict:
+        match_key = _find_match(float_dict, key, "weight")
+        if match_key is not None:
+            weight_dict[key] = {}
+            weight_dict[key]["float"] = float_dict[match_key]
+            weight_dict[key]["quantized"] = quantized_dict[key]
+            continue
+
+        # For matching "fc.weight" and "fc._packed_params._packed_params"
+        match_key = _find_match(float_dict, key, "_packed_params")
+        if match_key is not None:
+            weight_dict[match_key] = {}
+            weight_dict[match_key]["float"] = float_dict[match_key]
+            weight_dict[match_key]["quantized"] = quantized_dict[key][0]
+            ##TODO:should consider more models in further work
+
+        # For LSTM
+        split_str = key.split(".")
+        if split_str[-1] == "param" and split_str[-3] == "_all_weight_values":
+            layer = split_str[-2]
+            module_name = ".".join(split_str[:-3])
+            float_weight_ih_key = module_name + ".weight_ih_l" + layer
+            float_weight_hh_key = module_name + ".weight_hh_l" + layer
+            if float_weight_ih_key in float_dict and float_weight_hh_key in float_dict:
+                weight_dict[key] = {}
+                weight_dict[key]["float"] = float_dict[float_weight_ih_key]
+                weight_dict[key]["quantized"] = (
+                    quantized_dict[key].__getstate__()[0][4][0].__getstate__()[0][0]
+                )
+                weight_dict[key]["float"] = float_dict[float_weight_hh_key]
+                weight_dict[key]["quantized"] = (
+                    quantized_dict[key].__getstate__()[0][4][1].__getstate__()[0][0]
+                )
+
+    return weight_dict
+# op_to_traces = self.adaptor.calculate_hessian_trace(fp32_model = self._fp32_model, 
+        #                                                     dataloader = self.calib_dataloader, 
+        #                                                     q_model = self.q_model, 
+        #                                                     criterion = torch.nn.CrossEntropyLoss(), # TODO replace it with user specify loss
+        #                                                     enable_act = False)
+def hawq_top(fp32_model,q_model,dataloader,criterion,enable_act):
+    orig_eval=True
+    if fp32_model.training:
+        orig_eval=False
+    ht=HessianTrace(fp32_model,dataloader=dataloader,q_model=q_model)
+    q_model_state_dict={}
+    for key in q_model.state_dict().keys():
+        length=len("_model.")
+        new_key=key[length:] 
+        q_model_state_dict[new_key]=q_model.state_dict()[key]
+    weight_quant_loss=compare_weights(ht.model.state_dict(),q_model_state_dict)
+    pertur_lst={}
+    for key in weight_quant_loss:
+        op_float_tensor=weight_quant_loss[key]['float']
+        op_qnt_tensor=weight_quant_loss[key]['quantized'].dequantize()
+        diff_l2 = (torch.norm(op_float_tensor - op_qnt_tensor, p=2) ** 2)
+        pertur_lst[key]=diff_l2
+    traces=ht.get_act_traces(enable_act)
+    op_to_traces=traces['weight']
+    if enable_act:
+        act_to_traces=traces['activation']
+        for trace_i, pertur_i,act_i in zip(op_to_traces.keys(),pertur_lst.keys(),act_to_traces.keys()):
+                op_to_traces[trace_i]=pertur_lst[pertur_i]*op_to_traces[trace_i]+act_to_traces[act_i] #Formula:Omig=Trace*L2+act_trace
+    else:
+         for trace_i, pertur_i in zip(op_to_traces.keys(),pertur_lst.keys()):
+                op_to_traces[trace_i]=pertur_lst[pertur_i]*op_to_traces[trace_i] #Formula:Omig=Trace*L2       
+    if orig_eval==False:
+        fp32_model.train()
+    return op_to_traces
+    
+    
\ No newline at end of file

From cb8fd30160588faaadbfb19566f1715573123c84 Mon Sep 17 00:00:00 2001
From: BiaoFangAIA <biao.fang@intel.com>
Date: Tue, 6 Dec 2022 15:25:24 +0800
Subject: [PATCH 093/128] add call hawq function

---
 neural_compressor/adaptor/pytorch.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/neural_compressor/adaptor/pytorch.py b/neural_compressor/adaptor/pytorch.py
index 3589b65aca1..44392313d80 100644
--- a/neural_compressor/adaptor/pytorch.py
+++ b/neural_compressor/adaptor/pytorch.py
@@ -29,7 +29,7 @@
 from ..utils import logger
 from .query import QueryBackendCapability
 from ..experimental.data.dataloaders.base_dataloader import BaseDataLoader
-
+from .torch_utils.hawq_metric import hawq_top
 
 torch = LazyImport("torch")
 json = LazyImport("json")
@@ -1094,11 +1094,13 @@ def is_fused_module(self, module):
         else:
             return False
         
-    def calculate_hessian_trace(fp32_model, 
+    def calculate_hessian_trace(
+                                fp32_model, 
                                 dataloader, 
                                 q_model,
-                                criterion = torch.nn.CrossEntropyLoss(), 
-                                enable_act = False):
+                                criterion, 
+                                enable_act = False
+                                ):
         """Calculate hessian trace.
 
         Args:
@@ -1111,6 +1113,8 @@ def calculate_hessian_trace(fp32_model,
         Return:
             hessian_trace(Dict[Tuple, float]), key: (op_name, op_type); value: hessian trace.
         """
+        op_to_traces=hawq_top(fp32_model=fp32_model,dataloader=dataloader,q_model=q_model,criterion=criterion,enable_act=enable_act)
+        return op_to_traces
         pass
 
 

From a5503985e5d83e681e1caa466438335b445e1ec3 Mon Sep 17 00:00:00 2001
From: BiaoFangAIA <biao.fang@intel.com>
Date: Tue, 6 Dec 2022 15:27:37 +0800
Subject: [PATCH 094/128] enable hawq interface

---
 neural_compressor/strategy/hawq.py | 1103 ++++++++++++++--------------
 1 file changed, 552 insertions(+), 551 deletions(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 013e45ece32..c1ce91e0ca0 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -34,521 +34,521 @@
 import logging
 logger = logging.getLogger(__name__)
 from typing import Dict, List, Optional, Any, Union, Callable, Set
-# Define Collector based on hook, which is used to record the intermediate result
-class Node_collector:
-    def __init__(self, m):
-        self.handle = m.register_forward_hook(self.hook_fn_act)
-    def hook_fn_act(self, m, inp, outp):
-        self.out_features = outp.clone()
-        self.in_features = inp
-        self.m = m
-    def remove(self):
-        self.handle.remove()
-class HessianTrace:
-    """
-    please refer to
-    Yao, Zhewei, et al. "Pyhessian: Neural networks through the lens of the hessian." 2020 IEEE international conference on big data (Big data). IEEE, 2020.
-    Dong, Zhen, et al. "Hawq-v2: Hessian aware trace-weighted quantization of neural networks." Advances in neural information processing systems 33 (2020): 18518-18529.
-    https://github.com/openvinotoolkit/nncf/blob/develop/nncf/torch/quantization/hessian_trace.py
-    """
-
-    def __init__(self, model, dataloader,q_model,criterion=None):
-        self.unfused_model = model.model
-        self.q_model=q_model
-        tmp_model=model.model
-        if 'graph' in (str(dir(tmp_model))): #check the attribute and it's length
-            logger.info("This is aready fused model")
-            self.model=model.model
-        else:
-            logger.info("fusing model")
-            self.model = fuse_fx(model.model)  ##TODO need to check whether model has been already fused
-        self.dataloader = dataloader
-        self.max_iter = 500
-        self.tolerance = 1e-5
-        self.eps = 1e-6
-        self.index = 0
-        self.device = self.get_device(self.model)
-        self.criterion = criterion
-        if self.criterion == None:
-            self.criterion = torch.nn.CrossEntropyLoss().to(self.device)  ##TODO need to set in config
-        self.criterion = self.criterion.to(self.device)
-        self.weight_to_op, self.op_list = self.get_fused_mapping()
-        self.get_params()
-
-    def is_fused_module(self, module):
-        """This is a helper function for `_propagate_qconfig_helper` to detecte
-           if this module is fused.
-        Args:
-            module (object): input module
-        Returns:
-            (bool): is fused or not
-        """
-        op_type = str(type(module))
-        if 'fused' in op_type:
-            return True
-        else:
-            return False
-
-    def mapping_module_to_op(self, name):
-        # length = len("_model.")
-        # if len(name) < length:
-        #     return name
-        # else:
-        return name
-    def mse_metric_gap(self,fp32_tensor, dequantize_tensor):
-        """Calculate the euclidean distance between fp32 tensor and int8 dequantize tensor
-        Args:
-            fp32_tensor (tensor): The FP32 tensor.
-            dequantize_tensor (tensor): The INT8 dequantize tensor.
-        """
-        fp32_max = np.max(fp32_tensor)
-        fp32_min = np.min(fp32_tensor)
-        dequantize_max = np.max(dequantize_tensor)
-        dequantize_min = np.min(dequantize_tensor)
-        fp32_tensor = (fp32_tensor - fp32_min) / (fp32_max - fp32_min)
-        dequantize_tensor = (dequantize_tensor - dequantize_min) / \
-            (dequantize_max - dequantize_min)
-        diff_tensor = fp32_tensor - dequantize_tensor
-        euclidean_dist = np.sum(diff_tensor ** 2)
-        return euclidean_dist / fp32_tensor.size
-    def get_fused_mapping(self):
-        model = self.model
-        weights_info = dict(model.named_parameters())
-        weight_to_op = {}
-        for op_name, child in model.named_modules():
-            if self.is_fused_module(child):
-                for name, _ in child.named_children():
-                    if op_name + "." + name + ".weight" in weights_info:  ##TODO check if this is right
-
-                        weight_to_op[op_name + "." + name + ".weight"] = self.mapping_module_to_op(op_name)
-                        break
-            else:
-                name = op_name + ".weight"
-                if name in weights_info and name not in weight_to_op.keys():
-                    weight_to_op[op_name + ".weight"] = op_name
-        op_list = []
-        for key in weight_to_op.keys():
-            op_list.append(weight_to_op[key])
-        return weight_to_op, op_list
-
-    def get_device(self, model: torch.nn.Module):
-        for n, p in model.named_parameters():
-            return p.data.device
-
-    def _get_act_grad_hook(self, name):
-        def act_grad_hook(model, grad_input, grad_output):
-            ##print(name, grad_input[0].shape, grad_output[0].shape)
-            if type(model) == torch.nn.Linear:  ##TODO very tricky
-                self.layer_acts_grads[name] = grad_input[1]
-            else:
-                self.layer_acts_grads[name] = grad_input[0]
-
-        return act_grad_hook
-
-    def _get_enable_act_grad_hook(self, name):
-        def enable_act_grad_hook(model, inputs, outputs):
-            input = inputs[0]
-            if input.requires_grad is False:
-                input.requires_grad = True
-            self.layer_acts[name] = input
-
-        return enable_act_grad_hook
-
-    # def _get_disable_input_grad_hook(self, name):
-    #     def disable_input_grad_hook(model, inputs, outputs):
-    #         try:
-    #             input = inputs[0]  ##TODO check whether this is right
-    #         except:
-    #             input = inputs
-    #         if input.is_leaf == False:## you can only change requires_grad flags of leaf variables
-    #             if input.requires_grad is True:
-    #                 input.requires_grad = False
-    #
-    #
-    #     return disable_input_grad_hook
-
-    def _unregister_hook(self):
-        for handel in self.hook_handles:
-            handel.remove()
-
-    def register_act_grad_hooks(self, model):
-        for name, module in model.named_modules():
-            if self.mapping_module_to_op(name) in self.op_list:
-                hook_handle = module.register_forward_hook(self._get_enable_act_grad_hook(name))
-                self.hook_handles.append(hook_handle)
-                hook_handle = module.register_backward_hook(self._get_act_grad_hook(name))
-                self.hook_handles.append(hook_handle)
-
-    def reset_act_gradient_and_hooks(self):
-        # tmp_input = torch.zeros(self._input_shape, device=self.device)
-        # for name, module in self.model.named_modules():
-        #     if name in self.op_list:
-        #         hook_handle = module.register_forward_hook(self._get_disable_input_grad_hook(name))
-        #         self.hook_handles.append(hook_handle)
-        # self.model(tmp_input)
-        self._unregister_hook()
-
-    def get_params(self):
-        weight_names = [n for n, p in self.model.named_parameters() if
-                        p.requires_grad and "bias" not in n]  ##remove bias
-        params = [p for n, p in self.model.named_parameters() if p.requires_grad and "bias" not in n]  ##remove bias
-        self.weight_names = weight_names
-        self.params = params
-
-    def forward_backward(self, model, data, create_graph=False, return_w_grad=True):
-        model.zero_grad()
-        input = data[0].to(self.device)
-        ##self._input_shape = input.shape  ## for resetting input activation
-        target = data[1].to(self.device)
-        input.requires_grad = True
-        output = model(input)
-        loss = self.criterion(output, target)
-        torch.autograd.backward(loss, create_graph=create_graph)
-        ##loss.backward(create_graph=create_graph)
-        if return_w_grad:
-            gradients = []
-            for n, p in self.model.named_parameters():
-                if p.grad != None and n in self.weight_names:
-                    gradient = p.grad
-                    gradients.append(gradient + 0.0)  ## add 0 to create a copy
-            model.zero_grad()
-            return gradients
-        else:
-            model.zero_grad()
-
-    # def get_params(self, model):
-    #     parameters = [p for p in model.parameters() if p.requires_grad]
-    #     return parameters
-
-    def sample_rademacher(self, params):
-        samples = []
-        for param in params:
-            r = torch.randint_like(param, high=2, device=self.device)
-            r.masked_fill_(r == 0, -1)
-            samples.append(r)
-        return samples
-
-    def get_vtHv_weight(self, params, num_samples):
-        v = self.sample_rademacher(params)
-        H_v = [0] * len(v)
-        cnt = 0
-        for step, data in enumerate(self.dataloader):
-            batch_size = data[0].shape[0]
-            cnt += batch_size
-            gradients = self.forward_backward(self.model, data, create_graph=True)
-            H_v_one = torch.autograd.grad(gradients, params, v, only_inputs=True, retain_graph=False)
-            H_v = [pre + cur * float(batch_size) for cur, pre in zip(H_v_one, H_v)]
-            if cnt >= num_samples:
-                break
-        if cnt > 0:
-            H_v = [item / cnt for item in H_v]
-        v_t_H_v = torch.stack([torch.mean(h_v * v_t) for (h_v, v_t) in zip(H_v, v)])  ##maybe sum is better
-        return v_t_H_v
-
-    def get_vtHv_act(self, params, num_samples):
-        v = self.sample_rademacher(params)
-        H_v = [0] * len(v)
-        cnt = 0
-        for step, data in enumerate(self.dataloader):
-            if cnt >= num_samples:
-                break
-            for i in range(self.dataloader.batchsize):  ##force to batchsize to be 1
-                input = data[0][i:i + 1]
-                target = data[1][i:i + 1]
-
-                self.get_gradients(self.model, (input, target), self.criterion, create_graph=True)
-                layer_acts = [self.layer_acts[key] for key in self.layer_acts.keys()]
-                layer_act_gradients = [self.layer_acts_grads[key] for key in self.layer_acts.keys()]
-                hv_one = torch.autograd.grad(layer_act_gradients, layer_acts, v, only_inputs=True, retain_graph=False)
-                cnt += 1
-                if cnt >= num_samples:
-                    break
-
-    def get_weight_traces(self, num_samples):
-        layer_traces_per_iter = []
-        prev_avg_model_trace = 0
-        for iter in range(self.max_iter):
-            layer_traces = self.get_vtHv_weight(self.params, num_samples)
-            layer_traces_per_iter.append(layer_traces)
-            layer_traces_estimate = torch.mean(torch.stack(layer_traces_per_iter), dim=0)
-            model_trace = torch.sum(layer_traces_estimate)
-            diff_ratio = abs(model_trace - prev_avg_model_trace) / (prev_avg_model_trace + self.eps)
-            if diff_ratio < self.tolerance and iter > 10:  ##TODO magic number
-                break
-            if iter == 50:  ##TODO for debug
-                break
-            prev_avg_model_trace = model_trace
-        weight_name_to_traces = {}
-        layer_traces = layer_traces_estimate
-        for weight_name, trace in zip(self.weight_names, layer_traces):
-            weight_name_to_traces[weight_name] = trace
-        op_name_to_trace = {}
-        for weight_name in self.weight_names:
-            op_name = self.weight_to_op[weight_name]
-            op_name_to_trace[op_name] = weight_name_to_traces[weight_name]
-        return op_name_to_trace
-    def get_act_traces(self, num_samples):
-        unfused_training = self.unfused_model.training
-        self.unfused_model.eval()
-        self.hook_handles = []
-        self.layer_acts = {}
-        self.layer_acts_grads = {}
-        self.register_act_grad_hooks(self.unfused_model)
-        cnt = 0
-        act_traces_per_sample = []
-        for step, data in enumerate(self.dataloader):
-            if cnt >= num_samples:
-                break
-            bs = data[0].shape[0]
-            act_traces_sum = 0
-            act_traces_per_iter = []
-            prev_avg_model_trace = 0
-            act_traces_sums = None
-            for i in range(bs):  ##force the bs to be one
-                input = data[0][i:i + 1]
-                target = data[1][i:i + 1]
-                self.forward_backward(self.unfused_model, (input, target), create_graph=True, return_w_grad=False)
-                acts = [self.layer_acts[key] for key in self.layer_acts.keys()]
-                if act_traces_sums == None:
-                    act_traces_sums = [0] * len(acts)
-                acts_grad = [self.layer_acts_grads[key] for key in self.layer_acts.keys()]  ##same order with acts
-                vt_H_v_sum_per_act = [0] * len(acts)
-
-                prev_model_act_trace = 0
-                for iter in range(self.max_iter):
-                    v = self.sample_rademacher(acts)
-                    H_v = torch.autograd.grad(acts_grad, acts, v, only_inputs=True, retain_graph=True)
-                    vt_H_v = [torch.mean(h_v * v_t) for (h_v, v_t) in zip(H_v, v)]
-
-                    vt_H_v_sum_per_act = [vt_H_v_sum_per_act[index] + vt_H_v[index] for index, item in
-                                          enumerate(vt_H_v_sum_per_act)]
-                    vt_H_v_mean_per_act = [item / (iter + 1) for item in vt_H_v_sum_per_act]
-                    current_model_act_trace = torch.mean(torch.stack(vt_H_v_mean_per_act))
-
-                    diff_ratio = abs(current_model_act_trace - prev_model_act_trace) / (
-                            prev_model_act_trace + self.eps)
-                    if diff_ratio < self.tolerance and iter > 10:  ##TODO magic number
-                        break
-                    if iter == 50:  ##TODO for debug
-                        break
-
-                    prev_model_act_trace = current_model_act_trace
-                act_traces_per_sample.append(vt_H_v_mean_per_act)
-                cnt += 1
-                if cnt >= num_samples:
-                    break
-
-        if unfused_training:
-            self.unfused_model.train()
-        self.reset_act_gradient_and_hooks()  ##TODO have issues to reset the input grad to False
-        act_traces_stack = torch.stack([torch.stack(item) for item in act_traces_per_sample])
-        act_traces = torch.mean(act_traces_stack, dim=0)
-        res_dict = {}
-        for index, key in enumerate(self.layer_acts.keys()):
-            res_dict[key] = act_traces[index]
-
-        self.layer_acts = []
-        self.layer_acts_grads = []
-        return res_dict    
-    def insert_hook(self, model, target_module_list):
-        intern_outputs = []
-        for layer,module in model.named_modules():     
-            for target_module in target_module_list:
-                # print("layer:",layer)
-                # print("target_model:",target_module)   
-                if  layer == target_module:
-                    logging.debug("Collect: %s" % (module))
-                    # print("Collect: %s" % (module))
-                    intern_outputs.append(Node_collector(module))
+# # Define Collector based on hook, which is used to record the intermediate result
+# class Node_collector:
+#     def __init__(self, m):
+#         self.handle = m.register_forward_hook(self.hook_fn_act)
+#     def hook_fn_act(self, m, inp, outp):
+#         self.out_features = outp.clone()
+#         self.in_features = inp
+#         self.m = m
+#     def remove(self):
+#         self.handle.remove()
+# class HessianTrace:
+#     """
+#     please refer to
+#     Yao, Zhewei, et al. "Pyhessian: Neural networks through the lens of the hessian." 2020 IEEE international conference on big data (Big data). IEEE, 2020.
+#     Dong, Zhen, et al. "Hawq-v2: Hessian aware trace-weighted quantization of neural networks." Advances in neural information processing systems 33 (2020): 18518-18529.
+#     https://github.com/openvinotoolkit/nncf/blob/develop/nncf/torch/quantization/hessian_trace.py
+#     """
+
+#     def __init__(self, model, dataloader,q_model,criterion=None):
+#         self.unfused_model = model.model
+#         self.q_model=q_model
+#         tmp_model=model.model
+#         if 'graph' in (str(dir(tmp_model))): #check the attribute and it's length
+#             logger.info("This is aready fused model")
+#             self.model=model.model
+#         else:
+#             logger.info("fusing model")
+#             self.model = fuse_fx(model.model)  ##TODO need to check whether model has been already fused
+#         self.dataloader = dataloader
+#         self.max_iter = 500
+#         self.tolerance = 1e-5
+#         self.eps = 1e-6
+#         self.index = 0
+#         self.device = self.get_device(self.model)
+#         self.criterion = criterion
+#         if self.criterion == None:
+#             self.criterion = torch.nn.CrossEntropyLoss().to(self.device)  ##TODO need to set in config
+#         self.criterion = self.criterion.to(self.device)
+#         self.weight_to_op, self.op_list = self.get_fused_mapping()
+#         self.get_params()
+
+#     def is_fused_module(self, module):
+#         """This is a helper function for `_propagate_qconfig_helper` to detecte
+#            if this module is fused.
+#         Args:
+#             module (object): input module
+#         Returns:
+#             (bool): is fused or not
+#         """
+#         op_type = str(type(module))
+#         if 'fused' in op_type:
+#             return True
+#         else:
+#             return False
+
+#     def mapping_module_to_op(self, name):
+#         # length = len("_model.")
+#         # if len(name) < length:
+#         #     return name
+#         # else:
+#         return name
+#     def mse_metric_gap(self,fp32_tensor, dequantize_tensor):
+#         """Calculate the euclidean distance between fp32 tensor and int8 dequantize tensor
+#         Args:
+#             fp32_tensor (tensor): The FP32 tensor.
+#             dequantize_tensor (tensor): The INT8 dequantize tensor.
+#         """
+#         fp32_max = np.max(fp32_tensor)
+#         fp32_min = np.min(fp32_tensor)
+#         dequantize_max = np.max(dequantize_tensor)
+#         dequantize_min = np.min(dequantize_tensor)
+#         fp32_tensor = (fp32_tensor - fp32_min) / (fp32_max - fp32_min)
+#         dequantize_tensor = (dequantize_tensor - dequantize_min) / \
+#             (dequantize_max - dequantize_min)
+#         diff_tensor = fp32_tensor - dequantize_tensor
+#         euclidean_dist = np.sum(diff_tensor ** 2)
+#         return euclidean_dist / fp32_tensor.size
+#     def get_fused_mapping(self):
+#         model = self.model
+#         weights_info = dict(model.named_parameters())
+#         weight_to_op = {}
+#         for op_name, child in model.named_modules():
+#             if self.is_fused_module(child):
+#                 for name, _ in child.named_children():
+#                     if op_name + "." + name + ".weight" in weights_info:  ##TODO check if this is right
+
+#                         weight_to_op[op_name + "." + name + ".weight"] = self.mapping_module_to_op(op_name)
+#                         break
+#             else:
+#                 name = op_name + ".weight"
+#                 if name in weights_info and name not in weight_to_op.keys():
+#                     weight_to_op[op_name + ".weight"] = op_name
+#         op_list = []
+#         for key in weight_to_op.keys():
+#             op_list.append(weight_to_op[key])
+#         return weight_to_op, op_list
+
+#     def get_device(self, model: torch.nn.Module):
+#         for n, p in model.named_parameters():
+#             return p.data.device
+
+#     def _get_act_grad_hook(self, name):
+#         def act_grad_hook(model, grad_input, grad_output):
+#             ##print(name, grad_input[0].shape, grad_output[0].shape)
+#             if type(model) == torch.nn.Linear:  ##TODO very tricky
+#                 self.layer_acts_grads[name] = grad_input[1]
+#             else:
+#                 self.layer_acts_grads[name] = grad_input[0]
+
+#         return act_grad_hook
+
+#     def _get_enable_act_grad_hook(self, name):
+#         def enable_act_grad_hook(model, inputs, outputs):
+#             input = inputs[0]
+#             if input.requires_grad is False:
+#                 input.requires_grad = True
+#             self.layer_acts[name] = input
+
+#         return enable_act_grad_hook
+
+#     # def _get_disable_input_grad_hook(self, name):
+#     #     def disable_input_grad_hook(model, inputs, outputs):
+#     #         try:
+#     #             input = inputs[0]  ##TODO check whether this is right
+#     #         except:
+#     #             input = inputs
+#     #         if input.is_leaf == False:## you can only change requires_grad flags of leaf variables
+#     #             if input.requires_grad is True:
+#     #                 input.requires_grad = False
+#     #
+#     #
+#     #     return disable_input_grad_hook
+
+#     def _unregister_hook(self):
+#         for handel in self.hook_handles:
+#             handel.remove()
+
+#     def register_act_grad_hooks(self, model):
+#         for name, module in model.named_modules():
+#             if self.mapping_module_to_op(name) in self.op_list:
+#                 hook_handle = module.register_forward_hook(self._get_enable_act_grad_hook(name))
+#                 self.hook_handles.append(hook_handle)
+#                 hook_handle = module.register_backward_hook(self._get_act_grad_hook(name))
+#                 self.hook_handles.append(hook_handle)
+
+#     def reset_act_gradient_and_hooks(self):
+#         # tmp_input = torch.zeros(self._input_shape, device=self.device)
+#         # for name, module in self.model.named_modules():
+#         #     if name in self.op_list:
+#         #         hook_handle = module.register_forward_hook(self._get_disable_input_grad_hook(name))
+#         #         self.hook_handles.append(hook_handle)
+#         # self.model(tmp_input)
+#         self._unregister_hook()
+
+#     def get_params(self):
+#         weight_names = [n for n, p in self.model.named_parameters() if
+#                         p.requires_grad and "bias" not in n]  ##remove bias
+#         params = [p for n, p in self.model.named_parameters() if p.requires_grad and "bias" not in n]  ##remove bias
+#         self.weight_names = weight_names
+#         self.params = params
+
+#     def forward_backward(self, model, data, create_graph=False, return_w_grad=True):
+#         model.zero_grad()
+#         input = data[0].to(self.device)
+#         ##self._input_shape = input.shape  ## for resetting input activation
+#         target = data[1].to(self.device)
+#         input.requires_grad = True
+#         output = model(input)
+#         loss = self.criterion(output, target)
+#         torch.autograd.backward(loss, create_graph=create_graph)
+#         ##loss.backward(create_graph=create_graph)
+#         if return_w_grad:
+#             gradients = []
+#             for n, p in self.model.named_parameters():
+#                 if p.grad != None and n in self.weight_names:
+#                     gradient = p.grad
+#                     gradients.append(gradient + 0.0)  ## add 0 to create a copy
+#             model.zero_grad()
+#             return gradients
+#         else:
+#             model.zero_grad()
+
+#     # def get_params(self, model):
+#     #     parameters = [p for p in model.parameters() if p.requires_grad]
+#     #     return parameters
+
+#     def sample_rademacher(self, params):
+#         samples = []
+#         for param in params:
+#             r = torch.randint_like(param, high=2, device=self.device)
+#             r.masked_fill_(r == 0, -1)
+#             samples.append(r)
+#         return samples
+
+#     def get_vtHv_weight(self, params, num_samples):
+#         v = self.sample_rademacher(params)
+#         H_v = [0] * len(v)
+#         cnt = 0
+#         for step, data in enumerate(self.dataloader):
+#             batch_size = data[0].shape[0]
+#             cnt += batch_size
+#             gradients = self.forward_backward(self.model, data, create_graph=True)
+#             H_v_one = torch.autograd.grad(gradients, params, v, only_inputs=True, retain_graph=False)
+#             H_v = [pre + cur * float(batch_size) for cur, pre in zip(H_v_one, H_v)]
+#             if cnt >= num_samples:
+#                 break
+#         if cnt > 0:
+#             H_v = [item / cnt for item in H_v]
+#         v_t_H_v = torch.stack([torch.mean(h_v * v_t) for (h_v, v_t) in zip(H_v, v)])  ##maybe sum is better
+#         return v_t_H_v
+
+#     def get_vtHv_act(self, params, num_samples):
+#         v = self.sample_rademacher(params)
+#         H_v = [0] * len(v)
+#         cnt = 0
+#         for step, data in enumerate(self.dataloader):
+#             if cnt >= num_samples:
+#                 break
+#             for i in range(self.dataloader.batchsize):  ##force to batchsize to be 1
+#                 input = data[0][i:i + 1]
+#                 target = data[1][i:i + 1]
+
+#                 self.get_gradients(self.model, (input, target), self.criterion, create_graph=True)
+#                 layer_acts = [self.layer_acts[key] for key in self.layer_acts.keys()]
+#                 layer_act_gradients = [self.layer_acts_grads[key] for key in self.layer_acts.keys()]
+#                 hv_one = torch.autograd.grad(layer_act_gradients, layer_acts, v, only_inputs=True, retain_graph=False)
+#                 cnt += 1
+#                 if cnt >= num_samples:
+#                     break
+
+#     def get_weight_traces(self, num_samples):
+#         layer_traces_per_iter = []
+#         prev_avg_model_trace = 0
+#         for iter in range(self.max_iter):
+#             layer_traces = self.get_vtHv_weight(self.params, num_samples)
+#             layer_traces_per_iter.append(layer_traces)
+#             layer_traces_estimate = torch.mean(torch.stack(layer_traces_per_iter), dim=0)
+#             model_trace = torch.sum(layer_traces_estimate)
+#             diff_ratio = abs(model_trace - prev_avg_model_trace) / (prev_avg_model_trace + self.eps)
+#             if diff_ratio < self.tolerance and iter > 10:  ##TODO magic number
+#                 break
+#             if iter == 50:  ##TODO for debug
+#                 break
+#             prev_avg_model_trace = model_trace
+#         weight_name_to_traces = {}
+#         layer_traces = layer_traces_estimate
+#         for weight_name, trace in zip(self.weight_names, layer_traces):
+#             weight_name_to_traces[weight_name] = trace
+#         op_name_to_trace = {}
+#         for weight_name in self.weight_names:
+#             op_name = self.weight_to_op[weight_name]
+#             op_name_to_trace[op_name] = weight_name_to_traces[weight_name]
+#         return op_name_to_trace
+#     def get_act_traces(self, num_samples):
+#         unfused_training = self.unfused_model.training
+#         self.unfused_model.eval()
+#         self.hook_handles = []
+#         self.layer_acts = {}
+#         self.layer_acts_grads = {}
+#         self.register_act_grad_hooks(self.unfused_model)
+#         cnt = 0
+#         act_traces_per_sample = []
+#         for step, data in enumerate(self.dataloader):
+#             if cnt >= num_samples:
+#                 break
+#             bs = data[0].shape[0]
+#             act_traces_sum = 0
+#             act_traces_per_iter = []
+#             prev_avg_model_trace = 0
+#             act_traces_sums = None
+#             for i in range(bs):  ##force the bs to be one
+#                 input = data[0][i:i + 1]
+#                 target = data[1][i:i + 1]
+#                 self.forward_backward(self.unfused_model, (input, target), create_graph=True, return_w_grad=False)
+#                 acts = [self.layer_acts[key] for key in self.layer_acts.keys()]
+#                 if act_traces_sums == None:
+#                     act_traces_sums = [0] * len(acts)
+#                 acts_grad = [self.layer_acts_grads[key] for key in self.layer_acts.keys()]  ##same order with acts
+#                 vt_H_v_sum_per_act = [0] * len(acts)
+
+#                 prev_model_act_trace = 0
+#                 for iter in range(self.max_iter):
+#                     v = self.sample_rademacher(acts)
+#                     H_v = torch.autograd.grad(acts_grad, acts, v, only_inputs=True, retain_graph=True)
+#                     vt_H_v = [torch.mean(h_v * v_t) for (h_v, v_t) in zip(H_v, v)]
+
+#                     vt_H_v_sum_per_act = [vt_H_v_sum_per_act[index] + vt_H_v[index] for index, item in
+#                                           enumerate(vt_H_v_sum_per_act)]
+#                     vt_H_v_mean_per_act = [item / (iter + 1) for item in vt_H_v_sum_per_act]
+#                     current_model_act_trace = torch.mean(torch.stack(vt_H_v_mean_per_act))
+
+#                     diff_ratio = abs(current_model_act_trace - prev_model_act_trace) / (
+#                             prev_model_act_trace + self.eps)
+#                     if diff_ratio < self.tolerance and iter > 10:  ##TODO magic number
+#                         break
+#                     if iter == 50:  ##TODO for debug
+#                         break
+
+#                     prev_model_act_trace = current_model_act_trace
+#                 act_traces_per_sample.append(vt_H_v_mean_per_act)
+#                 cnt += 1
+#                 if cnt >= num_samples:
+#                     break
+
+#         if unfused_training:
+#             self.unfused_model.train()
+#         self.reset_act_gradient_and_hooks()  ##TODO have issues to reset the input grad to False
+#         act_traces_stack = torch.stack([torch.stack(item) for item in act_traces_per_sample])
+#         act_traces = torch.mean(act_traces_stack, dim=0)
+#         res_dict = {}
+#         for index, key in enumerate(self.layer_acts.keys()):
+#             res_dict[key] = act_traces[index]
+
+#         self.layer_acts = []
+#         self.layer_acts_grads = []
+#         return res_dict    
+#     def insert_hook(self, model, target_module_list):
+#         intern_outputs = []
+#         for layer,module in model.named_modules():     
+#             for target_module in target_module_list:
+#                 # print("layer:",layer)
+#                 # print("target_model:",target_module)   
+#                 if  layer == target_module:
+#                     logging.debug("Collect: %s" % (module))
+#                     # print("Collect: %s" % (module))
+#                     intern_outputs.append(Node_collector(module))
                 
-        logging.info("Total %d hook inserted" % (len(intern_outputs)))
-        # print("Total %d hook inserted" % (len(intern_outputs)))
-        return model, intern_outputs
-    def insert_hook_quantize(self,model, target_module_list):
-        intern_outputs = []
-        for layer,module in model.named_modules():     
-            for target_module in target_module_list:
-                # print("layer:",layer)
-                length = len("_model.")
-                new_key = layer[length:]
-                # print("target_model:",target_module)   
-                if  new_key == target_module:
-                    logging.debug("Collect: %s" % (module))
-                    # print("Collect: %s" % (module))
-                    intern_outputs.append(Node_collector(module))
-        logging.info("Total %d hook inserted" % (len(intern_outputs)))
-        # print("Total %d hook inserted" % (len(intern_outputs)))
-        return model, intern_outputs
-    def get_act_gap(self,fp32_model,q_model):
-        """
-        Estimates each activation gap between quantized model and float model 
-        """
-        self.handle_acts=[]
-        fp32_model.eval()
-        # temp_model = fuse_fx(fp32_model.model)
-        temp_model=fp32_model
-        # target_module_list = [nn.ReLU] # Insert hook for FP32 model
-        target_module_list = self.op_list
-        temp_model, intern_outputs =self.insert_hook(temp_model, target_module_list)
-        # intern_outputs={}
-        for input, target in self.dataloader:
-            temp_model(input)
-            break
-
-        fp32_act_out={}
-        for i, intern_output in enumerate(intern_outputs):
-            stat_features = intern_output.out_features.view(-1)
-            # print ("No.", i, " ", intern_output.out_features.shape)
-            # print ("Numpy No.", i, " ", intern_output.out_features.cpu().data.numpy().shape)
-            # print ("No.", i, " ", stat_features.cpu().data.numpy().shape)
-            # print ("Numpy No.", i, " ", stat_features.cpu().data.numpy())
-            fp32_act_out[target_module_list[i]]=stat_features.cpu().data.numpy()
-            # break
-        for i in intern_outputs:
-            # print(i)
-            i.remove()
-        target_module_list = self.op_list
-        q_model, intern_outputs=self.insert_hook_quantize(q_model, target_module_list)
-        for input, target in self.dataloader: #only one sample
-            q_model(input)
-            break
-        qnt_act_out={}
-        intern_outputs={}
-        for i, intern_output in enumerate(intern_outputs):
-            stat_features = intern_output.out_features.view(-1)
-            qnt_act_out[target_module_list[i]]=stat_features.dequantize().cpu().data.numpy()
-            # break
-        for i in intern_outputs:
-            # print(i)
-            i.remove()
-        act_gap={}
-        mse_gap={}
-        for fp_i,int_i in zip(fp32_act_out,qnt_act_out):
-            activation_qnt_error=fp32_act_out[fp_i]-qnt_act_out[int_i]
-            mse_gap[fp_i]=self.mse_metric_gap(fp32_act_out[fp_i],qnt_act_out[int_i])
-            act_gap[fp_i]=np.sum(activation_qnt_error)/activation_qnt_error.size
-        return act_gap,mse_gap
-    def get_avg_traces(self, enable_act=True, num_samples=32):
-        """
-        Estimates average hessian trace for each parameter
-        """
-        assert num_samples > 0
-        traces = {}
-        weight_traces = self.get_weight_traces(num_samples)
-        traces['weight'] = weight_traces
-        act_trace={}
-        if enable_act:
-            act_gap,mse_gap=self.get_act_gap(self.model,self.q_model)
-            act_traces = self.get_act_traces(num_samples)
-            for i,j in zip(act_traces,mse_gap):
-                #currently use mse to analysis 
-                act_trace[i]=act_traces[i]+mse_gap[j]
-            traces['activation'] = act_traces
-        return traces
-
-
-##copy from torch.quantization._numeric_suite
-def _find_match(
-        str_list: Union[Dict[str, Any], List[str]], key_str: str,
-        postfix: str,
-) -> Optional[str]:
-    split_str = key_str.split(".")
-    if split_str[-1] == postfix:
-        match_string = "".join(key_str.split(".")[0:-1])
-        for s2 in str_list:
-            pattern1 = "".join(s2.split(".")[0:-1])
-            pattern2 = "".join(s2.split(".")[0:-2])
-            if match_string == pattern1:
-                return s2
-            if match_string == pattern2:
-                return s2
-
-        # For matching "fc.weight" and "fc._packed_params._packed_params"
-        if postfix == "_packed_params":
-            match_string = "".join(key_str.split(".")[0:-2])
-            if len(match_string) == 0:
-                return None
-            for s2 in str_list:
-                pattern1 = "".join(s2.split(".")[0:-1])
-                pattern2 = "".join(s2.split(".")[0:-2])
-                if match_string == pattern1:
-                    return s2
-                if match_string == pattern2:
-                    return s2
-        return None
-    else:
-        return None
-
-
-##copy form torch.quantization._numeric_suite
-def compare_weights(
-        float_dict: Dict[str, Any], quantized_dict: Dict[str, Any]
-) -> Dict[str, Dict[str, torch.Tensor]]:
-    r"""Compare the weights of the float module with its corresponding quantized
-    module. Return a dict with key corresponding to module names and each entry being
-    a dictionary with two keys 'float' and 'quantized', containing the float and
-    quantized weights. This dict can be used to compare and compute the quantization
-    error of the weights of float and quantized models.
-
-    Example usage::
-
-        wt_compare_dict = compare_weights(
-            float_model.state_dict(), qmodel.state_dict())
-        for key in wt_compare_dict:
-            print(
-                key,
-                compute_error(
-                    wt_compare_dict[key]['float'],
-                    wt_compare_dict[key]['quantized'].dequantize()
-                )
-            )
-
-    Args:
-        float_dict: state dict of the float model
-        quantized_dict: state dict of the quantized model
-
-    Return:
-        weight_dict: dict with key corresponding to module names and each entry being
-        a dictionary with two keys 'float' and 'quantized', containing the float and
-        quantized weights
-    """
-
-    weight_dict: Dict[str, Dict] = {}
-    for key in quantized_dict:
-        match_key = _find_match(float_dict, key, "weight")
-        if match_key is not None:
-            weight_dict[key] = {}
-            weight_dict[key]["float"] = float_dict[match_key]
-            weight_dict[key]["quantized"] = quantized_dict[key]
-            continue
-
-        # For matching "fc.weight" and "fc._packed_params._packed_params"
-        match_key = _find_match(float_dict, key, "_packed_params")
-        if match_key is not None:
-            weight_dict[match_key] = {}
-            weight_dict[match_key]["float"] = float_dict[match_key]
-            weight_dict[match_key]["quantized"] = quantized_dict[key][0]
-            ##TODO:should consider more models in further work
-
-        # For LSTM
-        split_str = key.split(".")
-        if split_str[-1] == "param" and split_str[-3] == "_all_weight_values":
-            layer = split_str[-2]
-            module_name = ".".join(split_str[:-3])
-            float_weight_ih_key = module_name + ".weight_ih_l" + layer
-            float_weight_hh_key = module_name + ".weight_hh_l" + layer
-            if float_weight_ih_key in float_dict and float_weight_hh_key in float_dict:
-                weight_dict[key] = {}
-                weight_dict[key]["float"] = float_dict[float_weight_ih_key]
-                weight_dict[key]["quantized"] = (
-                    quantized_dict[key].__getstate__()[0][4][0].__getstate__()[0][0]
-                )
-                weight_dict[key]["float"] = float_dict[float_weight_hh_key]
-                weight_dict[key]["quantized"] = (
-                    quantized_dict[key].__getstate__()[0][4][1].__getstate__()[0][0]
-                )
-
-    return weight_dict
+#         logging.info("Total %d hook inserted" % (len(intern_outputs)))
+#         # print("Total %d hook inserted" % (len(intern_outputs)))
+#         return model, intern_outputs
+#     def insert_hook_quantize(self,model, target_module_list):
+#         intern_outputs = []
+#         for layer,module in model.named_modules():     
+#             for target_module in target_module_list:
+#                 # print("layer:",layer)
+#                 length = len("_model.")
+#                 new_key = layer[length:]
+#                 # print("target_model:",target_module)   
+#                 if  new_key == target_module:
+#                     logging.debug("Collect: %s" % (module))
+#                     # print("Collect: %s" % (module))
+#                     intern_outputs.append(Node_collector(module))
+#         logging.info("Total %d hook inserted" % (len(intern_outputs)))
+#         # print("Total %d hook inserted" % (len(intern_outputs)))
+#         return model, intern_outputs
+#     def get_act_gap(self,fp32_model,q_model):
+#         """
+#         Estimates each activation gap between quantized model and float model 
+#         """
+#         self.handle_acts=[]
+#         fp32_model.eval()
+#         # temp_model = fuse_fx(fp32_model.model)
+#         temp_model=fp32_model
+#         # target_module_list = [nn.ReLU] # Insert hook for FP32 model
+#         target_module_list = self.op_list
+#         temp_model, intern_outputs =self.insert_hook(temp_model, target_module_list)
+#         # intern_outputs={}
+#         for input, target in self.dataloader:
+#             temp_model(input)
+#             break
+
+#         fp32_act_out={}
+#         for i, intern_output in enumerate(intern_outputs):
+#             stat_features = intern_output.out_features.view(-1)
+#             # print ("No.", i, " ", intern_output.out_features.shape)
+#             # print ("Numpy No.", i, " ", intern_output.out_features.cpu().data.numpy().shape)
+#             # print ("No.", i, " ", stat_features.cpu().data.numpy().shape)
+#             # print ("Numpy No.", i, " ", stat_features.cpu().data.numpy())
+#             fp32_act_out[target_module_list[i]]=stat_features.cpu().data.numpy()
+#             # break
+#         for i in intern_outputs:
+#             # print(i)
+#             i.remove()
+#         target_module_list = self.op_list
+#         q_model, intern_outputs=self.insert_hook_quantize(q_model, target_module_list)
+#         for input, target in self.dataloader: #only one sample
+#             q_model(input)
+#             break
+#         qnt_act_out={}
+#         intern_outputs={}
+#         for i, intern_output in enumerate(intern_outputs):
+#             stat_features = intern_output.out_features.view(-1)
+#             qnt_act_out[target_module_list[i]]=stat_features.dequantize().cpu().data.numpy()
+#             # break
+#         for i in intern_outputs:
+#             # print(i)
+#             i.remove()
+#         act_gap={}
+#         mse_gap={}
+#         for fp_i,int_i in zip(fp32_act_out,qnt_act_out):
+#             activation_qnt_error=fp32_act_out[fp_i]-qnt_act_out[int_i]
+#             mse_gap[fp_i]=self.mse_metric_gap(fp32_act_out[fp_i],qnt_act_out[int_i])
+#             act_gap[fp_i]=np.sum(activation_qnt_error)/activation_qnt_error.size
+#         return act_gap,mse_gap
+#     def get_avg_traces(self, enable_act=True, num_samples=32):
+#         """
+#         Estimates average hessian trace for each parameter
+#         """
+#         assert num_samples > 0
+#         traces = {}
+#         weight_traces = self.get_weight_traces(num_samples)
+#         traces['weight'] = weight_traces
+#         act_trace={}
+#         if enable_act:
+#             act_gap,mse_gap=self.get_act_gap(self.model,self.q_model)
+#             act_traces = self.get_act_traces(num_samples)
+#             for i,j in zip(act_traces,mse_gap):
+#                 #currently use mse to analysis 
+#                 act_trace[i]=act_traces[i]+mse_gap[j]
+#             traces['activation'] = act_traces
+#         return traces
+
+
+# ##copy from torch.quantization._numeric_suite
+# def _find_match(
+#         str_list: Union[Dict[str, Any], List[str]], key_str: str,
+#         postfix: str,
+# ) -> Optional[str]:
+#     split_str = key_str.split(".")
+#     if split_str[-1] == postfix:
+#         match_string = "".join(key_str.split(".")[0:-1])
+#         for s2 in str_list:
+#             pattern1 = "".join(s2.split(".")[0:-1])
+#             pattern2 = "".join(s2.split(".")[0:-2])
+#             if match_string == pattern1:
+#                 return s2
+#             if match_string == pattern2:
+#                 return s2
+
+#         # For matching "fc.weight" and "fc._packed_params._packed_params"
+#         if postfix == "_packed_params":
+#             match_string = "".join(key_str.split(".")[0:-2])
+#             if len(match_string) == 0:
+#                 return None
+#             for s2 in str_list:
+#                 pattern1 = "".join(s2.split(".")[0:-1])
+#                 pattern2 = "".join(s2.split(".")[0:-2])
+#                 if match_string == pattern1:
+#                     return s2
+#                 if match_string == pattern2:
+#                     return s2
+#         return None
+#     else:
+#         return None
+
+
+# ##copy form torch.quantization._numeric_suite
+# def compare_weights(
+#         float_dict: Dict[str, Any], quantized_dict: Dict[str, Any]
+# ) -> Dict[str, Dict[str, torch.Tensor]]:
+#     r"""Compare the weights of the float module with its corresponding quantized
+#     module. Return a dict with key corresponding to module names and each entry being
+#     a dictionary with two keys 'float' and 'quantized', containing the float and
+#     quantized weights. This dict can be used to compare and compute the quantization
+#     error of the weights of float and quantized models.
+
+#     Example usage::
+
+#         wt_compare_dict = compare_weights(
+#             float_model.state_dict(), qmodel.state_dict())
+#         for key in wt_compare_dict:
+#             print(
+#                 key,
+#                 compute_error(
+#                     wt_compare_dict[key]['float'],
+#                     wt_compare_dict[key]['quantized'].dequantize()
+#                 )
+#             )
+
+#     Args:
+#         float_dict: state dict of the float model
+#         quantized_dict: state dict of the quantized model
+
+#     Return:
+#         weight_dict: dict with key corresponding to module names and each entry being
+#         a dictionary with two keys 'float' and 'quantized', containing the float and
+#         quantized weights
+#     """
+
+#     weight_dict: Dict[str, Dict] = {}
+#     for key in quantized_dict:
+#         match_key = _find_match(float_dict, key, "weight")
+#         if match_key is not None:
+#             weight_dict[key] = {}
+#             weight_dict[key]["float"] = float_dict[match_key]
+#             weight_dict[key]["quantized"] = quantized_dict[key]
+#             continue
+
+#         # For matching "fc.weight" and "fc._packed_params._packed_params"
+#         match_key = _find_match(float_dict, key, "_packed_params")
+#         if match_key is not None:
+#             weight_dict[match_key] = {}
+#             weight_dict[match_key]["float"] = float_dict[match_key]
+#             weight_dict[match_key]["quantized"] = quantized_dict[key][0]
+#             ##TODO:should consider more models in further work
+
+#         # For LSTM
+#         split_str = key.split(".")
+#         if split_str[-1] == "param" and split_str[-3] == "_all_weight_values":
+#             layer = split_str[-2]
+#             module_name = ".".join(split_str[:-3])
+#             float_weight_ih_key = module_name + ".weight_ih_l" + layer
+#             float_weight_hh_key = module_name + ".weight_hh_l" + layer
+#             if float_weight_ih_key in float_dict and float_weight_hh_key in float_dict:
+#                 weight_dict[key] = {}
+#                 weight_dict[key]["float"] = float_dict[float_weight_ih_key]
+#                 weight_dict[key]["quantized"] = (
+#                     quantized_dict[key].__getstate__()[0][4][0].__getstate__()[0][0]
+#                 )
+#                 weight_dict[key]["float"] = float_dict[float_weight_hh_key]
+#                 weight_dict[key]["quantized"] = (
+#                     quantized_dict[key].__getstate__()[0][4][1].__getstate__()[0][0]
+#                 )
+
+#     return weight_dict
 
 
 @strategy_registry
@@ -657,46 +657,47 @@ def next_tune_cfg(self):
         #     print(n)
         # for n, p in self._fp32_model.named_parameters():
         #     print(n)
-        orig_eval = True
-        if self._fp32_model.training:
-            orig_eval = False
-        self._fp32_model.eval()
-        import copy
-        ht = HessianTrace(self._fp32_model, self.calib_dataloader,self.q_model)
-        q_model_state_dict = {}
-        for key in self.q_model.state_dict().keys():
-            length = len("_model.")
-            new_key = key[length:]
-            q_model_state_dict[new_key] = self.q_model.state_dict()[key]
-        weight_quant_loss = compare_weights(ht.model.state_dict(), q_model_state_dict)
-        pertur_lst = {}
-        for key in weight_quant_loss:
-            op_float_tensor = weight_quant_loss[key]['float']
-            op_qnt_tensor = weight_quant_loss[key]['quantized'].dequantize()
-            diff_l2 = (torch.norm(op_float_tensor - op_qnt_tensor, p=2) ** 2)  # Formula: L2=||Q(w)-w||p^2
-            pertur_lst[key] = diff_l2
-        self.enable_act=False #enable activation trace and quantization loss analysis feature
-        traces = ht.get_avg_traces(self.enable_act)
-        op_to_traces = traces['weight']
-        if self.enable_act:
-            act_to_traces=traces['activation']
-        #TODO() optimize relationship of weights quantized loss and activation quantized loss, to find best conbine
-        #TODO() do double check why layer1's output is not 0 for activation quantized
-            for trace_i, pertur_i,act_i in zip(op_to_traces.keys(),pertur_lst.keys(),act_to_traces.keys()):
-                op_to_traces[trace_i]=pertur_lst[pertur_i]*op_to_traces[trace_i]+act_to_traces[act_i] #Formula:Omig=Trace*L2+act_trace
-        else:
-            for trace_i, pertur_i in zip(op_to_traces.keys(),pertur_lst.keys()):
-                op_to_traces[trace_i]=pertur_lst[pertur_i]*op_to_traces[trace_i] #Formula:Omig=Trace*L2
-        if orig_eval == False:
-            self._fp32_model.train()
+        # orig_eval = True
+        # if self._fp32_model.training:
+        #     orig_eval = False
+        # self._fp32_model.eval()
+        # ht = HessianTrace(self._fp32_model, self.calib_dataloader,self.q_model)
+        # q_model_state_dict = {}
+        # for key in self.q_model.state_dict().keys():
+        #     length = len("_model.")
+        #     new_key = key[length:]
+        #     q_model_state_dict[new_key] = self.q_model.state_dict()[key]
+        # weight_quant_loss = compare_weights(ht.model.state_dict(), q_model_state_dict)
+        # pertur_lst = {}
+        # for key in weight_quant_loss:
+        #     op_float_tensor = weight_quant_loss[key]['float']
+        #     op_qnt_tensor = weight_quant_loss[key]['quantized'].dequantize()
+        #     diff_l2 = (torch.norm(op_float_tensor - op_qnt_tensor, p=2) ** 2)  # Formula: L2=||Q(w)-w||p^2
+        #     pertur_lst[key] = diff_l2
+        # self.enable_act=False #enable activation trace and quantization loss analysis feature
+        # traces = ht.get_avg_traces(self.enable_act)
+        # op_to_traces = traces['weight']
+        # if self.enable_act:
+        #     act_to_traces=traces['activation']
+        # #TODO() optimize relationship of weights quantized loss and activation quantized loss, to find best conbine
+        # #TODO() do double check why layer1's output is not 0 for activation quantized
+        #     for trace_i, pertur_i,act_i in zip(op_to_traces.keys(),pertur_lst.keys(),act_to_traces.keys()):
+        #         op_to_traces[trace_i]=pertur_lst[pertur_i]*op_to_traces[trace_i]+act_to_traces[act_i] #Formula:Omig=Trace*L2+act_trace
+        # else:
+        #     for trace_i, pertur_i in zip(op_to_traces.keys(),pertur_lst.keys()):
+        #         op_to_traces[trace_i]=pertur_lst[pertur_i]*op_to_traces[trace_i] #Formula:Omig=Trace*L2
+        # if orig_eval == False:
+        #     self._fp32_model.train()
 
         # End compute the hessian trace
         # # TODO uncomment it when algo ready.
-        # op_to_traces = self.adaptor.calculate_hessian_trace(fp32_model = self._fp32_model, 
-        #                                                     dataloader = self.calib_dataloader, 
-        #                                                     q_model = self.q_model, 
-        #                                                     criterion = torch.nn.CrossEntropyLoss(), # TODO replace it with user specify loss
-        #                                                     enable_act = False)
+        criterion=torch.nn.CrossEntropyLoss()# TODO replace it with user specify loss
+        op_to_traces = self.adaptor.calculate_hessian_trace(fp32_model = self._fp32_model, 
+                                                            dataloader = self.calib_dataloader, 
+                                                            q_model = self.q_model, 
+                                                            criterion =criterion, 
+                                                            enable_act = False)
+        # op_to_traces = self.adaptor.calculate_hessian_trace()
         ordered_ops = sorted(op_to_traces.keys(),
                              key=lambda key: op_to_traces[key],
                              reverse=self.higher_is_better)

From eb05a1fb3a9362aa6a43cbed38811fa5d33402be Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 6 Dec 2022 15:44:15 +0800
Subject: [PATCH 095/128] add strategy kwargs for new api

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/conf/config.py |  9 +++++++++
 neural_compressor/config.py      | 25 +++++++++++++++++++++++--
 2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/neural_compressor/conf/config.py b/neural_compressor/conf/config.py
index 640b2fd36df..59b421f1e37 100644
--- a/neural_compressor/conf/config.py
+++ b/neural_compressor/conf/config.py
@@ -1346,8 +1346,17 @@ def map_pyconfig_to_cfg(self, pythonic_config):
                 'tuning.exit_policy.max_trials': pythonic_config.quantization.max_trials,
                 'tuning.exit_policy.performance_only': pythonic_config.quantization.performance_only,
                 'use_bf16': pythonic_config.quantization.use_bf16,
+                'quantization.optimization_level': pythonic_config.quantization.optimization_level,
                 'reduce_range': pythonic_config.quantization.reduce_range
             })
+            if pythonic_config.quantization.strategy_kwargs:
+                st_kwargs = pythonic_config.quantization.strategy_kwargs
+                for st_key in ['sigopt_api_token', 'sigopt_experiment_name', 'accuracy_weight', 'latency_weight']:
+                    if st_key in st_kwargs:
+                        st_val =  st_kwargs[st_key]
+                        print(st_key)
+                        mapping.update({'tuning.strategy.' + st_key: st_val})
+            
         if pythonic_config.distillation is not None:
             mapping.update({
                 'distillation.train.criterion': pythonic_config.distillation.criterion,
diff --git a/neural_compressor/config.py b/neural_compressor/config.py
index f7337b546f0..0f6adf34c23 100644
--- a/neural_compressor/config.py
+++ b/neural_compressor/config.py
@@ -307,6 +307,7 @@ def __init__(self,
                  op_type_list=None,
                  op_name_list=None,
                  strategy="basic",
+                 strategy_kwargs=None,
                  objective="performance",
                  timeout=0,
                  max_trials=100,
@@ -321,6 +322,7 @@ def __init__(self,
         self._op_type_list = op_type_list
         self._op_name_list = op_name_list
         self._strategy = strategy
+        self._strategy_kwargs = strategy_kwargs
         self._objective = objective
         self._timeout = timeout
         self._max_trials = max_trials
@@ -402,6 +404,14 @@ def strategy(self, strategy):
             ['basic', 'mse', 'bayesian', 'random', 'exhaustive']):
             self._strategy = strategy
 
+    @property
+    def strategy_kwargs(self):
+        return self._strategy_kwargs
+    
+    @strategy_kwargs.setter
+    def strategy_kwargs(self, strategy_kwargs):
+        self._strategy_kwargs = strategy_kwargs
+
     @property
     def op_name_list(self):
         return self._op_name_list
@@ -480,11 +490,12 @@ def inputs(self, inputs):
 
 
 class TuningCriterion:
-    def __init__(self, strategy="basic", timeout=0, max_trials=100, objective="performance"):
+    def __init__(self, strategy="basic", strategy_kwargs=None, timeout=0, max_trials=100, objective="performance"):
         self._strategy = strategy
         self._timeout = timeout
         self._max_trials = max_trials
         self._objective = objective
+        self._strategy_kwargs = strategy_kwargs
 
     @property
     def max_trials(self):
@@ -523,7 +534,14 @@ def strategy(self, strategy):
         if check_value('strategy', strategy, str,
             ['basic', 'mse', 'bayesian', 'random', 'exhaustive']):
             self._strategy = strategy
-
+    
+    @property
+    def strategy_kwargs(self):
+        return self._strategy_kwargs
+    
+    @strategy_kwargs.setter
+    def strategy_kwargs(self, strategy_kwargs):
+        self._strategy_kwargs = strategy_kwargs
 
 tuning_criterion = TuningCriterion()
 
@@ -540,6 +558,7 @@ def __init__(self,
                  op_name_list=None,
                  reduce_range=None,
                  extra_precisions = ["bf16"],
+                 optimization_level=1,
                  tuning_criterion=tuning_criterion,
                  accuracy_criterion=accuracy_criterion,
     ):
@@ -551,6 +570,7 @@ def __init__(self,
                          op_type_list=op_type_list,
                          op_name_list=op_name_list,
                          strategy=tuning_criterion.strategy,
+                         strategy_kwargs=tuning_criterion.strategy_kwargs,
                          objective=tuning_criterion.objective,
                          timeout=tuning_criterion.timeout,
                          max_trials=tuning_criterion.max_trials,
@@ -558,6 +578,7 @@ def __init__(self,
                          extra_precisions=extra_precisions,
                          accuracy_criterion=accuracy_criterion)
         self.approach = approach
+        self.optimization_level = optimization_level
 
     @property
     def approach(self):

From 0afc168ed9a1de1d85bc00c921bb290d6b15477f Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 6 Dec 2022 15:47:26 +0800
Subject: [PATCH 096/128] fixed some bugs

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/adaptor/pytorch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neural_compressor/adaptor/pytorch.py b/neural_compressor/adaptor/pytorch.py
index 44392313d80..c2535e4fdba 100644
--- a/neural_compressor/adaptor/pytorch.py
+++ b/neural_compressor/adaptor/pytorch.py
@@ -1094,7 +1094,7 @@ def is_fused_module(self, module):
         else:
             return False
         
-    def calculate_hessian_trace(
+    def calculate_hessian_trace(self,
                                 fp32_model, 
                                 dataloader, 
                                 q_model,

From b154e0cfe091d869f4dfb1cfdc8d04b79baa1b50 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 6 Dec 2022 16:28:27 +0800
Subject: [PATCH 097/128] add uts

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/conf/config.py |  3 ++-
 neural_compressor/config.py      |  4 ++--
 test/strategy/test_basic.py      | 22 +++++++++++++++++++++-
 test/strategy/test_sigopt.py     | 26 +++++++++++++++++++++++++-
 4 files changed, 50 insertions(+), 5 deletions(-)

diff --git a/neural_compressor/conf/config.py b/neural_compressor/conf/config.py
index 59b421f1e37..79f50237051 100644
--- a/neural_compressor/conf/config.py
+++ b/neural_compressor/conf/config.py
@@ -1351,7 +1351,8 @@ def map_pyconfig_to_cfg(self, pythonic_config):
             })
             if pythonic_config.quantization.strategy_kwargs:
                 st_kwargs = pythonic_config.quantization.strategy_kwargs
-                for st_key in ['sigopt_api_token', 'sigopt_experiment_name', 'accuracy_weight', 'latency_weight']:
+                for st_key in ['sigopt_api_token', 'sigopt_project_id', 'sigopt_experiment_name', \
+                    'accuracy_weight', 'latency_weight']:
                     if st_key in st_kwargs:
                         st_val =  st_kwargs[st_key]
                         print(st_key)
diff --git a/neural_compressor/config.py b/neural_compressor/config.py
index 0f6adf34c23..4accfce4bd0 100644
--- a/neural_compressor/config.py
+++ b/neural_compressor/config.py
@@ -401,7 +401,7 @@ def strategy(self):
     @strategy.setter
     def strategy(self, strategy):
         if check_value('strategy', strategy, str,
-            ['basic', 'mse', 'bayesian', 'random', 'exhaustive']):
+            ['basic', 'mse', 'bayesian', 'random', 'exhaustive', 'sigopt', 'tpe']):
             self._strategy = strategy
 
     @property
@@ -532,7 +532,7 @@ def strategy(self):
     @strategy.setter
     def strategy(self, strategy):
         if check_value('strategy', strategy, str,
-            ['basic', 'mse', 'bayesian', 'random', 'exhaustive']):
+            ['basic', 'mse', 'bayesian', 'random', 'exhaustive', 'sigopt', 'tpe']):
             self._strategy = strategy
     
     @property
diff --git a/test/strategy/test_basic.py b/test/strategy/test_basic.py
index 845e9b0ccae..239f26a071a 100644
--- a/test/strategy/test_basic.py
+++ b/test/strategy/test_basic.py
@@ -155,7 +155,7 @@ def build_fake_model():
             tf.import_graph_def(graph_def, name='')
     return graph
 
-class TestQuantization(unittest.TestCase):
+class TestBasicTuningStrategy(unittest.TestCase):
 
     @classmethod
     def setUpClass(self):
@@ -217,6 +217,26 @@ def test_run_basic_max_trials_multimetric_weight(self):
         quantizer.model = self.constant_graph
         quantizer.fit()
 
+        
+    def test_run_basic_one_trial_new_api(self):
+        from neural_compressor.quantization import fit
+        from neural_compressor.config import AccuracyCriterion, AccuracyLoss, PostTrainingQuantConfig, TuningCriterion
+        from neural_compressor.experimental.common import DataLoader
+        from neural_compressor.experimental.data.datasets.dummy_dataset import DummyDataset
+
+        # dataset and dataloader
+        dataset = DummyDataset(shape=(100, 3, 3, 1), label=True)
+        dataloader = DataLoader(dataset)
+        
+        # tuning and accuracy criterion
+        tolerable_loss = AccuracyLoss(0.01)
+        accuracy_criterion = AccuracyCriterion(criterion='relative', tolerable_loss=tolerable_loss)
+        tuning_criterion = TuningCriterion(strategy='basic')
+        conf = PostTrainingQuantConfig(approach="static", backend="tensorflow", 
+                                       tuning_criterion=tuning_criterion,
+                                       accuracy_criterion=accuracy_criterion)
+        q_model = fit(model=self.constant_graph, conf=conf, calib_dataloader= dataloader, eval_dataloader=dataloader)
+        self.assertIsNotNone(q_model)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/strategy/test_sigopt.py b/test/strategy/test_sigopt.py
index ce7a7669862..062c96b638c 100644
--- a/test/strategy/test_sigopt.py
+++ b/test/strategy/test_sigopt.py
@@ -104,7 +104,7 @@ def build_fake_model():
     return graph
 
 @unittest.skipIf(CONDITION , "missing the env variables 'SIGOPT_API_TOKEN' or 'SIGOPT_PROJECT_ID'")
-class TestQuantization(unittest.TestCase):
+class TestSigoptTuningStrategy(unittest.TestCase):
 
     @classmethod
     def setUpClass(self):
@@ -140,6 +140,30 @@ def test_run_basic_max_trials(self):
         quantizer.eval_dataloader = common.DataLoader(dataset)
         quantizer.model = self.constant_graph
         quantizer.fit()
+        
+    def test_run_sigopt_one_trial_new_api(self):
+        from neural_compressor.quantization import fit
+        from neural_compressor.config import AccuracyCriterion, AccuracyLoss, PostTrainingQuantConfig, TuningCriterion
+        from neural_compressor.experimental.common import DataLoader
+        from neural_compressor.experimental.data.datasets.dummy_dataset import DummyDataset
+        
+        # dataset and dataloader
+        dataset = DummyDataset(shape=(100, 3, 3, 1), label=True)
+        dataloader = DataLoader(dataset)
+        
+        # tuning and accuracy criterion
+        tolerable_loss = AccuracyLoss(0.01)
+        accuracy_criterion = AccuracyCriterion(criterion='relative', tolerable_loss=tolerable_loss)
+        strategy_kwargs = {'sigopt_api_token': 'sigopt_api_token_test', 
+                           'sigopt_project_id': 'sigopt_project_id_test',
+                           'sigopt_experiment_name': 'nc-tune'}
+        tuning_criterion = TuningCriterion(strategy='sigopt', strategy_kwargs=strategy_kwargs, max_trials=3)
+        conf = PostTrainingQuantConfig(approach="static", backend="tensorflow", 
+                                       tuning_criterion=tuning_criterion,
+                                       accuracy_criterion=accuracy_criterion)
+        q_model = fit(model=self.constant_graph, conf=conf, calib_dataloader= dataloader, eval_dataloader=dataloader)
+        self.assertIsNotNone(q_model)
+
 
 if __name__ == "__main__":
     unittest.main()

From 1f5c859f5e933395a050143c385a95ae1143a7e9 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 6 Dec 2022 16:42:40 +0800
Subject: [PATCH 098/128] remove the line for debug

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/conf/config.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/neural_compressor/conf/config.py b/neural_compressor/conf/config.py
index 79f50237051..8b227697086 100644
--- a/neural_compressor/conf/config.py
+++ b/neural_compressor/conf/config.py
@@ -1355,7 +1355,6 @@ def map_pyconfig_to_cfg(self, pythonic_config):
                     'accuracy_weight', 'latency_weight']:
                     if st_key in st_kwargs:
                         st_val =  st_kwargs[st_key]
-                        print(st_key)
                         mapping.update({'tuning.strategy.' + st_key: st_val})
             
         if pythonic_config.distillation is not None:

From fe03b257c38b0b902333e2271056873e415f9c19 Mon Sep 17 00:00:00 2001
From: BiaoFangAIA <biao.fang@intel.com>
Date: Tue, 6 Dec 2022 16:57:38 +0800
Subject: [PATCH 099/128] delete some unused code

---
 neural_compressor/strategy/hawq.py | 561 +----------------------------
 1 file changed, 2 insertions(+), 559 deletions(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index c1ce91e0ca0..4ba5ed7db8f 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -34,523 +34,6 @@
 import logging
 logger = logging.getLogger(__name__)
 from typing import Dict, List, Optional, Any, Union, Callable, Set
-# # Define Collector based on hook, which is used to record the intermediate result
-# class Node_collector:
-#     def __init__(self, m):
-#         self.handle = m.register_forward_hook(self.hook_fn_act)
-#     def hook_fn_act(self, m, inp, outp):
-#         self.out_features = outp.clone()
-#         self.in_features = inp
-#         self.m = m
-#     def remove(self):
-#         self.handle.remove()
-# class HessianTrace:
-#     """
-#     please refer to
-#     Yao, Zhewei, et al. "Pyhessian: Neural networks through the lens of the hessian." 2020 IEEE international conference on big data (Big data). IEEE, 2020.
-#     Dong, Zhen, et al. "Hawq-v2: Hessian aware trace-weighted quantization of neural networks." Advances in neural information processing systems 33 (2020): 18518-18529.
-#     https://github.com/openvinotoolkit/nncf/blob/develop/nncf/torch/quantization/hessian_trace.py
-#     """
-
-#     def __init__(self, model, dataloader,q_model,criterion=None):
-#         self.unfused_model = model.model
-#         self.q_model=q_model
-#         tmp_model=model.model
-#         if 'graph' in (str(dir(tmp_model))): #check the attribute and it's length
-#             logger.info("This is aready fused model")
-#             self.model=model.model
-#         else:
-#             logger.info("fusing model")
-#             self.model = fuse_fx(model.model)  ##TODO need to check whether model has been already fused
-#         self.dataloader = dataloader
-#         self.max_iter = 500
-#         self.tolerance = 1e-5
-#         self.eps = 1e-6
-#         self.index = 0
-#         self.device = self.get_device(self.model)
-#         self.criterion = criterion
-#         if self.criterion == None:
-#             self.criterion = torch.nn.CrossEntropyLoss().to(self.device)  ##TODO need to set in config
-#         self.criterion = self.criterion.to(self.device)
-#         self.weight_to_op, self.op_list = self.get_fused_mapping()
-#         self.get_params()
-
-#     def is_fused_module(self, module):
-#         """This is a helper function for `_propagate_qconfig_helper` to detecte
-#            if this module is fused.
-#         Args:
-#             module (object): input module
-#         Returns:
-#             (bool): is fused or not
-#         """
-#         op_type = str(type(module))
-#         if 'fused' in op_type:
-#             return True
-#         else:
-#             return False
-
-#     def mapping_module_to_op(self, name):
-#         # length = len("_model.")
-#         # if len(name) < length:
-#         #     return name
-#         # else:
-#         return name
-#     def mse_metric_gap(self,fp32_tensor, dequantize_tensor):
-#         """Calculate the euclidean distance between fp32 tensor and int8 dequantize tensor
-#         Args:
-#             fp32_tensor (tensor): The FP32 tensor.
-#             dequantize_tensor (tensor): The INT8 dequantize tensor.
-#         """
-#         fp32_max = np.max(fp32_tensor)
-#         fp32_min = np.min(fp32_tensor)
-#         dequantize_max = np.max(dequantize_tensor)
-#         dequantize_min = np.min(dequantize_tensor)
-#         fp32_tensor = (fp32_tensor - fp32_min) / (fp32_max - fp32_min)
-#         dequantize_tensor = (dequantize_tensor - dequantize_min) / \
-#             (dequantize_max - dequantize_min)
-#         diff_tensor = fp32_tensor - dequantize_tensor
-#         euclidean_dist = np.sum(diff_tensor ** 2)
-#         return euclidean_dist / fp32_tensor.size
-#     def get_fused_mapping(self):
-#         model = self.model
-#         weights_info = dict(model.named_parameters())
-#         weight_to_op = {}
-#         for op_name, child in model.named_modules():
-#             if self.is_fused_module(child):
-#                 for name, _ in child.named_children():
-#                     if op_name + "." + name + ".weight" in weights_info:  ##TODO check if this is right
-
-#                         weight_to_op[op_name + "." + name + ".weight"] = self.mapping_module_to_op(op_name)
-#                         break
-#             else:
-#                 name = op_name + ".weight"
-#                 if name in weights_info and name not in weight_to_op.keys():
-#                     weight_to_op[op_name + ".weight"] = op_name
-#         op_list = []
-#         for key in weight_to_op.keys():
-#             op_list.append(weight_to_op[key])
-#         return weight_to_op, op_list
-
-#     def get_device(self, model: torch.nn.Module):
-#         for n, p in model.named_parameters():
-#             return p.data.device
-
-#     def _get_act_grad_hook(self, name):
-#         def act_grad_hook(model, grad_input, grad_output):
-#             ##print(name, grad_input[0].shape, grad_output[0].shape)
-#             if type(model) == torch.nn.Linear:  ##TODO very tricky
-#                 self.layer_acts_grads[name] = grad_input[1]
-#             else:
-#                 self.layer_acts_grads[name] = grad_input[0]
-
-#         return act_grad_hook
-
-#     def _get_enable_act_grad_hook(self, name):
-#         def enable_act_grad_hook(model, inputs, outputs):
-#             input = inputs[0]
-#             if input.requires_grad is False:
-#                 input.requires_grad = True
-#             self.layer_acts[name] = input
-
-#         return enable_act_grad_hook
-
-#     # def _get_disable_input_grad_hook(self, name):
-#     #     def disable_input_grad_hook(model, inputs, outputs):
-#     #         try:
-#     #             input = inputs[0]  ##TODO check whether this is right
-#     #         except:
-#     #             input = inputs
-#     #         if input.is_leaf == False:## you can only change requires_grad flags of leaf variables
-#     #             if input.requires_grad is True:
-#     #                 input.requires_grad = False
-#     #
-#     #
-#     #     return disable_input_grad_hook
-
-#     def _unregister_hook(self):
-#         for handel in self.hook_handles:
-#             handel.remove()
-
-#     def register_act_grad_hooks(self, model):
-#         for name, module in model.named_modules():
-#             if self.mapping_module_to_op(name) in self.op_list:
-#                 hook_handle = module.register_forward_hook(self._get_enable_act_grad_hook(name))
-#                 self.hook_handles.append(hook_handle)
-#                 hook_handle = module.register_backward_hook(self._get_act_grad_hook(name))
-#                 self.hook_handles.append(hook_handle)
-
-#     def reset_act_gradient_and_hooks(self):
-#         # tmp_input = torch.zeros(self._input_shape, device=self.device)
-#         # for name, module in self.model.named_modules():
-#         #     if name in self.op_list:
-#         #         hook_handle = module.register_forward_hook(self._get_disable_input_grad_hook(name))
-#         #         self.hook_handles.append(hook_handle)
-#         # self.model(tmp_input)
-#         self._unregister_hook()
-
-#     def get_params(self):
-#         weight_names = [n for n, p in self.model.named_parameters() if
-#                         p.requires_grad and "bias" not in n]  ##remove bias
-#         params = [p for n, p in self.model.named_parameters() if p.requires_grad and "bias" not in n]  ##remove bias
-#         self.weight_names = weight_names
-#         self.params = params
-
-#     def forward_backward(self, model, data, create_graph=False, return_w_grad=True):
-#         model.zero_grad()
-#         input = data[0].to(self.device)
-#         ##self._input_shape = input.shape  ## for resetting input activation
-#         target = data[1].to(self.device)
-#         input.requires_grad = True
-#         output = model(input)
-#         loss = self.criterion(output, target)
-#         torch.autograd.backward(loss, create_graph=create_graph)
-#         ##loss.backward(create_graph=create_graph)
-#         if return_w_grad:
-#             gradients = []
-#             for n, p in self.model.named_parameters():
-#                 if p.grad != None and n in self.weight_names:
-#                     gradient = p.grad
-#                     gradients.append(gradient + 0.0)  ## add 0 to create a copy
-#             model.zero_grad()
-#             return gradients
-#         else:
-#             model.zero_grad()
-
-#     # def get_params(self, model):
-#     #     parameters = [p for p in model.parameters() if p.requires_grad]
-#     #     return parameters
-
-#     def sample_rademacher(self, params):
-#         samples = []
-#         for param in params:
-#             r = torch.randint_like(param, high=2, device=self.device)
-#             r.masked_fill_(r == 0, -1)
-#             samples.append(r)
-#         return samples
-
-#     def get_vtHv_weight(self, params, num_samples):
-#         v = self.sample_rademacher(params)
-#         H_v = [0] * len(v)
-#         cnt = 0
-#         for step, data in enumerate(self.dataloader):
-#             batch_size = data[0].shape[0]
-#             cnt += batch_size
-#             gradients = self.forward_backward(self.model, data, create_graph=True)
-#             H_v_one = torch.autograd.grad(gradients, params, v, only_inputs=True, retain_graph=False)
-#             H_v = [pre + cur * float(batch_size) for cur, pre in zip(H_v_one, H_v)]
-#             if cnt >= num_samples:
-#                 break
-#         if cnt > 0:
-#             H_v = [item / cnt for item in H_v]
-#         v_t_H_v = torch.stack([torch.mean(h_v * v_t) for (h_v, v_t) in zip(H_v, v)])  ##maybe sum is better
-#         return v_t_H_v
-
-#     def get_vtHv_act(self, params, num_samples):
-#         v = self.sample_rademacher(params)
-#         H_v = [0] * len(v)
-#         cnt = 0
-#         for step, data in enumerate(self.dataloader):
-#             if cnt >= num_samples:
-#                 break
-#             for i in range(self.dataloader.batchsize):  ##force to batchsize to be 1
-#                 input = data[0][i:i + 1]
-#                 target = data[1][i:i + 1]
-
-#                 self.get_gradients(self.model, (input, target), self.criterion, create_graph=True)
-#                 layer_acts = [self.layer_acts[key] for key in self.layer_acts.keys()]
-#                 layer_act_gradients = [self.layer_acts_grads[key] for key in self.layer_acts.keys()]
-#                 hv_one = torch.autograd.grad(layer_act_gradients, layer_acts, v, only_inputs=True, retain_graph=False)
-#                 cnt += 1
-#                 if cnt >= num_samples:
-#                     break
-
-#     def get_weight_traces(self, num_samples):
-#         layer_traces_per_iter = []
-#         prev_avg_model_trace = 0
-#         for iter in range(self.max_iter):
-#             layer_traces = self.get_vtHv_weight(self.params, num_samples)
-#             layer_traces_per_iter.append(layer_traces)
-#             layer_traces_estimate = torch.mean(torch.stack(layer_traces_per_iter), dim=0)
-#             model_trace = torch.sum(layer_traces_estimate)
-#             diff_ratio = abs(model_trace - prev_avg_model_trace) / (prev_avg_model_trace + self.eps)
-#             if diff_ratio < self.tolerance and iter > 10:  ##TODO magic number
-#                 break
-#             if iter == 50:  ##TODO for debug
-#                 break
-#             prev_avg_model_trace = model_trace
-#         weight_name_to_traces = {}
-#         layer_traces = layer_traces_estimate
-#         for weight_name, trace in zip(self.weight_names, layer_traces):
-#             weight_name_to_traces[weight_name] = trace
-#         op_name_to_trace = {}
-#         for weight_name in self.weight_names:
-#             op_name = self.weight_to_op[weight_name]
-#             op_name_to_trace[op_name] = weight_name_to_traces[weight_name]
-#         return op_name_to_trace
-#     def get_act_traces(self, num_samples):
-#         unfused_training = self.unfused_model.training
-#         self.unfused_model.eval()
-#         self.hook_handles = []
-#         self.layer_acts = {}
-#         self.layer_acts_grads = {}
-#         self.register_act_grad_hooks(self.unfused_model)
-#         cnt = 0
-#         act_traces_per_sample = []
-#         for step, data in enumerate(self.dataloader):
-#             if cnt >= num_samples:
-#                 break
-#             bs = data[0].shape[0]
-#             act_traces_sum = 0
-#             act_traces_per_iter = []
-#             prev_avg_model_trace = 0
-#             act_traces_sums = None
-#             for i in range(bs):  ##force the bs to be one
-#                 input = data[0][i:i + 1]
-#                 target = data[1][i:i + 1]
-#                 self.forward_backward(self.unfused_model, (input, target), create_graph=True, return_w_grad=False)
-#                 acts = [self.layer_acts[key] for key in self.layer_acts.keys()]
-#                 if act_traces_sums == None:
-#                     act_traces_sums = [0] * len(acts)
-#                 acts_grad = [self.layer_acts_grads[key] for key in self.layer_acts.keys()]  ##same order with acts
-#                 vt_H_v_sum_per_act = [0] * len(acts)
-
-#                 prev_model_act_trace = 0
-#                 for iter in range(self.max_iter):
-#                     v = self.sample_rademacher(acts)
-#                     H_v = torch.autograd.grad(acts_grad, acts, v, only_inputs=True, retain_graph=True)
-#                     vt_H_v = [torch.mean(h_v * v_t) for (h_v, v_t) in zip(H_v, v)]
-
-#                     vt_H_v_sum_per_act = [vt_H_v_sum_per_act[index] + vt_H_v[index] for index, item in
-#                                           enumerate(vt_H_v_sum_per_act)]
-#                     vt_H_v_mean_per_act = [item / (iter + 1) for item in vt_H_v_sum_per_act]
-#                     current_model_act_trace = torch.mean(torch.stack(vt_H_v_mean_per_act))
-
-#                     diff_ratio = abs(current_model_act_trace - prev_model_act_trace) / (
-#                             prev_model_act_trace + self.eps)
-#                     if diff_ratio < self.tolerance and iter > 10:  ##TODO magic number
-#                         break
-#                     if iter == 50:  ##TODO for debug
-#                         break
-
-#                     prev_model_act_trace = current_model_act_trace
-#                 act_traces_per_sample.append(vt_H_v_mean_per_act)
-#                 cnt += 1
-#                 if cnt >= num_samples:
-#                     break
-
-#         if unfused_training:
-#             self.unfused_model.train()
-#         self.reset_act_gradient_and_hooks()  ##TODO have issues to reset the input grad to False
-#         act_traces_stack = torch.stack([torch.stack(item) for item in act_traces_per_sample])
-#         act_traces = torch.mean(act_traces_stack, dim=0)
-#         res_dict = {}
-#         for index, key in enumerate(self.layer_acts.keys()):
-#             res_dict[key] = act_traces[index]
-
-#         self.layer_acts = []
-#         self.layer_acts_grads = []
-#         return res_dict    
-#     def insert_hook(self, model, target_module_list):
-#         intern_outputs = []
-#         for layer,module in model.named_modules():     
-#             for target_module in target_module_list:
-#                 # print("layer:",layer)
-#                 # print("target_model:",target_module)   
-#                 if  layer == target_module:
-#                     logging.debug("Collect: %s" % (module))
-#                     # print("Collect: %s" % (module))
-#                     intern_outputs.append(Node_collector(module))
-                
-#         logging.info("Total %d hook inserted" % (len(intern_outputs)))
-#         # print("Total %d hook inserted" % (len(intern_outputs)))
-#         return model, intern_outputs
-#     def insert_hook_quantize(self,model, target_module_list):
-#         intern_outputs = []
-#         for layer,module in model.named_modules():     
-#             for target_module in target_module_list:
-#                 # print("layer:",layer)
-#                 length = len("_model.")
-#                 new_key = layer[length:]
-#                 # print("target_model:",target_module)   
-#                 if  new_key == target_module:
-#                     logging.debug("Collect: %s" % (module))
-#                     # print("Collect: %s" % (module))
-#                     intern_outputs.append(Node_collector(module))
-#         logging.info("Total %d hook inserted" % (len(intern_outputs)))
-#         # print("Total %d hook inserted" % (len(intern_outputs)))
-#         return model, intern_outputs
-#     def get_act_gap(self,fp32_model,q_model):
-#         """
-#         Estimates each activation gap between quantized model and float model 
-#         """
-#         self.handle_acts=[]
-#         fp32_model.eval()
-#         # temp_model = fuse_fx(fp32_model.model)
-#         temp_model=fp32_model
-#         # target_module_list = [nn.ReLU] # Insert hook for FP32 model
-#         target_module_list = self.op_list
-#         temp_model, intern_outputs =self.insert_hook(temp_model, target_module_list)
-#         # intern_outputs={}
-#         for input, target in self.dataloader:
-#             temp_model(input)
-#             break
-
-#         fp32_act_out={}
-#         for i, intern_output in enumerate(intern_outputs):
-#             stat_features = intern_output.out_features.view(-1)
-#             # print ("No.", i, " ", intern_output.out_features.shape)
-#             # print ("Numpy No.", i, " ", intern_output.out_features.cpu().data.numpy().shape)
-#             # print ("No.", i, " ", stat_features.cpu().data.numpy().shape)
-#             # print ("Numpy No.", i, " ", stat_features.cpu().data.numpy())
-#             fp32_act_out[target_module_list[i]]=stat_features.cpu().data.numpy()
-#             # break
-#         for i in intern_outputs:
-#             # print(i)
-#             i.remove()
-#         target_module_list = self.op_list
-#         q_model, intern_outputs=self.insert_hook_quantize(q_model, target_module_list)
-#         for input, target in self.dataloader: #only one sample
-#             q_model(input)
-#             break
-#         qnt_act_out={}
-#         intern_outputs={}
-#         for i, intern_output in enumerate(intern_outputs):
-#             stat_features = intern_output.out_features.view(-1)
-#             qnt_act_out[target_module_list[i]]=stat_features.dequantize().cpu().data.numpy()
-#             # break
-#         for i in intern_outputs:
-#             # print(i)
-#             i.remove()
-#         act_gap={}
-#         mse_gap={}
-#         for fp_i,int_i in zip(fp32_act_out,qnt_act_out):
-#             activation_qnt_error=fp32_act_out[fp_i]-qnt_act_out[int_i]
-#             mse_gap[fp_i]=self.mse_metric_gap(fp32_act_out[fp_i],qnt_act_out[int_i])
-#             act_gap[fp_i]=np.sum(activation_qnt_error)/activation_qnt_error.size
-#         return act_gap,mse_gap
-#     def get_avg_traces(self, enable_act=True, num_samples=32):
-#         """
-#         Estimates average hessian trace for each parameter
-#         """
-#         assert num_samples > 0
-#         traces = {}
-#         weight_traces = self.get_weight_traces(num_samples)
-#         traces['weight'] = weight_traces
-#         act_trace={}
-#         if enable_act:
-#             act_gap,mse_gap=self.get_act_gap(self.model,self.q_model)
-#             act_traces = self.get_act_traces(num_samples)
-#             for i,j in zip(act_traces,mse_gap):
-#                 #currently use mse to analysis 
-#                 act_trace[i]=act_traces[i]+mse_gap[j]
-#             traces['activation'] = act_traces
-#         return traces
-
-
-# ##copy from torch.quantization._numeric_suite
-# def _find_match(
-#         str_list: Union[Dict[str, Any], List[str]], key_str: str,
-#         postfix: str,
-# ) -> Optional[str]:
-#     split_str = key_str.split(".")
-#     if split_str[-1] == postfix:
-#         match_string = "".join(key_str.split(".")[0:-1])
-#         for s2 in str_list:
-#             pattern1 = "".join(s2.split(".")[0:-1])
-#             pattern2 = "".join(s2.split(".")[0:-2])
-#             if match_string == pattern1:
-#                 return s2
-#             if match_string == pattern2:
-#                 return s2
-
-#         # For matching "fc.weight" and "fc._packed_params._packed_params"
-#         if postfix == "_packed_params":
-#             match_string = "".join(key_str.split(".")[0:-2])
-#             if len(match_string) == 0:
-#                 return None
-#             for s2 in str_list:
-#                 pattern1 = "".join(s2.split(".")[0:-1])
-#                 pattern2 = "".join(s2.split(".")[0:-2])
-#                 if match_string == pattern1:
-#                     return s2
-#                 if match_string == pattern2:
-#                     return s2
-#         return None
-#     else:
-#         return None
-
-
-# ##copy form torch.quantization._numeric_suite
-# def compare_weights(
-#         float_dict: Dict[str, Any], quantized_dict: Dict[str, Any]
-# ) -> Dict[str, Dict[str, torch.Tensor]]:
-#     r"""Compare the weights of the float module with its corresponding quantized
-#     module. Return a dict with key corresponding to module names and each entry being
-#     a dictionary with two keys 'float' and 'quantized', containing the float and
-#     quantized weights. This dict can be used to compare and compute the quantization
-#     error of the weights of float and quantized models.
-
-#     Example usage::
-
-#         wt_compare_dict = compare_weights(
-#             float_model.state_dict(), qmodel.state_dict())
-#         for key in wt_compare_dict:
-#             print(
-#                 key,
-#                 compute_error(
-#                     wt_compare_dict[key]['float'],
-#                     wt_compare_dict[key]['quantized'].dequantize()
-#                 )
-#             )
-
-#     Args:
-#         float_dict: state dict of the float model
-#         quantized_dict: state dict of the quantized model
-
-#     Return:
-#         weight_dict: dict with key corresponding to module names and each entry being
-#         a dictionary with two keys 'float' and 'quantized', containing the float and
-#         quantized weights
-#     """
-
-#     weight_dict: Dict[str, Dict] = {}
-#     for key in quantized_dict:
-#         match_key = _find_match(float_dict, key, "weight")
-#         if match_key is not None:
-#             weight_dict[key] = {}
-#             weight_dict[key]["float"] = float_dict[match_key]
-#             weight_dict[key]["quantized"] = quantized_dict[key]
-#             continue
-
-#         # For matching "fc.weight" and "fc._packed_params._packed_params"
-#         match_key = _find_match(float_dict, key, "_packed_params")
-#         if match_key is not None:
-#             weight_dict[match_key] = {}
-#             weight_dict[match_key]["float"] = float_dict[match_key]
-#             weight_dict[match_key]["quantized"] = quantized_dict[key][0]
-#             ##TODO:should consider more models in further work
-
-#         # For LSTM
-#         split_str = key.split(".")
-#         if split_str[-1] == "param" and split_str[-3] == "_all_weight_values":
-#             layer = split_str[-2]
-#             module_name = ".".join(split_str[:-3])
-#             float_weight_ih_key = module_name + ".weight_ih_l" + layer
-#             float_weight_hh_key = module_name + ".weight_hh_l" + layer
-#             if float_weight_ih_key in float_dict and float_weight_hh_key in float_dict:
-#                 weight_dict[key] = {}
-#                 weight_dict[key]["float"] = float_dict[float_weight_ih_key]
-#                 weight_dict[key]["quantized"] = (
-#                     quantized_dict[key].__getstate__()[0][4][0].__getstate__()[0][0]
-#                 )
-#                 weight_dict[key]["float"] = float_dict[float_weight_hh_key]
-#                 weight_dict[key]["quantized"] = (
-#                     quantized_dict[key].__getstate__()[0][4][1].__getstate__()[0][0]
-#                 )
-
-#     return weight_dict
-
-
 @strategy_registry
 class HawqTuneStrategy(TuneStrategy):
     """The basic tuning strategy which tunes the low precision model with below order.
@@ -640,12 +123,6 @@ def next_tune_cfg(self):
             yield op_tuning_cfg
         
         # Start compute the hessian trace
-
-        # import torch.quantization._numeric_suite as ns
-        # self.model.eval()
-        # fused_model = fuse_fx(self.model.model)
-        # res = compare_weights(fused_model.state_dict(), self.q_model.state_dict())
-
         # Fallback the ops supported both static and dynamic from static to dynamic
         quant_ops = quant_mode_wise_items['static'] if 'static' in quant_mode_wise_items else []
         quant_ops += quant_mode_wise_items['dynamic'] if 'dynamic' in quant_mode_wise_items else []
@@ -657,47 +134,13 @@ def next_tune_cfg(self):
         #     print(n)
         # for n, p in self._fp32_model.named_parameters():
         #     print(n)
-        # orig_eval = True
-        # if self._fp32_model.training:
-        #     orig_eval = False
-        # self._fp32_model.eval()
-        # ht = HessianTrace(self._fp32_model, self.calib_dataloader,self.q_model)
-        # q_model_state_dict = {}
-        # for key in self.q_model.state_dict().keys():
-        #     length = len("_model.")
-        #     new_key = key[length:]
-        #     q_model_state_dict[new_key] = self.q_model.state_dict()[key]
-        # weight_quant_loss = compare_weights(ht.model.state_dict(), q_model_state_dict)
-        # pertur_lst = {}
-        # for key in weight_quant_loss:
-        #     op_float_tensor = weight_quant_loss[key]['float']
-        #     op_qnt_tensor = weight_quant_loss[key]['quantized'].dequantize()
-        #     diff_l2 = (torch.norm(op_float_tensor - op_qnt_tensor, p=2) ** 2)  # Formula: L2=||Q(w)-w||p^2
-        #     pertur_lst[key] = diff_l2
-        # self.enable_act=False #enable activation trace and quantization loss analysis feature
-        # traces = ht.get_avg_traces(self.enable_act)
-        # op_to_traces = traces['weight']
-        # if self.enable_act:
-        #     act_to_traces=traces['activation']
-        # #TODO() optimize relationship of weights quantized loss and activation quantized loss, to find best conbine
-        # #TODO() do double check why layer1's output is not 0 for activation quantized
-        #     for trace_i, pertur_i,act_i in zip(op_to_traces.keys(),pertur_lst.keys(),act_to_traces.keys()):
-        #         op_to_traces[trace_i]=pertur_lst[pertur_i]*op_to_traces[trace_i]+act_to_traces[act_i] #Formula:Omig=Trace*L2+act_trace
-        # else:
-        #     for trace_i, pertur_i in zip(op_to_traces.keys(),pertur_lst.keys()):
-        #         op_to_traces[trace_i]=pertur_lst[pertur_i]*op_to_traces[trace_i] #Formula:Omig=Trace*L2
-        # if orig_eval == False:
-        #     self._fp32_model.train()
-
-        # End compute the hessian trace
         # # TODO uncomment it when algo ready.
-        criterion=torch.nn.CrossEntropyLoss()# TODO replace it with user specify loss
+        criterion=torch.nn.CrossEntropyLoss()
         op_to_traces = self.adaptor.calculate_hessian_trace(fp32_model = self._fp32_model, 
                                                             dataloader = self.calib_dataloader, 
                                                             q_model = self.q_model, 
-                                                            criterion =criterion, 
+                                                            criterion =criterion, # TODO replace it with user specify loss
                                                             enable_act = False)
-        # op_to_traces = self.adaptor.calculate_hessian_trace()
         ordered_ops = sorted(op_to_traces.keys(),
                              key=lambda key: op_to_traces[key],
                              reverse=self.higher_is_better)

From be4f5a284fb540788dd0ecac4695d2b838bed547 Mon Sep 17 00:00:00 2001
From: BiaoFangAIA <biao.fang@intel.com>
Date: Tue, 6 Dec 2022 16:59:29 +0800
Subject: [PATCH 100/128] enable model.eval() first

---
 neural_compressor/adaptor/torch_utils/hawq_metric.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/neural_compressor/adaptor/torch_utils/hawq_metric.py b/neural_compressor/adaptor/torch_utils/hawq_metric.py
index 465b7f9ca88..2b6a7790eb0 100644
--- a/neural_compressor/adaptor/torch_utils/hawq_metric.py
+++ b/neural_compressor/adaptor/torch_utils/hawq_metric.py
@@ -541,15 +541,11 @@ def compare_weights(
                 )
 
     return weight_dict
-# op_to_traces = self.adaptor.calculate_hessian_trace(fp32_model = self._fp32_model, 
-        #                                                     dataloader = self.calib_dataloader, 
-        #                                                     q_model = self.q_model, 
-        #                                                     criterion = torch.nn.CrossEntropyLoss(), # TODO replace it with user specify loss
-        #                                                     enable_act = False)
 def hawq_top(fp32_model,q_model,dataloader,criterion,enable_act):
     orig_eval=True
     if fp32_model.training:
         orig_eval=False
+    fp32_model.eval()
     ht=HessianTrace(fp32_model,dataloader=dataloader,q_model=q_model)
     q_model_state_dict={}
     for key in q_model.state_dict().keys():
@@ -563,7 +559,7 @@ def hawq_top(fp32_model,q_model,dataloader,criterion,enable_act):
         op_qnt_tensor=weight_quant_loss[key]['quantized'].dequantize()
         diff_l2 = (torch.norm(op_float_tensor - op_qnt_tensor, p=2) ** 2)
         pertur_lst[key]=diff_l2
-    traces=ht.get_act_traces(enable_act)
+    traces=ht.get_avg_traces(enable_act)
     op_to_traces=traces['weight']
     if enable_act:
         act_to_traces=traces['activation']

From b0b697c2edab56ed4a978796385b5ccc43fd7da6 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 6 Dec 2022 17:08:23 +0800
Subject: [PATCH 101/128] remove some useless lines

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/strategy/hawq.py | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq.py
index 4ba5ed7db8f..c6fc912fd3b 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq.py
@@ -123,29 +123,17 @@ def next_tune_cfg(self):
             yield op_tuning_cfg
         
         # Start compute the hessian trace
-        # Fallback the ops supported both static and dynamic from static to dynamic
-        quant_ops = quant_mode_wise_items['static'] if 'static' in quant_mode_wise_items else []
-        quant_ops += quant_mode_wise_items['dynamic'] if 'dynamic' in quant_mode_wise_items else []
-
-        target_dtype = "int8"  ##TODO support bf16
-        target_type_lst = set(tuning_space.query_items_by_quant_mode(target_dtype))
-        fp_op_list = [item.name for item in quant_ops if item in target_type_lst]
-        # for n, p in self._fp32_model.named_modules():
-        #     print(n)
-        # for n, p in self._fp32_model.named_parameters():
-        #     print(n)
-        # # TODO uncomment it when algo ready.
+        target_dtype = "int8"  # TODO support bf16
         criterion=torch.nn.CrossEntropyLoss()
         op_to_traces = self.adaptor.calculate_hessian_trace(fp32_model = self._fp32_model, 
                                                             dataloader = self.calib_dataloader, 
                                                             q_model = self.q_model, 
-                                                            criterion =criterion, # TODO replace it with user specify loss
+                                                            criterion =criterion, # TODO using user specify loss
                                                             enable_act = False)
         ordered_ops = sorted(op_to_traces.keys(),
                              key=lambda key: op_to_traces[key],
                              reverse=self.higher_is_better)
         # WA for add op type
-        # print("ordered_ops:",ordered_ops)
         op_info_map = {}
         for op_info in list(initial_op_tuning_cfg.keys()):
             op_info_map[op_info[0]] = op_info  # op_name: (op_name, op_type)
@@ -158,7 +146,7 @@ def next_tune_cfg(self):
             indx=indx+1
             if indx>4:
                 break
-        print(op_dtypes)
+
         logger.info("hawq op_config:"+str(op_dtypes))
         logger.info(f"Start to accumulate fallback to {target_dtype}.")
         initial_op_tuning_cfg = deepcopy(op_tuning_cfg)

From 9633ebd08604573e4de80a59ded14bb8c006e7d2 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 6 Dec 2022 21:07:01 +0800
Subject: [PATCH 102/128] fixed some uts

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 test/strategy/test_basic.py  | 9 ++++-----
 test/strategy/test_sigopt.py | 7 +++----
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/test/strategy/test_basic.py b/test/strategy/test_basic.py
index 239f26a071a..0a2812b5f79 100644
--- a/test/strategy/test_basic.py
+++ b/test/strategy/test_basic.py
@@ -221,12 +221,11 @@ def test_run_basic_max_trials_multimetric_weight(self):
     def test_run_basic_one_trial_new_api(self):
         from neural_compressor.quantization import fit
         from neural_compressor.config import AccuracyCriterion, AccuracyLoss, PostTrainingQuantConfig, TuningCriterion
-        from neural_compressor.experimental.common import DataLoader
-        from neural_compressor.experimental.data.datasets.dummy_dataset import DummyDataset
-
+        from neural_compressor.data import DATASETS, DATALOADERS
+        
         # dataset and dataloader
-        dataset = DummyDataset(shape=(100, 3, 3, 1), label=True)
-        dataloader = DataLoader(dataset)
+        dataset = DATASETS("tensorflow")["dummy"](((100, 3, 3, 1)))
+        dataloader = DATALOADERS["tensorflow"](dataset)
         
         # tuning and accuracy criterion
         tolerable_loss = AccuracyLoss(0.01)
diff --git a/test/strategy/test_sigopt.py b/test/strategy/test_sigopt.py
index 062c96b638c..5d443e3dba2 100644
--- a/test/strategy/test_sigopt.py
+++ b/test/strategy/test_sigopt.py
@@ -144,12 +144,11 @@ def test_run_basic_max_trials(self):
     def test_run_sigopt_one_trial_new_api(self):
         from neural_compressor.quantization import fit
         from neural_compressor.config import AccuracyCriterion, AccuracyLoss, PostTrainingQuantConfig, TuningCriterion
-        from neural_compressor.experimental.common import DataLoader
-        from neural_compressor.experimental.data.datasets.dummy_dataset import DummyDataset
+        from neural_compressor.data import DATASETS, DATALOADERS
         
         # dataset and dataloader
-        dataset = DummyDataset(shape=(100, 3, 3, 1), label=True)
-        dataloader = DataLoader(dataset)
+        dataset = DATASETS("tensorflow")["dummy"](((100, 3, 3, 1)))
+        dataloader = DATALOADERS["tensorflow"](dataset)
         
         # tuning and accuracy criterion
         tolerable_loss = AccuracyLoss(0.01)

From 09931956937c8e7b4b4b2951b21711f27ee0e1fc Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 6 Dec 2022 21:29:33 +0800
Subject: [PATCH 103/128] add optimization_level in BaseQuantizationConfig

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/config.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/neural_compressor/config.py b/neural_compressor/config.py
index 4accfce4bd0..b3a9fd4352e 100644
--- a/neural_compressor/config.py
+++ b/neural_compressor/config.py
@@ -314,6 +314,7 @@ def __init__(self,
                  performance_only=False,
                  reduce_range=None,
                  extra_precisions=["bf16"],
+                 optimization_level=1,
                  accuracy_criterion=accuracy_criterion):
         self._inputs = inputs
         self._outputs = outputs
@@ -330,6 +331,7 @@ def __init__(self,
         self._reduce_range = reduce_range
         self._extra_precisions = extra_precisions \
             if isinstance(extra_precisions, List) else [extra_precisions]
+        self._optimization_level = optimization_level
         self.use_bf16 = "bf16" in self._extra_precisions
         self._accuracy_criterion = accuracy_criterion
         self._calibration_sampling_size = calibration_sampling_size
@@ -348,6 +350,14 @@ def extra_precisions(self, extra_precisions):
             self._extra_precisions = extra_precisions
             self._use_bf16 = "bf16" in extra_precisions
 
+    @property
+    def optimization_level(self):
+        return self._optimization_level
+    
+    @optimization_level.setter
+    def optimization_level(self, optimization_level):
+        self._optimization_level = optimization_level
+
     @property
     def reduce_range(self):
         return self._reduce_range
@@ -576,9 +586,9 @@ def __init__(self,
                          max_trials=tuning_criterion.max_trials,
                          reduce_range=reduce_range,
                          extra_precisions=extra_precisions,
+                         optimization_level=optimization_level,
                          accuracy_criterion=accuracy_criterion)
         self.approach = approach
-        self.optimization_level = optimization_level
 
     @property
     def approach(self):
@@ -599,10 +609,12 @@ def __init__(self,
                  op_type_list=None,
                  op_name_list=None,
                  reduce_range=None,
-                 extra_precisions=["bf16"]):
+                 extra_precisions=["bf16"],
+                 optimization_level=1):
         super().__init__(inputs=inputs, outputs=outputs, device=device, backend=backend,
                          op_type_list=op_type_list, op_name_list=op_name_list,
-                         reduce_range=reduce_range, extra_precisions=extra_precisions)
+                         reduce_range=reduce_range, extra_precisions=extra_precisions, 
+                         optimization_level=optimization_level)
         self._approach = 'quant_aware_training'
 
     @property

From 087bdc624c16293e7366909c68d131971ac24eac Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 6 Dec 2022 23:08:17 +0800
Subject: [PATCH 104/128] add optimization_level to conf and pythonic_conf

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/conf/config.py          |  2 ++
 neural_compressor/conf/pythonic_config.py | 22 +++++++++++++++++++---
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/neural_compressor/conf/config.py b/neural_compressor/conf/config.py
index 8b227697086..f9039be5a5b 100644
--- a/neural_compressor/conf/config.py
+++ b/neural_compressor/conf/config.py
@@ -841,6 +841,7 @@ def percent_to_float(data):
         },
     },
     Optional('use_bf16', default=True): bool,
+    Optional('optimization_level', default=1): And(int, lambda level: level in [0, 1]),
     Optional('graph_optimization'): graph_optimization_schema,
     Optional('mixed_precision'): mixed_precision_schema,
 
@@ -1111,6 +1112,7 @@ def percent_to_float(data):
                                                      'activation': {}},
                                     }): dict,
     Optional('use_bf16', default=False): bool,
+    Optional('optimization_level', default=1): int,
     Optional('tuning', default={
         'strategy': {'name': 'basic'},
         'accuracy_criterion': {'relative': 0.01, 'higher_is_better': True},
diff --git a/neural_compressor/conf/pythonic_config.py b/neural_compressor/conf/pythonic_config.py
index dbf1d3dc6aa..c9975a9ebc6 100644
--- a/neural_compressor/conf/pythonic_config.py
+++ b/neural_compressor/conf/pythonic_config.py
@@ -34,17 +34,33 @@ def __init__(self,
                  op_type_list=None,
                  op_name_list=None,
                  strategy='basic',
+                 strategy_kwargs=None,
                  objective='performance',
                  timeout=0,
                  max_trials=100,
                  performance_only=False,
                  reduce_range=None,
                  use_bf16=True,
+                 optimization_level=1,
                  accuracy_criterion=accuracy_criterion):
         extra_precisions = ["bf16"] if use_bf16 else []
-        super().__init__(inputs, outputs, backend, device, calibration_sampling_size, op_type_list,
-                         op_name_list, strategy, objective, timeout, max_trials, performance_only,
-                         reduce_range, extra_precisions, accuracy_criterion)
+        super().__init__(inputs=inputs,
+                         outputs=outputs,
+                         backend=backend,
+                         device=device,
+                         calibration_sampling_size=calibration_sampling_size,
+                         op_type_list=op_type_list,
+                         op_name_list=op_name_list,
+                         strategy=strategy,
+                         strategy_kwargs=strategy_kwargs,
+                         objective=objective,
+                         timeout=timeout,
+                         max_trials=max_trials,
+                         performance_only=performance_only,
+                         reduce_range=reduce_range,
+                         extra_precisions=extra_precisions,
+                         optimization_level=optimization_level,
+                         accuracy_criterion=accuracy_criterion)
         self._approach = approach
 
     @property

From 75bd44c59418d2e8ba6d7bc2778302362dcb33a1 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 7 Dec 2022 13:00:36 +0800
Subject: [PATCH 105/128] rename test filename

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 test/strategy/{test_basic_fallback.py => test_hawq_v2.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename test/strategy/{test_basic_fallback.py => test_hawq_v2.py} (100%)

diff --git a/test/strategy/test_basic_fallback.py b/test/strategy/test_hawq_v2.py
similarity index 100%
rename from test/strategy/test_basic_fallback.py
rename to test/strategy/test_hawq_v2.py

From 1cc224e109d9f6eb17615e974ae0ef73f432daa3 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 7 Dec 2022 13:41:47 +0800
Subject: [PATCH 106/128] remove some incorrect comments

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/config.py                   |  2 +-
 .../strategy/{hawq.py => hawq_v2.py}          | 11 +---
 test/strategy/test_hawq_v2_2.x.py             | 56 +++++++++++++++++++
 3 files changed, 60 insertions(+), 9 deletions(-)
 rename neural_compressor/strategy/{hawq.py => hawq_v2.py} (95%)
 create mode 100644 test/strategy/test_hawq_v2_2.x.py

diff --git a/neural_compressor/config.py b/neural_compressor/config.py
index ff3f3aa1772..1e4ce97504d 100644
--- a/neural_compressor/config.py
+++ b/neural_compressor/config.py
@@ -542,7 +542,7 @@ def strategy(self):
     @strategy.setter
     def strategy(self, strategy):
         if check_value('strategy', strategy, str,
-            ['basic', 'mse', 'bayesian', 'random', 'exhaustive', 'sigopt', 'tpe']):
+            ['basic', 'mse', 'bayesian', 'random', 'exhaustive', 'sigopt', 'tpe', 'hawq_v2']):
             self._strategy = strategy
     
     @property
diff --git a/neural_compressor/strategy/hawq.py b/neural_compressor/strategy/hawq_v2.py
similarity index 95%
rename from neural_compressor/strategy/hawq.py
rename to neural_compressor/strategy/hawq_v2.py
index c6fc912fd3b..31c4cfa1b30 100644
--- a/neural_compressor/strategy/hawq.py
+++ b/neural_compressor/strategy/hawq_v2.py
@@ -35,13 +35,8 @@
 logger = logging.getLogger(__name__)
 from typing import Dict, List, Optional, Any, Union, Callable, Set
 @strategy_registry
-class HawqTuneStrategy(TuneStrategy):
-    """The basic tuning strategy which tunes the low precision model with below order.
-
-    1. modelwise tuning for all quantizable ops.
-    2. fallback tuning from bottom to top to decide the priority of which op has biggest impact
-       on accuracy.
-    3. incremental fallback tuning by fallbacking multiple ops with the order got from #2.
+class HAWQ_V2TuneStrategy(TuneStrategy):
+    """The hawq v2 tuning strategy.
 
     Args:
         model (object):                        The FP32 model specified for low precision tuning.
@@ -88,7 +83,7 @@ def eval_func(model):
     def __init__(self, model, conf, q_dataloader, q_func=None,
                  eval_dataloader=None, eval_func=None, dicts=None, q_hooks=None):
         super(
-            HawqTuneStrategy,
+            HAWQ_V2TuneStrategy,
             self).__init__(
             model,
             conf,
diff --git a/test/strategy/test_hawq_v2_2.x.py b/test/strategy/test_hawq_v2_2.x.py
new file mode 100644
index 00000000000..0ec055c26c3
--- /dev/null
+++ b/test/strategy/test_hawq_v2_2.x.py
@@ -0,0 +1,56 @@
+"""Tests for HAWQ v2 strategy"""
+
+import copy
+import shutil
+import unittest
+
+import numpy as np
+
+from neural_compressor.utils import logger
+
+class TestHAWQV2TuningStrategy(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(self):
+        import torchvision
+        self.model = torchvision.models.resnet18()
+
+    @classmethod
+    def tearDownClass(self):
+        shutil.rmtree('saved', ignore_errors=True)
+        shutil.rmtree('nc_workspace', ignore_errors=True)
+
+
+    def test_hawq_v2_pipeline(self):
+        logger.info("*** Test: HAWQ v2 with pytorch model.")
+        from neural_compressor.quantization import fit
+        from neural_compressor.config import PostTrainingQuantConfig, TuningCriterion
+        from neural_compressor.data import DATASETS, DATALOADERS
+
+        # model
+        model = copy.deepcopy(self.model)
+
+        # fake evaluation function
+        self.test_hawq_v2_pipeline_fake_acc = 0
+        def _fake_eval(model):
+            self.test_hawq_v2_pipeline_fake_acc -= 1
+            return self.test_hawq_v2_pipeline_fake_acc
+
+        # dataset and dataloader
+        dataset = DATASETS("pytorch")["dummy"](((1, 3, 224, 224)))
+        dataloader = DATALOADERS["pytorch"](dataset)
+
+        # tuning and accuracy criterion
+        tuning_criterion = TuningCriterion(strategy='hawq_v2', max_trials=5)
+        conf = PostTrainingQuantConfig(approach="static", backend="pytorch_fx", tuning_criterion=tuning_criterion)
+
+        # fit
+        q_model = fit(model=model,
+                      conf=conf,
+                      calib_dataloader=dataloader,
+                      eval_dataloader=dataloader,
+                      eval_func=_fake_eval)
+        self.assertIsNone(q_model)
+
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file

From 8390d3afd0e2b564d24562e1bd5bc4c1b85d2637 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 7 Dec 2022 13:43:21 +0800
Subject: [PATCH 107/128] remove UTs based on old API(YAML)

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 test/strategy/test_hawq_v2.py | 73 -----------------------------------
 1 file changed, 73 deletions(-)
 delete mode 100644 test/strategy/test_hawq_v2.py

diff --git a/test/strategy/test_hawq_v2.py b/test/strategy/test_hawq_v2.py
deleted file mode 100644
index fef994a4f1b..00000000000
--- a/test/strategy/test_hawq_v2.py
+++ /dev/null
@@ -1,73 +0,0 @@
-import torch
-import unittest
-import os
-import sys
-import copy
-import torchvision
-import torchvision.transforms as transforms
-from torch.utils.data import DataLoader
-from neural_compressor.data import DATASETS
-from neural_compressor.experimental.data.dataloaders.pytorch_dataloader import PyTorchDataLoader
-from neural_compressor.adaptor.pytorch import TemplateAdaptor
-from neural_compressor.adaptor import FRAMEWORKS
-import shutil
-
-
-def build_ptq_yaml():
-    fake_yaml = '''
-    model:
-        name: resnet18
-        framework: pytorch_fx
-    tuning:
-        strategy:
-            name: hawq
-        accuracy_criterion:
-            absolute:  -1
-        exit_policy:
-            timeout: 0
-    '''
-    with open('ptq_yaml.yaml', 'w', encoding="utf-8") as f:
-        f.write(fake_yaml)
-
-class TestPytorchAdaptor(unittest.TestCase):
-    framework_specific_info = {"device": "cpu",
-                               "approach": "post_training_static_quant",
-                               "random_seed": 1234,
-                               "q_dataloader": None,
-                               "workspace_path": None}
-    framework = "pytorch"
-    adaptor = FRAMEWORKS[framework](framework_specific_info)
-    model = torchvision.models.resnet18()
-
-    # model = torch.quantization.QuantWrapper(model)
-
-    @classmethod
-    def setUpClass(self):
-        self.i = 0
-        build_ptq_yaml()
-
-
-    @classmethod
-    def tearDownClass(self):
-        os.remove('ptq_yaml.yaml')
-        shutil.rmtree('./saved', ignore_errors=True)
-        shutil.rmtree('runs', ignore_errors=True)
-
-    def test_basic_fallback(self):
-        def eval_func(model):
-          self.i -= 1
-          return self.i
-          
-        from neural_compressor.experimental import Quantization, common
-        model = copy.deepcopy(self.model)
-        quantizer = Quantization('ptq_yaml.yaml')
-        quantizer.eval_func = eval_func
-        dataset = quantizer.dataset('dummy', (1, 3, 224, 224), label=True)
-        quantizer.calib_dataloader = common.DataLoader(dataset)
-        quantizer.eval_dataloader = common.DataLoader(dataset)
-        quantizer.model = model
-        q_model = quantizer()
-        self.assertTrue(q_model is None)
-        
-if __name__ == "__main__":
-    unittest.main()

From 73c634f74bbc24c9e82c881e5a39837204be5589 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 7 Dec 2022 13:48:51 +0800
Subject: [PATCH 108/128] remove some unused code

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/strategy/hawq_v2.py | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/neural_compressor/strategy/hawq_v2.py b/neural_compressor/strategy/hawq_v2.py
index 31c4cfa1b30..500f1727e69 100644
--- a/neural_compressor/strategy/hawq_v2.py
+++ b/neural_compressor/strategy/hawq_v2.py
@@ -15,28 +15,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import copy
-import numpy as np
 from collections import OrderedDict
-
-import torch.nn
+from copy import deepcopy
 
 from .strategy import strategy_registry, TuneStrategy
-from ..utils import logger
 
 from .st_utils.tuning_sampler import OpTypeWiseTuningSampler, FallbackTuningSampler, ModelWiseTuningSampler
 from .st_utils.tuning_structs import OpTuningConfig
 from .st_utils.tuning_space import TUNING_ITEMS_LST
-from torch.quantization.quantize_fx import fuse_fx
-import torch.nn.intrinsic.quantized as nniq
-from torch.fx import symbolic_trace, graph_module
-import torch.nn as nn
-import logging
-logger = logging.getLogger(__name__)
-from typing import Dict, List, Optional, Any, Union, Callable, Set
+from ..utils import logger
+
 @strategy_registry
 class HAWQ_V2TuneStrategy(TuneStrategy):
-    """The hawq v2 tuning strategy.
+    """The HAWQ v2 tuning strategy.
 
     Args:
         model (object):                        The FP32 model specified for low precision tuning.
@@ -95,7 +86,8 @@ def __init__(self, model, conf, q_dataloader, q_func=None,
             q_hooks)
 
     def next_tune_cfg(self):
-        from copy import deepcopy
+        # TODO remove it before merge
+        import torch
         tuning_space = self.tuning_space
         calib_size = tuning_space.root_item.get_option_by_name('calib_sampling_size').options[0]  ##TODO suppoprt list
 

From 2aabc2c60efe546852d563850e6302ea3f53ad8d Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 7 Dec 2022 13:50:59 +0800
Subject: [PATCH 109/128] add some comments

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/strategy/hawq_v2.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/neural_compressor/strategy/hawq_v2.py b/neural_compressor/strategy/hawq_v2.py
index 500f1727e69..5ac62641d44 100644
--- a/neural_compressor/strategy/hawq_v2.py
+++ b/neural_compressor/strategy/hawq_v2.py
@@ -110,7 +110,8 @@ def next_tune_cfg(self):
             yield op_tuning_cfg
         
         # Start compute the hessian trace
-        target_dtype = "int8"  # TODO support bf16
+        target_dtype = "int8"  
+        # TODO remove it before merge
         criterion=torch.nn.CrossEntropyLoss()
         op_to_traces = self.adaptor.calculate_hessian_trace(fp32_model = self._fp32_model, 
                                                             dataloader = self.calib_dataloader, 

From 4e7a4a809eb763de9abae3b4da1569cb0ad3d243 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 7 Dec 2022 19:07:24 +0800
Subject: [PATCH 110/128] WA for mapping op

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/strategy/hawq_v2.py | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/neural_compressor/strategy/hawq_v2.py b/neural_compressor/strategy/hawq_v2.py
index 5ac62641d44..43f300ef488 100644
--- a/neural_compressor/strategy/hawq_v2.py
+++ b/neural_compressor/strategy/hawq_v2.py
@@ -108,8 +108,8 @@ def next_tune_cfg(self):
                 break
             op_tuning_cfg['calib_sampling_size'] = calib_size
             yield op_tuning_cfg
-        
         # Start compute the hessian trace
+        logger.info(f"**************  Start compute the hessian trace  *****************")
         target_dtype = "int8"  
         # TODO remove it before merge
         criterion=torch.nn.CrossEntropyLoss()
@@ -118,15 +118,31 @@ def next_tune_cfg(self):
                                                             q_model = self.q_model, 
                                                             criterion =criterion, # TODO using user specify loss
                                                             enable_act = False)
-        ordered_ops = sorted(op_to_traces.keys(),
-                             key=lambda key: op_to_traces[key],
+        sorted_op_to_traces = dict(sorted(op_to_traces.items(), key=lambda item: item[1], reverse=True))
+        logger.info(f"**************  Hessian Trace  *****************")
+        for op_name, trace in sorted_op_to_traces.items():
+            logger.info(f"*** op: {op_name}, hessian trace : {trace}")
+        logger.info(f"************************************************")
+        # WA for op mapping
+        ordered_ops_tmp = {}
+        for op_info in list(initial_op_tuning_cfg.keys()):
+            op_name, op_type = op_info
+            for op_trace_name in op_to_traces.keys():
+                if isinstance(op_trace_name, str) and op_trace_name.startswith(op_name):
+                    if op_name in ordered_ops_tmp:
+                        logger.info((f"*** Already assigned the hessian trace to {op_name}",
+                                     f"update it with the value of {op_trace_name}"))
+                    ordered_ops_tmp[op_name] = op_to_traces[op_trace_name]
+
+        ordered_ops_tmp = sorted(ordered_ops_tmp.keys(),
+                             key=lambda key: ordered_ops_tmp[key],
                              reverse=self.higher_is_better)
         # WA for add op type
         op_info_map = {}
         for op_info in list(initial_op_tuning_cfg.keys()):
             op_info_map[op_info[0]] = op_info  # op_name: (op_name, op_type)
-        tmp_ordered_ops = [op_info_map[op_name] for op_name in ordered_ops]
-        op_dtypes = OrderedDict(zip(tmp_ordered_ops, [target_dtype] * len(ordered_ops)))
+        tmp_ordered_ops = [op_info_map[op_name] for op_name in ordered_ops_tmp]
+        op_dtypes = OrderedDict(zip(tmp_ordered_ops, [target_dtype] * len(ordered_ops_tmp)))
         indx=0
         #defautly fallback 5 ops
         for i in op_dtypes.keys():

From a3255bde465be9a2fd147b3bca22d3ef9ac7e848 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 7 Dec 2022 19:22:08 +0800
Subject: [PATCH 111/128] add efficientnet_b3_fx for test

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 examples/.config/model_params_pytorch.json | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/examples/.config/model_params_pytorch.json b/examples/.config/model_params_pytorch.json
index 16e03ea3bbb..184fdcefd50 100644
--- a/examples/.config/model_params_pytorch.json
+++ b/examples/.config/model_params_pytorch.json
@@ -14,10 +14,19 @@
       "dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
       "input_model": "",
       "yaml": "conf.yaml",
-      "strategy": "basic",
+      "strategy": "hawq_v2",
       "batch_size": 100,
       "new_benchmark": false
     },
+      "efficientnet_b3_fx": {
+      "model_src_dir": "image_recognition/torchvision_models/quantization/ptq/cpu/fx/",
+      "dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
+      "input_model": "",
+      "yaml": "conf.yaml",
+      "strategy": "hawq_v2",
+      "batch_size": 100,
+      "new_benchmark": false
+      },
     "resnet18_fx": {
       "model_src_dir": "image_recognition/torchvision_models/quantization/ptq/cpu/fx/",
       "dataset_location": "/tf_dataset/pytorch/ImageNet/raw",

From 5a36c596cef158e57126cd7a4512d745d68aab3d Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 8 Dec 2022 10:07:43 +0800
Subject: [PATCH 112/128] support for adding hawq_v2 loss by new API

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/conf/config.py      |  5 +++--
 neural_compressor/strategy/hawq_v2.py | 17 ++++++++---------
 test/strategy/test_hawq_v2_2.x.py     | 14 +++++++++-----
 3 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/neural_compressor/conf/config.py b/neural_compressor/conf/config.py
index a72a50e782a..1ce7b0c8c11 100644
--- a/neural_compressor/conf/config.py
+++ b/neural_compressor/conf/config.py
@@ -861,7 +861,8 @@ def percent_to_float(data):
             Optional('sigopt_project_id'): str,
             Optional('sigopt_experiment_name', default='nc-tune'): str,
             Optional('accuracy_weight', default=1.0): float,
-            Optional('latency_weight', default=1.0): float
+            Optional('latency_weight', default=1.0): float,
+            Optional('hawq_v2_loss', default=None): object,
         } ,
         Hook('accuracy_criterion', handler=_valid_accuracy_field): object,
         Optional('accuracy_criterion', default={'relative': 0.01}): {
@@ -1354,7 +1355,7 @@ def map_pyconfig_to_cfg(self, pythonic_config):
             if pythonic_config.quantization.strategy_kwargs:
                 st_kwargs = pythonic_config.quantization.strategy_kwargs
                 for st_key in ['sigopt_api_token', 'sigopt_project_id', 'sigopt_experiment_name', \
-                    'accuracy_weight', 'latency_weight']:
+                    'accuracy_weight', 'latency_weight', 'hawq_v2_loss']:
                     if st_key in st_kwargs:
                         st_val =  st_kwargs[st_key]
                         mapping.update({'tuning.strategy.' + st_key: st_val})
diff --git a/neural_compressor/strategy/hawq_v2.py b/neural_compressor/strategy/hawq_v2.py
index 5ac62641d44..2d467787501 100644
--- a/neural_compressor/strategy/hawq_v2.py
+++ b/neural_compressor/strategy/hawq_v2.py
@@ -86,10 +86,8 @@ def __init__(self, model, conf, q_dataloader, q_func=None,
             q_hooks)
 
     def next_tune_cfg(self):
-        # TODO remove it before merge
-        import torch
         tuning_space = self.tuning_space
-        calib_size = tuning_space.root_item.get_option_by_name('calib_sampling_size').options[0]  ##TODO suppoprt list
+        calib_size = tuning_space.root_item.get_option_by_name('calib_sampling_size').options[0]
 
         # Initialize the tuning config for each op according to the quantization approach
         op_item_dtype_dict, quant_mode_wise_items, initial_op_tuning_cfg = self.initial_tuning_cfg()
@@ -111,12 +109,13 @@ def next_tune_cfg(self):
         
         # Start compute the hessian trace
         target_dtype = "int8"  
-        # TODO remove it before merge
-        criterion=torch.nn.CrossEntropyLoss()
-        op_to_traces = self.adaptor.calculate_hessian_trace(fp32_model = self._fp32_model, 
-                                                            dataloader = self.calib_dataloader, 
-                                                            q_model = self.q_model, 
-                                                            criterion =criterion, # TODO using user specify loss
+        hawq_v2_criterion =self.cfg.tuning.strategy.hawq_v2_loss
+        assert hawq_v2_criterion is not None, "HAWQ-V2 strategy needs model loss function to compute the gradient, \
+            Please assign it by strategy_kwargs({'hawq_v2_loss': hawq_v2_loss})."
+        op_to_traces = self.adaptor.calculate_hessian_trace(fp32_model = self._fp32_model,
+                                                            dataloader = self.calib_dataloader,
+                                                            q_model = self.q_model,
+                                                            criterion =hawq_v2_criterion,
                                                             enable_act = False)
         ordered_ops = sorted(op_to_traces.keys(),
                              key=lambda key: op_to_traces[key],
diff --git a/test/strategy/test_hawq_v2_2.x.py b/test/strategy/test_hawq_v2_2.x.py
index 0ec055c26c3..8442154acb5 100644
--- a/test/strategy/test_hawq_v2_2.x.py
+++ b/test/strategy/test_hawq_v2_2.x.py
@@ -4,10 +4,13 @@
 import shutil
 import unittest
 
-import numpy as np
-
 from neural_compressor.utils import logger
 
+# loss function for hawq-v2
+def hawq_v2_loss(output, target):
+    import torch
+    return torch.nn.CrossEntropyLoss()(output, target)
+
 class TestHAWQV2TuningStrategy(unittest.TestCase):
 
     @classmethod
@@ -39,9 +42,10 @@ def _fake_eval(model):
         # dataset and dataloader
         dataset = DATASETS("pytorch")["dummy"](((1, 3, 224, 224)))
         dataloader = DATALOADERS["pytorch"](dataset)
-
-        # tuning and accuracy criterion
-        tuning_criterion = TuningCriterion(strategy='hawq_v2', max_trials=5)
+        
+        #tuning and accuracy criterion
+        strategy_kwargs = {'hawq_v2_loss': hawq_v2_loss}
+        tuning_criterion = TuningCriterion(strategy='hawq_v2', strategy_kwargs=strategy_kwargs, max_trials=5)
         conf = PostTrainingQuantConfig(approach="static", backend="pytorch_fx", tuning_criterion=tuning_criterion)
 
         # fit

From 8c7aa58dafa8e722ff9f8f6bbb36de08021cc69d Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 8 Dec 2022 10:08:12 +0800
Subject: [PATCH 113/128] remove some WA

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/strategy/hawq_v2.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/neural_compressor/strategy/hawq_v2.py b/neural_compressor/strategy/hawq_v2.py
index 2d467787501..4f17ed131c2 100644
--- a/neural_compressor/strategy/hawq_v2.py
+++ b/neural_compressor/strategy/hawq_v2.py
@@ -126,15 +126,7 @@ def next_tune_cfg(self):
             op_info_map[op_info[0]] = op_info  # op_name: (op_name, op_type)
         tmp_ordered_ops = [op_info_map[op_name] for op_name in ordered_ops]
         op_dtypes = OrderedDict(zip(tmp_ordered_ops, [target_dtype] * len(ordered_ops)))
-        indx=0
-        #defautly fallback 5 ops
-        for i in op_dtypes.keys():
-            op_dtypes[i]="fp32"
-            indx=indx+1
-            if indx>4:
-                break
 
-        logger.info("hawq op_config:"+str(op_dtypes))
         logger.info(f"Start to accumulate fallback to {target_dtype}.")
         initial_op_tuning_cfg = deepcopy(op_tuning_cfg)
         fallback_sampler = FallbackTuningSampler(tuning_space, tuning_order_lst=[],

From 971c723cca1adddc73849cfa426c517370fa518b Mon Sep 17 00:00:00 2001
From: lvliang-intel <liang1.lv@intel.com>
Date: Wed, 7 Dec 2022 13:39:00 +0800
Subject: [PATCH 114/128] Support 'Square', 'Sum', 'SparseSegmentSqrtN' BF16
 ops in TensorFlow backend (#223)

Signed-off-by: Lv, Liang1 <liang1.lv@intel.com>
---
 neural_compressor/adaptor/tensorflow.yaml                   | 2 +-
 .../adaptor/tf_utils/quantize_graph/qdq/fuse_qdq_matmul.py  | 6 ------
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/neural_compressor/adaptor/tensorflow.yaml b/neural_compressor/adaptor/tensorflow.yaml
index 62524f544db..256eb4a17bb 100644
--- a/neural_compressor/adaptor/tensorflow.yaml
+++ b/neural_compressor/adaptor/tensorflow.yaml
@@ -35,7 +35,7 @@
            "Erf", "FusedBatchNormV2", "FusedBatchNormGradV2", "FusedBatchNormV3", "FusedBatchNormGradV3", "LeakyRelu", "LeakyReluGrad",
            "Mean", "Mul", "Sub", "Elu", "EluGrad", "FloorDiv", "_FusedBatchNormEx", "Log", "Log1p", "LogSoftmax", "Prod", "RealDiv",
            "Reciprocal", "Rsqrt", "Selu", "SeluGrad", "Sigmoid", "SigmoidGrad", "Softmax", "Softplus", "SoftplusGrad", "Softsign",
-           "SoftsignGrad", "Sqrt", "SquaredDifference", "Tanh", "TanhGrad", #infer_list
+           "SoftsignGrad", "Sqrt", "Square", "SquaredDifference", "Sum", "Tanh", "TanhGrad", "SparseSegmentSqrtN", # infer_list
            "Abs", "ArgMax","ArgMin","BatchToSpace","BatchToSpaceND","BroadcastTo","Ceil","CheckNumerics","ClipByValue","Concat","ConcatV2",
            "DepthToSpace","DynamicPartition","DynamicStitch","EnsureShape","Enter","Equal","Exit","ExpandDims","Fill","Floor","Gather",
            "GatherNd","GatherV2","Greater","GreaterEqual","Identity","IsFinite","IsInf","IsNan","Less","LessEqual","Max","Maximum","MaxPool",
diff --git a/neural_compressor/adaptor/tf_utils/quantize_graph/qdq/fuse_qdq_matmul.py b/neural_compressor/adaptor/tf_utils/quantize_graph/qdq/fuse_qdq_matmul.py
index 1b95f743fc5..40183e427d2 100644
--- a/neural_compressor/adaptor/tf_utils/quantize_graph/qdq/fuse_qdq_matmul.py
+++ b/neural_compressor/adaptor/tf_utils/quantize_graph/qdq/fuse_qdq_matmul.py
@@ -963,12 +963,6 @@ def _is_match_matmul(self, patterns, qdq_inserted=False):
                             self.exclude_matmul_nodes.append(cur_node.name)
                             continue
 
-                        for i in self.node_name_mapping:
-                            if weight_node.input and not weight_node.input[0].startswith('^') \
-                               and weight_node.name in self.node_name_mapping[i].output:
-                                self.exclude_matmul_nodes.append(cur_node.name)
-                                continue
-
                 for sub_rule in patterns:
                     if sub_rule[0] != "Dequantize":
                         self.exclude_matmul_nodes.append(cur_node.name)

From 4e7e7e2082d35a27e7c2c3e2c41a806a121dd382 Mon Sep 17 00:00:00 2001
From: lvliang-intel <liang1.lv@intel.com>
Date: Wed, 7 Dec 2022 13:41:11 +0800
Subject: [PATCH 115/128] Support Conv2D + BiasAdd + Relu + Sum fusion (#221)

Signed-off-by: Lv, Liang1 <liang1.lv@intel.com>
---
 neural_compressor/adaptor/tensorflow.yaml          |  4 ++++
 .../tf_utils/quantize_graph/qdq/fuse_qdq_conv.py   | 14 ++++++++++----
 test/tfnewapi/test_tensorflow_graph_conv_fusion.py |  2 +-
 .../test_tensorflow_graph_qdq_conv_fusion.py       |  2 +-
 4 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/neural_compressor/adaptor/tensorflow.yaml b/neural_compressor/adaptor/tensorflow.yaml
index 256eb4a17bb..188b5ce00e6 100644
--- a/neural_compressor/adaptor/tensorflow.yaml
+++ b/neural_compressor/adaptor/tensorflow.yaml
@@ -273,6 +273,10 @@
         'Dequantize + Conv2D + BiasAdd + LeakyRelu + Add + QuantizeV2',
         'Dequantize + Conv2D + LeakyRelu + AddV2 + QuantizeV2',
         'Dequantize + Conv2D + LeakyRelu + Add + QuantizeV2',
+        'Dequantize + Conv2D + BiasAdd + Relu + AddV2 + QuantizeV2',
+        'Dequantize + Conv2D + BiasAdd + Relu + Add + QuantizeV2',
+        'Dequantize + Conv2D + Relu + AddV2 + QuantizeV2',
+        'Dequantize + Conv2D + Relu + Add + QuantizeV2',
         'Dequantize + Conv2D + Add + QuantizeV2',
         'Dequantize + Conv2D + AddV2 + QuantizeV2',
         'Dequantize + Conv2D + AddV2 + Add + QuantizeV2',
diff --git a/neural_compressor/adaptor/tf_utils/quantize_graph/qdq/fuse_qdq_conv.py b/neural_compressor/adaptor/tf_utils/quantize_graph/qdq/fuse_qdq_conv.py
index 3db96745ed6..0b1b712a627 100644
--- a/neural_compressor/adaptor/tf_utils/quantize_graph/qdq/fuse_qdq_conv.py
+++ b/neural_compressor/adaptor/tf_utils/quantize_graph/qdq/fuse_qdq_conv.py
@@ -58,12 +58,16 @@ def __init__(self, **kwargs):
                 'DequantizeConv2DSigmoidQuantizeV2': self.apply_newly_conv_biasadd_relu_fusion,
                 'DequantizeConv2DBiasAddLeakyReluAddV2QuantizeV2': self.apply_newly_conv_biasadd_addn_relu_fusion,
                 'DequantizeConv2DBiasAddLeakyReluAddQuantizeV2': self.apply_newly_conv_biasadd_addn_relu_fusion,
+                'DequantizeConv2DBiasAddReluAddV2QuantizeV2': self.apply_newly_conv_biasadd_addn_relu_fusion,
+                'DequantizeConv2DBiasAddReluAddQuantizeV2': self.apply_newly_conv_biasadd_addn_relu_fusion,
                 'DequantizeConv2DBiasAddAddLeakyReluQuantizeV2': self.apply_newly_conv_biasadd_addn_relu_fusion,
                 'DequantizeConv2DBiasAddAddV2LeakyReluQuantizeV2': self.apply_newly_conv_biasadd_addn_relu_fusion,
                 'DequantizeConv2DAddLeakyReluQuantizeV2': self.apply_newly_conv_biasadd_addn_relu_fusion,
                 'DequantizeConv2DAddV2LeakyReluQuantizeV2': self.apply_newly_conv_biasadd_addn_relu_fusion,
                 'DequantizeConv2DLeakyReluAddV2QuantizeV2': self.apply_newly_conv_biasadd_addn_relu_fusion,
                 'DequantizeConv2DLeakyReluAddQuantizeV2': self.apply_newly_conv_biasadd_addn_relu_fusion,
+                'DequantizeConv2DReluAddV2QuantizeV2': self.apply_newly_conv_biasadd_addn_relu_fusion,
+                'DequantizeConv2DReluAddQuantizeV2': self.apply_newly_conv_biasadd_addn_relu_fusion,
                 'DequantizeConv2DAddRelu6QuantizeV2': self.apply_newly_conv_biasadd_relu_fusion,
                 'DequantizeConv2DAddReluQuantizeV2': self.apply_newly_conv_biasadd_relu_fusion,
                 'DequantizeConv2DBiasAddAddRelu6MulMulQuantizeV2': self.apply_conv_biasadd_hardswish_fusion,
@@ -1194,7 +1198,9 @@ def apply_newly_conv_biasadd_addn_relu_fusion(self, match_node_name):
         # Dequantize + Conv2D + BiasAdd + AddV2 + Relu6 + QuantizeV2
         # Dequantize + Conv2D + BiasAdd + Add + Relu + QuantizeV2
         # Dequantize + Conv2D + BiasAdd + LeakyRelu + AddV2 + QuantizeV2
+        # Dequantize + Conv2D + BiasAdd + Relu + AddV2(Add) + QuantizeV2
         # Dequantize + Conv2D + LeakyRelu + AddV2 + QuantizeV2
+        # Dequantize + Conv2D + Relu + AddV2(Add) + QuantizeV2
         # Dequantize + Conv2D + Add + Add + Relu + QuantizeV2
         # Dequantize + Conv2D + BiasAdd + Add + Relu + QuantizeV2
         skip_node_name = match_node_name[2:]
@@ -1236,8 +1242,8 @@ def apply_newly_conv_biasadd_addn_relu_fusion(self, match_node_name):
                 return self.apply_newly_conv_biasadd_fusion(match_node_name[:3] + [match_node_name[-1]])
 
         forth_node = self.node_name_mapping[match_node_name[4]].node
-        if forth_node.op != 'LeakyRelu':
-            if third_node.op != 'LeakyRelu' and not self._find_relu_node(matched_node.node):
+        if forth_node.op not in ('LeakyRelu', 'Relu'):
+            if third_node.op not in ('LeakyRelu', 'Relu') and not self._find_relu_node(matched_node.node):
                 return self.apply_newly_conv_biasadd_fusion(match_node_name[:3] + [match_node_name[-1]])
 
         is_leakyrelu_add_fusion = third_node.op == 'LeakyRelu' and forth_node.op.find('Add') != -1
@@ -1251,7 +1257,7 @@ def apply_newly_conv_biasadd_addn_relu_fusion(self, match_node_name):
 
         sum_node_name = self.node_name_mapping[match_node_name[3 + relu_offset]].node.input[sum_index]
         deq_node = self.node_name_mapping[sum_node_name].node
-        if (deq_node.op != 'LeakyRelu' and deq_node.op != 'Dequantize') or \
+        if (deq_node.op != 'LeakyRelu' and deq_node.op != 'Dequantize' and deq_node.op != 'BiasAdd') or \
                    deq_node.op.find("Quantize") != -1:
             return self.apply_newly_conv_biasadd_fusion(match_node_name[:3]+[match_node_name[-1]])
 
@@ -1350,7 +1356,7 @@ def apply_newly_conv_biasadd_addn_relu_fusion(self, match_node_name):
 
                 self.add_output_graph_node(quantized_conv_node)
 
-                if is_leakyrelu_add_fusion or is_leakyrelu:
+                if is_leakyrelu_add_fusion or is_leakyrelu or is_relu_add_fusion:
                     quantize_down_name = self._add_quantize_down_nodes(
                                         node, quantized_node_name, dtypes.qint8, False)
                     self._intel_cpu_add_dequantize_result_node(
diff --git a/test/tfnewapi/test_tensorflow_graph_conv_fusion.py b/test/tfnewapi/test_tensorflow_graph_conv_fusion.py
index 09a595be4a9..e5402c910fa 100644
--- a/test/tfnewapi/test_tensorflow_graph_conv_fusion.py
+++ b/test/tfnewapi/test_tensorflow_graph_conv_fusion.py
@@ -348,7 +348,7 @@ def test_conv_biasadd_addv2_relu_fallback_fusion_1(self):
 
             for i in output_graph.graph_def.node:
                 if i.op == '_FusedQuantizedConv2D' and \
-                    i.attr['fused_ops'].list.s == [b'BiasAdd', b'Dequantize']:
+                    i.attr['fused_ops'].list.s == [b'BiasAdd', b'Sum', b'Relu', b'Requantize']:
                     found_conv_fusion = True
                     break
             self.assertEqual(found_conv_fusion, True)
diff --git a/test/tfnewapi/test_tensorflow_graph_qdq_conv_fusion.py b/test/tfnewapi/test_tensorflow_graph_qdq_conv_fusion.py
index 981bdbee29a..cb25dffd52b 100644
--- a/test/tfnewapi/test_tensorflow_graph_qdq_conv_fusion.py
+++ b/test/tfnewapi/test_tensorflow_graph_qdq_conv_fusion.py
@@ -317,7 +317,7 @@ def test_conv_biasadd_addv2_relu_fallback_fusion_1(self):
 
             for i in output_graph.graph_def.node:
                 if i.op == '_FusedQuantizedConv2D' and \
-                    i.attr['fused_ops'].list.s == [b'BiasAdd', b'Dequantize']:
+                    i.attr['fused_ops'].list.s == [b'BiasAdd', b'Sum', b'Relu', b'Requantize']:
                     found_conv_fusion = True
                     break
             self.assertEqual(found_conv_fusion, True)

From 620c5f1aa25c663a580d648acf10c094679122a0 Mon Sep 17 00:00:00 2001
From: "chen, suyue" <suyue.chen@intel.com>
Date: Wed, 7 Dec 2022 15:17:44 +0800
Subject: [PATCH 116/128] update azure pipeline (#229)

Signed-off-by: chensuyue <suyue.chen@intel.com>
---
 .azure-pipelines/model-test.yml                       |  2 +-
 .azure-pipelines/scripts/ut/run_basic_adaptor.sh      | 11 ++++++-----
 .../scripts/ut/run_basic_adaptor_tfnewapi.sh          | 11 ++++++-----
 .azure-pipelines/scripts/ut/run_basic_ipex.sh         | 11 ++++++-----
 .azure-pipelines/scripts/ut/run_basic_itex.sh         | 11 ++++++-----
 .azure-pipelines/scripts/ut/run_basic_others.sh       | 11 ++++++-----
 .azure-pipelines/scripts/ut/run_ncoder.sh             |  8 +++++---
 .azure-pipelines/scripts/ut/run_ux.sh                 |  8 +++++---
 8 files changed, 41 insertions(+), 32 deletions(-)

diff --git a/.azure-pipelines/model-test.yml b/.azure-pipelines/model-test.yml
index 512fbe158cb..270a245bb65 100644
--- a/.azure-pipelines/model-test.yml
+++ b/.azure-pipelines/model-test.yml
@@ -45,7 +45,7 @@ parameters:
       - ssd_mobilenet_v1_ckpt
       # - ssd_resnet50_v1_ckpt
       - inception_v1
-      - resnet50_fashion
+      # - resnet50_fashion
       - darknet19
       - densenet-121
       - resnet-101
diff --git a/.azure-pipelines/scripts/ut/run_basic_adaptor.sh b/.azure-pipelines/scripts/ut/run_basic_adaptor.sh
index d9a9fd2d990..d5510bbd177 100644
--- a/.azure-pipelines/scripts/ut/run_basic_adaptor.sh
+++ b/.azure-pipelines/scripts/ut/run_basic_adaptor.sh
@@ -1,5 +1,4 @@
 #!/bin/bash
-set -x
 python -c "import neural_compressor as nc;print(nc.version.__version__)"
 echo "run basic adaptor"
 
@@ -23,11 +22,13 @@ ut_log_name=${LOG_DIR}/ut_tf_${tensorflow_version}_pt_${pytorch_version}.log
 
 echo "cat run.sh..."
 cat run.sh | tee ${ut_log_name}
-echo "-------------"
+echo "------UT start-------"
 bash run.sh 2>&1 | tee -a ${ut_log_name}
 cp .coverage ${LOG_DIR}/.coverage.adaptor
-echo "list all in ${LOG_DIR}"
-ls -a ${LOG_DIR}
+echo "------UT end -------"
+
 if [ $(grep -c "FAILED" ${ut_log_name}) != 0 ] || [ $(grep -c "OK" ${ut_log_name}) == 0 ];then
+    echo "Find errors in UT test, please check the output..."
     exit 1
-fi
\ No newline at end of file
+fi
+echo "UT finished successfully! "
\ No newline at end of file
diff --git a/.azure-pipelines/scripts/ut/run_basic_adaptor_tfnewapi.sh b/.azure-pipelines/scripts/ut/run_basic_adaptor_tfnewapi.sh
index 2b687e633d3..ebd861efeb2 100644
--- a/.azure-pipelines/scripts/ut/run_basic_adaptor_tfnewapi.sh
+++ b/.azure-pipelines/scripts/ut/run_basic_adaptor_tfnewapi.sh
@@ -1,5 +1,4 @@
 #!/bin/bash
-set -x
 python -c "import neural_compressor as nc;print(nc.version.__version__)"
 echo "run basic adaptor tfnewapi"
 
@@ -19,11 +18,13 @@ ut_log_name=${LOG_DIR}/ut_tf_newapi.log
 
 echo "cat run.sh..."
 cat run.sh | tee ${ut_log_name}
-echo "-------------"
+echo "------UT start-------"
 bash run.sh 2>&1 | tee -a ${ut_log_name}
 cp .coverage ${LOG_DIR}/.coverage.tfnewapi
-echo "list all in ${LOG_DIR}"
-ls -a ${LOG_DIR}
+echo "------UT end -------"
+
 if [ $(grep -c "FAILED" ${ut_log_name}) != 0 ] || [ $(grep -c "OK" ${ut_log_name}) == 0 ];then
+    echo "Find errors in UT test, please check the output..."
     exit 1
-fi
\ No newline at end of file
+fi
+echo "UT finished successfully! "
\ No newline at end of file
diff --git a/.azure-pipelines/scripts/ut/run_basic_ipex.sh b/.azure-pipelines/scripts/ut/run_basic_ipex.sh
index 9e22bc01be3..edc2b5d3aeb 100644
--- a/.azure-pipelines/scripts/ut/run_basic_ipex.sh
+++ b/.azure-pipelines/scripts/ut/run_basic_ipex.sh
@@ -1,5 +1,4 @@
 #!/bin/bash
-set -x
 python -c "import neural_compressor as nc;print(nc.version.__version__)"
 echo "run basic ipex"
 
@@ -20,11 +19,13 @@ ut_log_name=${LOG_DIR}/ut_ipex.log
 
 echo "cat run.sh..."
 cat run.sh | tee ${ut_log_name}
-echo "-------------"
+echo "------UT start-------"
 bash run.sh 2>&1 | tee -a ${ut_log_name}
 cp .coverage ${LOG_DIR}/.coverage.ipex
-echo "list all in ${LOG_DIR}"
-ls -a ${LOG_DIR}
+echo "------UT end -------"
+
 if [ $(grep -c "FAILED" ${ut_log_name}) != 0 ] || [ $(grep -c "OK" ${ut_log_name}) == 0 ];then
+    echo "Find errors in UT test, please check the output..."
     exit 1
-fi
\ No newline at end of file
+fi
+echo "UT finished successfully! "
diff --git a/.azure-pipelines/scripts/ut/run_basic_itex.sh b/.azure-pipelines/scripts/ut/run_basic_itex.sh
index da9b9923ce9..45278216f8d 100644
--- a/.azure-pipelines/scripts/ut/run_basic_itex.sh
+++ b/.azure-pipelines/scripts/ut/run_basic_itex.sh
@@ -1,5 +1,4 @@
 #!/bin/bash
-set -x
 python -c "import neural_compressor as nc;print(nc.version.__version__)"
 echo "run basic itex"
 
@@ -19,11 +18,13 @@ ut_log_name=${LOG_DIR}/ut_itex.log
 
 echo "cat run.sh..."
 cat run.sh | tee ${ut_log_name}
-echo "-------------"
+echo "------UT start-------"
 bash run.sh 2>&1 | tee -a ${ut_log_name}
 cp .coverage ${LOG_DIR}/.coverage.itex
-echo "list all in ${LOG_DIR}"
-ls -a ${LOG_DIR}
+echo "------UT end -------"
+
 if [ $(grep -c "FAILED" ${ut_log_name}) != 0 ] || [ $(grep -c "OK" ${ut_log_name}) == 0 ];then
+    echo "Find errors in UT test, please check the output..."
     exit 1
-fi
\ No newline at end of file
+fi
+echo "UT finished successfully! "
\ No newline at end of file
diff --git a/.azure-pipelines/scripts/ut/run_basic_others.sh b/.azure-pipelines/scripts/ut/run_basic_others.sh
index 9789802a75a..4781eb89468 100644
--- a/.azure-pipelines/scripts/ut/run_basic_others.sh
+++ b/.azure-pipelines/scripts/ut/run_basic_others.sh
@@ -1,5 +1,4 @@
 #!/bin/bash
-set -x
 python -c "import neural_compressor as nc;print(nc.version.__version__)"
 echo "run basic others"
 
@@ -29,11 +28,13 @@ ut_log_name=${LOG_DIR}/ut_tf_${tensorflow_version}_pt_${pytorch_version}.log
 
 echo "cat run.sh..."
 cat run.sh | tee ${ut_log_name}
-echo "-------------"
+echo "------UT start-------"
 bash run.sh 2>&1 | tee -a ${ut_log_name}
 cp .coverage ${LOG_DIR}/.coverage.others
-echo "list all in ${LOG_DIR}"
-ls -a ${LOG_DIR}
+echo "------UT end -------"
+
 if [ $(grep -c "FAILED" ${ut_log_name}) != 0 ] || [ $(grep -c "OK" ${ut_log_name}) == 0 ];then
+    echo "Find errors in UT test, please check the output..."
     exit 1
-fi
\ No newline at end of file
+fi
+echo "UT finished successfully! "
\ No newline at end of file
diff --git a/.azure-pipelines/scripts/ut/run_ncoder.sh b/.azure-pipelines/scripts/ut/run_ncoder.sh
index aef05d13e3f..bb3e3212494 100644
--- a/.azure-pipelines/scripts/ut/run_ncoder.sh
+++ b/.azure-pipelines/scripts/ut/run_ncoder.sh
@@ -1,5 +1,4 @@
 #!/bin/bash
-set -x
 python -c "import neural_compressor as nc;print(nc.version.__version__)"
 echo "run coder"
 
@@ -15,9 +14,12 @@ ut_log_name=${LOG_DIR}/ut_neural_coder.log
 
 echo "cat run.sh..."
 cat run.sh | tee ${ut_log_name}
-echo "-------------"
+echo "------UT start-------"
 bash run.sh 2>&1 | tee -a ${ut_log_name}
+echo "------UT end -------"
 
 if [ $(grep -c "FAILED" ${ut_log_name}) != 0 ] || [ $(grep -c "OK" ${ut_log_name}) == 0 ];then
+    echo "Find errors in UT test, please check the output..."
     exit 1
-fi
\ No newline at end of file
+fi
+echo "UT finished successfully! "
\ No newline at end of file
diff --git a/.azure-pipelines/scripts/ut/run_ux.sh b/.azure-pipelines/scripts/ut/run_ux.sh
index ceb1c7fcefd..e7041cbacce 100644
--- a/.azure-pipelines/scripts/ut/run_ux.sh
+++ b/.azure-pipelines/scripts/ut/run_ux.sh
@@ -1,5 +1,4 @@
 #!/bin/bash
-set -x
 python -c "import neural_compressor as nc;print(nc.version.__version__)"
 echo "run ux"
 
@@ -21,9 +20,12 @@ ut_log_name=${LOG_DIR}/ut_tf_${tensorflow_version}_pt_${pytorch_version}.log
 
 echo "cat run.sh..."
 cat run.sh | tee ${ut_log_name}
-echo "-------------"
+echo "------UT start-------"
 bash run.sh 2>&1 | tee -a ${ut_log_name}
+echo "------UT end -------"
 
 if [ $(grep -c "FAILED" ${ut_log_name}) != 0 ] || [ $(grep -c "OK" ${ut_log_name}) == 0 ];then
+    echo "Find errors in UT test, please check the output..."
     exit 1
-fi
\ No newline at end of file
+fi
+echo "UT finished successfully! "
\ No newline at end of file

From 7ffbbf18c6eb2039342e859c7d1de14560eb1db0 Mon Sep 17 00:00:00 2001
From: xinhe <xin3.he@intel.com>
Date: Wed, 7 Dec 2022 17:26:51 +0800
Subject: [PATCH 117/128] Add export examples for new API (#225)

Signed-off-by: Xin He <xin3.he@intel.com>
---
 .../quantization/ptq_dynamic/eager/README.md  |  5 +++
 .../ptq_dynamic/eager/requirements.txt        |  4 +-
 .../ptq_dynamic/eager/run_glue_tune.py        | 35 ++++++++++++---
 .../ptq_dynamic/eager/run_tuning.sh           |  1 +
 .../quantization/ptq_static/fx/README.md      |  8 ++++
 .../ptq_static/fx/requirements.txt            |  2 +
 .../quantization/ptq_static/fx/run_glue.py    | 44 ++++++++++++++++++-
 .../quantization/ptq_static/fx/run_tuning.sh  |  1 +
 .../quantization/qat/fx/README.md             |  8 ++++
 .../quantization/qat/fx/requirements.txt      |  2 +
 .../quantization/qat/fx/run_glue_tune.py      | 40 +++++++++++++++++
 .../quantization/qat/fx/run_tuning.sh         |  1 +
 12 files changed, 143 insertions(+), 8 deletions(-)

diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/README.md b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/README.md
index ac449cdb781..016d8d99456 100644
--- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/README.md
+++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/README.md
@@ -198,5 +198,10 @@ Shapley values originate from cooperative game theory that come with desirable p
 > **Note** : run_glue_tune_with_shap.py is the example of "SST2" task. If you want to execute other glue task, you may take some slight change under "ShapleyMSE" class.  
 
 
+# Appendix
 
+## Export to ONNX
 
+Right now, we experimentally support exporting PyTorch model to ONNX model, includes FP32 and INT8 model.
+
+By enabling `--onnx` argument, Intel Neural Compressor will export fp32 ONNX model, INT8 QDQ ONNX model, and INT8 QLinear ONNX model.
diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/requirements.txt b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/requirements.txt
index 7ad9dc04d0c..688b5217718 100644
--- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/requirements.txt
+++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/requirements.txt
@@ -6,4 +6,6 @@ torch >= 1.3
 transformers>=4.10.0
 shap
 scipy
-sacremoses
\ No newline at end of file
+sacremoses
+onnx
+onnxruntime
\ No newline at end of file
diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/run_glue_tune.py b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/run_glue_tune.py
index 13812b30b4e..b41c077ac59 100755
--- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/run_glue_tune.py
+++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/run_glue_tune.py
@@ -144,18 +144,25 @@ class ModelArguments:
     tune: bool = field(
         default=False,
         metadata={
-            "help": "tune quantized model with Intel Neural Compressor)."
-        },
+            "help": "tune quantized model with Intel Neural Compressor)."},
     )
     benchmark: bool = field(
         default=False,
-        metadata={"help": "run benchmark."})
+        metadata={"help": "run benchmark."},
+    )
     int8: bool = field(
         default=False,
-        metadata={"help":"run benchmark."})
+        metadata={"help":"initialize int8 model."},
+    )
     accuracy_only: bool = field(
         default=False,
-        metadata={"help":"Whether to only test accuracy for model tuned by Neural Compressor."})
+        metadata={"help":"Whether to only test accuracy for model tuned by Neural Compressor."},
+    )
+    onnx: bool = field(
+        default=False, metadata={"help": "convert PyTorch model to ONNX"}
+    )
+
+
 def main():
     # See all possible arguments in src/transformers/training_args.py
     # or by passing the --help flag to this script.
@@ -439,6 +446,24 @@ def eval_func_for_nc(model_tuned):
         q_model = fit(model, conf=conf, eval_func=eval_func_for_nc)
         from neural_compressor.utils.load_huggingface import save_for_huggingface_upstream
         save_for_huggingface_upstream(q_model, tokenizer, training_args.output_dir)
+
+        if model_args.onnx:
+            eval_dataloader = trainer.get_eval_dataloader()
+            it = iter(eval_dataloader)
+            input = next(it)
+            input.pop('labels')
+            symbolic_names = {0: 'batch_size', 1: 'max_seq_len'}
+            dynamic_axes = {k: symbolic_names for k in input.keys()}
+            from neural_compressor.config import Torch2ONNXConfig
+            int8_onnx_config = Torch2ONNXConfig(
+                dtype="int8",
+                opset_version=14,
+                example_inputs=tuple(input.values()),
+                input_names=list(input.keys()),
+                output_names=['labels'],
+                dynamic_axes=dynamic_axes,
+            )
+            q_model.export('int8-nlp-model.onnx', int8_onnx_config)
         exit(0)
 
     if model_args.accuracy_only:
diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/run_tuning.sh b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/run_tuning.sh
index e01add178fb..edc07713079 100644
--- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/run_tuning.sh
+++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/run_tuning.sh
@@ -86,6 +86,7 @@ function run_tuning {
         --no_cuda \
         --output_dir ${tuned_checkpoint} \
         --tune \
+        --onnx \
         ${extra_cmd}
 }
 
diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/README.md b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/README.md
index 881332a1314..d9b82bf907b 100644
--- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/README.md
+++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/README.md
@@ -187,3 +187,11 @@ quantizer.model = common.Model(model)
 model = quantizer.fit()
 model.save(training_args.output_dir)
 ```
+
+# Appendix
+
+## Export to ONNX
+
+Right now, we experimentally support exporting PyTorch model to ONNX model, includes FP32 and INT8 model.
+
+By enabling `--onnx` argument, Intel Neural Compressor will export fp32 ONNX model, INT8 QDQ ONNX model, and INT8 QLinear ONNX model.
diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/requirements.txt b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/requirements.txt
index fbbce5e4433..01afab8e2ae 100644
--- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/requirements.txt
+++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/requirements.txt
@@ -4,6 +4,8 @@ protobuf
 scipy
 scikit-learn
 Keras-Preprocessing
+onnx
+onnxruntime
 transformers >= 4.16.0
 --find-links https://download.pytorch.org/whl/torch_stable.html
 torch >= 1.8.0+cpu
diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_glue.py b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_glue.py
index 717ae91d886..113bfa69341 100644
--- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_glue.py
+++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_glue.py
@@ -195,6 +195,9 @@ class ModelArguments:
     accuracy_only: bool = field(
         default=False, metadata={"help": "get accuracy"}
     )
+    onnx: bool = field(
+        default=False, metadata={"help": "convert PyTorch model to ONNX"}
+    )
 
 
 def main():
@@ -502,9 +505,46 @@ def eval_func(model):
         from neural_compressor.config import PostTrainingQuantConfig, TuningCriterion
         tuning_criterion = TuningCriterion(max_trials=600)
         conf = PostTrainingQuantConfig(approach="static", backend="pytorch_fx", tuning_criterion=tuning_criterion)
-        model = fit(model, conf=conf, calib_dataloader=eval_dataloader, eval_func=eval_func)
+        q_model = fit(model, conf=conf, calib_dataloader=eval_dataloader, eval_func=eval_func)
         from neural_compressor.utils.load_huggingface import save_for_huggingface_upstream
-        save_for_huggingface_upstream(model, tokenizer, training_args.output_dir)
+        save_for_huggingface_upstream(q_model, tokenizer, training_args.output_dir)
+
+        if model_args.onnx:
+            it = iter(eval_dataloader)
+            input = next(it)
+            input.pop('labels')
+            symbolic_names = {0: 'batch_size', 1: 'max_seq_len'}
+            dynamic_axes = {k: symbolic_names for k in input.keys()}
+            from neural_compressor.config import Torch2ONNXConfig
+            fp32_onnx_config = Torch2ONNXConfig(
+                dtype="fp32",
+                opset_version=14,
+                example_inputs=tuple(input.values()),
+                input_names=list(input.keys()),
+                output_names=['labels'],
+                dynamic_axes=dynamic_axes,
+            )
+            q_model.export('fp32-model.onnx', fp32_onnx_config)
+            int8_onnx_config = Torch2ONNXConfig(
+                dtype="int8",
+                opset_version=14,
+                quant_format="QDQ",
+                example_inputs=tuple(input.values()),
+                input_names=list(input.keys()),
+                output_names=['labels'],
+                dynamic_axes=dynamic_axes,
+            )
+            q_model.export('int8-nlp-qdq-model.onnx', int8_onnx_config)
+            int8_onnx_config = Torch2ONNXConfig(
+                dtype="int8",
+                opset_version=14,
+                quant_format="QLinear",
+                example_inputs=tuple(input.values()),
+                input_names=list(input.keys()),
+                output_names=['labels'],
+                dynamic_axes=dynamic_axes,
+            )
+            q_model.export('int8-nlp-qlinear-model.onnx', int8_onnx_config)
         return
 
     if model_args.benchmark or model_args.accuracy_only:
diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_tuning.sh b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_tuning.sh
index a3f5c6934c7..19712872786 100644
--- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_tuning.sh
+++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_tuning.sh
@@ -92,6 +92,7 @@ function run_tuning {
         --no_cuda \
         --output_dir ${tuned_checkpoint} \
         --tune \
+        --onnx \
         --overwrite_output_dir \
         ${extra_cmd}
 }
diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/README.md b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/README.md
index e1c802c7ff2..fc6d1ccd4e1 100644
--- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/README.md
+++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/README.md
@@ -117,3 +117,11 @@ model = OptimizedModel.from_pretrained(
 ```
 
 We also upstreamed several int8 models into HuggingFace [model hub](https://huggingface.co/models?other=Intel%C2%AE%20Neural%20Compressor) for users to ramp up.
+
+# Appendix
+
+## Export to ONNX
+
+Right now, we experimentally support exporting PyTorch model to ONNX model, includes FP32 and INT8 model.
+
+By enabling `--onnx` argument, Intel Neural Compressor will export fp32 ONNX model, INT8 QDQ ONNX model, and INT8 QLinear ONNX model.
diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/requirements.txt b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/requirements.txt
index 5386769210e..2bb6fc03b2d 100644
--- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/requirements.txt
+++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/requirements.txt
@@ -4,5 +4,7 @@ datasets == 1.18.0
 sentencepiece != 0.1.92
 protobuf
 scipy
+onnx
+onnxruntime
 --find-links https://download.pytorch.org/whl/torch_stable.html
 torch >= 1.8.0+cpu
diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_glue_tune.py b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_glue_tune.py
index f5bc771e712..f9fe765dbc2 100644
--- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_glue_tune.py
+++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_glue_tune.py
@@ -194,6 +194,9 @@ class ModelArguments:
     benchmark: bool = field(
         default=False, metadata={"help": "get benchmark instead of accuracy"}
     )
+    onnx: bool = field(
+        default=False, metadata={"help": "convert PyTorch model to ONNX"}
+    )
 
 
 def main():
@@ -533,6 +536,43 @@ def benchmark(model):
 
         from neural_compressor.utils.load_huggingface import save_for_huggingface_upstream
         save_for_huggingface_upstream(model, tokenizer, training_args.output_dir)
+
+        if model_args.onnx:
+            it = iter(eval_dataloader)
+            input = next(it)
+            input.pop('labels')
+            symbolic_names = {0: 'batch_size', 1: 'max_seq_len'}
+            dynamic_axes = {k: symbolic_names for k in input.keys()}
+            from neural_compressor.config import Torch2ONNXConfig
+            fp32_onnx_config = Torch2ONNXConfig(
+                dtype="fp32",
+                opset_version=14,
+                example_inputs=tuple(input.values()),
+                input_names=list(input.keys()),
+                output_names=['labels'],
+                dynamic_axes=dynamic_axes,
+            )
+            model.export('fp32-model.onnx', fp32_onnx_config)
+            int8_onnx_config = Torch2ONNXConfig(
+                dtype="int8",
+                opset_version=14,
+                quant_format="QDQ",
+                example_inputs=tuple(input.values()),
+                input_names=list(input.keys()),
+                output_names=['labels'],
+                dynamic_axes=dynamic_axes,
+            )
+            model.export('int8-nlp-qdq-model.onnx', int8_onnx_config)
+            int8_onnx_config = Torch2ONNXConfig(
+                dtype="int8",
+                opset_version=14,
+                quant_format="QLinear",
+                example_inputs=tuple(input.values()),
+                input_names=list(input.keys()),
+                output_names=['labels'],
+                dynamic_axes=dynamic_axes,
+            )
+            model.export('int8-nlp-qlinear-model.onnx', int8_onnx_config)
         return
 
     if model_args.benchmark:
diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_tuning.sh b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_tuning.sh
index 888a8968d24..31d6f314e8b 100644
--- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_tuning.sh
+++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_tuning.sh
@@ -60,6 +60,7 @@ function run_tuning {
         --save_strategy steps \
         --metric_for_best_model f1 \
         --save_total_limit 1 \
+        --onnx \
         --tune
 }
 

From f9008e236a816a67da0f6a1683ccf24a98f9bf23 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 8 Dec 2022 10:07:43 +0800
Subject: [PATCH 118/128] support for adding hawq_v2 loss by new API

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/conf/config.py      |  5 +++--
 neural_compressor/strategy/hawq_v2.py | 17 ++++++++---------
 test/strategy/test_hawq_v2_2.x.py     | 14 +++++++++-----
 3 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/neural_compressor/conf/config.py b/neural_compressor/conf/config.py
index a72a50e782a..1ce7b0c8c11 100644
--- a/neural_compressor/conf/config.py
+++ b/neural_compressor/conf/config.py
@@ -861,7 +861,8 @@ def percent_to_float(data):
             Optional('sigopt_project_id'): str,
             Optional('sigopt_experiment_name', default='nc-tune'): str,
             Optional('accuracy_weight', default=1.0): float,
-            Optional('latency_weight', default=1.0): float
+            Optional('latency_weight', default=1.0): float,
+            Optional('hawq_v2_loss', default=None): object,
         } ,
         Hook('accuracy_criterion', handler=_valid_accuracy_field): object,
         Optional('accuracy_criterion', default={'relative': 0.01}): {
@@ -1354,7 +1355,7 @@ def map_pyconfig_to_cfg(self, pythonic_config):
             if pythonic_config.quantization.strategy_kwargs:
                 st_kwargs = pythonic_config.quantization.strategy_kwargs
                 for st_key in ['sigopt_api_token', 'sigopt_project_id', 'sigopt_experiment_name', \
-                    'accuracy_weight', 'latency_weight']:
+                    'accuracy_weight', 'latency_weight', 'hawq_v2_loss']:
                     if st_key in st_kwargs:
                         st_val =  st_kwargs[st_key]
                         mapping.update({'tuning.strategy.' + st_key: st_val})
diff --git a/neural_compressor/strategy/hawq_v2.py b/neural_compressor/strategy/hawq_v2.py
index 43f300ef488..e6b5d7c619d 100644
--- a/neural_compressor/strategy/hawq_v2.py
+++ b/neural_compressor/strategy/hawq_v2.py
@@ -86,10 +86,8 @@ def __init__(self, model, conf, q_dataloader, q_func=None,
             q_hooks)
 
     def next_tune_cfg(self):
-        # TODO remove it before merge
-        import torch
         tuning_space = self.tuning_space
-        calib_size = tuning_space.root_item.get_option_by_name('calib_sampling_size').options[0]  ##TODO suppoprt list
+        calib_size = tuning_space.root_item.get_option_by_name('calib_sampling_size').options[0]
 
         # Initialize the tuning config for each op according to the quantization approach
         op_item_dtype_dict, quant_mode_wise_items, initial_op_tuning_cfg = self.initial_tuning_cfg()
@@ -111,12 +109,13 @@ def next_tune_cfg(self):
         # Start compute the hessian trace
         logger.info(f"**************  Start compute the hessian trace  *****************")
         target_dtype = "int8"  
-        # TODO remove it before merge
-        criterion=torch.nn.CrossEntropyLoss()
-        op_to_traces = self.adaptor.calculate_hessian_trace(fp32_model = self._fp32_model, 
-                                                            dataloader = self.calib_dataloader, 
-                                                            q_model = self.q_model, 
-                                                            criterion =criterion, # TODO using user specify loss
+        hawq_v2_criterion =self.cfg.tuning.strategy.hawq_v2_loss
+        assert hawq_v2_criterion is not None, "HAWQ-V2 strategy needs model loss function to compute the gradient, \
+            Please assign it by strategy_kwargs({'hawq_v2_loss': hawq_v2_loss})."
+        op_to_traces = self.adaptor.calculate_hessian_trace(fp32_model = self._fp32_model,
+                                                            dataloader = self.calib_dataloader,
+                                                            q_model = self.q_model,
+                                                            criterion =hawq_v2_criterion,
                                                             enable_act = False)
         sorted_op_to_traces = dict(sorted(op_to_traces.items(), key=lambda item: item[1], reverse=True))
         logger.info(f"**************  Hessian Trace  *****************")
diff --git a/test/strategy/test_hawq_v2_2.x.py b/test/strategy/test_hawq_v2_2.x.py
index 0ec055c26c3..8442154acb5 100644
--- a/test/strategy/test_hawq_v2_2.x.py
+++ b/test/strategy/test_hawq_v2_2.x.py
@@ -4,10 +4,13 @@
 import shutil
 import unittest
 
-import numpy as np
-
 from neural_compressor.utils import logger
 
+# loss function for hawq-v2
+def hawq_v2_loss(output, target):
+    import torch
+    return torch.nn.CrossEntropyLoss()(output, target)
+
 class TestHAWQV2TuningStrategy(unittest.TestCase):
 
     @classmethod
@@ -39,9 +42,10 @@ def _fake_eval(model):
         # dataset and dataloader
         dataset = DATASETS("pytorch")["dummy"](((1, 3, 224, 224)))
         dataloader = DATALOADERS["pytorch"](dataset)
-
-        # tuning and accuracy criterion
-        tuning_criterion = TuningCriterion(strategy='hawq_v2', max_trials=5)
+        
+        #tuning and accuracy criterion
+        strategy_kwargs = {'hawq_v2_loss': hawq_v2_loss}
+        tuning_criterion = TuningCriterion(strategy='hawq_v2', strategy_kwargs=strategy_kwargs, max_trials=5)
         conf = PostTrainingQuantConfig(approach="static", backend="pytorch_fx", tuning_criterion=tuning_criterion)
 
         # fit

From 0d8f0e830b8e462b19a8c64985d73e14ff9a1e18 Mon Sep 17 00:00:00 2001
From: "biao.fang" <biao.fang@intel.com>
Date: Thu, 8 Dec 2022 14:31:55 +0800
Subject: [PATCH 119/128] enable trace type Tensor->float

---
 neural_compressor/adaptor/torch_utils/hawq_metric.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/neural_compressor/adaptor/torch_utils/hawq_metric.py b/neural_compressor/adaptor/torch_utils/hawq_metric.py
index 2b6a7790eb0..0e505848a85 100644
--- a/neural_compressor/adaptor/torch_utils/hawq_metric.py
+++ b/neural_compressor/adaptor/torch_utils/hawq_metric.py
@@ -267,13 +267,13 @@ def get_weight_traces(self, num_samples):
             diff_ratio = abs(model_trace - prev_avg_model_trace) / (prev_avg_model_trace + self.eps)
             if diff_ratio < self.tolerance and iter > 10:  ##TODO magic number
                 break
-            if iter == 50:  ##TODO for debug
+            if iter == 20:  ##TODO for debug
                 break
             prev_avg_model_trace = model_trace
         weight_name_to_traces = {}
         layer_traces = layer_traces_estimate
         for weight_name, trace in zip(self.weight_names, layer_traces):
-            weight_name_to_traces[weight_name] = trace
+            weight_name_to_traces[weight_name] = float(trace)# tensor->float
         op_name_to_trace = {}
         for weight_name in self.weight_names:
             op_name = self.weight_to_op[weight_name]
@@ -434,7 +434,7 @@ def get_avg_traces(self, enable_act=True, num_samples=32):
             act_traces = self.get_act_traces(num_samples)
             for i,j in zip(act_traces,mse_gap):
                 #currently use mse to analysis 
-                act_trace[i]=act_traces[i]+mse_gap[j]
+                act_trace[i]=float(act_traces[i])+float(mse_gap[j])# Tensor->float
             traces['activation'] = act_traces
         return traces
 

From 8350241179af04cc0cb40fee9dcca73723ecc72f Mon Sep 17 00:00:00 2001
From: "biao.fang" <biao.fang@intel.com>
Date: Thu, 8 Dec 2022 15:03:17 +0800
Subject: [PATCH 120/128] cancel Max iter times for debugging

---
 neural_compressor/adaptor/torch_utils/hawq_metric.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/neural_compressor/adaptor/torch_utils/hawq_metric.py b/neural_compressor/adaptor/torch_utils/hawq_metric.py
index 0e505848a85..fd1428acc12 100644
--- a/neural_compressor/adaptor/torch_utils/hawq_metric.py
+++ b/neural_compressor/adaptor/torch_utils/hawq_metric.py
@@ -267,8 +267,8 @@ def get_weight_traces(self, num_samples):
             diff_ratio = abs(model_trace - prev_avg_model_trace) / (prev_avg_model_trace + self.eps)
             if diff_ratio < self.tolerance and iter > 10:  ##TODO magic number
                 break
-            if iter == 20:  ##TODO for debug
-                break
+            # if iter == 20:  ##TODO for debugging
+            #     break
             prev_avg_model_trace = model_trace
         weight_name_to_traces = {}
         layer_traces = layer_traces_estimate
@@ -321,8 +321,8 @@ def get_act_traces(self, num_samples):
                             prev_model_act_trace + self.eps)
                     if diff_ratio < self.tolerance and iter > 10:  ##TODO magic number
                         break
-                    if iter == 50:  ##TODO for debug
-                        break
+                    # if iter == 50:  ##TODO for debug
+                    #     break
 
                     prev_model_act_trace = current_model_act_trace
                 act_traces_per_sample.append(vt_H_v_mean_per_act)

From 8b7993819ed26edaa4c756f2a4b1a8fc95de4845 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 8 Dec 2022 18:51:25 +0800
Subject: [PATCH 121/128] revert change for test

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../quantization/ptq/cpu/fx/conf.yaml                | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/conf.yaml b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/conf.yaml
index ef61c6c3e0b..f11483acd16 100644
--- a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/conf.yaml
+++ b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/conf.yaml
@@ -21,7 +21,7 @@ quantization:                                        # optional. tuning constrai
   calibration:
     sampling_size: 300                               # optional. default value is 100. used to set how many samples should be used in calibration.
     dataloader:
-      batch_size: 1
+      batch_size: 30
       dataset:
         ImageFolder:
           root: /path/to/calibration/dataset         # NOTE: modify to calibration dataset location if needed
@@ -40,10 +40,10 @@ evaluation:                                          # optional. required if use
     metric:
       topk: 1                                        # built-in metrics are topk, map, f1, allow user to register new metric.
     dataloader:
-      batch_size: 1
+      batch_size: 30
       dataset:
         ImageFolder:
-          root: /path/to/calibration/dataset         # NOTE: modify to evaluation dataset location if needed
+          root: /path/to/evaluation/dataset          # NOTE: modify to evaluation dataset location if needed
       transform:
         Resize:
           size: 256
@@ -61,7 +61,7 @@ evaluation:                                          # optional. required if use
       batch_size: 1
       dataset:
         ImageFolder:
-          root: /path/to/calibration/dataset         # NOTE: modify to evaluation dataset location if needed
+          root: /path/to/evaluation/dataset          # NOTE: modify to evaluation dataset location if needed
       transform:
         Resize:
           size: 256
@@ -73,10 +73,8 @@ evaluation:                                          # optional. required if use
           std: [0.229, 0.224, 0.225]
 
 tuning:
-  strategy:
-    name: hawq
   accuracy_criterion:
     relative:  0.01                                  # optional. default value is relative, other value is absolute. this example allows relative accuracy loss: 1%.
   exit_policy:
     timeout: 0                                       # optional. tuning timeout (seconds). default value is 0 which means early stop. combine with max_trials field to decide when to exit.
-  random_seed: 9527                                  # optional. random seed for deterministic tuning.
+  random_seed: 9527                                  # optional. random seed for deterministic tuning.
\ No newline at end of file

From 04fc7aed3f696479067614bb1787248e11f6859e Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 8 Dec 2022 18:55:43 +0800
Subject: [PATCH 122/128] fixed some bugs

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/adaptor/pytorch.py                 | 2 +-
 neural_compressor/adaptor/torch_utils/hawq_metric.py | 2 ++
 neural_compressor/strategy/basic.py                  | 5 -----
 neural_compressor/strategy/hawq_v2.py                | 4 ++--
 4 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/neural_compressor/adaptor/pytorch.py b/neural_compressor/adaptor/pytorch.py
index 79df22deeba..2ff206c392c 100644
--- a/neural_compressor/adaptor/pytorch.py
+++ b/neural_compressor/adaptor/pytorch.py
@@ -29,7 +29,6 @@
 from ..utils import logger
 from .query import QueryBackendCapability
 from ..experimental.data.dataloaders.base_dataloader import BaseDataLoader
-from .torch_utils.hawq_metric import hawq_top
 
 torch = LazyImport("torch")
 json = LazyImport("json")
@@ -1113,6 +1112,7 @@ def calculate_hessian_trace(self,
         Return:
             hessian_trace(Dict[Tuple, float]), key: (op_name, op_type); value: hessian trace.
         """
+        from .torch_utils.hawq_metric import hawq_top
         op_to_traces=hawq_top(fp32_model=fp32_model,dataloader=dataloader,q_model=q_model,criterion=criterion,enable_act=enable_act)
         return op_to_traces
         pass
diff --git a/neural_compressor/adaptor/torch_utils/hawq_metric.py b/neural_compressor/adaptor/torch_utils/hawq_metric.py
index fd1428acc12..c6973826516 100644
--- a/neural_compressor/adaptor/torch_utils/hawq_metric.py
+++ b/neural_compressor/adaptor/torch_utils/hawq_metric.py
@@ -14,6 +14,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from ...utils.utility import LazyImport
+torch = LazyImport("torch")
 
 import copy
 import numpy as np
diff --git a/neural_compressor/strategy/basic.py b/neural_compressor/strategy/basic.py
index 184a15996f7..c35398dd4bb 100644
--- a/neural_compressor/strategy/basic.py
+++ b/neural_compressor/strategy/basic.py
@@ -144,11 +144,6 @@ def next_tune_cfg(self):
                     logger.info(f"Start to fallback op to {target_dtype} one by one.")
                     self._fallback_started()
                 fallback_items_name_lst = [item.name for item in fallback_items_lst][::-1] # from bottom to up
-                # ops_sensitivity = self.adaptor.calculate_op_sensitivity(self._fp32_model, 
-                #                                                         self.calib_dataloader, 
-                #                                                         method_args = {'name': 'hessian_trace'})
-                #fallback_items_name_lst = sorted(ops_sensitivity, key = lambda items: items[1], reverse=True)
-                
                 op_dtypes = OrderedDict(zip(fallback_items_name_lst, [target_dtype] * len(fallback_items_name_lst)))
                 initial_op_tuning_cfg = deepcopy(best_op_tuning_cfg_stage1)
                 fallback_sampler = FallbackTuningSampler(tuning_space, tuning_order_lst=[],
diff --git a/neural_compressor/strategy/hawq_v2.py b/neural_compressor/strategy/hawq_v2.py
index 0647b6a15d4..cb4d759bfcc 100644
--- a/neural_compressor/strategy/hawq_v2.py
+++ b/neural_compressor/strategy/hawq_v2.py
@@ -110,8 +110,8 @@ def next_tune_cfg(self):
         logger.info(f"**************  Start compute the hessian trace  *****************")
         target_dtype = "int8"  
         hawq_v2_criterion =self.cfg.tuning.strategy.hawq_v2_loss
-        assert hawq_v2_criterion is not None, "HAWQ-V2 strategy needs model loss function to compute the gradient, \
-            Please assign it by strategy_kwargs({'hawq_v2_loss': hawq_v2_loss})."
+        # assert hawq_v2_criterion is not None, "HAWQ-V2 strategy needs model loss function to compute the gradient, \
+        #     Please assign it by strategy_kwargs({'hawq_v2_loss': hawq_v2_loss})."
         op_to_traces = self.adaptor.calculate_hessian_trace(fp32_model = self._fp32_model,
                                                             dataloader = self.calib_dataloader,
                                                             q_model = self.q_model,

From 953d861d1cdb36f17e6ac40a1d0bd6799d433f9b Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 8 Dec 2022 19:00:12 +0800
Subject: [PATCH 123/128] revert change for test

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/strategy/mse.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/neural_compressor/strategy/mse.py b/neural_compressor/strategy/mse.py
index 8dafa35759d..614984359ba 100644
--- a/neural_compressor/strategy/mse.py
+++ b/neural_compressor/strategy/mse.py
@@ -194,11 +194,10 @@ def initial_op_quant_mode(items_lst, target_quant_mode, op_item_dtype_dict):
                 initial_op_quant_mode(quant_mode_items, quant_mode, op_item_dtype_dict)
 
             # step3. optype-wise tuning tuning items: the algorithm/scheme/granularity of activation(weight)
-            early_stop_tuning = True
+            early_stop_tuning = False
             stage1_cnt = 0
             int8_ops = quant_mode_wise_items['dynamic'] + quant_mode_wise_items['static']
             stage1_max = min(5, len(int8_ops))  # TODO set a more appropriate value
-            stage1_max=-1
             op_wise_tuning_sampler = OpTypeWiseTuningSampler(tuning_space, [], [], 
                                                              op_item_dtype_dict, initial_op_tuning_cfg)
             for op_tuning_cfg in op_wise_tuning_sampler:

From 2e14eb121ba10a5babe1599c1651d19cb4cdf6ea Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 8 Dec 2022 19:09:42 +0800
Subject: [PATCH 124/128] add more log info

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/adaptor/torch_utils/hawq_metric.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/neural_compressor/adaptor/torch_utils/hawq_metric.py b/neural_compressor/adaptor/torch_utils/hawq_metric.py
index c6973826516..a63b2ef4c85 100644
--- a/neural_compressor/adaptor/torch_utils/hawq_metric.py
+++ b/neural_compressor/adaptor/torch_utils/hawq_metric.py
@@ -259,9 +259,10 @@ def get_vtHv_act(self, params, num_samples):
                     break
 
     def get_weight_traces(self, num_samples):
+        import tqdm
         layer_traces_per_iter = []
         prev_avg_model_trace = 0
-        for iter in range(self.max_iter):
+        for iter in tqdm.tqdm(range(self.max_iter)):
             layer_traces = self.get_vtHv_weight(self.params, num_samples)
             layer_traces_per_iter.append(layer_traces)
             layer_traces_estimate = torch.mean(torch.stack(layer_traces_per_iter), dim=0)

From 52ee89d57218e20b6029a9a519f6d4caa1b78bbe Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 8 Dec 2022 19:18:14 +0800
Subject: [PATCH 125/128] add skip first as arg

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/strategy/hawq_v2.py                 | 3 ++-
 neural_compressor/strategy/st_utils/tuning_sampler.py | 8 +++++---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/neural_compressor/strategy/hawq_v2.py b/neural_compressor/strategy/hawq_v2.py
index cb4d759bfcc..6645fd86153 100644
--- a/neural_compressor/strategy/hawq_v2.py
+++ b/neural_compressor/strategy/hawq_v2.py
@@ -147,7 +147,8 @@ def next_tune_cfg(self):
         initial_op_tuning_cfg = deepcopy(op_tuning_cfg)
         fallback_sampler = FallbackTuningSampler(tuning_space, tuning_order_lst=[],
                                                  initial_op_tuning_cfg=op_tuning_cfg,
-                                                 op_dtypes=op_dtypes, accumulate=True)
+                                                 op_dtypes=op_dtypes, accumulate=True,
+                                                 skip_first=False)
         for op_tuning_cfg in fallback_sampler:
             op_tuning_cfg['calib_sampling_size'] = calib_size
             yield op_tuning_cfg
diff --git a/neural_compressor/strategy/st_utils/tuning_sampler.py b/neural_compressor/strategy/st_utils/tuning_sampler.py
index f311d7c16a4..9b5eff7dc1b 100644
--- a/neural_compressor/strategy/st_utils/tuning_sampler.py
+++ b/neural_compressor/strategy/st_utils/tuning_sampler.py
@@ -254,16 +254,18 @@ def __init__(self,
                  tuning_order_lst: List[TuningOrder],
                  initial_op_tuning_cfg: Dict[tuple, Any],
                  op_dtypes: Dict[str, str],
-                 accumulate: bool
+                 accumulate: bool,
+                 skip_first: bool = True
                  ):
         super().__init__(tuning_space, tuning_order_lst, initial_op_tuning_cfg)
         self.op_dtypes = op_dtypes
         self.accumulate = accumulate
+        self.skip_first = skip_first
         pass
 
     def __iter__(self):
         new_tune_cfg = copy.deepcopy(self.initial_op_tuning_cfg)
-        skip_first = False
+        skip_first = self.skip_first
         for op_name_type, target_dtype in self.op_dtypes.items():
             if not self.accumulate:
                 new_tune_cfg = copy.deepcopy(self.initial_op_tuning_cfg)
@@ -272,7 +274,7 @@ def __iter__(self):
             if self.accumulate and skip_first:  # skip the first one
                 skip_first = False
                 continue
-            logger.info(f"fallback {op_name_type} to {target_dtype}")
+            logger.debug(f"fallback {op_name_type} to {target_dtype}")
             yield new_tune_cfg  # need to skip the first one
 
 

From 6aac6c510b2bed1194de602dc1a053114f96ae48 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Fri, 9 Dec 2022 08:57:48 +0800
Subject: [PATCH 126/128] fixed some format error

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/adaptor/pytorch.py          |  6 ++-
 .../adaptor/torch_utils/hawq_metric.py        | 46 ++++++++++---------
 2 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/neural_compressor/adaptor/pytorch.py b/neural_compressor/adaptor/pytorch.py
index 2ff206c392c..238df231513 100644
--- a/neural_compressor/adaptor/pytorch.py
+++ b/neural_compressor/adaptor/pytorch.py
@@ -1113,7 +1113,11 @@ def calculate_hessian_trace(self,
             hessian_trace(Dict[Tuple, float]), key: (op_name, op_type); value: hessian trace.
         """
         from .torch_utils.hawq_metric import hawq_top
-        op_to_traces=hawq_top(fp32_model=fp32_model,dataloader=dataloader,q_model=q_model,criterion=criterion,enable_act=enable_act)
+        op_to_traces=hawq_top(fp32_model=fp32_model,
+                              dataloader=dataloader,
+                              q_model=q_model,
+                              criterion=criterion,
+                              enable_act=enable_act)
         return op_to_traces
         pass
 
diff --git a/neural_compressor/adaptor/torch_utils/hawq_metric.py b/neural_compressor/adaptor/torch_utils/hawq_metric.py
index a63b2ef4c85..f68a1234164 100644
--- a/neural_compressor/adaptor/torch_utils/hawq_metric.py
+++ b/neural_compressor/adaptor/torch_utils/hawq_metric.py
@@ -41,8 +41,10 @@ def remove(self):
 class HessianTrace:
     """
     please refer to
-    Yao, Zhewei, et al. "Pyhessian: Neural networks through the lens of the hessian." 2020 IEEE international conference on big data (Big data). IEEE, 2020.
-    Dong, Zhen, et al. "Hawq-v2: Hessian aware trace-weighted quantization of neural networks." Advances in neural information processing systems 33 (2020): 18518-18529.
+    Yao, Zhewei, et al. "Pyhessian: Neural networks through the lens of the hessian." 
+    2020 IEEE international conference on big data (Big data). IEEE, 2020.
+    Dong, Zhen, et al. "Hawq-v2: Hessian aware trace-weighted quantization of neural networks." 
+    Advances in neural information processing systems 33 (2020): 18518-18529.
     https://github.com/openvinotoolkit/nncf/blob/develop/nncf/torch/quantization/hessian_trace.py
     """
 
@@ -239,24 +241,25 @@ def get_vtHv_weight(self, params, num_samples):
         v_t_H_v = torch.stack([torch.mean(h_v * v_t) for (h_v, v_t) in zip(H_v, v)])  ##maybe sum is better
         return v_t_H_v
 
-    def get_vtHv_act(self, params, num_samples):
-        v = self.sample_rademacher(params)
-        H_v = [0] * len(v)
-        cnt = 0
-        for step, data in enumerate(self.dataloader):
-            if cnt >= num_samples:
-                break
-            for i in range(self.dataloader.batchsize):  ##force to batchsize to be 1
-                input = data[0][i:i + 1]
-                target = data[1][i:i + 1]
-
-                self.get_gradients(self.model, (input, target), self.criterion, create_graph=True)
-                layer_acts = [self.layer_acts[key] for key in self.layer_acts.keys()]
-                layer_act_gradients = [self.layer_acts_grads[key] for key in self.layer_acts.keys()]
-                hv_one = torch.autograd.grad(layer_act_gradients, layer_acts, v, only_inputs=True, retain_graph=False)
-                cnt += 1
-                if cnt >= num_samples:
-                    break
+    # def get_vtHv_act(self, params, num_samples):
+    #     v = self.sample_rademacher(params)
+    #     H_v = [0] * len(v)
+    #     cnt = 0
+    #     for step, data in enumerate(self.dataloader):
+    #         if cnt >= num_samples:
+    #             break
+    #         for i in range(self.dataloader.batchsize):  ##force to batchsize to be 1
+    #             input = data[0][i:i + 1]
+    #             target = data[1][i:i + 1]
+
+    #             self.get_gradients(self.model, (input, target), self.criterion, create_graph=True)
+    #             layer_acts = [self.layer_acts[key] for key in self.layer_acts.keys()]
+    #             layer_act_gradients = [self.layer_acts_grads[key] for key in self.layer_acts.keys()]
+    #             hv_one = torch.autograd.grad(layer_act_gradients, layer_acts, v, 
+    #                                          only_inputs=True, retain_graph=False)
+    #             cnt += 1
+    #             if cnt >= num_samples:
+    #                 break
 
     def get_weight_traces(self, num_samples):
         import tqdm
@@ -567,7 +570,8 @@ def hawq_top(fp32_model,q_model,dataloader,criterion,enable_act):
     if enable_act:
         act_to_traces=traces['activation']
         for trace_i, pertur_i,act_i in zip(op_to_traces.keys(),pertur_lst.keys(),act_to_traces.keys()):
-                op_to_traces[trace_i]=pertur_lst[pertur_i]*op_to_traces[trace_i]+act_to_traces[act_i] #Formula:Omig=Trace*L2+act_trace
+            #Formula:Omig=Trace*L2+act_trace
+            op_to_traces[trace_i]=pertur_lst[pertur_i]*op_to_traces[trace_i]+act_to_traces[act_i] 
     else:
          for trace_i, pertur_i in zip(op_to_traces.keys(),pertur_lst.keys()):
                 op_to_traces[trace_i]=pertur_lst[pertur_i]*op_to_traces[trace_i] #Formula:Omig=Trace*L2       

From e63195cf2f52d00e43539df380d1b354715dcdea Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Fri, 9 Dec 2022 20:59:26 +0800
Subject: [PATCH 127/128] resolved the conflicts

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/strategy/hawq_v2.py | 6 +++---
 test/strategy/test_hawq_v2_2.x.py     | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/neural_compressor/strategy/hawq_v2.py b/neural_compressor/strategy/hawq_v2.py
index 6645fd86153..2f33bf39ba4 100644
--- a/neural_compressor/strategy/hawq_v2.py
+++ b/neural_compressor/strategy/hawq_v2.py
@@ -20,9 +20,9 @@
 
 from .strategy import strategy_registry, TuneStrategy
 
-from .st_utils.tuning_sampler import OpTypeWiseTuningSampler, FallbackTuningSampler, ModelWiseTuningSampler
-from .st_utils.tuning_structs import OpTuningConfig
-from .st_utils.tuning_space import TUNING_ITEMS_LST
+from .utils.tuning_sampler import OpTypeWiseTuningSampler, FallbackTuningSampler, ModelWiseTuningSampler
+from .utils.tuning_structs import OpTuningConfig
+from .utils.tuning_space import TUNING_ITEMS_LST
 from ..utils import logger
 
 @strategy_registry
diff --git a/test/strategy/test_hawq_v2_2.x.py b/test/strategy/test_hawq_v2_2.x.py
index 8442154acb5..19b52e07826 100644
--- a/test/strategy/test_hawq_v2_2.x.py
+++ b/test/strategy/test_hawq_v2_2.x.py
@@ -46,7 +46,7 @@ def _fake_eval(model):
         #tuning and accuracy criterion
         strategy_kwargs = {'hawq_v2_loss': hawq_v2_loss}
         tuning_criterion = TuningCriterion(strategy='hawq_v2', strategy_kwargs=strategy_kwargs, max_trials=5)
-        conf = PostTrainingQuantConfig(approach="static", backend="pytorch_fx", tuning_criterion=tuning_criterion)
+        conf = PostTrainingQuantConfig(approach="static", tuning_criterion=tuning_criterion)
 
         # fit
         q_model = fit(model=model,

From 36137c266e876eadbd42366ad9dacd66a67a6220 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Fri, 9 Dec 2022 21:27:17 +0800
Subject: [PATCH 128/128] revert some change for test

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../efficientnet/quantization/ptq/eager/run_tuning.sh      | 3 +--
 .../quantization/ptq/cpu/eager/run_tuning.sh               | 7 +++----
 .../quantization/ptq/cpu/eager/run_tuning_dump_tensor.sh   | 7 +++----
 .../quantization/ptq/cpu/fx/run_tuning.sh                  | 7 +++----
 .../quantization/ptq/gpu/eager/run_tuning.sh               | 7 +++----
 5 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/examples/pytorch/image_recognition/efficientnet/quantization/ptq/eager/run_tuning.sh b/examples/pytorch/image_recognition/efficientnet/quantization/ptq/eager/run_tuning.sh
index 588ec872406..c5c764b7155 100644
--- a/examples/pytorch/image_recognition/efficientnet/quantization/ptq/eager/run_tuning.sh
+++ b/examples/pytorch/image_recognition/efficientnet/quantization/ptq/eager/run_tuning.sh
@@ -41,8 +41,7 @@ function run_tuning {
         conf_yaml=conf_efficientnet_b0.yaml
     elif [ "${topology}" = "mobilenetv3_rw" ]; then
         conf_yaml=conf_mobilenetv3_rw.yaml
-        # TODO only for test, uncomment it before merge
-        # sed -i "/relative:/s|relative:.*|relative: 0.02|g" $conf_yaml
+        sed -i "/relative:/s|relative:.*|relative: 0.02|g" $conf_yaml
     fi
     sed -i "/\/path\/to\/calibration\/dataset/s|root:.*|root: $dataset_location/train|g" $conf_yaml
     sed -i "/\/path\/to\/evaluation\/dataset/s|root:.*|root: $dataset_location/val|g" $conf_yaml
diff --git a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/eager/run_tuning.sh b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/eager/run_tuning.sh
index 7752585ddb5..2f930ad1470 100644
--- a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/eager/run_tuning.sh
+++ b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/eager/run_tuning.sh
@@ -37,10 +37,9 @@ function init_params {
 
 # run_tuning
 function run_tuning {
-    # TODO only for test, uncomment it before merge
-    # if [ "mobilenet_v2" = "$topology" ];then
-    #     sed -i "/relative:/s|relative:.*|relative: 0.02|g" conf.yaml
-    # fi
+    if [ "mobilenet_v2" = "$topology" ];then
+        sed -i "/relative:/s|relative:.*|relative: 0.02|g" conf.yaml
+    fi
     extra_cmd=""
     if [ -n "$output_model" ];then
         extra_cmd = $extra_cmd"--tuned_checkpoint ${output_model}"
diff --git a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/eager/run_tuning_dump_tensor.sh b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/eager/run_tuning_dump_tensor.sh
index 3c45fe25a32..02f968d7d23 100644
--- a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/eager/run_tuning_dump_tensor.sh
+++ b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/eager/run_tuning_dump_tensor.sh
@@ -39,10 +39,9 @@ function init_params {
 function run_tuning {
     sed -i "/\/path\/to\/calibration\/dataset/s|root:.*|root: $dataset_location/train|g" conf_dump_tensors.yaml
     sed -i "/\/path\/to\/evaluation\/dataset/s|root:.*|root: $dataset_location/val|g" conf_dump_tensors.yaml
-    # TODO only for test, uncomment it before merge
-    # if [ "mobilenet_v2" = "$topology" ];then
-    #     sed -i "/relative:/s|relative:.*|relative: 0.02|g" conf_dump_tensors.yaml
-    # fi
+    if [ "mobilenet_v2" = "$topology" ];then
+        sed -i "/relative:/s|relative:.*|relative: 0.02|g" conf_dump_tensors.yaml
+    fi
 
     extra_cmd=""
     if [ -n "$output_model" ];then
diff --git a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/run_tuning.sh b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/run_tuning.sh
index eaa81d6e85c..054d4389d9c 100644
--- a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/run_tuning.sh
+++ b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx/run_tuning.sh
@@ -37,10 +37,9 @@ function init_params {
 
 # run_tuning
 function run_tuning {
-    # TODO only for test, uncomment it before merge
-    # if [ "mobilenet_v2" = "$topology" ];then
-    #     sed -i "/relative:/s|relative:.*|relative: 0.02|g" conf.yaml
-    # fi
+    if [ "mobilenet_v2" = "$topology" ];then
+        sed -i "/relative:/s|relative:.*|relative: 0.02|g" conf.yaml
+    fi
     extra_cmd=""
     if [ -n "$output_model" ];then
         extra_cmd = $extra_cmd"--tuned_checkpoint ${output_model}"
diff --git a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/gpu/eager/run_tuning.sh b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/gpu/eager/run_tuning.sh
index a4460264ee2..3a272f7e8eb 100644
--- a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/gpu/eager/run_tuning.sh
+++ b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/gpu/eager/run_tuning.sh
@@ -39,10 +39,9 @@ function init_params {
 function run_tuning {
     sed -i "/\/path\/to\/calibration\/dataset/s|root:.*|root: $dataset_location/train|g" conf.yaml
     sed -i "/\/path\/to\/evaluation\/dataset/s|root:.*|root: $dataset_location/val|g" conf.yaml
-    # TODO only for test, uncomment it before merge
-    # if [ "mobilenet_v2" = "$topology" ];then
-    #     sed -i "/relative:/s|relative:.*|relative: 0.02|g" conf.yaml
-    # fi
+    if [ "mobilenet_v2" = "$topology" ];then
+        sed -i "/relative:/s|relative:.*|relative: 0.02|g" conf.yaml
+    fi
 
     extra_cmd="${dataset_location}"