intel
diff --git a/‎neural_compressor/adaptor/pytorch.py‎
Lines changed: 10 additions & 9 deletions b/‎neural_compressor/adaptor/pytorch.py‎
Lines changed: 10 additions & 9 deletions
diff --git a/‎neural_compressor/adaptor/torch_utils/model_wrapper.py‎
Lines changed: 155 additions & 0 deletions b/‎neural_compressor/adaptor/torch_utils/model_wrapper.py‎
Lines changed: 155 additions & 0 deletions
@@ -24,6 +24,7 @@
 from packaging.version import Version
 import yaml
 from functools import partial
+from neural_compressor.adaptor.torch_utils.util import set_module
 from neural_compressor.utils.utility import dump_elapsed_time
 from .adaptor import adaptor_registry, Adaptor
 from ..utils.utility import LazyImport, CpuInfo, GLOBAL_STATE, MODE
@@ -4548,7 +4549,8 @@ def rtn_quantize(self, model, tune_cfg):
                 if algorithm != 'RTN':
                     continue
                 m = fetch_module(model, op_name)
-                rtn_quantize(m, num_bits, group_size, scheme)
+                m = rtn_quantize(m, num_bits, group_size, scheme, return_int=False)
+                set_module(model, op_name, m)
         return model
 
     def gptq_quantize(self, model, tune_cfg, dataloader):
@@ -4591,6 +4593,7 @@ def awq_quantize(self, model, tune_cfg, dataloader, calib_func):
                 flipped_dict[m] = {'absorb_layer': k}
 
         # check tune_cfg to skip layers without AWQ config
+        weight_config = {}
         skipped_op_name_set = set()
         for key, config in tune_cfg['op'].items():
             op_name, op_type = key
@@ -4599,29 +4602,26 @@ def awq_quantize(self, model, tune_cfg, dataloader, calib_func):
                     absorb_to_layer.pop(flipped_dict[op_name]['absorb_layer'])
                 continue
             else:
+                weight_config[op_name] = {}
+                weight_config[op_name]['bits'] = config['weight']['bits']
+                weight_config[op_name]['group_size'] = config['weight']['group_size']
+                weight_config[op_name]['scheme'] = config['weight']['scheme']
                 if op_name in flipped_dict:
-                    flipped_dict[op_name]['bits'] = config['weight']['bits']
-                    flipped_dict[op_name]['group_size'] = config['weight']['group_size']
-                    flipped_dict[op_name]['scheme'] = config['weight']['scheme']
                     algorithm = config['weight']['algorithm']
                     if algorithm != 'AWQ':
-                        if op_name in flipped_dict:
-                            absorb_to_layer.pop(flipped_dict[op_name]['absorb_layer'])
+                        absorb_to_layer.pop(weight_config[op_name]['absorb_layer'])
                 else:
                     skipped_op_name_set.add(op_name)
         if skipped_op_name_set:
             logger.info("{} is skipped by AWQ algorithm".format(skipped_op_name_set))
 
         # collect AWQ config from tune_cfg for quantization.
-        weight_config = {}
         if len(absorb_to_layer) == 0:
             logger.warning('No absorb layer needs AWQ algorithim, skip it')
         else:
             logger.debug("**absorb layer**: **absorbed layers**")
         for k, v in absorb_to_layer.items():
             logger.debug(f"{k}: {v}")
-            for m in v:
-                weight_config[m] = flipped_dict[m]
         logger.info("Absorbed layers with the same absorb layer use the same config")
 
         if 'awq_args' in self.recipes:
@@ -4641,6 +4641,7 @@ def awq_quantize(self, model, tune_cfg, dataloader, calib_func):
             mse_range=mse_range,
             calib_func=calib_func,
             n_blocks=n_blocks,
+            return_int=False,
         )
         return model
 
 
@@ -18,7 +18,9 @@
 """Torch.nn.Module Class Defination."""
 # Note: Do not import this file unless you have already imported torch, 
 # since the model classes inherit torch.nn.Module.
+import math
 import torch
+from torch.nn import functional as F
 from packaging.version import Version
 
 
@@ -146,3 +148,156 @@ def _wrapper_qdq_linear(tmp_model, module_name_list=[]):
         new_module = QDQLinear(module)
         set_module(tmp_model, name, new_module)
     return tmp_model
+
+
+class WeightOnlyLinear(torch.nn.Module):
+    def __init__(self, in_features, out_features, bits, groupsize):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.bits = bits
+        self.groupsize = groupsize if groupsize != -1 else in_features
+        self.n_pack = 32 // self.bits
+
+        self.register_buffer(
+            'packed_weight', 
+            torch.zeros(
+                (out_features, math.ceil(in_features / self.n_pack)), 
+                dtype=torch.int32,
+            )
+        )
+        self.register_buffer(
+            'scale', 
+            torch.zeros(
+                (out_features, math.ceil(in_features / self.groupsize)), 
+                dtype=torch.float,
+            )
+        )
+
+    def pack(self, int_weight, scale, zp, bias):
+        if bias is not None:
+            self.register_buffer('bias', torch.zeros(self.out_features, dtype=torch.float))
+        else:
+            self.bias = None
+        self.bias = bias
+        assert scale.shape == self.scale.shape, "Scale shape is mismatched."
+        self.scale = scale
+        origin_shape = int_weight.shape
+        target_shape = self.packed_weight.shape
+        assert origin_shape[0] == target_shape[0], "output channels mismatch, please check."
+        mask = torch.tensor(2**self.bits - 1, dtype=torch.int32)
+
+        # pack weight
+        for i in range(target_shape[0]):
+            for j in range(target_shape[1]):
+                start = self.n_pack * j
+                end = self.n_pack * (j + 1)
+                tmp = int_weight[i][start: end].type(torch.int32)
+                for e in range(len(tmp)):
+                    tmp[e] &= mask
+                    tmp[e] = tmp[e] << self.bits * (self.n_pack - 1 - e)
+                    self.packed_weight[i][j] |= tmp[e]
+
+        if zp is not None:
+            # pack zero_points
+            self.register_buffer(
+                'packed_zp', 
+                torch.zeros(
+                    (self.out_features, math.ceil(self.in_features / self.groupsize / self.n_pack)), 
+                    dtype=torch.int32,
+                )
+            )
+            target_shape = self.packed_zp.shape
+            for i in range(target_shape[0]):
+                for j in range(target_shape[1]):
+                    start = self.n_pack * j
+                    end = self.n_pack * (j + 1)
+                    tmp = zp[i][start: end].type(torch.int32)
+                    for e in range(len(tmp)):
+                        tmp[e] &= mask
+                        tmp[e] = tmp[e] << self.bits * (self.n_pack - 1 - e)
+                        self.packed_zp[i][j] |= tmp[e]
+
+    def recover(self):
+        mask = torch.tensor(2**self.bits - 1, dtype=torch.int32)
+        if hasattr(self, 'packed_zp'):
+            weight_dtype = torch.uint8
+        else:
+            weight_dtype = torch.int8
+        # unpack weight
+        weight = torch.zeros(self.out_features, self.in_features, dtype=weight_dtype)
+        origin_shape = weight.shape
+        target_shape = self.packed_weight.shape
+        for i in range(target_shape[0]):
+            for j in range(target_shape[1]):
+                for e in range(self.n_pack):
+                    index = j * self.n_pack + e
+                    if index >= origin_shape[1]:
+                        continue
+                    tmp = self.packed_weight[i][j]
+                    tmp = tmp << 32 - self.bits * (self.n_pack - e)
+                    tmp = tmp >> 32 - self.bits
+                    if weight_dtype == torch.uint8:
+                        tmp &= mask # remove sign bit
+                    weight[i][index] = tmp.type(weight_dtype)
+        # unpack zero_point
+        if hasattr(self, 'packed_zp'):
+            zp_dtype = torch.int32 # to avoid overflow when weight-zp
+            zp = torch.zeros(self.scale.shape, dtype=zp_dtype)
+            origin_shape = zp.shape
+            target_shape = self.packed_zp.shape
+            for i in range(target_shape[0]):
+                for j in range(target_shape[1]):
+                    for e in range(self.n_pack):
+                        index = j * self.n_pack + e
+                        if index >= origin_shape[1]:
+                            continue
+                        tmp = self.packed_zp[i][j]
+                        tmp = tmp << 32 - self.bits * (self.n_pack - e)
+                        tmp = tmp >> 32 - self.bits
+                        tmp &= mask
+                        zp[i][index] = tmp.type(zp_dtype)
+            # recover fp32 weight with int_weight, scale, and zero_point
+            left_element = self.in_features % self.groupsize 
+            if left_element != 0:
+                split_index = self.in_features // self.groupsize  * self.groupsize
+                weight1 = weight[:, :-split_index].reshape(-1, self.groupsize)
+                scale1 = self.scale[:, :-1].reshape(-1, 1)
+                zp1 = zp[:, :-1].reshape(-1, 1)
+                weight1 = ((weight1 - zp1) * scale1).reshape(self.out_features, -1)
+                weight2 = weight[:, -split_index:]
+                scale2 = self.scale[:, -1:]
+                zp2 = zp[:, -1].reshape(-1, 1)
+                weight2 = ((weight2 - zp2) * scale2)
+                fp32_weight = torch.cat((weight1, weight2), dim=1)
+            else:
+                weight = weight.reshape(-1, self.groupsize)
+                scale = self.scale.reshape(-1, 1)
+                zp = zp.reshape(-1, 1)
+                fp32_weight = ((weight - zp) * scale).reshape(self.out_features, -1)
+        else:
+            # recover fp32 weight with int_weight, scale
+            left_element = self.in_features % self.groupsize 
+            if left_element != 0:
+                split_index = self.in_features // self.groupsize  * self.groupsize
+                weight1 = weight[:, :split_index].reshape(-1, self.groupsize)
+                scale1 = self.scale[:, :-1].reshape(-1, 1)
+                weight1 = (weight1 * scale1).reshape(self.out_features, -1)
+                weight2 = weight[:, split_index:]
+                scale2 = self.scale[:, -1:]
+                weight2 = (weight2 * scale2)
+                fp32_weight = torch.cat((weight1, weight2), dim=1)
+            else:
+                weight = weight.reshape(-1, self.groupsize)
+                scale = self.scale.reshape(-1, 1)
+                fp32_weight = (weight * scale).reshape(self.out_features, -1)
+        return fp32_weight
+
+    def forward(self, input):
+        weight = self.recover()
+        return F.linear(input, weight, self.bias)
+
+    def extra_repr(self) -> str:
+        return 'in_features={}, out_features={}, bits={}, group_size={}, bias={}'.format(
+            self.in_features, self.out_features, self.bits, self.groupsize, self.bias is not None
+        )