[SW-214269] support g_idx for uint4 (#246)

xin3he · xinhe3 · Sylwester Fraczek · web-flow · commit 8b811468228a · 2025-06-25T19:37:24.000+08:00
* support g_idx for uint4


---------

Signed-off-by: Xin He &lt;xinhe3@habana.ai&gt;
Co-authored-by: Xin He &lt;xinhe3@habana.ai&gt;
Co-authored-by: Sylwester Fraczek &lt;sylwester.fraczek@intel.com&gt;
diff --git a/neural_compressor/torch/algorithms/weight_only/modules.py b/neural_compressor/torch/algorithms/weight_only/modules.py
@@ -365,11 +365,6 @@ def unpack(self):
         qweight = self.qweight.T.contiguous() if self.use_optimum_format else self.qweight
 
         device = scales.device
-        if self.g_idx is None:
-            # used for recovering fp32_weight
-            self.g_idx = torch.tensor([i // self.group_size for i in range(self.in_features)], dtype=torch.int32).to(
-                device
-            )
         # unpack weight
         if not self.use_optimum_format and self.compression_dim == 0:
             qweight = qweight.T.contiguous()
@@ -413,6 +408,11 @@ def recover(self):
         fp32_weight = torch.zeros(self.out_features, self.in_features, dtype=self.float_type).to(device)
 
         # recover fp32 weight
+        if self.g_idx is None:
+            # used for recovering fp32_weight
+            self.g_idx = torch.tensor([i // self.group_size for i in range(self.in_features)], dtype=torch.int32).to(
+                device
+            )
         if zp is not None:
             # recover fp32 weight with int_weight, scale, and zero_point
             for idx in range(self.in_features):
@@ -729,7 +729,8 @@ def forward(self, input):
         scales = self.scales
         qweight = self.qweight
         zeros = self.qzeros
-        weight = torch.ops.hpu.convert_from_uint4(qweight, scales, zeros, input_dtype)
+        g_idx = self.g_idx
+        weight = torch.ops.hpu.convert_from_uint4(qweight, scales, zeros, input_dtype, g_idx)
         output = self.matmul_internal(input, weight)
         output = output.to(dtype=input_dtype).reshape(
             output_shape
@@ -760,6 +761,9 @@ def pack(self, int_weight, scales, zp, scale_bf16_to_fp8=None, bias=None, g_idx=
         if bias is not None:
             self.bias = bias.to("hpu").to(torch.bfloat16)
 
+        if g_idx is not None:
+            self.g_idx = g_idx.to("hpu").to(torch.int32)
+
     def unpack(self):
         """Unpack weight and zero point."""
         logger.debug("Unpacking from HPU")
diff --git a/neural_compressor/torch/algorithms/weight_only/save_load.py b/neural_compressor/torch/algorithms/weight_only/save_load.py
@@ -470,7 +470,7 @@ def _replace_woqlinear_modules(self, name, linear_module, module_quantization_co
         module_kwargs["group_size"] = module_quantization_config.get("group_size", 32)
 
         # spceific initialization kwargs
-        module_kwargs["g_idx"] = True if name + ".g_idx" in self.loaded_state_dict_keys else False
+        module_kwargs["g_idx"] = module_quantization_config.get("desc_act", False)
         module_kwargs["zp"] = True if name + ".qzeros" in self.loaded_state_dict_keys else False
         module_kwargs["use_optimum_format"] = True
         module_kwargs["bias"] = linear_module.bias is not None