From 4c45d10adbd9f64c7590f00bcdfc5e1649a7aa2c Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Mon, 24 Feb 2025 08:41:10 -0500
Subject: [PATCH 1/4] fix bug

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 neural_compressor/adaptor/torch_utils/gptq.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neural_compressor/adaptor/torch_utils/gptq.py b/neural_compressor/adaptor/torch_utils/gptq.py
index 2f1c6cc0582..7bcefc154c3 100644
--- a/neural_compressor/adaptor/torch_utils/gptq.py
+++ b/neural_compressor/adaptor/torch_utils/gptq.py
@@ -745,6 +745,7 @@ def tmp(_, inp, out):
             for j in range(len(self.dataloader)):
                 cache_keyword_batch = self.gather_single_batch_from_dict(self.cache_key_arguments, j)
                 cache_positional_batch = self.gather_single_batch_from_list(self.cache_positional_arguments, j)
+                transformer_block.to(cache_positional_batch[0].dtype)
                 out = transformer_block(*cache_positional_batch, **cache_keyword_batch)
                 out = self.track_hidden_states(out)
                 outs.append(out)
@@ -967,7 +968,6 @@ def fasterquant(self, W, blocksize=128, percdamp=0.01, groupsize=-1, act_order=F
                     if not static_groups:
                         if (i1 + i) % groupsize == 0:
                             self.quantizer.find_params(W[:, (i1 + i) : (i1 + i + groupsize)], weight=True)
-                            scale.append(self.quantizer.scale)
                             zero.append(self.quantizer.zero)
                     else:
                         idx = i1 + i

From 63bea6bf2e596307623bf319760d0980e7a880ed Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Mon, 24 Feb 2025 08:52:07 -0500
Subject: [PATCH 2/4] update

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 neural_compressor/adaptor/torch_utils/gptq.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neural_compressor/adaptor/torch_utils/gptq.py b/neural_compressor/adaptor/torch_utils/gptq.py
index 7bcefc154c3..bd40d3907fa 100644
--- a/neural_compressor/adaptor/torch_utils/gptq.py
+++ b/neural_compressor/adaptor/torch_utils/gptq.py
@@ -745,7 +745,7 @@ def tmp(_, inp, out):
             for j in range(len(self.dataloader)):
                 cache_keyword_batch = self.gather_single_batch_from_dict(self.cache_key_arguments, j)
                 cache_positional_batch = self.gather_single_batch_from_list(self.cache_positional_arguments, j)
-                transformer_block.to(cache_positional_batch[0].dtype)
+                transformer_block = transformer_block.to(cache_positional_batch[0].dtype)
                 out = transformer_block(*cache_positional_batch, **cache_keyword_batch)
                 out = self.track_hidden_states(out)
                 outs.append(out)

From 15c39f19b55bd1efa22aeb16c7ca5f4a90bd1f6e Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Mon, 24 Feb 2025 20:48:50 -0500
Subject: [PATCH 3/4] fix

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 neural_compressor/adaptor/torch_utils/gptq.py             | 8 +++++---
 .../adaptor/torch_utils/layer_wise_quant/utils.py         | 4 ++--
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/neural_compressor/adaptor/torch_utils/gptq.py b/neural_compressor/adaptor/torch_utils/gptq.py
index bd40d3907fa..ad597620fef 100644
--- a/neural_compressor/adaptor/torch_utils/gptq.py
+++ b/neural_compressor/adaptor/torch_utils/gptq.py
@@ -718,10 +718,10 @@ def tmp(_, inp, out):
                         for n, p in sub_layer.named_parameters():
                             param_name = full_layer_name + "." + n
                             if n == "weight":
-                                set_module_tensor_to_device(self.model, param_name, self.device, Q)
+                                set_module_tensor_to_device(self.model, param_name, self.device, Q, dtype=Q.dtype)
                             else:
                                 value = load_value(self.model, param_name, model_path)
-                                set_module_tensor_to_device(self.model, param_name, self.device, value)
+                                set_module_tensor_to_device(self.model, param_name, self.device, value, dtype=value.dtype)
                         # sub_layer.weight.data = Q
                         torch.save(sub_layer.state_dict(), LWQ_WORKSPACE + f"/{full_layer_name}.pt")
                         clean_module_weight(sub_layer)
@@ -745,7 +745,8 @@ def tmp(_, inp, out):
             for j in range(len(self.dataloader)):
                 cache_keyword_batch = self.gather_single_batch_from_dict(self.cache_key_arguments, j)
                 cache_positional_batch = self.gather_single_batch_from_list(self.cache_positional_arguments, j)
-                transformer_block = transformer_block.to(cache_positional_batch[0].dtype)
+                # breakpoint()
+                # transformer_block = transformer_block.to(getattr(torch, self.model.config.torch_dtype))
                 out = transformer_block(*cache_positional_batch, **cache_keyword_batch)
                 out = self.track_hidden_states(out)
                 outs.append(out)
@@ -968,6 +969,7 @@ def fasterquant(self, W, blocksize=128, percdamp=0.01, groupsize=-1, act_order=F
                     if not static_groups:
                         if (i1 + i) % groupsize == 0:
                             self.quantizer.find_params(W[:, (i1 + i) : (i1 + i + groupsize)], weight=True)
+                            scale.append(self.quantizer.scale)
                             zero.append(self.quantizer.zero)
                     else:
                         idx = i1 + i
diff --git a/neural_compressor/adaptor/torch_utils/layer_wise_quant/utils.py b/neural_compressor/adaptor/torch_utils/layer_wise_quant/utils.py
index 8bd3d32d320..211cfebbad1 100644
--- a/neural_compressor/adaptor/torch_utils/layer_wise_quant/utils.py
+++ b/neural_compressor/adaptor/torch_utils/layer_wise_quant/utils.py
@@ -221,7 +221,7 @@ def load_module(model, module_name, path, device="cpu"):
     for n, p in module.named_parameters():
         param_name = module_name + "." + n
         value = load_value(model, param_name, path)
-        set_module_tensor_to_device(model, param_name, device, value)
+        set_module_tensor_to_device(model, param_name, device, value, dtype=value.dtype)
 
 
 def register_weight_hooks(model, path, device="cpu", clean_weight=True, saved_path=None):
@@ -239,7 +239,7 @@ def hook(module, input):
                     value = state_dict[n]
                 else:
                     value = load_value(model, param_name, path)
-                set_module_tensor_to_device(model, param_name, device, value)
+                set_module_tensor_to_device(model, param_name, device, value, dtype=value.dtype)
 
         return hook
 

From d6b8524fe963efff4a3146087611d746ca84c73b Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 25 Feb 2025 01:50:03 +0000
Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 neural_compressor/adaptor/torch_utils/gptq.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/neural_compressor/adaptor/torch_utils/gptq.py b/neural_compressor/adaptor/torch_utils/gptq.py
index ad597620fef..a077e932b33 100644
--- a/neural_compressor/adaptor/torch_utils/gptq.py
+++ b/neural_compressor/adaptor/torch_utils/gptq.py
@@ -721,7 +721,9 @@ def tmp(_, inp, out):
                                 set_module_tensor_to_device(self.model, param_name, self.device, Q, dtype=Q.dtype)
                             else:
                                 value = load_value(self.model, param_name, model_path)
-                                set_module_tensor_to_device(self.model, param_name, self.device, value, dtype=value.dtype)
+                                set_module_tensor_to_device(
+                                    self.model, param_name, self.device, value, dtype=value.dtype
+                                )
                         # sub_layer.weight.data = Q
                         torch.save(sub_layer.state_dict(), LWQ_WORKSPACE + f"/{full_layer_name}.pt")
                         clean_module_weight(sub_layer)