[SW-217321] Add autoround UTs Back (#197)

yiliu30 · Yi4Liu · XuehaoSun · commit 9b8090f23b51 · 2025-07-19T13:16:06.000+08:00
* add autoround UTs back

Change-Id: I0614ffd8be4f89e9787037ee99e24a60f8548b49

---------

Signed-off-by: Yi Liu &lt;yiliu4@habana.ai&gt;
Co-authored-by: Yi Liu &lt;yiliu4@habana.ai&gt;
Signed-off-by: Xin He &lt;xinhe3@habana.ai&gt;
diff --git a/neural_compressor/torch/algorithms/weight_only/save_load.py b/neural_compressor/torch/algorithms/weight_only/save_load.py
@@ -897,7 +897,7 @@ def _init_hf_model(self, model_class, config):
         from transformers.modeling_utils import no_init_weights
         from transformers.utils import ContextManagers
 
-        _fast_init = self.kwargs.pop("_fast_init", True)
+        _ = self.kwargs.pop("_fast_init", True)
         torch_dtype = self.kwargs.pop("torch_dtype", "auto")
         is_sharded = self.kwargs.pop("is_sharded", False)
         sharded_metadata = self.kwargs.pop("sharded_metadata", None)
diff --git a/test/3x/torch/quantization/fp8_quant/test_save_load.py b/test/3x/torch/quantization/fp8_quant/test_save_load.py
@@ -73,7 +73,7 @@ def test_save_vllm_compatible_model():
     tokenizer = transformers.AutoTokenizer.from_pretrained(name)
     tokenizer.save_pretrained("saved_results_qwen")
 
-
+@pytest.mark.skip(reason="[SW-226589] Skip this test since the model was updated")
 def test_load_model_provided_by_neuralmagic():
     model_name_or_path = "neuralmagic/Qwen2-0.5B-Instruct-FP8"
     hpu_mem0 = get_used_hpu_mem_MB()
diff --git a/test/3x/torch/quantization/weight_only/test_autoround.py b/test/3x/torch/quantization/weight_only/test_autoround.py
@@ -69,7 +69,6 @@ def run_fn(model, dataloader):
         else:
             model(data)
 
-@pytest.mark.skip(reason="SW-217321 pytorch inductor error")
 @pytest.mark.skipif(is_habana_framework_installed(), reason="These tests are not supported on HPU for now.")
 @pytest.mark.skipif(not auto_round_installed, reason="auto_round module is not installed")
 class TestAutoRoundCPU:
@@ -284,7 +283,6 @@ def test_mllm(self):
     #     loaded_model = load("saved_results_tiny-random-GPTJForCausalLM", format="huggingface", trust_remote_code=True)
 
 
-@pytest.mark.skip(reason="SW-217321 pytorch inductor error")
 @pytest.mark.skipif(not is_habana_framework_installed(), reason="Habana framework is not installed")
 @pytest.mark.skipif(os.getenv("PT_HPU_LAZY_MODE", "0") == "1", reason="Lazy mode is enabled")
 @pytest.mark.skipif(not auto_round_installed, reason="auto_round module is not installed")
@@ -366,7 +364,7 @@ def test_autoround_w4a8(self):
     @pytest.mark.parametrize("quant_lm_head", [True, False])
     def test_autoround(self, quant_lm_head):
         fp32_model = copy.deepcopy(self.tiny_llama_model)
-        quant_config = AutoRoundConfig(nsamples=32, seqlen=10, iters=10, scale_dtype="fp32")
+        quant_config = AutoRoundConfig(nsamples=32, seqlen=10, iters=10, act_dtype="fp32", scale_dtype="fp32")
         if quant_lm_head is False:
             quant_config.set_local("lm_head", AutoRoundConfig(dtype="fp32"))
         logger.info(f"Test AutoRound with config {quant_config}")
@@ -377,30 +375,32 @@ def test_autoround(self, quant_lm_head):
         run_fn(model, self.dataloader)
         q_model = convert(model)
         assert "model.layers.0.self_attn.k_proj" in q_model.autoround_config.keys()
-        assert "scale" in q_model.autoround_config["model.layers.0.self_attn.k_proj"].keys()
+        assert "scale_dtype" in q_model.autoround_config["model.layers.0.self_attn.k_proj"].keys()
         assert torch.float32 == q_model.autoround_config["model.layers.0.self_attn.k_proj"]["scale_dtype"]
         assert isinstance(q_model.model.layers[0].self_attn.k_proj, WeightOnlyLinear), "packing model failed."
         if quant_lm_head is True:
             assert isinstance(q_model.lm_head, WeightOnlyLinear), "quantization for lm_head failed."
 
     def test_int4_dtype(self):
         fp32_model = copy.deepcopy(self.tiny_llama_model)
-        quant_config = AutoRoundConfig(dtype="int4", nsamples=32, seqlen=10, iters=10, scale_dtype="fp32")
+        quant_config = AutoRoundConfig(
+            dtype="int4", nsamples=32, seqlen=10, iters=10, act_dtype="fp32", scale_dtype="fp32"
+        )
         logger.info(f"Test AutoRound with config {quant_config}")
 
         # prepare + convert API
         model = prepare(model=fp32_model, quant_config=quant_config)
         run_fn(model, self.dataloader)
         q_model = convert(model)
         assert "model.layers.0.self_attn.k_proj" in q_model.autoround_config.keys()
-        assert "scale" in q_model.autoround_config["model.layers.0.self_attn.k_proj"].keys()
+        assert "scale_dtype" in q_model.autoround_config["model.layers.0.self_attn.k_proj"].keys()
         assert torch.float32 == q_model.autoround_config["model.layers.0.self_attn.k_proj"]["scale_dtype"]
         assert isinstance(q_model.model.layers[0].self_attn.k_proj, WeightOnlyLinear), "packing model failed."
 
     def test_autoround_with_quantize_API(self):
         model = copy.deepcopy(self.tiny_llama_model)
 
-        quant_config = AutoRoundConfig(nsamples=32, seqlen=10, iters=10, scale_dtype="fp32")
+        quant_config = AutoRoundConfig(nsamples=32, seqlen=10, iters=10, act_dtype="fp32", scale_dtype="fp32")
         quant_config.set_local("lm_head", AutoRoundConfig(dtype="fp32"))
 
         logger.info(f"Test AutoRound with config {quant_config}")