From 33e5062f7bb27497c0bc1a6dc55724bbd5a47ca3 Mon Sep 17 00:00:00 2001 From: xin3he Date: Mon, 3 Jun 2024 13:33:45 +0800 Subject: [PATCH 01/11] update UTs Signed-off-by: xin3he --- .../quantization/llm/run_clm_no_trainer.py | 4 ++-- neural_compressor/torch/utils/utility.py | 2 +- .../torch/{ => utils}/test_auto_accelerator.py | 0 .../{test_utils.py => utils/test_utility.py} | 18 +++--------------- 4 files changed, 6 insertions(+), 18 deletions(-) rename test/3x/torch/{ => utils}/test_auto_accelerator.py (100%) rename test/3x/torch/{test_utils.py => utils/test_utility.py} (87%) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py index 2556820284a..4e8e4b1c3e4 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py @@ -236,9 +236,9 @@ def get_user_model(): # 3.x api if args.approach == 'weight_only': from neural_compressor.torch.quantization import RTNConfig, GPTQConfig, prepare, convert, quantize - from neural_compressor.torch.utils import get_double_quant_config + from neural_compressor.torch.utils import get_double_quant_config_dict weight_sym = True if args.woq_scheme == "sym" else False - double_quant_config_dict = get_double_quant_config(args.double_quant_type) + double_quant_config_dict = get_double_quant_config_dict(args.double_quant_type) if args.woq_algo == "RTN": if args.double_quant_type is not None: diff --git a/neural_compressor/torch/utils/utility.py b/neural_compressor/torch/utils/utility.py index f88c768cfed..13942ea8083 100644 --- a/neural_compressor/torch/utils/utility.py +++ b/neural_compressor/torch/utils/utility.py @@ -120,7 +120,7 @@ def get_model_info(model: torch.nn.Module, white_module_list: List[Callable]) -> return filter_result -def get_double_quant_config(double_quant_type): +def get_double_quant_config_dict(double_quant_type): from neural_compressor.torch.utils.constants import DOUBLE_QUANT_CONFIGS if double_quant_type is None: diff --git a/test/3x/torch/test_auto_accelerator.py b/test/3x/torch/utils/test_auto_accelerator.py similarity index 100% rename from test/3x/torch/test_auto_accelerator.py rename to test/3x/torch/utils/test_auto_accelerator.py diff --git a/test/3x/torch/test_utils.py b/test/3x/torch/utils/test_utility.py similarity index 87% rename from test/3x/torch/test_utils.py rename to test/3x/torch/utils/test_utility.py index 00ca99a5734..dc4bc0b3739 100644 --- a/test/3x/torch/test_utils.py +++ b/test/3x/torch/utils/test_utility.py @@ -1,5 +1,3 @@ -import unittest - import torch from neural_compressor.torch.utils import logger @@ -38,19 +36,13 @@ def forward(self, x): from neural_compressor.torch.utils.utility import fetch_module, set_module -class TestTorchUtils(unittest.TestCase): - @classmethod - def setUpClass(self): +class TestTorchUtils: + def setup_class(self): self.model = get_gpt_j() - @classmethod - def tearDownClass(self): + def teardown_class(self): pass - def setUp(self): - # print the test name - logger.info(f"Running TestTorchUtils test: {self.id()}") - def test_fetch_module(self): result = fetch_module(self.model, "transformer.h.2.mlp.fc_in") self.assertIsInstance(result, torch.nn.Linear) @@ -80,7 +72,3 @@ def test_get_model_info(self): white_module_list = [torch.nn.Linear] model_info = get_model_info(build_simple_torch_model(), white_module_list) self.assertEqual(len(model_info), 4) - - -if __name__ == "__main__": - unittest.main() From f39fb4f406b17107715c6b46f2ec6d86a674361f Mon Sep 17 00:00:00 2001 From: xin3he Date: Mon, 3 Jun 2024 17:29:46 +0800 Subject: [PATCH 02/11] update utility Signed-off-by: xin3he --- .../torch/algorithms/pt2e_quant/core.py | 2 +- .../torch/algorithms/pt2e_quant/utility.py | 79 +++++++++++++++++++ neural_compressor/torch/utils/utility.py | 66 +--------------- test/3x/torch/utils/test_utility.py | 50 ++++++------ 4 files changed, 109 insertions(+), 88 deletions(-) create mode 100644 neural_compressor/torch/algorithms/pt2e_quant/utility.py diff --git a/neural_compressor/torch/algorithms/pt2e_quant/core.py b/neural_compressor/torch/algorithms/pt2e_quant/core.py index 129ca6f072a..a1b4d1f65b6 100644 --- a/neural_compressor/torch/algorithms/pt2e_quant/core.py +++ b/neural_compressor/torch/algorithms/pt2e_quant/core.py @@ -26,7 +26,7 @@ from neural_compressor.common.utils import logger from neural_compressor.torch.algorithms.base_algorithm import Quantizer from neural_compressor.torch.algorithms.pt2e_quant import half_precision_rewriter as hp_rewriter -from neural_compressor.torch.utils import create_xiq_quantizer_from_pt2e_config +from neural_compressor.torch.algorithms.pt2e_quant.utility import create_xiq_quantizer_from_pt2e_config class W8A8PT2EQuantizer(Quantizer): diff --git a/neural_compressor/torch/algorithms/pt2e_quant/utility.py b/neural_compressor/torch/algorithms/pt2e_quant/utility.py new file mode 100644 index 00000000000..92635db1f70 --- /dev/null +++ b/neural_compressor/torch/algorithms/pt2e_quant/utility.py @@ -0,0 +1,79 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict + +import torch +import torch.ao.quantization.quantizer.x86_inductor_quantizer as xiq +from torch.ao.quantization.observer import HistogramObserver, MinMaxObserver, PlaceholderObserver +from torch.ao.quantization.quantizer import QuantizationSpec +from torch.ao.quantization.quantizer.x86_inductor_quantizer import QuantizationConfig, X86InductorQuantizer + + +def create_quant_spec_from_config(dtype, sym, granularity, algo, is_dynamic=False) -> QuantizationSpec: + dtype_mapping: Dict[str, torch.dtype] = {"int8": torch.int8, "uint8": torch.uint8} + select_dtype = dtype_mapping[dtype] + min_max_mapping = {torch.int8: (-128, 127), torch.uint8: (0, 255)} + qscheme_mapping = { + "per_channel": {True: torch.per_channel_symmetric, False: torch.per_tensor_affine}, + "per_tensor": {True: torch.per_tensor_symmetric, False: torch.per_tensor_affine}, + } + observer_mapping = { + "placeholder": PlaceholderObserver, + "minmax": MinMaxObserver, + "kl": HistogramObserver, + } + # Force to use placeholder observer for dynamic quantization + if is_dynamic: + algo = "placeholder" + # algo + observer_or_fake_quant_ctr = observer_mapping[algo] + # qscheme + qscheme = qscheme_mapping[granularity][sym] + quantization_spec = QuantizationSpec( + dtype=select_dtype, + quant_min=min_max_mapping[select_dtype][0], + quant_max=min_max_mapping[select_dtype][1], + observer_or_fake_quant_ctr=observer_or_fake_quant_ctr, + qscheme=qscheme, + is_dynamic=is_dynamic, + ) + return quantization_spec + + +def _map_inc_config_to_torch_quant_config(inc_config, is_dynamic=False) -> QuantizationConfig: + default_quant_config = xiq.get_default_x86_inductor_quantization_config(is_dynamic=is_dynamic) + input_act_quant_spec = create_quant_spec_from_config( + inc_config.act_dtype, inc_config.act_sym, inc_config.act_granularity, inc_config.act_algo, is_dynamic=is_dynamic + ) + weight_quant_spec = create_quant_spec_from_config( + inc_config.w_dtype, inc_config.w_sym, inc_config.w_granularity, inc_config.w_algo + ) + quant_config = QuantizationConfig( + input_activation=input_act_quant_spec, + output_activation=default_quant_config.output_activation, + weight=weight_quant_spec, + bias=default_quant_config.bias, + is_qat=False, + ) + return quant_config + + +def create_xiq_quantizer_from_pt2e_config(config, is_dynamic=False) -> X86InductorQuantizer: + quantizer = xiq.X86InductorQuantizer() + # set global + global_config = _map_inc_config_to_torch_quant_config(config, is_dynamic) + quantizer.set_global(global_config) + # Skip the local config for now (need torch 2.4) + return quantizer diff --git a/neural_compressor/torch/utils/utility.py b/neural_compressor/torch/utils/utility.py index 13942ea8083..95db23711cf 100644 --- a/neural_compressor/torch/utils/utility.py +++ b/neural_compressor/torch/utils/utility.py @@ -16,10 +16,6 @@ from typing import Callable, Dict, List, Tuple, Union import torch -import torch.ao.quantization.quantizer.x86_inductor_quantizer as xiq -from torch.ao.quantization.observer import HistogramObserver, MinMaxObserver, PlaceholderObserver -from torch.ao.quantization.quantizer import QuantizationSpec -from torch.ao.quantization.quantizer.x86_inductor_quantizer import QuantizationConfig, X86InductorQuantizer from typing_extensions import TypeAlias from neural_compressor.common import logger @@ -120,11 +116,9 @@ def get_model_info(model: torch.nn.Module, white_module_list: List[Callable]) -> return filter_result -def get_double_quant_config_dict(double_quant_type): +def get_double_quant_config_dict(double_quant_type="BNB_NF4"): from neural_compressor.torch.utils.constants import DOUBLE_QUANT_CONFIGS - if double_quant_type is None: - return {} assert double_quant_type in DOUBLE_QUANT_CONFIGS, "Supported double quant configs: {}".format( list(DOUBLE_QUANT_CONFIGS.keys()) ) @@ -170,61 +164,3 @@ def postprocess_model(model, mode, quantizer): elif mode == Mode.CONVERT or mode == Mode.QUANTIZE: if getattr(model, "quantizer", False): del model.quantizer - - -def create_quant_spec_from_config(dtype, sym, granularity, algo, is_dynamic=False) -> QuantizationSpec: - dtype_mapping: Dict[str, torch.dtype] = {"int8": torch.int8, "uint8": torch.uint8} - select_dtype = dtype_mapping[dtype] - min_max_mapping = {torch.int8: (-128, 127), torch.uint8: (0, 255)} - qscheme_mapping = { - "per_channel": {True: torch.per_channel_symmetric, False: torch.per_tensor_affine}, - "per_tensor": {True: torch.per_tensor_symmetric, False: torch.per_tensor_affine}, - } - observer_mapping = { - "placeholder": PlaceholderObserver, - "minmax": MinMaxObserver, - "kl": HistogramObserver, - } - # Force to use placeholder observer for dynamic quantization - if is_dynamic: - algo = "placeholder" - # algo - observer_or_fake_quant_ctr = observer_mapping[algo] - # qscheme - qscheme = qscheme_mapping[granularity][sym] - quantization_spec = QuantizationSpec( - dtype=select_dtype, - quant_min=min_max_mapping[select_dtype][0], - quant_max=min_max_mapping[select_dtype][1], - observer_or_fake_quant_ctr=observer_or_fake_quant_ctr, - qscheme=qscheme, - is_dynamic=is_dynamic, - ) - return quantization_spec - - -def _map_inc_config_to_torch_quant_config(inc_config, is_dynamic=False) -> QuantizationConfig: - default_quant_config = xiq.get_default_x86_inductor_quantization_config(is_dynamic=is_dynamic) - input_act_quant_spec = create_quant_spec_from_config( - inc_config.act_dtype, inc_config.act_sym, inc_config.act_granularity, inc_config.act_algo, is_dynamic=is_dynamic - ) - weight_quant_spec = create_quant_spec_from_config( - inc_config.w_dtype, inc_config.w_sym, inc_config.w_granularity, inc_config.w_algo - ) - quant_config = QuantizationConfig( - input_activation=input_act_quant_spec, - output_activation=default_quant_config.output_activation, - weight=weight_quant_spec, - bias=default_quant_config.bias, - is_qat=False, - ) - return quant_config - - -def create_xiq_quantizer_from_pt2e_config(config, is_dynamic=False) -> X86InductorQuantizer: - quantizer = xiq.X86InductorQuantizer() - # set global - global_config = _map_inc_config_to_torch_quant_config(config, is_dynamic) - quantizer.set_global(global_config) - # Skip the local config for now (need torch 2.4) - return quantizer diff --git a/test/3x/torch/utils/test_utility.py b/test/3x/torch/utils/test_utility.py index dc4bc0b3739..1c645c6dfcf 100644 --- a/test/3x/torch/utils/test_utility.py +++ b/test/3x/torch/utils/test_utility.py @@ -1,6 +1,7 @@ +import pytest import torch -from neural_compressor.torch.utils import logger +from neural_compressor.torch.utils.utility import get_double_quant_config_dict def get_gpt_j(): @@ -43,28 +44,28 @@ def setup_class(self): def teardown_class(self): pass - def test_fetch_module(self): - result = fetch_module(self.model, "transformer.h.2.mlp.fc_in") - self.assertIsInstance(result, torch.nn.Linear) - - def test_set_module(self): - module_name = "transformer.h.2.mlp.fc_in" - mew_value = torch.nn.Linear(32, 128, bias=False) - set_module(self.model, module_name, mew_value) + @pytest.mark.parametrize( + "module_name", + [ + "transformer.h.2.mlp.fc_in", + "transformer.nonexistent_attr", + ], + ) + def test_fetch_set_module(self, module_name): + # fetch result = fetch_module(self.model, module_name) - self.assertFalse(result.bias) - - def test_set_module_nonexistent_attribute(self): - new_value = torch.nn.Parameter(torch.Tensor([3.0])) - attr_name = "transformer.nonexistent_attr" - set_module(self.model, attr_name, new_value) - result = fetch_module(self.model, attr_name) - self.assertTrue(torch.equal(result, torch.Tensor([3.0]))) - - def test_fetch_module_nonexistent_attribute(self): - attr_name = "transformer.nonexistent_attr" - result = fetch_module(self.model, attr_name) - self.assertIsNone(result) + if "nonexistent_attr" in module_name: + self.assertIsNone(result) + else: + self.assertIsInstance(result, torch.nn.Linear) + # set + new_value = torch.nn.Linear(32, 128, bias=False) + set_module(self.model, module_name, new_value) + result = fetch_module(self.model, module_name) + if "nonexistent_attr" in module_name: + self.assertTrue(torch.equal(result, torch.Tensor([3.0]))) + else: + self.assertFalse(result.bias) def test_get_model_info(self): from neural_compressor.torch.utils.utility import get_model_info @@ -72,3 +73,8 @@ def test_get_model_info(self): white_module_list = [torch.nn.Linear] model_info = get_model_info(build_simple_torch_model(), white_module_list) self.assertEqual(len(model_info), 4) + + @pytest.mark.parametrize("double_quant_type", ["BNB_NF4", "GGML_TYPE_Q4_K"]) + def test_double_quant_config_dict(self, double_quant_type): + config_dict = get_double_quant_config_dict(double_quant_type) + assert isinstance(config_dict, dict), "The returned object should be a dict." From 08c097aea7bf087d846f8f98f17ff99599b98514 Mon Sep 17 00:00:00 2001 From: xin3he Date: Tue, 4 Jun 2024 15:14:49 +0800 Subject: [PATCH 03/11] add UTs Signed-off-by: xin3he --- .../torch/utils/auto_accelerator.py | 8 +- neural_compressor/torch/utils/environ.py | 38 ++++---- test/3x/torch/utils/test_auto_accelerator.py | 88 ++++++++++++++++++- test/3x/torch/utils/test_utility.py | 13 ++- 4 files changed, 117 insertions(+), 30 deletions(-) diff --git a/neural_compressor/torch/utils/auto_accelerator.py b/neural_compressor/torch/utils/auto_accelerator.py index 91ffc41fcac..fad0ca9d40b 100644 --- a/neural_compressor/torch/utils/auto_accelerator.py +++ b/neural_compressor/torch/utils/auto_accelerator.py @@ -98,7 +98,7 @@ class CUDA_Accelerator: return accelerator_registry.register_accelerator_impl(name=name, priority=priority) -class Auto_Accelerator(ABC): +class Auto_Accelerator(ABC): # pragma: no cover @classmethod @abstractmethod def is_available(cls) -> bool: @@ -175,7 +175,7 @@ def synchronize(self): @register_accelerator(name="cuda", priority=PRIORITY_CUDA) -class CUDA_Accelerator(Auto_Accelerator): +class CUDA_Accelerator(Auto_Accelerator): # pragma: no cover def __init__(self) -> None: self._name = "cuda" @@ -211,7 +211,7 @@ def empty_cache(self): @register_accelerator(name="xpu", priority=PRIORITY_XPU) -class XPU_Accelerator(Auto_Accelerator): +class XPU_Accelerator(Auto_Accelerator): # pragma: no cover def __init__(self) -> None: self._name = "xpu" @@ -250,7 +250,7 @@ def empty_cache(self): @register_accelerator(name="hpu", priority=PRIORITY_HPU) -class HPU_Accelerator(Auto_Accelerator): +class HPU_Accelerator(Auto_Accelerator): # pragma: no cover def __init__(self) -> None: self._name = "hpu" diff --git a/neural_compressor/torch/utils/environ.py b/neural_compressor/torch/utils/environ.py index 611ab5fda15..9e76558ee55 100644 --- a/neural_compressor/torch/utils/environ.py +++ b/neural_compressor/torch/utils/environ.py @@ -13,24 +13,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +import importlib import sys import torch from packaging.version import Version -# pylint:disable=import-error -try: - import habana_frameworks.torch.hpex - - _hpex_available = True -except: - _hpex_available = False - - -def is_hpex_available(): - return _hpex_available - +################ Check imported sys.module first to decide behavior ################# def is_ipex_imported() -> bool: for name, _ in sys.modules.items(): if name == "intel_extension_for_pytorch": @@ -45,11 +35,27 @@ def is_transformers_imported() -> bool: return False -try: - import intel_extension_for_pytorch as ipex +################ Check available sys.module to decide behavior ################# +def is_package_available(package_name): + package_spec = importlib.util.find_spec(package_name) + return package_spec is not None + +## check hpex +if is_package_available("habana_frameworks.torch.hpex"): + _hpex_available = True +else: + _hpex_available = False + + +def is_hpex_available(): + return _hpex_available + + +## check ipex +if is_package_available("intel_extension_for_pytorch"): _ipex_available = True -except: +else: _ipex_available = False @@ -60,6 +66,8 @@ def is_ipex_available(): def get_ipex_version(): if is_ipex_available(): try: + import intel_extension_for_pytorch as ipex + ipex_version = ipex.__version__.split("+")[0] except ValueError as e: # pragma: no cover assert False, "Got an unknown version of intel_extension_for_pytorch: {}".format(e) diff --git a/test/3x/torch/utils/test_auto_accelerator.py b/test/3x/torch/utils/test_auto_accelerator.py index 918a54ebbd5..2c8a8c733da 100644 --- a/test/3x/torch/utils/test_auto_accelerator.py +++ b/test/3x/torch/utils/test_auto_accelerator.py @@ -4,10 +4,90 @@ import torch from neural_compressor.torch.utils import get_accelerator -from neural_compressor.torch.utils.auto_accelerator import accelerator_registry, auto_detect_accelerator +from neural_compressor.torch.utils.auto_accelerator import ( + CPU_Accelerator, + CUDA_Accelerator, + HPU_Accelerator, + XPU_Accelerator, + accelerator_registry, + auto_detect_accelerator, +) -class Test_CPU_Accelerator: +@pytest.mark.skipif(not HPU_Accelerator.is_available(), reason="CUDA is not available") +class TestHPUAccelerator: + + def test_cuda_accelerator(self): + assert os.environ.get("FORCE_DEVICE", None) is None, "FORCE_DEVICE shouldn't be set. HPU is the first priority." + accelerator = auto_detect_accelerator() + assert accelerator.current_device() == 0, f"{accelerator.current_device()}" + assert accelerator.current_device_name() == "hpu" + assert accelerator.device() is not None + assert accelerator.empty_cache() is None + assert accelerator.synchronize() is None + assert accelerator.set_device(0) is None + assert accelerator.device_name(0) == "cuda:0" + assert accelerator.is_available() is True + assert accelerator.name() == "cuda" + assert accelerator.device_name(1) == "cuda:1" + assert accelerator.set_device(1) is None + assert accelerator.device_name(1) == "cuda:1" + assert accelerator.current_device() == 1 + assert accelerator.current_device_name() == "cuda:1" + assert accelerator.synchronize() is None + assert accelerator.empty_cache() is None + + @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Only one GPU is available") + def test_get_device(self): + accelerator = auto_detect_accelerator() + assert accelerator.set_device(1) is None + assert accelerator.current_device_name() == "cuda:1" + cur_device = get_accelerator().current_device_name() + assert cur_device == "cuda:1" + tmp_tensor = torch.tensor([1, 2], device=cur_device) + assert "cuda:1" == str(tmp_tensor.device) + + +@pytest.mark.skipif(not XPU_Accelerator.is_available(), reason="CUDA is not available") +class TestXPUAccelerator: + + @pytest.fixture + def force_use_cuda(self, monkeypatch): + # Force use CUDA + monkeypatch.setenv("FORCE_DEVICE", "cuda") + + def test_cuda_accelerator(self, force_use_cuda): + print(f"FORCE_DEVICE: {os.environ.get('FORCE_DEVICE', None)}") + accelerator = auto_detect_accelerator() + assert accelerator.current_device() == 0, f"{accelerator.current_device()}" + assert accelerator.current_device_name() == "cuda:0" + assert accelerator.device() is not None + assert accelerator.empty_cache() is None + assert accelerator.synchronize() is None + assert accelerator.set_device(0) is None + assert accelerator.device_name(0) == "cuda:0" + assert accelerator.is_available() is True + assert accelerator.name() == "cuda" + assert accelerator.device_name(1) == "cuda:1" + assert accelerator.set_device(1) is None + assert accelerator.device_name(1) == "cuda:1" + assert accelerator.current_device() == 1 + assert accelerator.current_device_name() == "cuda:1" + assert accelerator.synchronize() is None + assert accelerator.empty_cache() is None + + @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Only one GPU is available") + def test_get_device(self): + accelerator = auto_detect_accelerator() + assert accelerator.set_device(1) is None + assert accelerator.current_device_name() == "cuda:1" + cur_device = get_accelerator().current_device_name() + assert cur_device == "cuda:1" + tmp_tensor = torch.tensor([1, 2], device=cur_device) + assert "cuda:1" == str(tmp_tensor.device) + + +class TestCPUAccelerator: @pytest.fixture def force_use_cpu(self, monkeypatch): # Force use CPU @@ -25,8 +105,8 @@ def test_cpu_accelerator(self, force_use_cpu): assert accelerator.synchronize() is None -@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available") -class Test_CUDA_Accelerator: +@pytest.mark.skipif(not CUDA_Accelerator.is_available(), reason="CUDA is not available") +class TestCUDAAccelerator: @pytest.fixture def force_use_cuda(self, monkeypatch): diff --git a/test/3x/torch/utils/test_utility.py b/test/3x/torch/utils/test_utility.py index 1c645c6dfcf..b84db61ff7a 100644 --- a/test/3x/torch/utils/test_utility.py +++ b/test/3x/torch/utils/test_utility.py @@ -55,24 +55,23 @@ def test_fetch_set_module(self, module_name): # fetch result = fetch_module(self.model, module_name) if "nonexistent_attr" in module_name: - self.assertIsNone(result) + assert result is None, "result should be None" else: - self.assertIsInstance(result, torch.nn.Linear) + assert isinstance(result, torch.nn.Linear), "fetched module should be Linear" + assert result.bias is not None, "The bias of fetched module should not be None." # set new_value = torch.nn.Linear(32, 128, bias=False) set_module(self.model, module_name, new_value) result = fetch_module(self.model, module_name) - if "nonexistent_attr" in module_name: - self.assertTrue(torch.equal(result, torch.Tensor([3.0]))) - else: - self.assertFalse(result.bias) + print(result) + assert result.bias is None, "The bias of new module should be None." def test_get_model_info(self): from neural_compressor.torch.utils.utility import get_model_info white_module_list = [torch.nn.Linear] model_info = get_model_info(build_simple_torch_model(), white_module_list) - self.assertEqual(len(model_info), 4) + assert len(model_info) == 4, "The length of model_info should be 4." @pytest.mark.parametrize("double_quant_type", ["BNB_NF4", "GGML_TYPE_Q4_K"]) def test_double_quant_config_dict(self, double_quant_type): From 866fc373e6a651093dbb6b4c0cb6163a0a4e29b3 Mon Sep 17 00:00:00 2001 From: xinhe3 Date: Tue, 4 Jun 2024 10:59:52 +0300 Subject: [PATCH 04/11] checked on hpu Signed-off-by: xinhe3 --- .../torch/utils/auto_accelerator.py | 10 ++- .../quantization/weight_only/test_rtn.py | 4 +- test/3x/torch/utils/test_auto_accelerator.py | 64 +++++++------------ 3 files changed, 33 insertions(+), 45 deletions(-) diff --git a/neural_compressor/torch/utils/auto_accelerator.py b/neural_compressor/torch/utils/auto_accelerator.py index fad0ca9d40b..be7dd7f842f 100644 --- a/neural_compressor/torch/utils/auto_accelerator.py +++ b/neural_compressor/torch/utils/auto_accelerator.py @@ -275,7 +275,10 @@ def synchronize(self): return torch.hpu.synchronize() def set_device(self, device_index): - return torch.hpu.set_device(device_index) + try: + torch.hpu.set_device(device_index) + except Exception as e: + logger.warning(e) def current_device(self): return torch.hpu.current_device() @@ -287,7 +290,10 @@ def device(self, device_index=None): return torch.hpu.device(device_index) def empty_cache(self): - return torch.hpu.empty_cache() + try: + torch.hpu.empty_cache() + except Exception as e: + logger.warning(e) def mark_step(self): return htcore.mark_step() diff --git a/test/3x/torch/quantization/weight_only/test_rtn.py b/test/3x/torch/quantization/weight_only/test_rtn.py index 53aa19f9424..bbf270f2321 100644 --- a/test/3x/torch/quantization/weight_only/test_rtn.py +++ b/test/3x/torch/quantization/weight_only/test_rtn.py @@ -14,7 +14,7 @@ prepare, quantize, ) -from neural_compressor.torch.utils import accelerator +from neural_compressor.torch.utils import accelerator, is_hpex_available device = accelerator.current_device_name() @@ -76,6 +76,8 @@ def test_int_params(self, bits, use_sym, group_size, group_dim): model = convert(model) out = model(self.example_inputs)[0] assert (out != self.label).any(), "WOQ output should be different with raw output" + if is_hpex_available: + assert "hpu" in out.device, "Neural Compressor should run on HPU when HPEX is available." if (bits, use_sym, group_size, group_dim) == (8, True, -1, 1): assert torch.allclose(out, self.label, atol=0.01), "Accuracy gap atol > 0.01 is unexpected." if (bits, use_sym, group_size, group_dim) == [(4, True, 128, 0), (4, True, 32, 1)]: diff --git a/test/3x/torch/utils/test_auto_accelerator.py b/test/3x/torch/utils/test_auto_accelerator.py index 2c8a8c733da..575ac517eec 100644 --- a/test/3x/torch/utils/test_auto_accelerator.py +++ b/test/3x/torch/utils/test_auto_accelerator.py @@ -14,77 +14,63 @@ ) -@pytest.mark.skipif(not HPU_Accelerator.is_available(), reason="CUDA is not available") +@pytest.mark.skipif(not HPU_Accelerator.is_available(), reason="HPEX is not available") class TestHPUAccelerator: - def test_cuda_accelerator(self): assert os.environ.get("FORCE_DEVICE", None) is None, "FORCE_DEVICE shouldn't be set. HPU is the first priority." accelerator = auto_detect_accelerator() assert accelerator.current_device() == 0, f"{accelerator.current_device()}" - assert accelerator.current_device_name() == "hpu" + assert accelerator.current_device_name() == "hpu:0" assert accelerator.device() is not None - assert accelerator.empty_cache() is None - assert accelerator.synchronize() is None - assert accelerator.set_device(0) is None - assert accelerator.device_name(0) == "cuda:0" + assert accelerator.device_name(0) == "hpu:0" assert accelerator.is_available() is True - assert accelerator.name() == "cuda" - assert accelerator.device_name(1) == "cuda:1" - assert accelerator.set_device(1) is None - assert accelerator.device_name(1) == "cuda:1" - assert accelerator.current_device() == 1 - assert accelerator.current_device_name() == "cuda:1" + assert accelerator.name() == "hpu" + assert accelerator.device_name(1) == "hpu:1" assert accelerator.synchronize() is None assert accelerator.empty_cache() is None - @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Only one GPU is available") + @pytest.mark.skipif(torch.hpu.device_count() < 2, reason="Only one HPU is available") def test_get_device(self): accelerator = auto_detect_accelerator() assert accelerator.set_device(1) is None - assert accelerator.current_device_name() == "cuda:1" + assert accelerator.current_device_name() == "hpu:1" cur_device = get_accelerator().current_device_name() - assert cur_device == "cuda:1" + assert cur_device == "hpu:1" tmp_tensor = torch.tensor([1, 2], device=cur_device) - assert "cuda:1" == str(tmp_tensor.device) + assert "hpu:1" == str(tmp_tensor.device) -@pytest.mark.skipif(not XPU_Accelerator.is_available(), reason="CUDA is not available") +@pytest.mark.skipif(not XPU_Accelerator.is_available(), reason="XPU is not available") class TestXPUAccelerator: @pytest.fixture - def force_use_cuda(self, monkeypatch): - # Force use CUDA - monkeypatch.setenv("FORCE_DEVICE", "cuda") + def force_use_xpu(self, monkeypatch): + # Force use xpu + monkeypatch.setenv("FORCE_DEVICE", "xpu") - def test_cuda_accelerator(self, force_use_cuda): + def test_xpu_accelerator(self, force_use_xpu): print(f"FORCE_DEVICE: {os.environ.get('FORCE_DEVICE', None)}") accelerator = auto_detect_accelerator() assert accelerator.current_device() == 0, f"{accelerator.current_device()}" - assert accelerator.current_device_name() == "cuda:0" + assert accelerator.current_device_name() == "xpu:0" assert accelerator.device() is not None - assert accelerator.empty_cache() is None - assert accelerator.synchronize() is None assert accelerator.set_device(0) is None - assert accelerator.device_name(0) == "cuda:0" + assert accelerator.device_name(0) == "xpu:0" assert accelerator.is_available() is True - assert accelerator.name() == "cuda" - assert accelerator.device_name(1) == "cuda:1" - assert accelerator.set_device(1) is None - assert accelerator.device_name(1) == "cuda:1" - assert accelerator.current_device() == 1 - assert accelerator.current_device_name() == "cuda:1" + assert accelerator.name() == "xpu" + assert accelerator.device_name(1) == "xpu:1" assert accelerator.synchronize() is None assert accelerator.empty_cache() is None - @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Only one GPU is available") + @pytest.mark.skipif(torch.xpu.device_count() < 2, reason="Only one GPU is available") def test_get_device(self): accelerator = auto_detect_accelerator() assert accelerator.set_device(1) is None - assert accelerator.current_device_name() == "cuda:1" + assert accelerator.current_device_name() == "xpu:1" cur_device = get_accelerator().current_device_name() - assert cur_device == "cuda:1" + assert cur_device == "xpu:1" tmp_tensor = torch.tensor([1, 2], device=cur_device) - assert "cuda:1" == str(tmp_tensor.device) + assert "xpu:1" == str(tmp_tensor.device) class TestCPUAccelerator: @@ -119,17 +105,11 @@ def test_cuda_accelerator(self, force_use_cuda): assert accelerator.current_device() == 0, f"{accelerator.current_device()}" assert accelerator.current_device_name() == "cuda:0" assert accelerator.device() is not None - assert accelerator.empty_cache() is None - assert accelerator.synchronize() is None assert accelerator.set_device(0) is None assert accelerator.device_name(0) == "cuda:0" assert accelerator.is_available() is True assert accelerator.name() == "cuda" assert accelerator.device_name(1) == "cuda:1" - assert accelerator.set_device(1) is None - assert accelerator.device_name(1) == "cuda:1" - assert accelerator.current_device() == 1 - assert accelerator.current_device_name() == "cuda:1" assert accelerator.synchronize() is None assert accelerator.empty_cache() is None From c046b92b1b519506683cedc0315c83766807276a Mon Sep 17 00:00:00 2001 From: xin3he Date: Tue, 4 Jun 2024 16:44:30 +0800 Subject: [PATCH 05/11] fix bug Signed-off-by: xin3he --- neural_compressor/torch/utils/environ.py | 2 +- test/3x/torch/utils/test_auto_accelerator.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/neural_compressor/torch/utils/environ.py b/neural_compressor/torch/utils/environ.py index 9e76558ee55..0e65cdd51fe 100644 --- a/neural_compressor/torch/utils/environ.py +++ b/neural_compressor/torch/utils/environ.py @@ -42,7 +42,7 @@ def is_package_available(package_name): ## check hpex -if is_package_available("habana_frameworks.torch.hpex"): +if is_package_available("habana_frameworks"): _hpex_available = True else: _hpex_available = False diff --git a/test/3x/torch/utils/test_auto_accelerator.py b/test/3x/torch/utils/test_auto_accelerator.py index 575ac517eec..dea9cdce918 100644 --- a/test/3x/torch/utils/test_auto_accelerator.py +++ b/test/3x/torch/utils/test_auto_accelerator.py @@ -29,8 +29,9 @@ def test_cuda_accelerator(self): assert accelerator.synchronize() is None assert accelerator.empty_cache() is None - @pytest.mark.skipif(torch.hpu.device_count() < 2, reason="Only one HPU is available") def test_get_device(self): + if torch.hpu.device_count() < 2: + return accelerator = auto_detect_accelerator() assert accelerator.set_device(1) is None assert accelerator.current_device_name() == "hpu:1" @@ -62,8 +63,9 @@ def test_xpu_accelerator(self, force_use_xpu): assert accelerator.synchronize() is None assert accelerator.empty_cache() is None - @pytest.mark.skipif(torch.xpu.device_count() < 2, reason="Only one GPU is available") def test_get_device(self): + if torch.xpu.device_count() < 2: + return accelerator = auto_detect_accelerator() assert accelerator.set_device(1) is None assert accelerator.current_device_name() == "xpu:1" From f015d42870bf71893a5fe044f408d788b15bb9a2 Mon Sep 17 00:00:00 2001 From: xin3he Date: Wed, 5 Jun 2024 10:13:29 +0800 Subject: [PATCH 06/11] change requirement Signed-off-by: xin3he --- .../language-modeling/quantization/llm/requirements.txt | 2 ++ neural_compressor/torch/utils/environ.py | 4 +++- requirements_pt.txt | 3 +-- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/requirements.txt index f0b56e558d3..9688a4f6cb3 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/requirements.txt +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/requirements.txt @@ -11,3 +11,5 @@ neural-compressor intel-extension-for-transformers lm_eval==0.4.2 peft +auto_round +intel_extension_for_pytorch diff --git a/neural_compressor/torch/utils/environ.py b/neural_compressor/torch/utils/environ.py index 0e65cdd51fe..3091aa83d88 100644 --- a/neural_compressor/torch/utils/environ.py +++ b/neural_compressor/torch/utils/environ.py @@ -37,7 +37,9 @@ def is_transformers_imported() -> bool: ################ Check available sys.module to decide behavior ################# def is_package_available(package_name): - package_spec = importlib.util.find_spec(package_name) + from importlib.util import find_spec + + package_spec = find_spec(package_name) return package_spec is not None diff --git a/requirements_pt.txt b/requirements_pt.txt index 67c5371c46c..6a012a75b5a 100644 --- a/requirements_pt.txt +++ b/requirements_pt.txt @@ -1,5 +1,4 @@ -auto-round -intel_extension_for_pytorch +numpy peft==0.10.0 psutil py-cpuinfo From 98301d5e254fdbf43146213796ac77da135d12ea Mon Sep 17 00:00:00 2001 From: xin3he Date: Wed, 5 Jun 2024 10:21:33 +0800 Subject: [PATCH 07/11] fix bug Signed-off-by: xin3he --- test/3x/torch/quantization/weight_only/test_rtn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/3x/torch/quantization/weight_only/test_rtn.py b/test/3x/torch/quantization/weight_only/test_rtn.py index bbf270f2321..4fff24fb093 100644 --- a/test/3x/torch/quantization/weight_only/test_rtn.py +++ b/test/3x/torch/quantization/weight_only/test_rtn.py @@ -76,7 +76,7 @@ def test_int_params(self, bits, use_sym, group_size, group_dim): model = convert(model) out = model(self.example_inputs)[0] assert (out != self.label).any(), "WOQ output should be different with raw output" - if is_hpex_available: + if is_hpex_available(): assert "hpu" in out.device, "Neural Compressor should run on HPU when HPEX is available." if (bits, use_sym, group_size, group_dim) == (8, True, -1, 1): assert torch.allclose(out, self.label, atol=0.01), "Accuracy gap atol > 0.01 is unexpected." From a3362269044c21c34a16c839f285716e7e8f3ba7 Mon Sep 17 00:00:00 2001 From: xin3he Date: Thu, 6 Jun 2024 11:41:54 +0800 Subject: [PATCH 08/11] fix bug and rename ut Signed-off-by: xin3he --- .../language-modeling/quantization/llm/run_clm_no_trainer.py | 3 ++- test/3x/torch/utils/{test_utility.py => test_torch_utility.py} | 0 2 files changed, 2 insertions(+), 1 deletion(-) rename test/3x/torch/utils/{test_utility.py => test_torch_utility.py} (100%) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py index 4e8e4b1c3e4..c95eb9ed002 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py @@ -238,7 +238,8 @@ def get_user_model(): from neural_compressor.torch.quantization import RTNConfig, GPTQConfig, prepare, convert, quantize from neural_compressor.torch.utils import get_double_quant_config_dict weight_sym = True if args.woq_scheme == "sym" else False - double_quant_config_dict = get_double_quant_config_dict(args.double_quant_type) + if args.double_quant_type is not None: + double_quant_config_dict = get_double_quant_config_dict(args.double_quant_type) if args.woq_algo == "RTN": if args.double_quant_type is not None: diff --git a/test/3x/torch/utils/test_utility.py b/test/3x/torch/utils/test_torch_utility.py similarity index 100% rename from test/3x/torch/utils/test_utility.py rename to test/3x/torch/utils/test_torch_utility.py From c35fa6b77176b7845a605445bb5f05c5d1bccb8e Mon Sep 17 00:00:00 2001 From: xin3he Date: Fri, 7 Jun 2024 10:09:18 +0800 Subject: [PATCH 09/11] add requirement Signed-off-by: xin3he --- test/3x/torch/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/test/3x/torch/requirements.txt b/test/3x/torch/requirements.txt index 28a91bccca8..0fe83126bdf 100644 --- a/test/3x/torch/requirements.txt +++ b/test/3x/torch/requirements.txt @@ -1,3 +1,4 @@ +auto_round expecttest numpy peft==0.10.0 From a4faba2c7468997c5c5b56661a8e861b4be2ff46 Mon Sep 17 00:00:00 2001 From: xinhe Date: Tue, 11 Jun 2024 17:50:05 +0800 Subject: [PATCH 10/11] Update requirements.txt --- test/3x/torch/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/test/3x/torch/requirements.txt b/test/3x/torch/requirements.txt index 0fe83126bdf..b68f3d7a6ab 100644 --- a/test/3x/torch/requirements.txt +++ b/test/3x/torch/requirements.txt @@ -6,3 +6,4 @@ prettytable psutil pytest transformers +intel_extension_for_pytorch From 66bb07694b08781c29918726dfca3a048836ec13 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 11 Jun 2024 09:52:17 +0000 Subject: [PATCH 11/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- test/3x/torch/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/3x/torch/requirements.txt b/test/3x/torch/requirements.txt index b68f3d7a6ab..bdf99d92cf0 100644 --- a/test/3x/torch/requirements.txt +++ b/test/3x/torch/requirements.txt @@ -1,9 +1,9 @@ auto_round expecttest +intel_extension_for_pytorch numpy peft==0.10.0 prettytable psutil pytest transformers -intel_extension_for_pytorch