From b2a4e9199b994589e2819b7fc5e5204924a411ef Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 16 Jul 2024 11:25:58 +0800 Subject: [PATCH 01/12] uintegrate autoround v2.1 Signed-off-by: Kaihui-intel --- .../torch/algorithms/weight_only/autoround.py | 128 +++++++++++------- .../torch/quantization/algorithm_entry.py | 16 ++- .../torch/quantization/config.py | 24 +++- .../weight_only/test_autoround.py | 10 +- 4 files changed, 118 insertions(+), 60 deletions(-) diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py index 2e97533c0bb..29b45a4ac85 100644 --- a/neural_compressor/torch/algorithms/weight_only/autoround.py +++ b/neural_compressor/torch/algorithms/weight_only/autoround.py @@ -31,69 +31,91 @@ class AutoRoundQuantizer(Quantizer): def __init__( self, quant_config: dict = {}, - enable_full_range: bool = False, + enable_full_range: bool = False, ##for symmetric, TODO support later batch_size: int = 8, amp: bool = True, - device=None, + device: str = None, lr_scheduler=None, + dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k", enable_quanted_input: bool = True, enable_minmax_tuning: bool = True, lr: float = None, minmax_lr: float = None, - low_gpu_mem_usage: bool = True, + low_gpu_mem_usage: bool = False, iters: int = 200, seqlen: int = 2048, - n_samples: int = 512, + nsamples: int = 128, sampler: str = "rand", seed: int = 42, - n_blocks: int = 1, + nblocks: int = 1, gradient_accumulate_steps: int = 1, not_use_best_mse: bool = False, dynamic_max_gap: int = -1, data_type: str = "int", scale_dtype: str = "fp16", + multimodal:bool = False, + act_bits: int = 32, + act_group_size: int = None, + act_sym: bool = None, + act_dynamic: bool = True, + low_cpu_mem_usage: bool = False, **kwargs, ): """Init a AutQRoundQuantizer object. Args: - quant_config (dict): Configuration for weight quantization (default is None). - quant_config={ - 'layer1':##layer_name - { - 'data_type': 'int', - 'bits': 4, - 'group_size': 32, - 'sym': False, + quant_config (dict): Configuration for weight quantization (default is None). + quant_config={ + 'layer1':##layer_name + { + 'data_type': 'int', + 'bits': 4, + 'group_size': 32, + 'sym': False, + } + ..., } - ... - } - keys: - data_type (str): The data type to be used (default is "int"). - bits (int): Number of bits for quantization (default is 4). - group_size (int): Size of the quantization group (default is 128). - sym (bool): Whether to use symmetric quantization. (default is None). - enable_full_range (bool): Whether to enable full range quantization (default is False). - batch_size (int): Batch size for training (default is 8). - amp (bool): Whether to use automatic mixed precision (default is True). Automatically detect and set. - device: The device to be used for tuning (default is None). Automatically detect and set. - lr_scheduler: The learning rate scheduler to be used. - use_quant_input (bool): Whether to use quantized input data (default is True). - enable_minmax_tuning (bool): Whether to enable min-max tuning (default is True). - lr (float): The learning rate (default is 0.005). - minmax_lr (float): The learning rate for min-max tuning (default is None). - low_gpu_mem_usage (bool): Whether to use low GPU memory (default is True). - iters (int): Number of iterations (default is 200). - seqlen (int): Length of the sequence. - n_samples (int): Number of samples (default is 512). - sampler (str): The sampling method (default is "rand"). - seed (int): The random seed (default is 42). - n_blocks (int): Number of blocks (default is 1). - gradient_accumulate_steps (int): Number of gradient accumulation steps (default is 1). - not_use_best_mse (bool): Whether to use mean squared error (default is False). - dynamic_max_gap (int): The dynamic maximum gap (default is -1). - scale_dtype (str): The data type of quantization scale to be used (default is "float16"), different kernels + keys: + data_type (str): The data type to be used (default is "int"). + bits (int): Number of bits for quantization (default is 4). + group_size (int): Size of the quantization group (default is 128). + sym (bool): Whether to use symmetric quantization. (default is None). + bits (int): Number of bits for quantization (default is 4). + group_size (int): Size of the quantization group (default is 128). + sym (bool): Whether symmetric quantization is to be used (default is False). + + enable_full_range (bool): Whether to enable full range quantization (default is False). + batch_size (int): Batch size for training (default is 8). + amp (bool): Whether to use automatic mixed precision (default is True). + device: The device to be used for tuning (default is "auto"). + lr_scheduler: The learning rate scheduler to be used. + dataset (str): The default dataset name (default is "NeelNanda/pile-10k"). + enable_quanted_input (bool): Whether to use the output of the previous quantized block as + the input for the current block (default is True). + enable_minmax_tuning (bool): Whether to enable weight min-max tuning (default is True). + lr (float): The learning rate (default is None, will be set to 1.0/iters). + minmax_lr (float): The learning rate for min-max tuning (default is None, it will be set to lr automatically). + low_gpu_mem_usage (bool): Whether to use low GPU memory (default is True). + iters (int): Number of iterations (default is 200). + seqlen (int): Data length of the sequence for tuning (default is 2048). + nsamples (int): Number of samples (default is 128). + sampler (str): The sampling method (default is "rand"). + seed (int): The random seed (default is 42). + nblocks (int): Number of blocks (default is 1). + gradient_accumulate_steps (int): Number of gradient accumulation steps (default is 1). + not_use_best_mse (bool): Whether to use mean squared error (default is False). + dynamic_max_gap (int): The dynamic maximum gap (default is -1). + data_type (str): The data type to be used (default is "int"). + scale_dtype (str): The data type of quantization scale to be used (default is "float16"), different kernels have different choices. + multimodal(bool): Enable multimodal model quantization, (default is "False"). + act_bits (int): Number of bits for activation quantization. Default is 32. + act_group_size (int): Group size for activation quantization. Default is None. + act_sym (bool): Whether to use symmetric activation quantization. Default is None. + act_dynamic (bool): Whether to use dynamic activation quantization. Default is True. + + Returns: + The quantized model. """ super().__init__(quant_config) self.tokenizer = None @@ -109,15 +131,21 @@ def __init__( self.low_gpu_mem_usage = low_gpu_mem_usage self.iters = iters self.seqlen = seqlen - self.n_samples = n_samples + self.nsamples = nsamples self.sampler = sampler self.seed = seed - self.n_blocks = n_blocks + self.nblocks = nblocks self.gradient_accumulate_steps = gradient_accumulate_steps self.not_use_best_mse = not_use_best_mse self.dynamic_max_gap = dynamic_max_gap self.data_type = data_type self.scale_dtype = scale_dtype + self.multimodal = multimodal + self.act_bits = act_bits + self.act_group_size = act_group_size + self.act_sym = act_sym + self.act_dynamic = act_dynamic + self.low_cpu_mem_usage = low_cpu_mem_usage def prepare(self, model: torch.nn.Module, *args, **kwargs): """Prepares a given model for quantization. @@ -137,7 +165,7 @@ def convert(self, model: torch.nn.Module, *args, **kwargs): model=model, tokenizer=None, dataset=dataloader, - weight_config=self.quant_config or {}, + layer_config=self.quant_config or {}, enable_full_range=self.enable_full_range, batch_size=self.batch_size, amp=self.amp, @@ -150,15 +178,21 @@ def convert(self, model: torch.nn.Module, *args, **kwargs): low_gpu_mem_usage=self.low_gpu_mem_usage, iters=self.iters, seqlen=self.seqlen, - n_samples=self.n_samples, + nsamples=self.nsamples, sampler=self.sampler, seed=self.seed, - n_blocks=self.n_blocks, + nblocks=self.nblocks, gradient_accumulate_steps=self.gradient_accumulate_steps, not_use_best_mse=self.not_use_best_mse, dynamic_max_gap=self.dynamic_max_gap, data_type=self.data_type, scale_dtype=self.scale_dtype, + multimodal=self.multimodal, + act_bits=self.act_bits, + act_group_size=self.act_group_size, + act_sym = self.act_sym, + act_dynamic = self.act_dynamic, + low_cpu_mem_usage = self.low_cpu_mem_usage, ) model, weight_config = rounder.quantize() model.autoround_config = weight_config @@ -166,7 +200,7 @@ def convert(self, model: torch.nn.Module, *args, **kwargs): return model -def get_dataloader(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42, bs=8, n_samples=512): +def get_dataloader(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42, bs=8, nsamples=512): """Generate a DataLoader for calibration using specified parameters. Args: @@ -186,6 +220,6 @@ def get_dataloader(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42 from auto_round.calib_dataset import get_dataloader # pylint: disable=E0401 dataloader = get_dataloader( - tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=seed, bs=bs, n_samples=n_samples + tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=seed, bs=bs, nsamples=nsamples ) return dataloader diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py index 2a3eada9bf5..69496ae4842 100644 --- a/neural_compressor/torch/quantization/algorithm_entry.py +++ b/neural_compressor/torch/quantization/algorithm_entry.py @@ -459,6 +459,10 @@ def autoround_quantize_entry( "bits": quant_config.bits, "sym": quant_config.use_sym, "group_size": quant_config.group_size, + "act_bits": quant_config.act_bits, + "act_group_size": quant_config.act_group_size, + "act_sym": quant_config.act_sym, + "act_dynamic": quant_config.act_dynamic, } enable_full_range = quant_config.enable_full_range batch_size = quant_config.batch_size @@ -470,14 +474,16 @@ def autoround_quantize_entry( low_gpu_mem_usage = quant_config.low_gpu_mem_usage iters = quant_config.iters seqlen = quant_config.seqlen - n_samples = quant_config.n_samples + nsamples = quant_config.nsamples sampler = quant_config.sampler seed = quant_config.seed - n_blocks = quant_config.n_blocks + nblocks = quant_config.nblocks gradient_accumulate_steps = quant_config.gradient_accumulate_steps not_use_best_mse = quant_config.not_use_best_mse dynamic_max_gap = quant_config.dynamic_max_gap scale_dtype = quant_config.scale_dtype + multimodal = quant_config.multimodal + low_cpu_mem_usage = quant_config.use_layer_wise kwargs.pop("example_inputs") @@ -495,14 +501,16 @@ def autoround_quantize_entry( low_gpu_mem_usage=low_gpu_mem_usage, iters=iters, seqlen=seqlen, - n_samples=n_samples, + nsamples=nsamples, sampler=sampler, seed=seed, - n_blocks=n_blocks, + nblocks=nblocks, gradient_accumulate_steps=gradient_accumulate_steps, not_use_best_mse=not_use_best_mse, dynamic_max_gap=dynamic_max_gap, scale_dtype=scale_dtype, + multimodal=multimodal, + low_cpu_mem_usage=low_cpu_mem_usage, ) model = quantizer.execute(model=model, mode=mode, *args, **kwargs) model.qconfig = configs_mapping diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py index 27d30753cdb..03a5bb0745e 100644 --- a/neural_compressor/torch/quantization/config.py +++ b/neural_compressor/torch/quantization/config.py @@ -731,6 +731,10 @@ def __init__( use_sym: bool = False, group_size: int = 128, # AUTOROUND + act_bits: int = 32, + act_group_size: int = None, + act_sym: bool = None, + act_dynamic: bool = True, enable_full_range: bool = False, batch_size: int = 8, lr_scheduler=None, @@ -741,15 +745,16 @@ def __init__( low_gpu_mem_usage: bool = True, iters: int = 200, seqlen: int = 2048, - n_samples: int = 512, + nsamples: int = 512, sampler: str = "rand", seed: int = 42, - n_blocks: int = 1, + nblocks: int = 1, gradient_accumulate_steps: int = 1, not_use_best_mse: bool = False, dynamic_max_gap: int = -1, scale_dtype: str = "fp16", use_layer_wise: bool = False, + multimodal:bool = False, white_list: Optional[List[OP_NAME_OR_MODULE_TYPE]] = DEFAULT_WHITE_LIST, ): """Init AUTOROUND weight-only quantization config. @@ -759,6 +764,10 @@ def __init__( bits (int): Number of bits used to represent weights, default is 4. use_sym (bool): Indicates whether weights are symmetric, default is False. group_size (int): Size of weight groups, default is 128. + act_bits (int): Number of bits for activation quantization. Default is 32. + act_group_size (int): Group size for activation quantization. Default is None. + act_sym (bool): Whether to use symmetric activation quantization. Default is None. + act_dynamic (bool): Whether to use dynamic activation quantization. Default is True. enable_full_range (bool): Whether to enable full range quantization (default is False). batch_size (int): Batch size for training (default is 8). lr_scheduler: The learning rate scheduler to be used. @@ -778,12 +787,18 @@ def __init__( dynamic_max_gap (int): The dynamic maximum gap (default is -1). scale_dtype (str): The data type of quantization scale to be used (default is "float16"), different kernels have different choices. + use_layer_wise (bool): Enables quantize model per layer. Defaults to False. + multimodal(bool): Enable multimodal model quantization, (default is "False"). """ super().__init__(white_list=white_list) self.dtype = dtype self.bits = bits self.use_sym = use_sym self.group_size = group_size + self.act_bits = act_bits + self.act_group_size = act_group_size + self.act_sym = act_sym + self.act_dynamic = act_dynamic self.enable_full_range = enable_full_range self.batch_size = batch_size self.lr_scheduler = lr_scheduler @@ -794,15 +809,16 @@ def __init__( self.low_gpu_mem_usage = low_gpu_mem_usage self.iters = iters self.seqlen = seqlen - self.n_samples = n_samples + self.nsamples = nsamples self.sampler = sampler self.seed = seed - self.n_blocks = n_blocks + self.nblocks = nblocks self.gradient_accumulate_steps = gradient_accumulate_steps self.not_use_best_mse = not_use_best_mse self.dynamic_max_gap = dynamic_max_gap self.scale_dtype = scale_dtype self.use_layer_wise = use_layer_wise + self.multimodal = multimodal self._post_init() @classmethod diff --git a/test/3x/torch/quantization/weight_only/test_autoround.py b/test/3x/torch/quantization/weight_only/test_autoround.py index f1539b072b7..f5351656595 100644 --- a/test/3x/torch/quantization/weight_only/test_autoround.py +++ b/test/3x/torch/quantization/weight_only/test_autoround.py @@ -49,7 +49,7 @@ def setup_class(self): tokenizer = transformers.AutoTokenizer.from_pretrained( "hf-internal-testing/tiny-random-GPTJForCausalLM", trust_remote_code=True ) - self.dataloader = get_dataloader(tokenizer, 32, dataset_name="NeelNanda/pile-10k", seed=42, bs=8, n_samples=10) + self.dataloader = get_dataloader(tokenizer, 32, dataset_name="NeelNanda/pile-10k", seed=42, bs=8, nsamples=10) self.label = self.gptj(self.inp)[0] def teardown_class(self): @@ -61,7 +61,7 @@ def setup_method(self, method): @pytest.mark.parametrize("quant_lm_head", [True, False]) def test_autoround(self, quant_lm_head): fp32_model = copy.deepcopy(self.gptj) - quant_config = AutoRoundConfig(n_samples=32, seqlen=10, iters=10, scale_dtype="fp32") + quant_config = AutoRoundConfig(nsamples=32, seqlen=10, iters=10, scale_dtype="fp32") if quant_lm_head is False: quant_config.set_local("lm_head", AutoRoundConfig(dtype="fp32")) logger.info(f"Test AutoRound with config {quant_config}") @@ -83,7 +83,7 @@ def test_autoround(self, quant_lm_head): def test_autoround_with_quantize_API(self): gpt_j_model = copy.deepcopy(self.gptj) - quant_config = AutoRoundConfig(n_samples=32, seqlen=10, iters=10, scale_dtype="fp32") + quant_config = AutoRoundConfig(nsamples=32, seqlen=10, iters=10, scale_dtype="fp32") quant_config.set_local("lm_head", AutoRoundConfig(dtype="fp32")) logger.info(f"Test AutoRound with config {quant_config}") @@ -101,7 +101,7 @@ def test_autoround_with_quantize_API(self): def test_save_and_load(self): fp32_model = copy.deepcopy(self.gptj) - quant_config = AutoRoundConfig(n_samples=32, seqlen=10, iters=10, scale_dtype="fp32") + quant_config = AutoRoundConfig(nsamples=32, seqlen=10, iters=10, scale_dtype="fp32") # quant_config.set_local("lm_head", AutoRoundConfig(dtype="fp32")) logger.info(f"Test AutoRound with config {quant_config}") @@ -133,7 +133,7 @@ def test_conv1d(self): text = "Replace me by any text you'd like." encoded_input = tokenizer(text, return_tensors="pt") out1 = model(**encoded_input)[0] - quant_config = AutoRoundConfig(n_samples=32, seqlen=10, iters=10, scale_dtype="fp32") + quant_config = AutoRoundConfig(nsamples=32, seqlen=10, iters=10, scale_dtype="fp32") model = prepare(model=model, quant_config=quant_config) run_fn(model, self.dataloader) q_model = convert(model) From f94d122f8f8c6fd2d25f72219d806c2cc50fb769 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 16 Jul 2024 11:29:16 +0800 Subject: [PATCH 02/12] update dosstring Signed-off-by: Kaihui-intel --- neural_compressor/torch/algorithms/weight_only/autoround.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py index 29b45a4ac85..9a92a801fab 100644 --- a/neural_compressor/torch/algorithms/weight_only/autoround.py +++ b/neural_compressor/torch/algorithms/weight_only/autoround.py @@ -72,6 +72,10 @@ def __init__( 'bits': 4, 'group_size': 32, 'sym': False, + 'act_data_type': None, + 'act_bits': 32, + 'group_size': None, + 'sym': None, } ..., } From b99997ef133ce80739287e5b62b00614a89db8dc Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 16 Jul 2024 03:31:26 +0000 Subject: [PATCH 03/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../torch/algorithms/weight_only/autoround.py | 10 +++++----- .../torch/quantization/algorithm_entry.py | 4 ++-- neural_compressor/torch/quantization/config.py | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py index 9a92a801fab..228c1122c5f 100644 --- a/neural_compressor/torch/algorithms/weight_only/autoround.py +++ b/neural_compressor/torch/algorithms/weight_only/autoround.py @@ -53,7 +53,7 @@ def __init__( dynamic_max_gap: int = -1, data_type: str = "int", scale_dtype: str = "fp16", - multimodal:bool = False, + multimodal: bool = False, act_bits: int = 32, act_group_size: int = None, act_sym: bool = None, @@ -87,7 +87,7 @@ def __init__( bits (int): Number of bits for quantization (default is 4). group_size (int): Size of the quantization group (default is 128). sym (bool): Whether symmetric quantization is to be used (default is False). - + enable_full_range (bool): Whether to enable full range quantization (default is False). batch_size (int): Batch size for training (default is 8). amp (bool): Whether to use automatic mixed precision (default is True). @@ -194,9 +194,9 @@ def convert(self, model: torch.nn.Module, *args, **kwargs): multimodal=self.multimodal, act_bits=self.act_bits, act_group_size=self.act_group_size, - act_sym = self.act_sym, - act_dynamic = self.act_dynamic, - low_cpu_mem_usage = self.low_cpu_mem_usage, + act_sym=self.act_sym, + act_dynamic=self.act_dynamic, + low_cpu_mem_usage=self.low_cpu_mem_usage, ) model, weight_config = rounder.quantize() model.autoround_config = weight_config diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py index 69496ae4842..91aefa39673 100644 --- a/neural_compressor/torch/quantization/algorithm_entry.py +++ b/neural_compressor/torch/quantization/algorithm_entry.py @@ -482,8 +482,8 @@ def autoround_quantize_entry( not_use_best_mse = quant_config.not_use_best_mse dynamic_max_gap = quant_config.dynamic_max_gap scale_dtype = quant_config.scale_dtype - multimodal = quant_config.multimodal - low_cpu_mem_usage = quant_config.use_layer_wise + multimodal = quant_config.multimodal + low_cpu_mem_usage = quant_config.use_layer_wise kwargs.pop("example_inputs") diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py index 03a5bb0745e..4d9e5cd4b9c 100644 --- a/neural_compressor/torch/quantization/config.py +++ b/neural_compressor/torch/quantization/config.py @@ -754,7 +754,7 @@ def __init__( dynamic_max_gap: int = -1, scale_dtype: str = "fp16", use_layer_wise: bool = False, - multimodal:bool = False, + multimodal: bool = False, white_list: Optional[List[OP_NAME_OR_MODULE_TYPE]] = DEFAULT_WHITE_LIST, ): """Init AUTOROUND weight-only quantization config. From 7f35ffc1e6c5d0c2b6678829525a0c5199e3139d Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 16 Jul 2024 11:46:44 +0800 Subject: [PATCH 04/12] fix config Signed-off-by: Kaihui-intel --- neural_compressor/torch/quantization/config.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py index 03a5bb0745e..7f64878f91f 100644 --- a/neural_compressor/torch/quantization/config.py +++ b/neural_compressor/torch/quantization/config.py @@ -716,8 +716,8 @@ class AutoRoundConfig(TorchBaseConfig): "minmax_lr", "iters", "seqlen", - "n_samples", - "n_blocks", + "nsamples", + "nblocks", "gradient_accumulate_steps", "not_use_best_mse", "dynamic_max_gap", @@ -778,10 +778,10 @@ def __init__( low_gpu_mem_usage (bool): Whether to use low GPU memory (default is True). iters (int): Number of iterations (default is 200). seqlen (int): Length of the sequence. - n_samples (int): Number of samples (default is 512). + nsamples (int): Number of samples (default is 512). sampler (str): The sampling method (default is "rand"). seed (int): The random seed (default is 42). - n_blocks (int): Number of blocks (default is 1). + nblocks (int): Number of blocks (default is 1). gradient_accumulate_steps (int): Number of gradient accumulation steps (default is 1). not_use_best_mse (bool): Whether to use mean squared error (default is False). dynamic_max_gap (int): The dynamic maximum gap (default is -1). From 1c2e0d032009f49667e9c3e096f1d7cd921deb7f Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 16 Jul 2024 11:49:23 +0800 Subject: [PATCH 05/12] nsamples=128 Signed-off-by: Kaihui-intel --- neural_compressor/torch/algorithms/weight_only/autoround.py | 2 +- neural_compressor/torch/quantization/config.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py index 228c1122c5f..19b995ccb3f 100644 --- a/neural_compressor/torch/algorithms/weight_only/autoround.py +++ b/neural_compressor/torch/algorithms/weight_only/autoround.py @@ -204,7 +204,7 @@ def convert(self, model: torch.nn.Module, *args, **kwargs): return model -def get_dataloader(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42, bs=8, nsamples=512): +def get_dataloader(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42, bs=8, nsamples=128): """Generate a DataLoader for calibration using specified parameters. Args: diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py index 885befa8bf8..0b6fee38204 100644 --- a/neural_compressor/torch/quantization/config.py +++ b/neural_compressor/torch/quantization/config.py @@ -745,7 +745,7 @@ def __init__( low_gpu_mem_usage: bool = True, iters: int = 200, seqlen: int = 2048, - nsamples: int = 512, + nsamples: int = 128, sampler: str = "rand", seed: int = 42, nblocks: int = 1, From 6687d0fbf96b5b125e86d38fc4704c3603d53016 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 16 Jul 2024 11:54:04 +0800 Subject: [PATCH 06/12] fix docstring Signed-off-by: Kaihui-intel --- neural_compressor/torch/algorithms/weight_only/autoround.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py index 19b995ccb3f..562e31d6be6 100644 --- a/neural_compressor/torch/algorithms/weight_only/autoround.py +++ b/neural_compressor/torch/algorithms/weight_only/autoround.py @@ -74,8 +74,8 @@ def __init__( 'sym': False, 'act_data_type': None, 'act_bits': 32, - 'group_size': None, - 'sym': None, + 'act_sym': None, + 'act_dynamic': True, } ..., } From fe1dda1baaa71f7e16ed7152cad2773aefae5df6 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 16 Jul 2024 12:15:51 +0800 Subject: [PATCH 07/12] update commit Signed-off-by: Kaihui-intel --- .azure-pipelines/scripts/ut/env_setup.sh | 2 +- .../torch/algorithms/weight_only/autoround.py | 8 ++++---- test/3x/torch/requirements.txt | 2 +- test/requirements.txt | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.azure-pipelines/scripts/ut/env_setup.sh b/.azure-pipelines/scripts/ut/env_setup.sh index d5876b07cef..84e7fc654ec 100644 --- a/.azure-pipelines/scripts/ut/env_setup.sh +++ b/.azure-pipelines/scripts/ut/env_setup.sh @@ -92,7 +92,7 @@ elif [[ $(echo "${test_case}" | grep -c "tf pruning") != 0 ]]; then fi if [[ $(echo "${test_case}" | grep -c "api") != 0 ]] || [[ $(echo "${test_case}" | grep -c "adaptor") != 0 ]]; then - pip install auto-round + pip install git+https://github.com/intel/auto-round.git@24b2e74070f2b4e6f26ff069ec75af74cf5b177c fi # test deps diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py index 562e31d6be6..08c31de8459 100644 --- a/neural_compressor/torch/algorithms/weight_only/autoround.py +++ b/neural_compressor/torch/algorithms/weight_only/autoround.py @@ -87,7 +87,6 @@ def __init__( bits (int): Number of bits for quantization (default is 4). group_size (int): Size of the quantization group (default is 128). sym (bool): Whether symmetric quantization is to be used (default is False). - enable_full_range (bool): Whether to enable full range quantization (default is False). batch_size (int): Batch size for training (default is 8). amp (bool): Whether to use automatic mixed precision (default is True). @@ -95,10 +94,11 @@ def __init__( lr_scheduler: The learning rate scheduler to be used. dataset (str): The default dataset name (default is "NeelNanda/pile-10k"). enable_quanted_input (bool): Whether to use the output of the previous quantized block as - the input for the current block (default is True). + the input for the current block (default is True). enable_minmax_tuning (bool): Whether to enable weight min-max tuning (default is True). lr (float): The learning rate (default is None, will be set to 1.0/iters). - minmax_lr (float): The learning rate for min-max tuning (default is None, it will be set to lr automatically). + minmax_lr (float): The learning rate for min-max tuning + (default is None, it will be set to lr automatically). low_gpu_mem_usage (bool): Whether to use low GPU memory (default is True). iters (int): Number of iterations (default is 200). seqlen (int): Data length of the sequence for tuning (default is 2048). @@ -111,7 +111,7 @@ def __init__( dynamic_max_gap (int): The dynamic maximum gap (default is -1). data_type (str): The data type to be used (default is "int"). scale_dtype (str): The data type of quantization scale to be used (default is "float16"), different kernels - have different choices. + have different choices. multimodal(bool): Enable multimodal model quantization, (default is "False"). act_bits (int): Number of bits for activation quantization. Default is 32. act_group_size (int): Group size for activation quantization. Default is None. diff --git a/test/3x/torch/requirements.txt b/test/3x/torch/requirements.txt index bdf99d92cf0..ea647ff587f 100644 --- a/test/3x/torch/requirements.txt +++ b/test/3x/torch/requirements.txt @@ -1,4 +1,4 @@ -auto_round +auto_round @ git+https://github.com/intel/auto-round.git@c7751c49853eb3497e7df9b92dc4733a80df34a0 expecttest intel_extension_for_pytorch numpy diff --git a/test/requirements.txt b/test/requirements.txt index 3a24001cfd2..386b3ee4f2b 100644 --- a/test/requirements.txt +++ b/test/requirements.txt @@ -1,6 +1,6 @@ --find-links https://download.pytorch.org/whl/torch_stable.html accelerate==0.21.0 -auto-round +auto-round @ git+https://github.com/intel/auto-round.git@24b2e74070f2b4e6f26ff069ec75af74cf5b177c dynast==1.6.0rc1 horovod intel-extension-for-pytorch From f2aa279a66b0ae3dc54d229eefb2d597c29c3d12 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 16 Jul 2024 04:18:21 +0000 Subject: [PATCH 08/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_compressor/torch/algorithms/weight_only/autoround.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py index 08c31de8459..6f5a022cfee 100644 --- a/neural_compressor/torch/algorithms/weight_only/autoround.py +++ b/neural_compressor/torch/algorithms/weight_only/autoround.py @@ -97,7 +97,7 @@ def __init__( the input for the current block (default is True). enable_minmax_tuning (bool): Whether to enable weight min-max tuning (default is True). lr (float): The learning rate (default is None, will be set to 1.0/iters). - minmax_lr (float): The learning rate for min-max tuning + minmax_lr (float): The learning rate for min-max tuning (default is None, it will be set to lr automatically). low_gpu_mem_usage (bool): Whether to use low GPU memory (default is True). iters (int): Number of iterations (default is 200). From c69f79eb2a6ce5ca88da468b40bd212b5d333d58 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 16 Jul 2024 13:06:12 +0800 Subject: [PATCH 09/12] update commit version Signed-off-by: Kaihui-intel --- test/3x/torch/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/3x/torch/requirements.txt b/test/3x/torch/requirements.txt index ea647ff587f..cc1ee22fe83 100644 --- a/test/3x/torch/requirements.txt +++ b/test/3x/torch/requirements.txt @@ -1,4 +1,4 @@ -auto_round @ git+https://github.com/intel/auto-round.git@c7751c49853eb3497e7df9b92dc4733a80df34a0 +auto_round @ git+https://github.com/intel/auto-round.git@24b2e74070f2b4e6f26ff069ec75af74cf5b177c expecttest intel_extension_for_pytorch numpy From d7da8a60b6e5c71edd125431cacdb61840a07a26 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 16 Jul 2024 13:18:00 +0800 Subject: [PATCH 10/12] reset 2x ut commit Signed-off-by: Kaihui-intel --- test/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/requirements.txt b/test/requirements.txt index 386b3ee4f2b..3a24001cfd2 100644 --- a/test/requirements.txt +++ b/test/requirements.txt @@ -1,6 +1,6 @@ --find-links https://download.pytorch.org/whl/torch_stable.html accelerate==0.21.0 -auto-round @ git+https://github.com/intel/auto-round.git@24b2e74070f2b4e6f26ff069ec75af74cf5b177c +auto-round dynast==1.6.0rc1 horovod intel-extension-for-pytorch From 1324d049cc0253733a8a09537e9fab63db4d40b4 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 16 Jul 2024 16:17:10 +0800 Subject: [PATCH 11/12] update 2x version Signed-off-by: Kaihui-intel --- .azure-pipelines/scripts/ut/env_setup.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure-pipelines/scripts/ut/env_setup.sh b/.azure-pipelines/scripts/ut/env_setup.sh index 84e7fc654ec..d5876b07cef 100644 --- a/.azure-pipelines/scripts/ut/env_setup.sh +++ b/.azure-pipelines/scripts/ut/env_setup.sh @@ -92,7 +92,7 @@ elif [[ $(echo "${test_case}" | grep -c "tf pruning") != 0 ]]; then fi if [[ $(echo "${test_case}" | grep -c "api") != 0 ]] || [[ $(echo "${test_case}" | grep -c "adaptor") != 0 ]]; then - pip install git+https://github.com/intel/auto-round.git@24b2e74070f2b4e6f26ff069ec75af74cf5b177c + pip install auto-round fi # test deps From 10dd0cbfe4b529c70fe20419b8d8302dbe392587 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 16 Jul 2024 21:57:15 +0800 Subject: [PATCH 12/12] upodate autotune config Signed-off-by: Kaihui-intel --- neural_compressor/torch/quantization/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py index c358d39c604..70907f91c64 100644 --- a/neural_compressor/torch/quantization/config.py +++ b/neural_compressor/torch/quantization/config.py @@ -1538,7 +1538,7 @@ def get_woq_tuning_config() -> list: the list of WOQ quant config. """ RTN_G32ASYM = RTNConfig(use_sym=False, group_size=32) - AUTO_ROUND_CONFIG = AutoRoundConfig(use_sym=False, group_size=32) + AUTO_ROUND_CONFIG = AutoRoundConfig(use_sym=False, group_size=32, seqlen=512) GPTQ_G32ASYM = GPTQConfig(use_sym=False, group_size=32) AWQ_G32ASYM = AWQConfig(use_sym=False, group_size=32) return [RTN_G32ASYM, AUTO_ROUND_CONFIG, GPTQ_G32ASYM, AWQ_G32ASYM]