From b2a4e9199b994589e2819b7fc5e5204924a411ef Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Tue, 16 Jul 2024 11:25:58 +0800
Subject: [PATCH 01/12] uintegrate autoround v2.1

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../torch/algorithms/weight_only/autoround.py | 128 +++++++++++-------
 .../torch/quantization/algorithm_entry.py     |  16 ++-
 .../torch/quantization/config.py              |  24 +++-
 .../weight_only/test_autoround.py             |  10 +-
 4 files changed, 118 insertions(+), 60 deletions(-)

diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py
index 2e97533c0bb..29b45a4ac85 100644
--- a/neural_compressor/torch/algorithms/weight_only/autoround.py
+++ b/neural_compressor/torch/algorithms/weight_only/autoround.py
@@ -31,69 +31,91 @@ class AutoRoundQuantizer(Quantizer):
     def __init__(
         self,
         quant_config: dict = {},
-        enable_full_range: bool = False,
+        enable_full_range: bool = False,  ##for symmetric, TODO support later
         batch_size: int = 8,
         amp: bool = True,
-        device=None,
+        device: str = None,
         lr_scheduler=None,
+        dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k",
         enable_quanted_input: bool = True,
         enable_minmax_tuning: bool = True,
         lr: float = None,
         minmax_lr: float = None,
-        low_gpu_mem_usage: bool = True,
+        low_gpu_mem_usage: bool = False,
         iters: int = 200,
         seqlen: int = 2048,
-        n_samples: int = 512,
+        nsamples: int = 128,
         sampler: str = "rand",
         seed: int = 42,
-        n_blocks: int = 1,
+        nblocks: int = 1,
         gradient_accumulate_steps: int = 1,
         not_use_best_mse: bool = False,
         dynamic_max_gap: int = -1,
         data_type: str = "int",
         scale_dtype: str = "fp16",
+        multimodal:bool = False,
+        act_bits: int = 32,
+        act_group_size: int = None,
+        act_sym: bool = None,
+        act_dynamic: bool = True,
+        low_cpu_mem_usage: bool = False,
         **kwargs,
     ):
         """Init a AutQRoundQuantizer object.
 
         Args:
-        quant_config (dict): Configuration for weight quantization (default is None).
-        quant_config={
-                    'layer1':##layer_name
-                    {
-                        'data_type': 'int',
-                        'bits': 4,
-                        'group_size': 32,
-                        'sym': False,
+            quant_config (dict): Configuration for weight quantization (default is None).
+            quant_config={
+                        'layer1':##layer_name
+                        {
+                            'data_type': 'int',
+                            'bits': 4,
+                            'group_size': 32,
+                            'sym': False,
+                        }
+                        ...,
                     }
-                    ...
-                }
-            keys:
-                data_type (str): The data type to be used (default is "int").
-                bits (int): Number of bits for quantization (default is 4).
-                group_size (int): Size of the quantization group (default is 128).
-                sym (bool): Whether to use symmetric quantization. (default is None).
-        enable_full_range (bool): Whether to enable full range quantization (default is False).
-        batch_size (int): Batch size for training (default is 8).
-        amp (bool): Whether to use automatic mixed precision (default is True). Automatically detect and set.
-        device: The device to be used for tuning (default is None). Automatically detect and set.
-        lr_scheduler: The learning rate scheduler to be used.
-        use_quant_input (bool): Whether to use quantized input data (default is True).
-        enable_minmax_tuning (bool): Whether to enable min-max tuning (default is True).
-        lr (float): The learning rate (default is 0.005).
-        minmax_lr (float): The learning rate for min-max tuning (default is None).
-        low_gpu_mem_usage (bool): Whether to use low GPU memory (default is True).
-        iters (int): Number of iterations (default is 200).
-        seqlen (int): Length of the sequence.
-        n_samples (int): Number of samples (default is 512).
-        sampler (str): The sampling method (default is "rand").
-        seed (int): The random seed (default is 42).
-        n_blocks (int): Number of blocks (default is 1).
-        gradient_accumulate_steps (int): Number of gradient accumulation steps (default is 1).
-        not_use_best_mse (bool): Whether to use mean squared error (default is False).
-        dynamic_max_gap (int): The dynamic maximum gap (default is -1).
-        scale_dtype (str): The data type of quantization scale to be used (default is "float16"), different kernels
+                keys:
+                    data_type (str): The data type to be used (default is "int").
+                    bits (int): Number of bits for quantization (default is 4).
+                    group_size (int): Size of the quantization group (default is 128).
+                    sym (bool): Whether to use symmetric quantization. (default is None).
+            bits (int): Number of bits for quantization (default is 4).
+            group_size (int): Size of the quantization group (default is 128).
+            sym (bool): Whether symmetric quantization is to be used (default is False).
+                    
+            enable_full_range (bool): Whether to enable full range quantization (default is False).
+            batch_size (int): Batch size for training (default is 8).
+            amp (bool): Whether to use automatic mixed precision (default is True).
+            device: The device to be used for tuning (default is "auto").
+            lr_scheduler: The learning rate scheduler to be used.
+            dataset (str): The default dataset name (default is "NeelNanda/pile-10k").
+            enable_quanted_input (bool): Whether to use the output of the previous quantized block as
+                                    the input for the current block (default is True).
+            enable_minmax_tuning (bool): Whether to enable weight min-max tuning (default is True).
+            lr (float): The learning rate (default is None, will be set to 1.0/iters).
+            minmax_lr (float): The learning rate for min-max tuning (default is None, it will be set to lr automatically).
+            low_gpu_mem_usage (bool): Whether to use low GPU memory (default is True).
+            iters (int): Number of iterations (default is 200).
+            seqlen (int): Data length of the sequence for tuning (default is 2048).
+            nsamples (int): Number of samples (default is 128).
+            sampler (str): The sampling method (default is "rand").
+            seed (int): The random seed (default is 42).
+            nblocks (int): Number of blocks (default is 1).
+            gradient_accumulate_steps (int): Number of gradient accumulation steps (default is 1).
+            not_use_best_mse (bool): Whether to use mean squared error (default is False).
+            dynamic_max_gap (int): The dynamic maximum gap (default is -1).
+            data_type (str): The data type to be used (default is "int").
+            scale_dtype (str): The data type of quantization scale to be used (default is "float16"), different kernels
                             have different choices.
+            multimodal(bool): Enable multimodal model quantization, (default is "False").
+            act_bits (int): Number of bits for activation quantization. Default is 32.
+            act_group_size (int): Group size for activation quantization. Default is None.
+            act_sym (bool): Whether to use symmetric activation quantization. Default is None.
+            act_dynamic (bool): Whether to use dynamic activation quantization. Default is True.
+
+        Returns:
+            The quantized model.
         """
         super().__init__(quant_config)
         self.tokenizer = None
@@ -109,15 +131,21 @@ def __init__(
         self.low_gpu_mem_usage = low_gpu_mem_usage
         self.iters = iters
         self.seqlen = seqlen
-        self.n_samples = n_samples
+        self.nsamples = nsamples
         self.sampler = sampler
         self.seed = seed
-        self.n_blocks = n_blocks
+        self.nblocks = nblocks
         self.gradient_accumulate_steps = gradient_accumulate_steps
         self.not_use_best_mse = not_use_best_mse
         self.dynamic_max_gap = dynamic_max_gap
         self.data_type = data_type
         self.scale_dtype = scale_dtype
+        self.multimodal = multimodal
+        self.act_bits = act_bits
+        self.act_group_size = act_group_size
+        self.act_sym = act_sym
+        self.act_dynamic = act_dynamic
+        self.low_cpu_mem_usage = low_cpu_mem_usage
 
     def prepare(self, model: torch.nn.Module, *args, **kwargs):
         """Prepares a given model for quantization.
@@ -137,7 +165,7 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
             model=model,
             tokenizer=None,
             dataset=dataloader,
-            weight_config=self.quant_config or {},
+            layer_config=self.quant_config or {},
             enable_full_range=self.enable_full_range,
             batch_size=self.batch_size,
             amp=self.amp,
@@ -150,15 +178,21 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
             low_gpu_mem_usage=self.low_gpu_mem_usage,
             iters=self.iters,
             seqlen=self.seqlen,
-            n_samples=self.n_samples,
+            nsamples=self.nsamples,
             sampler=self.sampler,
             seed=self.seed,
-            n_blocks=self.n_blocks,
+            nblocks=self.nblocks,
             gradient_accumulate_steps=self.gradient_accumulate_steps,
             not_use_best_mse=self.not_use_best_mse,
             dynamic_max_gap=self.dynamic_max_gap,
             data_type=self.data_type,
             scale_dtype=self.scale_dtype,
+            multimodal=self.multimodal,
+            act_bits=self.act_bits,
+            act_group_size=self.act_group_size,
+            act_sym = self.act_sym,
+            act_dynamic = self.act_dynamic,
+            low_cpu_mem_usage = self.low_cpu_mem_usage,
         )
         model, weight_config = rounder.quantize()
         model.autoround_config = weight_config
@@ -166,7 +200,7 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
         return model
 
 
-def get_dataloader(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42, bs=8, n_samples=512):
+def get_dataloader(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42, bs=8, nsamples=512):
     """Generate a DataLoader for calibration using specified parameters.
 
     Args:
@@ -186,6 +220,6 @@ def get_dataloader(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42
     from auto_round.calib_dataset import get_dataloader  # pylint: disable=E0401
 
     dataloader = get_dataloader(
-        tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=seed, bs=bs, n_samples=n_samples
+        tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=seed, bs=bs, nsamples=nsamples
     )
     return dataloader
diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py
index 2a3eada9bf5..69496ae4842 100644
--- a/neural_compressor/torch/quantization/algorithm_entry.py
+++ b/neural_compressor/torch/quantization/algorithm_entry.py
@@ -459,6 +459,10 @@ def autoround_quantize_entry(
                 "bits": quant_config.bits,
                 "sym": quant_config.use_sym,
                 "group_size": quant_config.group_size,
+                "act_bits": quant_config.act_bits,
+                "act_group_size": quant_config.act_group_size,
+                "act_sym": quant_config.act_sym,
+                "act_dynamic": quant_config.act_dynamic,
             }
             enable_full_range = quant_config.enable_full_range
             batch_size = quant_config.batch_size
@@ -470,14 +474,16 @@ def autoround_quantize_entry(
             low_gpu_mem_usage = quant_config.low_gpu_mem_usage
             iters = quant_config.iters
             seqlen = quant_config.seqlen
-            n_samples = quant_config.n_samples
+            nsamples = quant_config.nsamples
             sampler = quant_config.sampler
             seed = quant_config.seed
-            n_blocks = quant_config.n_blocks
+            nblocks = quant_config.nblocks
             gradient_accumulate_steps = quant_config.gradient_accumulate_steps
             not_use_best_mse = quant_config.not_use_best_mse
             dynamic_max_gap = quant_config.dynamic_max_gap
             scale_dtype = quant_config.scale_dtype
+            multimodal =  quant_config.multimodal
+            low_cpu_mem_usage =  quant_config.use_layer_wise
 
     kwargs.pop("example_inputs")
 
@@ -495,14 +501,16 @@ def autoround_quantize_entry(
         low_gpu_mem_usage=low_gpu_mem_usage,
         iters=iters,
         seqlen=seqlen,
-        n_samples=n_samples,
+        nsamples=nsamples,
         sampler=sampler,
         seed=seed,
-        n_blocks=n_blocks,
+        nblocks=nblocks,
         gradient_accumulate_steps=gradient_accumulate_steps,
         not_use_best_mse=not_use_best_mse,
         dynamic_max_gap=dynamic_max_gap,
         scale_dtype=scale_dtype,
+        multimodal=multimodal,
+        low_cpu_mem_usage=low_cpu_mem_usage,
     )
     model = quantizer.execute(model=model, mode=mode, *args, **kwargs)
     model.qconfig = configs_mapping
diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
index 27d30753cdb..03a5bb0745e 100644
--- a/neural_compressor/torch/quantization/config.py
+++ b/neural_compressor/torch/quantization/config.py
@@ -731,6 +731,10 @@ def __init__(
         use_sym: bool = False,
         group_size: int = 128,
         # AUTOROUND
+        act_bits: int = 32,
+        act_group_size: int = None,
+        act_sym: bool = None,
+        act_dynamic: bool = True,
         enable_full_range: bool = False,
         batch_size: int = 8,
         lr_scheduler=None,
@@ -741,15 +745,16 @@ def __init__(
         low_gpu_mem_usage: bool = True,
         iters: int = 200,
         seqlen: int = 2048,
-        n_samples: int = 512,
+        nsamples: int = 512,
         sampler: str = "rand",
         seed: int = 42,
-        n_blocks: int = 1,
+        nblocks: int = 1,
         gradient_accumulate_steps: int = 1,
         not_use_best_mse: bool = False,
         dynamic_max_gap: int = -1,
         scale_dtype: str = "fp16",
         use_layer_wise: bool = False,
+        multimodal:bool = False,
         white_list: Optional[List[OP_NAME_OR_MODULE_TYPE]] = DEFAULT_WHITE_LIST,
     ):
         """Init AUTOROUND weight-only quantization config.
@@ -759,6 +764,10 @@ def __init__(
             bits (int): Number of bits used to represent weights, default is 4.
             use_sym (bool): Indicates whether weights are symmetric, default is False.
             group_size (int): Size of weight groups, default is 128.
+            act_bits (int): Number of bits for activation quantization. Default is 32.
+            act_group_size (int): Group size for activation quantization. Default is None.
+            act_sym (bool): Whether to use symmetric activation quantization. Default is None.
+            act_dynamic (bool): Whether to use dynamic activation quantization. Default is True.
             enable_full_range (bool): Whether to enable full range quantization (default is False).
             batch_size (int): Batch size for training (default is 8).
             lr_scheduler: The learning rate scheduler to be used.
@@ -778,12 +787,18 @@ def __init__(
             dynamic_max_gap (int): The dynamic maximum gap (default is -1).
             scale_dtype (str): The data type of quantization scale to be used (default is "float16"), different kernels
                         have different choices.
+            use_layer_wise (bool): Enables quantize model per layer. Defaults to False.
+            multimodal(bool): Enable multimodal model quantization, (default is "False").
         """
         super().__init__(white_list=white_list)
         self.dtype = dtype
         self.bits = bits
         self.use_sym = use_sym
         self.group_size = group_size
+        self.act_bits = act_bits
+        self.act_group_size = act_group_size
+        self.act_sym = act_sym
+        self.act_dynamic = act_dynamic
         self.enable_full_range = enable_full_range
         self.batch_size = batch_size
         self.lr_scheduler = lr_scheduler
@@ -794,15 +809,16 @@ def __init__(
         self.low_gpu_mem_usage = low_gpu_mem_usage
         self.iters = iters
         self.seqlen = seqlen
-        self.n_samples = n_samples
+        self.nsamples = nsamples
         self.sampler = sampler
         self.seed = seed
-        self.n_blocks = n_blocks
+        self.nblocks = nblocks
         self.gradient_accumulate_steps = gradient_accumulate_steps
         self.not_use_best_mse = not_use_best_mse
         self.dynamic_max_gap = dynamic_max_gap
         self.scale_dtype = scale_dtype
         self.use_layer_wise = use_layer_wise
+        self.multimodal = multimodal
         self._post_init()
 
     @classmethod
diff --git a/test/3x/torch/quantization/weight_only/test_autoround.py b/test/3x/torch/quantization/weight_only/test_autoround.py
index f1539b072b7..f5351656595 100644
--- a/test/3x/torch/quantization/weight_only/test_autoround.py
+++ b/test/3x/torch/quantization/weight_only/test_autoround.py
@@ -49,7 +49,7 @@ def setup_class(self):
         tokenizer = transformers.AutoTokenizer.from_pretrained(
             "hf-internal-testing/tiny-random-GPTJForCausalLM", trust_remote_code=True
         )
-        self.dataloader = get_dataloader(tokenizer, 32, dataset_name="NeelNanda/pile-10k", seed=42, bs=8, n_samples=10)
+        self.dataloader = get_dataloader(tokenizer, 32, dataset_name="NeelNanda/pile-10k", seed=42, bs=8, nsamples=10)
         self.label = self.gptj(self.inp)[0]
 
     def teardown_class(self):
@@ -61,7 +61,7 @@ def setup_method(self, method):
     @pytest.mark.parametrize("quant_lm_head", [True, False])
     def test_autoround(self, quant_lm_head):
         fp32_model = copy.deepcopy(self.gptj)
-        quant_config = AutoRoundConfig(n_samples=32, seqlen=10, iters=10, scale_dtype="fp32")
+        quant_config = AutoRoundConfig(nsamples=32, seqlen=10, iters=10, scale_dtype="fp32")
         if quant_lm_head is False:
             quant_config.set_local("lm_head", AutoRoundConfig(dtype="fp32"))
         logger.info(f"Test AutoRound with config {quant_config}")
@@ -83,7 +83,7 @@ def test_autoround(self, quant_lm_head):
     def test_autoround_with_quantize_API(self):
         gpt_j_model = copy.deepcopy(self.gptj)
 
-        quant_config = AutoRoundConfig(n_samples=32, seqlen=10, iters=10, scale_dtype="fp32")
+        quant_config = AutoRoundConfig(nsamples=32, seqlen=10, iters=10, scale_dtype="fp32")
         quant_config.set_local("lm_head", AutoRoundConfig(dtype="fp32"))
 
         logger.info(f"Test AutoRound with config {quant_config}")
@@ -101,7 +101,7 @@ def test_autoround_with_quantize_API(self):
 
     def test_save_and_load(self):
         fp32_model = copy.deepcopy(self.gptj)
-        quant_config = AutoRoundConfig(n_samples=32, seqlen=10, iters=10, scale_dtype="fp32")
+        quant_config = AutoRoundConfig(nsamples=32, seqlen=10, iters=10, scale_dtype="fp32")
         # quant_config.set_local("lm_head", AutoRoundConfig(dtype="fp32"))
         logger.info(f"Test AutoRound with config {quant_config}")
 
@@ -133,7 +133,7 @@ def test_conv1d(self):
         text = "Replace me by any text you'd like."
         encoded_input = tokenizer(text, return_tensors="pt")
         out1 = model(**encoded_input)[0]
-        quant_config = AutoRoundConfig(n_samples=32, seqlen=10, iters=10, scale_dtype="fp32")
+        quant_config = AutoRoundConfig(nsamples=32, seqlen=10, iters=10, scale_dtype="fp32")
         model = prepare(model=model, quant_config=quant_config)
         run_fn(model, self.dataloader)
         q_model = convert(model)

From f94d122f8f8c6fd2d25f72219d806c2cc50fb769 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Tue, 16 Jul 2024 11:29:16 +0800
Subject: [PATCH 02/12] update dosstring

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 neural_compressor/torch/algorithms/weight_only/autoround.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py
index 29b45a4ac85..9a92a801fab 100644
--- a/neural_compressor/torch/algorithms/weight_only/autoround.py
+++ b/neural_compressor/torch/algorithms/weight_only/autoround.py
@@ -72,6 +72,10 @@ def __init__(
                             'bits': 4,
                             'group_size': 32,
                             'sym': False,
+                            'act_data_type': None,
+                            'act_bits': 32,
+                            'group_size': None,
+                            'sym': None,
                         }
                         ...,
                     }

From b99997ef133ce80739287e5b62b00614a89db8dc Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 16 Jul 2024 03:31:26 +0000
Subject: [PATCH 03/12] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../torch/algorithms/weight_only/autoround.py          | 10 +++++-----
 .../torch/quantization/algorithm_entry.py              |  4 ++--
 neural_compressor/torch/quantization/config.py         |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py
index 9a92a801fab..228c1122c5f 100644
--- a/neural_compressor/torch/algorithms/weight_only/autoround.py
+++ b/neural_compressor/torch/algorithms/weight_only/autoround.py
@@ -53,7 +53,7 @@ def __init__(
         dynamic_max_gap: int = -1,
         data_type: str = "int",
         scale_dtype: str = "fp16",
-        multimodal:bool = False,
+        multimodal: bool = False,
         act_bits: int = 32,
         act_group_size: int = None,
         act_sym: bool = None,
@@ -87,7 +87,7 @@ def __init__(
             bits (int): Number of bits for quantization (default is 4).
             group_size (int): Size of the quantization group (default is 128).
             sym (bool): Whether symmetric quantization is to be used (default is False).
-                    
+
             enable_full_range (bool): Whether to enable full range quantization (default is False).
             batch_size (int): Batch size for training (default is 8).
             amp (bool): Whether to use automatic mixed precision (default is True).
@@ -194,9 +194,9 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
             multimodal=self.multimodal,
             act_bits=self.act_bits,
             act_group_size=self.act_group_size,
-            act_sym = self.act_sym,
-            act_dynamic = self.act_dynamic,
-            low_cpu_mem_usage = self.low_cpu_mem_usage,
+            act_sym=self.act_sym,
+            act_dynamic=self.act_dynamic,
+            low_cpu_mem_usage=self.low_cpu_mem_usage,
         )
         model, weight_config = rounder.quantize()
         model.autoround_config = weight_config
diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py
index 69496ae4842..91aefa39673 100644
--- a/neural_compressor/torch/quantization/algorithm_entry.py
+++ b/neural_compressor/torch/quantization/algorithm_entry.py
@@ -482,8 +482,8 @@ def autoround_quantize_entry(
             not_use_best_mse = quant_config.not_use_best_mse
             dynamic_max_gap = quant_config.dynamic_max_gap
             scale_dtype = quant_config.scale_dtype
-            multimodal =  quant_config.multimodal
-            low_cpu_mem_usage =  quant_config.use_layer_wise
+            multimodal = quant_config.multimodal
+            low_cpu_mem_usage = quant_config.use_layer_wise
 
     kwargs.pop("example_inputs")
 
diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
index 03a5bb0745e..4d9e5cd4b9c 100644
--- a/neural_compressor/torch/quantization/config.py
+++ b/neural_compressor/torch/quantization/config.py
@@ -754,7 +754,7 @@ def __init__(
         dynamic_max_gap: int = -1,
         scale_dtype: str = "fp16",
         use_layer_wise: bool = False,
-        multimodal:bool = False,
+        multimodal: bool = False,
         white_list: Optional[List[OP_NAME_OR_MODULE_TYPE]] = DEFAULT_WHITE_LIST,
     ):
         """Init AUTOROUND weight-only quantization config.

From 7f35ffc1e6c5d0c2b6678829525a0c5199e3139d Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Tue, 16 Jul 2024 11:46:44 +0800
Subject: [PATCH 04/12] fix config

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 neural_compressor/torch/quantization/config.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
index 03a5bb0745e..7f64878f91f 100644
--- a/neural_compressor/torch/quantization/config.py
+++ b/neural_compressor/torch/quantization/config.py
@@ -716,8 +716,8 @@ class AutoRoundConfig(TorchBaseConfig):
         "minmax_lr",
         "iters",
         "seqlen",
-        "n_samples",
-        "n_blocks",
+        "nsamples",
+        "nblocks",
         "gradient_accumulate_steps",
         "not_use_best_mse",
         "dynamic_max_gap",
@@ -778,10 +778,10 @@ def __init__(
             low_gpu_mem_usage (bool): Whether to use low GPU memory (default is True).
             iters (int): Number of iterations (default is 200).
             seqlen (int): Length of the sequence.
-            n_samples (int): Number of samples (default is 512).
+            nsamples (int): Number of samples (default is 512).
             sampler (str): The sampling method (default is "rand").
             seed (int): The random seed (default is 42).
-            n_blocks (int): Number of blocks (default is 1).
+            nblocks (int): Number of blocks (default is 1).
             gradient_accumulate_steps (int): Number of gradient accumulation steps (default is 1).
             not_use_best_mse (bool): Whether to use mean squared error (default is False).
             dynamic_max_gap (int): The dynamic maximum gap (default is -1).

From 1c2e0d032009f49667e9c3e096f1d7cd921deb7f Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Tue, 16 Jul 2024 11:49:23 +0800
Subject: [PATCH 05/12] nsamples=128

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 neural_compressor/torch/algorithms/weight_only/autoround.py | 2 +-
 neural_compressor/torch/quantization/config.py              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py
index 228c1122c5f..19b995ccb3f 100644
--- a/neural_compressor/torch/algorithms/weight_only/autoround.py
+++ b/neural_compressor/torch/algorithms/weight_only/autoround.py
@@ -204,7 +204,7 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
         return model
 
 
-def get_dataloader(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42, bs=8, nsamples=512):
+def get_dataloader(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42, bs=8, nsamples=128):
     """Generate a DataLoader for calibration using specified parameters.
 
     Args:
diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
index 885befa8bf8..0b6fee38204 100644
--- a/neural_compressor/torch/quantization/config.py
+++ b/neural_compressor/torch/quantization/config.py
@@ -745,7 +745,7 @@ def __init__(
         low_gpu_mem_usage: bool = True,
         iters: int = 200,
         seqlen: int = 2048,
-        nsamples: int = 512,
+        nsamples: int = 128,
         sampler: str = "rand",
         seed: int = 42,
         nblocks: int = 1,

From 6687d0fbf96b5b125e86d38fc4704c3603d53016 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Tue, 16 Jul 2024 11:54:04 +0800
Subject: [PATCH 06/12] fix docstring

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 neural_compressor/torch/algorithms/weight_only/autoround.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py
index 19b995ccb3f..562e31d6be6 100644
--- a/neural_compressor/torch/algorithms/weight_only/autoround.py
+++ b/neural_compressor/torch/algorithms/weight_only/autoround.py
@@ -74,8 +74,8 @@ def __init__(
                             'sym': False,
                             'act_data_type': None,
                             'act_bits': 32,
-                            'group_size': None,
-                            'sym': None,
+                            'act_sym': None,
+                            'act_dynamic': True,
                         }
                         ...,
                     }

From fe1dda1baaa71f7e16ed7152cad2773aefae5df6 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Tue, 16 Jul 2024 12:15:51 +0800
Subject: [PATCH 07/12] update commit

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .azure-pipelines/scripts/ut/env_setup.sh                  | 2 +-
 .../torch/algorithms/weight_only/autoround.py             | 8 ++++----
 test/3x/torch/requirements.txt                            | 2 +-
 test/requirements.txt                                     | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.azure-pipelines/scripts/ut/env_setup.sh b/.azure-pipelines/scripts/ut/env_setup.sh
index d5876b07cef..84e7fc654ec 100644
--- a/.azure-pipelines/scripts/ut/env_setup.sh
+++ b/.azure-pipelines/scripts/ut/env_setup.sh
@@ -92,7 +92,7 @@ elif [[ $(echo "${test_case}" | grep -c "tf pruning") != 0 ]]; then
 fi
 
 if [[ $(echo "${test_case}" | grep -c "api") != 0 ]] || [[ $(echo "${test_case}" | grep -c "adaptor") != 0 ]]; then
-    pip install auto-round
+    pip install git+https://github.com/intel/auto-round.git@24b2e74070f2b4e6f26ff069ec75af74cf5b177c
 fi
 
 # test deps
diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py
index 562e31d6be6..08c31de8459 100644
--- a/neural_compressor/torch/algorithms/weight_only/autoround.py
+++ b/neural_compressor/torch/algorithms/weight_only/autoround.py
@@ -87,7 +87,6 @@ def __init__(
             bits (int): Number of bits for quantization (default is 4).
             group_size (int): Size of the quantization group (default is 128).
             sym (bool): Whether symmetric quantization is to be used (default is False).
-
             enable_full_range (bool): Whether to enable full range quantization (default is False).
             batch_size (int): Batch size for training (default is 8).
             amp (bool): Whether to use automatic mixed precision (default is True).
@@ -95,10 +94,11 @@ def __init__(
             lr_scheduler: The learning rate scheduler to be used.
             dataset (str): The default dataset name (default is "NeelNanda/pile-10k").
             enable_quanted_input (bool): Whether to use the output of the previous quantized block as
-                                    the input for the current block (default is True).
+                the input for the current block (default is True).
             enable_minmax_tuning (bool): Whether to enable weight min-max tuning (default is True).
             lr (float): The learning rate (default is None, will be set to 1.0/iters).
-            minmax_lr (float): The learning rate for min-max tuning (default is None, it will be set to lr automatically).
+            minmax_lr (float): The learning rate for min-max tuning 
+                (default is None, it will be set to lr automatically).
             low_gpu_mem_usage (bool): Whether to use low GPU memory (default is True).
             iters (int): Number of iterations (default is 200).
             seqlen (int): Data length of the sequence for tuning (default is 2048).
@@ -111,7 +111,7 @@ def __init__(
             dynamic_max_gap (int): The dynamic maximum gap (default is -1).
             data_type (str): The data type to be used (default is "int").
             scale_dtype (str): The data type of quantization scale to be used (default is "float16"), different kernels
-                            have different choices.
+                have different choices.
             multimodal(bool): Enable multimodal model quantization, (default is "False").
             act_bits (int): Number of bits for activation quantization. Default is 32.
             act_group_size (int): Group size for activation quantization. Default is None.
diff --git a/test/3x/torch/requirements.txt b/test/3x/torch/requirements.txt
index bdf99d92cf0..ea647ff587f 100644
--- a/test/3x/torch/requirements.txt
+++ b/test/3x/torch/requirements.txt
@@ -1,4 +1,4 @@
-auto_round
+auto_round @ git+https://github.com/intel/auto-round.git@c7751c49853eb3497e7df9b92dc4733a80df34a0
 expecttest
 intel_extension_for_pytorch
 numpy
diff --git a/test/requirements.txt b/test/requirements.txt
index 3a24001cfd2..386b3ee4f2b 100644
--- a/test/requirements.txt
+++ b/test/requirements.txt
@@ -1,6 +1,6 @@
 --find-links https://download.pytorch.org/whl/torch_stable.html
 accelerate==0.21.0
-auto-round
+auto-round @ git+https://github.com/intel/auto-round.git@24b2e74070f2b4e6f26ff069ec75af74cf5b177c
 dynast==1.6.0rc1
 horovod
 intel-extension-for-pytorch

From f2aa279a66b0ae3dc54d229eefb2d597c29c3d12 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 16 Jul 2024 04:18:21 +0000
Subject: [PATCH 08/12] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 neural_compressor/torch/algorithms/weight_only/autoround.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py
index 08c31de8459..6f5a022cfee 100644
--- a/neural_compressor/torch/algorithms/weight_only/autoround.py
+++ b/neural_compressor/torch/algorithms/weight_only/autoround.py
@@ -97,7 +97,7 @@ def __init__(
                 the input for the current block (default is True).
             enable_minmax_tuning (bool): Whether to enable weight min-max tuning (default is True).
             lr (float): The learning rate (default is None, will be set to 1.0/iters).
-            minmax_lr (float): The learning rate for min-max tuning 
+            minmax_lr (float): The learning rate for min-max tuning
                 (default is None, it will be set to lr automatically).
             low_gpu_mem_usage (bool): Whether to use low GPU memory (default is True).
             iters (int): Number of iterations (default is 200).

From c69f79eb2a6ce5ca88da468b40bd212b5d333d58 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Tue, 16 Jul 2024 13:06:12 +0800
Subject: [PATCH 09/12] update commit version

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 test/3x/torch/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/3x/torch/requirements.txt b/test/3x/torch/requirements.txt
index ea647ff587f..cc1ee22fe83 100644
--- a/test/3x/torch/requirements.txt
+++ b/test/3x/torch/requirements.txt
@@ -1,4 +1,4 @@
-auto_round @ git+https://github.com/intel/auto-round.git@c7751c49853eb3497e7df9b92dc4733a80df34a0
+auto_round @ git+https://github.com/intel/auto-round.git@24b2e74070f2b4e6f26ff069ec75af74cf5b177c
 expecttest
 intel_extension_for_pytorch
 numpy

From d7da8a60b6e5c71edd125431cacdb61840a07a26 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Tue, 16 Jul 2024 13:18:00 +0800
Subject: [PATCH 10/12] reset 2x ut commit

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 test/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/requirements.txt b/test/requirements.txt
index 386b3ee4f2b..3a24001cfd2 100644
--- a/test/requirements.txt
+++ b/test/requirements.txt
@@ -1,6 +1,6 @@
 --find-links https://download.pytorch.org/whl/torch_stable.html
 accelerate==0.21.0
-auto-round @ git+https://github.com/intel/auto-round.git@24b2e74070f2b4e6f26ff069ec75af74cf5b177c
+auto-round
 dynast==1.6.0rc1
 horovod
 intel-extension-for-pytorch

From 1324d049cc0253733a8a09537e9fab63db4d40b4 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Tue, 16 Jul 2024 16:17:10 +0800
Subject: [PATCH 11/12] update 2x version

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .azure-pipelines/scripts/ut/env_setup.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.azure-pipelines/scripts/ut/env_setup.sh b/.azure-pipelines/scripts/ut/env_setup.sh
index 84e7fc654ec..d5876b07cef 100644
--- a/.azure-pipelines/scripts/ut/env_setup.sh
+++ b/.azure-pipelines/scripts/ut/env_setup.sh
@@ -92,7 +92,7 @@ elif [[ $(echo "${test_case}" | grep -c "tf pruning") != 0 ]]; then
 fi
 
 if [[ $(echo "${test_case}" | grep -c "api") != 0 ]] || [[ $(echo "${test_case}" | grep -c "adaptor") != 0 ]]; then
-    pip install git+https://github.com/intel/auto-round.git@24b2e74070f2b4e6f26ff069ec75af74cf5b177c
+    pip install auto-round
 fi
 
 # test deps

From 10dd0cbfe4b529c70fe20419b8d8302dbe392587 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Tue, 16 Jul 2024 21:57:15 +0800
Subject: [PATCH 12/12] upodate autotune config

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 neural_compressor/torch/quantization/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
index c358d39c604..70907f91c64 100644
--- a/neural_compressor/torch/quantization/config.py
+++ b/neural_compressor/torch/quantization/config.py
@@ -1538,7 +1538,7 @@ def get_woq_tuning_config() -> list:
         the list of WOQ quant config.
     """
     RTN_G32ASYM = RTNConfig(use_sym=False, group_size=32)
-    AUTO_ROUND_CONFIG = AutoRoundConfig(use_sym=False, group_size=32)
+    AUTO_ROUND_CONFIG = AutoRoundConfig(use_sym=False, group_size=32, seqlen=512)
     GPTQ_G32ASYM = GPTQConfig(use_sym=False, group_size=32)
     AWQ_G32ASYM = AWQConfig(use_sym=False, group_size=32)
     return [RTN_G32ASYM, AUTO_ROUND_CONFIG, GPTQ_G32ASYM, AWQ_G32ASYM]