From e586fd568f2599edaf1ec771c4a16eb4948224e3 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Fri, 10 Oct 2025 02:54:22 -0400 Subject: [PATCH 1/7] better help printing information Signed-off-by: n1ck-guo --- auto_round/__main__.py | 354 ++++++++++++------------------------ auto_round/eval/eval_cli.py | 34 ++-- 2 files changed, 132 insertions(+), 256 deletions(-) diff --git a/auto_round/__main__.py b/auto_round/__main__.py index 07bc3f273..195f61c8f 100644 --- a/auto_round/__main__.py +++ b/auto_round/__main__.py @@ -27,45 +27,35 @@ set_cuda_visible_devices, ) +RECIPES = { + "default": {"batch_size": 8, "iters": 200, "seqlen": 2048, "nsample": 128, "lr": None}, + "best": {"batch_size": 8, "iters": 1000, "seqlen": 2048, "nsample": 512, "lr": None}, + "light": {"batch_size": 8, "iters": 50, "seqlen": 2048, "nsample": 128, "lr": 5e-3}, + "fast": {"batch_size": 4, "iters": 200, "seqlen": 512, "nsample": 128, "lr": None}, +} -class BasicArgumentParser(argparse.ArgumentParser): +class BasicArgumentParser(argparse.ArgumentParser): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.add_argument( + basic = self.add_argument_group("basic arguments") + basic.add_argument( "--model", "--model_name", "--model_name_or_path", default="facebook/opt-125m", help="model name or path" ) - - self.add_argument("--mllm", action="store_true", help="whether to quant multi-modal model.") - - self.add_argument("--eval", action="store_true", help="whether to use eval only mode") - - self.add_argument( + basic.add_argument( "--scheme", default="W4A16", type=str, # choices=["W4A16", "W2A16", "W3A16", "W8A16", "MXFP4", "MXFP8", "NVFP4", "FPW8A16", "FP8_STATIC"], help="quantization scheme", ) - - self.add_argument("--bits", default=None, type=int, help="number of weight bits") - self.add_argument("--group_size", default=None, type=int, help="group size") - self.add_argument("--asym", action="store_true", help="whether to use asym quantization") - self.add_argument("--data_type", "--dtype", default=None, help="data type for tuning, 'int', 'mx_fp' and etc") - self.add_argument("--act_bits", default=None, type=int, help="activation bits") - self.add_argument("--act_group_size", default=None, type=int, help="activation group size") - self.add_argument( - "--super_group_size", default=None, type=int, help="the number of super group size when use double quant." + basic.add_argument("--batch_size", "--train_bs", "--bs", default=None, type=int, help="train batch size") + basic.add_argument("--iters", "--iter", default=None, type=int, help="iteration to tune each block") + basic.add_argument( + "--seqlen", "--seq_len", default=None, type=int, help="sequence length of the calibration samples" ) - - self.add_argument( - "--super_bits", default=None, type=int, help="number of scale and mins quant bits for double quant." - ) - self.add_argument("--act_data_type", "--act_dtype", default=None, type=str, help="activation data type") - - self.add_argument("--disable_act_dynamic", action="store_true", help="activation static quantization") - - self.add_argument( + basic.add_argument("--nsamples", "--nsample", default=None, type=int, help="number of samples") + basic.add_argument( "--device_map", "--device", "--devices", @@ -77,161 +67,106 @@ def __init__(self, *args, **kwargs): "allowing for automatic detection and switch to HPU or CPU." "set --device 0,1,2 to use multiple cards.", ) - - self.add_argument( + basic.add_argument( "--dataset", default="NeelNanda/pile-10k", type=str, help="the dataset for quantization training" ) + basic.add_argument("--seed", default=42, type=int, help="random seed") + basic.add_argument("--adam", action="store_true", help="whether to use adam optimizer instead of SignSGD") + basic.add_argument("--low_gpu_mem_usage", action="store_true", help="offload intermediate features to cpu") + basic.add_argument("--format", default="auto_round", type=str, help="the format to save the model") + basic.add_argument( + "--output_dir", default="./tmp_autoround", type=str, help="the directory to save quantized model" + ) + basic.add_argument( + "--not_use_best_mse", + action="store_true", + help="whether to use the iter of best mes loss in the tuning phase", + ) + basic.add_argument("--enable_torch_compile", action="store_true", help="whether to enable torch compile") - self.add_argument( + tuning = self.add_argument_group("tuning arguments") + tuning.add_argument( + "--lr", default=None, type=float, help="learning rate, if None, it will be set to 1.0/iters automatically" + ) + tuning.add_argument( "--minmax_lr", default=None, type=float, help="minmax learning rate, if None, it will beset to be the same with lr", ) - - self.add_argument( + tuning.add_argument( "--mem_per_param_scale", default=13, type=float, help="Scale factor for memory per parameter, used to adjust memory usage estimation for tuning", ) - - self.add_argument("--seed", default=42, type=int, help="random seed") - - self.add_argument("--adam", action="store_true", help="whether to use adam optimizer instead of SignSGD") - - self.add_argument("--gradient_accumulate_steps", default=1, type=int, help="gradient accumulate steps") - - self.add_argument("--nblocks", default=1, type=int, help="how many blocks to tune together") - - self.add_argument("--low_gpu_mem_usage", action="store_true", help="offload intermediate features to cpu") - - self.add_argument("--format", default="auto_round", type=str, help="the format to save the model") - - self.add_argument( + tuning.add_argument("--gradient_accumulate_steps", default=1, type=int, help="gradient accumulate steps") + tuning.add_argument("--nblocks", default=1, type=int, help="how many blocks to tune together") + tuning.add_argument( "--scale_dtype", default="fp16", choices=["fp16", "float16", "bf16", "bfloat16", "fp32", "float32"], help="scale data type to use for quantization", ) - - self.add_argument( - "--output_dir", default="./tmp_autoround", type=str, help="the directory to save quantized model" - ) - - self.add_argument("--disable_amp", action="store_true", help="disable amp") - - self.add_argument( + tuning.add_argument("--disable_amp", action="store_true", help="disable amp") + tuning.add_argument( "--disable_minmax_tuning", action="store_true", help="whether to disable enable weight minmax tuning" ) - - self.add_argument("--enable_norm_bias_tuning", action="store_true", help="whether to enable norm bias tuning") - - self.add_argument( - "--disable_trust_remote_code", action="store_true", help="whether to disable trust_remote_code" - ) - - self.add_argument( + tuning.add_argument("--enable_norm_bias_tuning", action="store_true", help="whether to enable norm bias tuning") + tuning.add_argument( "--disable_quanted_input", action="store_true", help="whether to disuse the output of quantized block to tune the next block", ) - - self.add_argument("--quant_lm_head", action="store_true", help="whether to quant lm_head") - - self.add_argument( - "--low_cpu_mem_mode", - default=0, - type=int, - choices=[0, 1, 2], - help="choose which low cpu memory mode to use. " - "Can significantly reduce cpu memory footprint but cost more time." - "1 means choose block-wise mode, load the weights of each block" - " from disk when tuning and release the memory of the block after tuning." - "2 means choose layer-wise mode, load the weights of each layer from disk when tuning," - " minimum memory consumption and also slowest running speed." - "others means not use low cpu memory. Default to 0, not use low cpu memory.", - ) - - self.add_argument( - "--low_cpu_mem_tmp_dir", - default=None, - type=str, - help="temporary work space to store the temporary files " - "when using low cpu memory mode. Will remove after tuning.", - ) - - self.add_argument( - "--model_dtype", - default=None, - type=str, - choices=["fp16", "float16", "bf16", "bfloat16", "fp32", "float32"], - help="force to convert the dtype, some backends supports fp16 dtype better", - ) - - self.add_argument( - "--fp_layers", default="", type=str, help="list of Layer names to maintain original data type" - ) - - self.add_argument( - "--not_use_best_mse", - action="store_true", - help="whether to use the iter of best mes loss in the tuning phase", - ) - - self.add_argument( + tuning.add_argument( "--to_quant_block_names", default=None, type=str, help="Names of quantitative blocks, please use commas to separate them.", ) - - self.add_argument("--enable_torch_compile", action="store_true", help="whether to enable torch compile") - - self.add_argument("--enable_alg_ext", action="store_true", help="whether to enable probably better algorithm") - - self.add_argument( + tuning.add_argument("--enable_alg_ext", action="store_true", help="whether to enable probably better algorithm") + tuning.add_argument( "--disable_deterministic_algorithms", action="store_true", help="deprecated, disable torch deterministic algorithms.", ) - self.add_argument( + tuning.add_argument( "--enable_deterministic_algorithms", action="store_true", help="enable torch deterministic algorithms." ) - - self.add_argument( + tuning.add_argument( "--disable_opt_rtn", action="store_true", help="whether to disable optimization of the RTN mode(iters=0) (default is False).", ) - ## ======================= MLLM ======================= - self.add_argument( - "--quant_nontext_module", - action="store_true", - help="whether to quantize non-text module, e.g. vision component", + scheme = self.add_argument_group("scheme arguments") + scheme.add_argument("--bits", default=None, type=int, help="number of weight bits") + scheme.add_argument("--group_size", default=None, type=int, help="group size") + scheme.add_argument("--asym", action="store_true", help="whether to use asym quantization") + scheme.add_argument("--data_type", "--dtype", default=None, help="data type for tuning, 'int', 'mx_fp' and etc") + scheme.add_argument("--act_bits", default=None, type=int, help="activation bits") + scheme.add_argument("--act_group_size", default=None, type=int, help="activation group size") + scheme.add_argument("--act_data_type", "--act_dtype", default=None, type=str, help="activation data type") + scheme.add_argument("--disable_act_dynamic", action="store_true", help="activation static quantization") + scheme.add_argument("--quant_lm_head", action="store_true", help="whether to quant lm_head") + scheme.add_argument( + "--fp_layers", default="", type=str, help="list of Layer names to maintain original data type" ) - self.add_argument( - "--extra_data_dir", - default=None, - type=str, - help="dataset dir for storing images/audio/videos. " - "Can be a dir path or multiple dir path with format as " - "'image=path_to_image,video=path_to_video,audio=path_to_audio'" - "By default, it will search in the relative path, " - "and if not find, will automatic download.", + gguf = self.add_argument_group("double quant arguments") + gguf.add_argument( + "--super_group_size", default=None, type=int, help="the number of super group size when use double quant." ) - - self.add_argument( - "--template", - default=None, - type=str, - help="the template for building training dataset. It can be a custom one.", + gguf.add_argument( + "--super_bits", default=None, type=int, help="number of scale and mins quant bits for double quant." ) ## ======================= eval ======================= - self.add_argument( + eval_args = self.add_argument_group("eval arguments") + eval_args.add_argument( + "--disable_trust_remote_code", action="store_true", help="whether to disable trust_remote_code" + ) + eval_args.add_argument( "--tasks", "--task", nargs="?", @@ -240,10 +175,8 @@ def __init__(self, *args, **kwargs): default=None, help="lm-eval tasks", ) - - self.add_argument("--eval_bs", default=None, type=int, help="batch size in evaluation") - - self.add_argument( + eval_args.add_argument("--eval_bs", default=None, type=int, help="batch size in evaluation") + eval_args.add_argument( "--limit", type=float, default=None, @@ -251,106 +184,46 @@ def __init__(self, *args, **kwargs): help="Limit the number of examples per task. " "If <1, limit is a percentage of the total number of examples.", ) - - self.add_argument("--eval_task_by_task", action="store_true", help="whether to eval task by task.") - - self.add_argument( + eval_args.add_argument("--eval_task_by_task", action="store_true", help="whether to eval task by task.") + eval_args.add_argument( "--eval_model_dtype", default=None, type=str, help="the torch_dytpe to load the model for evaluation." ) - -def setup_parser(): - parser = BasicArgumentParser() - - parser.add_argument("--batch_size", "--train_bs", "--bs", default=8, type=int, help="train batch size") - - parser.add_argument("--iters", "--iter", default=200, type=int, help="iteration to tune each block") - - parser.add_argument( - "--seqlen", "--seq_len", default=2048, type=int, help="sequence length of the calibration samples" - ) - - parser.add_argument("--nsamples", "--nsample", default=128, type=int, help="number of samples") - - parser.add_argument( - "--lr", default=None, type=float, help="learning rate, if None, it will be set to 1.0/iters automatically" - ) - - args = parser.parse_args() - return args - - -def setup_best_parser(): - parser = BasicArgumentParser() - - parser.add_argument("--batch_size", "--train_bs", "--bs", default=8, type=int, help="train batch size") - - parser.add_argument("--iters", "--iter", default=1000, type=int, help="iterations to tune each block") - - parser.add_argument( - "--seqlen", "--seq_len", default=2048, type=int, help="sequence length of the calibration samples" - ) - - parser.add_argument("--nsamples", "--nsample", default=512, type=int, help="number of samples") - - parser.add_argument( - "--lr", default=None, type=float, help="learning rate, if None, it will be set to 1.0/iters automatically" - ) - - args = parser.parse_args() - args.low_gpu_mem_usage = True - - return args - - -def setup_light_parser(): - parser = BasicArgumentParser() - - parser.add_argument("--batch_size", "--train_bs", "--bs", default=8, type=int, help="train batch size") - - parser.add_argument("--iters", "--iter", default=50, type=int, help="iterations to tune each block") - - parser.add_argument( - "--seqlen", "--seq_len", default=2048, type=int, help="sequence length of the calibration samples" - ) - - parser.add_argument("--nsamples", "--nsample", default=128, type=int, help="number of samples") - - parser.add_argument( - "--lr", default=5e-3, type=float, help="learning rate, if None, it will be set to 1.0/iters automatically" - ) - - args = parser.parse_args() - - return args + ## ======================= MLLM ======================= + mllm_args = self.add_argument_group("Multimodal Large Language Model(MLLM) arguments") + mllm_args.add_argument( + "--mllm", action="store_true", help="deprecated, auto_round can auto detect and use mllm mode." + ) + mllm_args.add_argument( + "--quant_nontext_module", + action="store_true", + help="whether to quantize non-text module, e.g. vision component", + ) + mllm_args.add_argument( + "--extra_data_dir", + default=None, + type=str, + help="dataset dir for storing images/audio/videos. " + "Can be a dir path or multiple dir path with format as " + "'image=path_to_image,video=path_to_video,audio=path_to_audio'" + "By default, it will search in the relative path, " + "and if not find, will automatic download.", + ) + mllm_args.add_argument( + "--template", + default=None, + type=str, + help="the template for building training dataset. It can be a custom one.", + ) -def setup_fast_parser(): +def setup_parser(recipe="default"): + recipe = RECIPES[recipe] parser = BasicArgumentParser() - - parser.add_argument("--batch_size", "--train_bs", "--bs", default=4, type=int, help="train batch size") - - parser.add_argument("--iters", default=200, type=int, help="iterations to tune each block") - - parser.add_argument( - "--seqlen", "--seq_len", default=512, type=int, help="sequence length of the calibration samples" - ) - - parser.add_argument("--nsamples", "--nsample", default=128, type=int, help="number of samples") - - parser.add_argument( - "--lr", default=None, type=float, help="learning rate, if None, it will be set to 1.0/iters automatically" - ) - - args = parser.parse_args() - - return args - - -def setup_eval_parser(): - - parser = EvalArgumentParser() args = parser.parse_args() + for k, v in recipe.items(): + if getattr(args, k) is None: + setattr(args, k, v) return args @@ -652,6 +525,12 @@ def tune(args): print("evaluation running time=%ds" % (time.time() - st)) +def setup_eval_parser(): + parser = EvalArgumentParser() + args = parser.parse_args() + return args + + def run_eval(): args = setup_eval_parser() if args.eval_task_by_task: @@ -676,23 +555,18 @@ def run(): tune(args) -def run_mllm(): - sys.argv.append("--mllm") - run() - - def run_best(): - args = setup_best_parser() + args = setup_parser("best") tune(args) def run_light(): - args = setup_light_parser() + args = setup_parser("light") tune(args) def run_fast(): - args = setup_fast_parser() + args = setup_parser("fast") tune(args) diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py index f66d93316..de76305c0 100644 --- a/auto_round/eval/eval_cli.py +++ b/auto_round/eval/eval_cli.py @@ -74,27 +74,29 @@ def __init__(self, *args, **kwargs): choices=["hf", "vllm"], help="Use hf backend for evaluation by default.", ) + # vllm related arguments - self.add_argument("--revision", default=None, type=str, help="model revision for vllm") - self.add_argument("--tokenizer", default=None, type=str, help="tokenizer to use with vllm") - self.add_argument( + vllm_args = self.add_argument_group("vllm backend arguments") + vllm_args.add_argument("--revision", default=None, type=str, help="model revision for vllm") + vllm_args.add_argument("--tokenizer", default=None, type=str, help="tokenizer to use with vllm") + vllm_args.add_argument( "--tokenizer_mode", default="auto", type=str, help="tokenizer mode for vllm (e.g. auto/fast/slow)" ) - self.add_argument("--tokenizer_revision", default=None, type=str, help="tokenizer revision for vllm") - self.add_argument("--add_bos_token", action="store_true", help="add BOS token when using vllm") - self.add_argument("--prefix_token_id", default=None, type=int, help="prefix token id for vllm") - self.add_argument("--tensor_parallel_size", default=1, type=int, help="tensor parallel size for vllm") - self.add_argument("--data_parallel_size", default=1, type=int, help="data parallel size for vllm") - self.add_argument("--quantization", default=None, type=str, help="quantization setting for vllm") - self.add_argument("--max_gen_toks", default=256, type=int, help="max generation tokens for vllm") - self.add_argument("--swap_space", default=4, type=float, help="swap space (GB) for vllm") - self.add_argument("--max_batch_size", default=None, type=int, help="max batch size for vllm") - self.add_argument("--max_length", default=None, type=int, help="max generation length for vllm") - self.add_argument("--max_model_len", default=None, type=int, help="maximum model sequence length for vllm") - self.add_argument( + vllm_args.add_argument("--tokenizer_revision", default=None, type=str, help="tokenizer revision for vllm") + vllm_args.add_argument("--add_bos_token", action="store_true", help="add BOS token when using vllm") + vllm_args.add_argument("--prefix_token_id", default=None, type=int, help="prefix token id for vllm") + vllm_args.add_argument("--tensor_parallel_size", default=1, type=int, help="tensor parallel size for vllm") + vllm_args.add_argument("--data_parallel_size", default=1, type=int, help="data parallel size for vllm") + vllm_args.add_argument("--quantization", default=None, type=str, help="quantization setting for vllm") + vllm_args.add_argument("--max_gen_toks", default=256, type=int, help="max generation tokens for vllm") + vllm_args.add_argument("--swap_space", default=4, type=float, help="swap space (GB) for vllm") + vllm_args.add_argument("--max_batch_size", default=None, type=int, help="max batch size for vllm") + vllm_args.add_argument("--max_length", default=None, type=int, help="max generation length for vllm") + vllm_args.add_argument("--max_model_len", default=None, type=int, help="maximum model sequence length for vllm") + vllm_args.add_argument( "--gpu_memory_utilization", default=0.9, type=float, help="target GPU memory utilization for vllm" ) - self.add_argument("--lora_local_path", default=None, type=str, help="local LoRA path for vllm") + vllm_args.add_argument("--lora_local_path", default=None, type=str, help="local LoRA path for vllm") def _eval_init(tasks, model_path, device, disable_trust_remote_code=False, dtype="auto"): From b9f9d358bf167bb9096d94b35300e8cd04172594 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Fri, 10 Oct 2025 03:27:42 -0400 Subject: [PATCH 2/7] fix Signed-off-by: n1ck-guo --- auto_round/__main__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/auto_round/__main__.py b/auto_round/__main__.py index 195f61c8f..b4e1a7dd9 100644 --- a/auto_round/__main__.py +++ b/auto_round/__main__.py @@ -28,10 +28,10 @@ ) RECIPES = { - "default": {"batch_size": 8, "iters": 200, "seqlen": 2048, "nsample": 128, "lr": None}, - "best": {"batch_size": 8, "iters": 1000, "seqlen": 2048, "nsample": 512, "lr": None}, - "light": {"batch_size": 8, "iters": 50, "seqlen": 2048, "nsample": 128, "lr": 5e-3}, - "fast": {"batch_size": 4, "iters": 200, "seqlen": 512, "nsample": 128, "lr": None}, + "default": {"batch_size": 8, "iters": 200, "seqlen": 2048, "nsamples": 128, "lr": None}, + "best": {"batch_size": 8, "iters": 1000, "seqlen": 2048, "nsamples": 512, "lr": None}, + "light": {"batch_size": 8, "iters": 50, "seqlen": 2048, "nsamples": 128, "lr": 5e-3}, + "fast": {"batch_size": 4, "iters": 200, "seqlen": 512, "nsamples": 128, "lr": None}, } From f142ebfc1079d1f6d7d7fae3b2b2050ea7bdf9a3 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Fri, 10 Oct 2025 03:36:06 -0400 Subject: [PATCH 3/7] fix Signed-off-by: n1ck-guo --- auto_round/__main__.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/auto_round/__main__.py b/auto_round/__main__.py index b4e1a7dd9..d2af7934a 100644 --- a/auto_round/__main__.py +++ b/auto_round/__main__.py @@ -389,11 +389,6 @@ def tune(args): model, folders = autoround.quantize_and_save(export_dir, format=args.format) # pylint: disable=E1101 tokenizer = autoround.tokenizer # pylint: disable=E1101 - if args.low_cpu_mem_mode == 1 or args.low_cpu_mem_mode == 2: - import shutil - - shutil.rmtree(args.low_cpu_mem_tmp_dir, ignore_errors=True) - model.eval() clear_memory() From 203f46ebed85a410addc22ebd854a52ec480fc44 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 13 Oct 2025 00:54:46 +0000 Subject: [PATCH 4/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/__main__.py b/auto_round/__main__.py index 9764cf45c..f95edac2b 100644 --- a/auto_round/__main__.py +++ b/auto_round/__main__.py @@ -234,7 +234,7 @@ def __init__(self, *args, **kwargs): type=str, help="the template for building training dataset. It can be a custom one.", ) - + ## ======================= diffusion model eval ======================= diffusion_args = self.add_argument_group("diffusion model arguments") diffusion_args.add_argument("--prompt_file", default=None, type=str, help="the prompt file to load prmpt.") From 6fdc8dfe4d683cd6c4f8b2076e55d5285e9c7f5b Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Sun, 12 Oct 2025 21:43:36 -0400 Subject: [PATCH 5/7] refine the help information Signed-off-by: n1ck-guo --- auto_round/__main__.py | 363 +++++++++++++++++++++++------------- auto_round/eval/eval_cli.py | 37 +++- 2 files changed, 259 insertions(+), 141 deletions(-) diff --git a/auto_round/__main__.py b/auto_round/__main__.py index 9764cf45c..9ae0ae1c9 100644 --- a/auto_round/__main__.py +++ b/auto_round/__main__.py @@ -38,127 +38,252 @@ class BasicArgumentParser(argparse.ArgumentParser): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - basic = self.add_argument_group("basic arguments") + basic = self.add_argument_group("Basic Arguments") basic.add_argument( - "--model", "--model_name", "--model_name_or_path", default="facebook/opt-125m", help="model name or path" + "--model", + "--model_name", + "--model_name_or_path", + default="facebook/opt-125m", + help="Path to the pre-trained model or model identifier from huggingface.co/models. " + "Examples: 'facebook/opt-125m', 'bert-base-uncased', or local path like '/path/to/model'", ) basic.add_argument( "--scheme", default="W4A16", type=str, # choices=["W4A16", "W2A16", "W3A16", "W8A16", "MXFP4", "MXFP8", "NVFP4", "FPW8A16", "FP8_STATIC"], - help="quantization scheme", + help="Quantization scheme to use. " + "W4A16: 4-bit weights with 16-bit activations (default). " + "Other options include W2A16, W3A16, W8A16 for different bit widths, " + "and MXFP4/MXFP8/NVFP4 for different data type.", ) - basic.add_argument("--batch_size", "--train_bs", "--bs", default=None, type=int, help="train batch size") - basic.add_argument("--iters", "--iter", default=None, type=int, help="iteration to tune each block") basic.add_argument( - "--seqlen", "--seq_len", default=None, type=int, help="sequence length of the calibration samples" + "--batch_size", + "--train_bs", + "--bs", + default=None, + type=int, + help="The batch size for tuning/calibration." + "Larger batch sizes may improve stability but require more memory.", + ) + basic.add_argument( + "--iters", + "--iter", + default=None, + type=int, + help="Number of iterations to tune each block. " + "More iterations may lead to better quantization quality but take longer.", + ) + basic.add_argument( + "--seqlen", + "--seq_len", + default=None, + type=int, + help="Sequence length of the calibration samples" + "Longer sequences capture more context but use more memory.", + ) + basic.add_argument( + "--nsamples", + "--nsample", + default=None, + type=int, + help="Number of calibration samples to use for quantization.", ) - basic.add_argument("--nsamples", "--nsample", default=None, type=int, help="number of samples") basic.add_argument( "--device_map", "--device", "--devices", default="0", type=str, - help="the device to be used for tuning. " + help="The device to be used for tuning. " "Currently, device settings support CPU, GPU, and HPU." "The default is set to cuda:0," "allowing for automatic detection and switch to HPU or CPU." "set --device 0,1,2 to use multiple cards.", ) basic.add_argument( - "--dataset", default="NeelNanda/pile-10k", type=str, help="the dataset for quantization training" + "--dataset", + default="NeelNanda/pile-10k", + type=str, + help="Calibration dataset for quantization. " + "Should be a dataset from huggingface datasets or local path. ", ) - basic.add_argument("--seed", default=42, type=int, help="random seed") - basic.add_argument("--adam", action="store_true", help="whether to use adam optimizer instead of SignSGD") - basic.add_argument("--low_gpu_mem_usage", action="store_true", help="offload intermediate features to cpu") - basic.add_argument("--format", default="auto_round", type=str, help="the format to save the model") + basic.add_argument("--seed", default=42, type=int, help="Random seed for reproducibility.") + basic.add_argument("--adam", action="store_true", help="Use Adam optimizer instead of SignSGD.") basic.add_argument( - "--output_dir", default="./tmp_autoround", type=str, help="the directory to save quantized model" + "--low_gpu_mem_usage", + action="store_true", + help="Enable memory-efficient mode by offloading intermediate features to CPU. " + "Useful when working with large models that don't fit in GPU memory.", + ) + basic.add_argument( + "--format", + default="auto_round", + type=str, + help="Output format for the quantized model." "'auto_round' is the recommended format", + ) + basic.add_argument( + "--output_dir", + default="./tmp_autoround", + type=str, + help="Directory to save the quantized model and related files", ) basic.add_argument( "--not_use_best_mse", action="store_true", - help="whether to use the iter of best mes loss in the tuning phase", + help="Disable using the iteration with best MSE loss during tuning.", + ) + basic.add_argument( + "--enable_torch_compile", action="store_true", help="Enable PyTorch compilation for faster execution. " ) - basic.add_argument("--enable_torch_compile", action="store_true", help="whether to enable torch compile") - tuning = self.add_argument_group("tuning arguments") + tuning = self.add_argument_group("Tuning Arguments") tuning.add_argument( - "--lr", default=None, type=float, help="learning rate, if None, it will be set to 1.0/iters automatically" + "--lr", + default=None, + type=float, + help="Learning rate for tuning. " "If None, automatically sets to 1.0/iters. ", ) tuning.add_argument( "--minmax_lr", default=None, type=float, - help="minmax learning rate, if None, it will beset to be the same with lr", + help="Learning rate specifically for min-max tuning. " "If None, uses the same value as --lr. ", ) tuning.add_argument( "--mem_per_param_scale", default=13, type=float, - help="Scale factor for memory per parameter, used to adjust memory usage estimation for tuning", + help="Memory scaling factor for parameter memory estimation. " + "Adjust this if you need to control memory usage during tuning. " + "Lower values reduce memory usage but may affect accuracy.", + ) + tuning.add_argument( + "--gradient_accumulate_steps", + default=1, + type=int, + help="Number of steps to accumulate gradients before updating weights. " + "Effectively increases batch size without requiring more GPU memory. " + "Useful for large models with limited memory.", + ) + tuning.add_argument( + "--nblocks", + default=1, + type=int, + help="Number of blocks to tune simultaneously. " + "Higher values may speed up tuning but require more memory. " + "Recommended to keep at 1 for stability with large models.", ) - tuning.add_argument("--gradient_accumulate_steps", default=1, type=int, help="gradient accumulate steps") - tuning.add_argument("--nblocks", default=1, type=int, help="how many blocks to tune together") tuning.add_argument( "--scale_dtype", default="fp16", choices=["fp16", "float16", "bf16", "bfloat16", "fp32", "float32"], - help="scale data type to use for quantization", + help="Data type for quantization scales. " + "fp16/bf16: lower memory, fp32: higher precision. " + "Choose based on your hardware support and accuracy requirements.", + ) + tuning.add_argument( + "--disable_amp", + action="store_true", + help="Disable Automatic Mixed Precision (AMP). " + "AMP speeds up training but may affect numerical stability in some cases.", ) - tuning.add_argument("--disable_amp", action="store_true", help="disable amp") tuning.add_argument( - "--disable_minmax_tuning", action="store_true", help="whether to disable enable weight minmax tuning" + "--disable_minmax_tuning", + action="store_true", + help="Disable weight min-max range tuning. " + "Not recommended as it may significantly reduce quantization accuracy.", + ) + tuning.add_argument( + "--enable_norm_bias_tuning", action="store_true", help="Enable normalization layer bias tuning. " ) - tuning.add_argument("--enable_norm_bias_tuning", action="store_true", help="whether to enable norm bias tuning") tuning.add_argument( "--disable_quanted_input", action="store_true", - help="whether to disuse the output of quantized block to tune the next block", + help="Use original (non-quantized) inputs for each block instead of quantized outputs from previous blocks. ", ) tuning.add_argument( "--to_quant_block_names", default=None, type=str, - help="Names of quantitative blocks, please use commas to separate them.", + help="Specific blocks to quantize, separated by commas. " + "Example: 'block1,block2,block3'. " + "If None, all blocks will be quantized.", + ) + tuning.add_argument( + "--enable_alg_ext", + action="store_true", + help="Enable experimental algorithms that may provide better quantization results. " + "These are newer methods that might improve accuracy but are less tested.", ) - tuning.add_argument("--enable_alg_ext", action="store_true", help="whether to enable probably better algorithm") tuning.add_argument( "--disable_deterministic_algorithms", action="store_true", help="deprecated, disable torch deterministic algorithms.", ) tuning.add_argument( - "--enable_deterministic_algorithms", action="store_true", help="enable torch deterministic algorithms." + "--enable_deterministic_algorithms", + action="store_true", + help="Enable PyTorch deterministic algorithms for reproducible results. ", ) tuning.add_argument( "--disable_opt_rtn", action="store_true", - help="whether to disable optimization of the RTN mode(iters=0) (default is False).", - ) - - scheme = self.add_argument_group("scheme arguments") - scheme.add_argument("--bits", default=None, type=int, help="number of weight bits") - scheme.add_argument("--group_size", default=None, type=int, help="group size") - scheme.add_argument("--asym", action="store_true", help="whether to use asym quantization") - scheme.add_argument("--data_type", "--dtype", default=None, help="data type for tuning, 'int', 'mx_fp' and etc") - scheme.add_argument("--act_bits", default=None, type=int, help="activation bits") - scheme.add_argument("--act_group_size", default=None, type=int, help="activation group size") - scheme.add_argument("--act_data_type", "--act_dtype", default=None, type=str, help="activation data type") - scheme.add_argument("--disable_act_dynamic", action="store_true", help="activation static quantization") - scheme.add_argument("--quant_lm_head", action="store_true", help="whether to quant lm_head") + help="Disable optimization for RTN (Round-To-Nearest) mode when iters=0. " + "RTN is fast but less accurate; keeping optimization enabled is recommended.", + ) + + scheme = self.add_argument_group("Scheme Arguments") + scheme.add_argument("--bits", default=None, type=int, help="Number of bits for weight quantization. ") + scheme.add_argument("--group_size", default=None, type=int, help="Group size for weight quantization.") + scheme.add_argument("--asym", action="store_true", help="Use asymmetric quantization instead of symmetric.") + scheme.add_argument( + "--data_type", + "--dtype", + default=None, + help="Data type for quantization. Options: 'int' for integer, 'mx_fp' for mixed floating-point, etc.", + ) + scheme.add_argument( + "--act_bits", + default=None, + type=int, + help="Number of bits for activation quantization. " + "Activation quantization significantly impacts performance and accuracy.", + ) + scheme.add_argument( + "--act_group_size", + default=None, + type=int, + help="Group size for activation quantization. " "Similar to weight group size but for activations.", + ) + scheme.add_argument( + "--act_data_type", "--act_dtype", default=None, type=str, help="Data type for activation quantization. " + ) + scheme.add_argument( + "--disable_act_dynamic", action="store_true", help="Use static instead of dynamic activation quantization. " + ) + scheme.add_argument( + "--quant_lm_head", + action="store_true", + help="Quantize the lm_head. " "Usually kept in higher precision for better output quality.", + ) scheme.add_argument( - "--fp_layers", default="", type=str, help="list of Layer names to maintain original data type" + "--fp_layers", + default="", + type=str, + help="List of layer names to keep in original precision (not quantized). " + "Useful for preserving critical layers. Separate multiple names with commas.", ) - gguf = self.add_argument_group("double quant arguments") + gguf = self.add_argument_group("Double Quant Arguments") gguf.add_argument( - "--super_group_size", default=None, type=int, help="the number of super group size when use double quant." + "--super_group_size", default=None, type=int, help="Super group size for double quantization." ) gguf.add_argument( - "--super_bits", default=None, type=int, help="number of scale and mins quant bits for double quant." + "--super_bits", + default=None, + type=int, + help="Number of bits for scale and zero-point quantization in double quantization. ", ) ## ===================== diffusion model ================== @@ -166,24 +291,35 @@ def __init__(self, *args, **kwargs): "--guidance_scale", default=7.5, type=float, + help="Classifier-free guidance scale for diffusion models. " + "Higher values (7-20) make the model follow the prompt more closely. " + "Lower values give more creative/random results.", ) self.add_argument( "--num_inference_steps", default=50, type=int, + help="Number of denoising steps in the diffusion process. " + "More steps (50-100) usually give better quality but take longer. " + "Fewer steps (10-30) are faster but lower quality.", ) self.add_argument( "--generator_seed", default=None, type=int, + help="Random seed for image generation reproducibility. " + "Using the same seed produces identical results across runs.", ) ## ======================= eval ======================= eval_args = self.add_argument_group("eval arguments") eval_args.add_argument( - "--disable_trust_remote_code", action="store_true", help="whether to disable trust_remote_code" + "--disable_trust_remote_code", + action="store_true", + help="Disable trusting remote code when loading models. " + "Use for security if you don't trust the model source.", ) eval_args.add_argument( "--tasks", @@ -192,130 +328,93 @@ def __init__(self, *args, **kwargs): const="lambada_openai,hellaswag,winogrande,piqa,mmlu,wikitext,truthfulqa_mc1," "openbookqa,boolq,arc_easy,arc_challenge", default=None, - help="lm-eval tasks", + help="LM-Evaluation-Harness tasks to run. " + "Specify specific tasks like 'mmlu,wikitext' for custom evaluation.", ) - eval_args.add_argument("--eval_bs", default=None, type=int, help="batch size in evaluation") + eval_args.add_argument("--eval_bs", default=None, type=int, help="Batch size for evaluation.") eval_args.add_argument( "--limit", type=float, default=None, metavar="N|0 Date: Sun, 12 Oct 2025 22:13:16 -0400 Subject: [PATCH 6/7] fix Signed-off-by: n1ck-guo --- auto_round/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/__main__.py b/auto_round/__main__.py index 9ae0ae1c9..5a1ecd2bf 100644 --- a/auto_round/__main__.py +++ b/auto_round/__main__.py @@ -389,7 +389,7 @@ def __init__(self, *args, **kwargs): "--prompt_file", default=None, type=str, - hhelp="File containing prompts for evaluation, one per line. " + help="File containing prompts for evaluation, one per line. " "Use this for batch evaluation with multiple prompts.", ) diffusion_args.add_argument( From 65ad4a7e47268ca873e0a0e1269c1c325d74aa23 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Mon, 13 Oct 2025 22:01:14 -0400 Subject: [PATCH 7/7] fix Signed-off-by: n1ck-guo --- auto_round/__main__.py | 5 +++-- auto_round/eval/eval_cli.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/auto_round/__main__.py b/auto_round/__main__.py index 5a1ecd2bf..242498afb 100644 --- a/auto_round/__main__.py +++ b/auto_round/__main__.py @@ -200,7 +200,8 @@ def __init__(self, *args, **kwargs): tuning.add_argument( "--disable_quanted_input", action="store_true", - help="Use original (non-quantized) inputs for each block instead of quantized outputs from previous blocks. ", + help="Use original (non-quantized) inputs for each block instead of" + " quantized outputs from previous blocks. ", ) tuning.add_argument( "--to_quant_block_names", @@ -339,7 +340,7 @@ def __init__(self, *args, **kwargs): metavar="N|0