Skip to content

Commit 873114a

Browse files
enable regex quantization config saving for mixed bits (#825)
* enable dynamic quantization config saving Signed-off-by: Zhang, Weiwei1 <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixtypo Signed-off-by: Zhang, Weiwei1 <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * rebase code, refine config saving Signed-off-by: Zhang, Weiwei1 <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * refine ut Signed-off-by: Zhang, Weiwei1 <[email protected]> * fix UT Signed-off-by: Zhang, Weiwei1 <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * enable hf loading for regex, add UTs Signed-off-by: Zhang, Weiwei1 <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * refine export, enhance gptq UT Signed-off-by: Zhang, Weiwei1 <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Zhang, Weiwei1 <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent a00fcc6 commit 873114a

File tree

12 files changed

+984
-56
lines changed

12 files changed

+984
-56
lines changed

auto_round/compressors/base.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -439,7 +439,7 @@ def _gen_auto_scheme(
439439
"Please save the model using the `fake` format for now."
440440
)
441441

442-
layer_config, self.has_qlayer_outside_block = set_layer_config(
442+
layer_config, self.has_qlayer_outside_block, self.regex_config = set_layer_config(
443443
self.model,
444444
self.layer_config,
445445
self.scheme,
@@ -1653,7 +1653,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:
16531653
enable_gguf_official_mixed = True
16541654
else:
16551655
enable_gguf_official_mixed = False
1656-
self.layer_config, self.has_qlayer_outside_block = set_layer_config(
1656+
self.layer_config, self.has_qlayer_outside_block, self.regex_config = set_layer_config(
16571657
self.model,
16581658
self.layer_config,
16591659
self.scheme,
@@ -2937,6 +2937,8 @@ def save_quantized(
29372937
"Support for exporting activation quantization is limited. "
29382938
"Please ensure that your configuration is supported."
29392939
)
2940+
# if format == "llm_compressor" and (is_nv_fp(self.data_type) or is_mx_fp(self.data_type)):
2941+
# format = format.replace("llm_compressor", f"llm_compressor:{self.data_type}")
29402942
if format == "llm_compressor" and (is_nv_fp(self.data_type) or is_mx_fp(self.data_type)):
29412943
format = format.replace("llm_compressor", f"llm_compressor:{self.data_type}")
29422944
if format == "llm_compressor" and is_static_wfp8afp8(self):
@@ -2985,6 +2987,7 @@ def save_quantized(
29852987
"act_data_type",
29862988
"super_bits",
29872989
"super_group_size",
2990+
"regex_config",
29882991
]
29892992
if isinstance(self.dataset, str):
29902993
serialization_keys.append("dataset")

auto_round/export/export_to_autogptq/export.py

Lines changed: 131 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
import json
1818
import os
1919
from concurrent.futures import ThreadPoolExecutor
20+
from dataclasses import fields
21+
from typing import Any, Dict
2022

2123
import threadpoolctl as tctl
2224

@@ -48,16 +50,28 @@
4850

4951
import auto_round.export.export_to_autogptq.qlinear_triton
5052
from auto_round.export.utils import save_model
53+
from auto_round.schemes import QuantizationScheme
54+
55+
GPTQ_REQUIRED_CONFIG_KEYS = (
56+
"bits",
57+
"group_size",
58+
"sym",
59+
)
60+
5161
from auto_round.logger import logger
5262
from auto_round.utils import (
5363
SUPPORTED_LAYER_TYPES,
64+
check_start_with_block_name,
5465
check_to_quantized,
5566
copy_python_files_from_model_cache,
5667
filter_quantization_config,
5768
get_autogptq_packing_qlinear,
5869
get_block_names,
5970
get_module,
71+
json_serialize,
72+
matches_any_regex,
6073
set_module,
74+
to_standard_regex,
6175
)
6276

6377
BLOCK_PATTERNS = [ ## copy from transformers optimum
@@ -66,6 +80,54 @@
6680
"gpt_neox.layers",
6781
"model.layers",
6882
]
83+
from auto_round.export.export_to_autoround.utils import check_neq_config
84+
85+
86+
def convert_to_autogptq_dynamic(regex_config: Dict[str, Dict[str, Any]]) -> Dict[str, Dict[str, Any]]:
87+
"""
88+
Convert AutoRound-style regex_config into AutoGPTQ-style QuantizerConfig.dynamic.
89+
90+
Rules:
91+
- bits < 16 -> quantize -> positive match `+:regex`
92+
- bits == 16 -> skip quantize -> negative match `-:regex`
93+
"""
94+
converted = {}
95+
for name, cfg in regex_config.items():
96+
bits = cfg.get("bits")
97+
regex = to_standard_regex(name)
98+
99+
if bits is None:
100+
continue # ignore invalid entries
101+
elif bits < 16:
102+
converted[f"+:{regex}"] = {"bits": bits}
103+
for key in GPTQ_REQUIRED_CONFIG_KEYS: # only save keys gptq supported
104+
converted[f"+:{regex}"][key] = regex_config[name][key]
105+
else:
106+
# skip quantization
107+
converted[f"-:{regex}"] = {}
108+
return converted
109+
110+
111+
def convert_from_autogptq_dynamic(dynamic_config: dict) -> dict:
112+
"""
113+
Convert AutoGPTQ-style QuantizerConfig.dynamic into AutoRound-style extra_config.
114+
115+
Rules:
116+
- '+:regex' => quantize => keep bits and other quantization keys
117+
- '-:regex' => skip quantize => set bits to 16 (FP16 passthrough)
118+
"""
119+
converted = {}
120+
for name, cfg in dynamic_config.items():
121+
# Strip the +: or -:
122+
if name.startswith("+:"):
123+
regex = name[2:]
124+
# keep all config fields (bits, group_size, sym, etc.)
125+
converted[regex] = dict(cfg)
126+
elif name.startswith("-:"):
127+
regex = name[2:]
128+
# mark skipped layers with bits=16
129+
converted[regex] = {"bits": 16, "act_bits": 16}
130+
return converted
69131

70132

71133
def pack_layer(name, model, backend, device=None):
@@ -132,58 +194,93 @@ def pack_layer(name, model, backend, device=None):
132194
def save_quantized_as_autogptq(output_dir, inplace=True, backend="auto_gptq:exllamav2", **kwargs):
133195
"""Export the model to autogptq format to easily leverage cuda kernel."""
134196

197+
# --- 1️⃣ Extract inputs & configs ---
135198
model = kwargs["model"]
136-
safe_serialization = True if "safe_serialization" not in kwargs.keys() else kwargs["safe_serialization"]
137-
quant_block_list = kwargs.get("quant_block_list", get_block_names(model))
138-
tokenizer = kwargs.get("tokenizer", None)
139-
processor = kwargs.get("processor", None)
140-
device = kwargs.get("device", None)
141-
image_processor = kwargs.get("image_processor", None)
142-
if output_dir is not None and os.path.exists(output_dir):
143-
logger.warning(f"{output_dir} already exists, this may cause model conflict")
144-
if output_dir is not None and tokenizer is not None and hasattr(tokenizer, "save_pretrained"):
145-
tokenizer.save_pretrained(output_dir)
146-
if output_dir is not None and processor is not None:
147-
processor.save_pretrained(output_dir)
148-
if output_dir is not None and image_processor is not None:
149-
image_processor.save_pretrained(output_dir)
150-
##check module quantized in block, this may have bug for mixed precision quantization
151199
quantization_config = kwargs["serialization_dict"]
200+
layer_config = kwargs["layer_config"]
201+
quant_block_list = kwargs.get("quant_block_list", get_block_names(model))
202+
tokenizer = kwargs.get("tokenizer")
203+
processor = kwargs.get("processor")
204+
image_processor = kwargs.get("image_processor")
205+
device = kwargs.get("device")
206+
safe_serialization = kwargs.get("safe_serialization", True)
207+
208+
# --- Save metadata (tokenizer, processor, etc.) ---
209+
if output_dir:
210+
if os.path.exists(output_dir):
211+
logger.warning(f"{output_dir} already exists, may cause overwrite conflicts.")
212+
for comp in (tokenizer, processor, image_processor):
213+
if comp is not None and hasattr(comp, "save_pretrained"):
214+
comp.save_pretrained(output_dir)
215+
216+
# --- Handle quantization structure ---
152217
all_blocks = quant_block_list
153-
flattened_list = [item for sublist in all_blocks for item in sublist]
154-
common_prefix = os.path.commonprefix(flattened_list).rstrip(".")
155-
if common_prefix not in BLOCK_PATTERNS:
156-
logger.error("auto-gptq format may not support loading this quantized model")
218+
flattened = [x for sub in all_blocks for x in sub]
219+
common_prefix = os.path.commonprefix(flattened).rstrip(".")
220+
221+
if "BLOCK_PATTERNS" in kwargs and common_prefix not in kwargs["BLOCK_PATTERNS"]:
222+
logger.error(f"Unsupported block prefix '{common_prefix}' for AutoGPTQ format.")
157223
quantization_config["block_name_to_quantize"] = common_prefix
158224
quantization_config.pop("to_quant_block_names", None)
159225

160-
## as layers maybe already packed, we need to check in layer_config
161-
layer_config = kwargs["layer_config"]
226+
# --- Build per-layer dynamic overrides ---
227+
regex_config = quantization_config.pop("regex_config", {})
228+
block_name_to_quantize = quantization_config.get("block_name_to_quantize")
229+
extra_config = {}
230+
lm_head_quantized = False
231+
scheme_keys = [f.name for f in fields(QuantizationScheme)]
232+
for layer_name, cfg in layer_config.items():
233+
bits = cfg.get("bits", 16)
234+
in_blocks = cfg.get("in_blocks", False)
235+
# Handle non-block layers (e.g., LM head)
236+
if not in_blocks and bits <= 8:
237+
lm_head_quantized = True
238+
extra_config[layer_name] = {k: cfg[k] for k in GPTQ_REQUIRED_CONFIG_KEYS}
239+
continue
240+
# Handle block layers
241+
if in_blocks or (block_name_to_quantize and check_start_with_block_name(layer_name, block_name_to_quantize)):
242+
neq_keys = check_neq_config(cfg, **{k: quantization_config[k] for k in scheme_keys})
243+
if neq_keys:
244+
if matches_any_regex(layer_name, regex_config):
245+
continue
246+
extra_config[layer_name] = {k: cfg[k] for k in GPTQ_REQUIRED_CONFIG_KEYS}
247+
248+
# --- Merge regex_config + extra_config into GPTQ dynamic config ---
249+
dynamic = {}
250+
if regex_config:
251+
dynamic.update(convert_to_autogptq_dynamic(regex_config))
252+
if extra_config:
253+
dynamic.update(convert_to_autogptq_dynamic(extra_config))
254+
if dynamic:
255+
quantization_config["dynamic"] = dynamic
256+
257+
# --- Block-wise quantization verification ---
162258
for n, m in model.named_modules():
163259
m.tmp_name = n
164260

165261
all_to_quantized = True
166262
modules_in_block_to_quantize = []
167-
for block_names in all_blocks:
168-
first_block = get_module(model, block_names[0])
169-
for n, m in first_block.named_modules():
170-
if m.tmp_name not in layer_config.keys():
171-
continue
172-
if not check_to_quantized(layer_config[m.tmp_name]):
173-
all_to_quantized = False
174-
else:
175-
modules_in_block_to_quantize.append(n)
176-
modules_in_block_to_quantize = [modules_in_block_to_quantize]
263+
if not dynamic: # Only uniform precision
264+
for block_names in all_blocks:
265+
first_block = get_module(model, block_names[0])
266+
for n, m in first_block.named_modules():
267+
if m.tmp_name not in layer_config:
268+
continue
269+
if not check_to_quantized(layer_config[m.tmp_name]):
270+
all_to_quantized = False
271+
else:
272+
modules_in_block_to_quantize.append(n)
273+
modules_in_block_to_quantize = [modules_in_block_to_quantize]
274+
177275
if all_to_quantized:
178276
modules_in_block_to_quantize = None
179277

180-
for n, m in model.named_modules():
278+
for _, m in model.named_modules():
181279
delattr(m, "tmp_name")
182280

183281
if not inplace:
184282
model = copy.deepcopy(model.to("cpu"))
185283

186-
layer_config = kwargs["layer_config"]
187284
names = list(layer_config.keys())
188285
max_workers = 1
189286
if not torch.cuda.is_available() and not torch.xpu.is_available():
@@ -202,6 +299,7 @@ def wrapper(name):
202299
pass
203300
if output_dir is None:
204301
return model
302+
quantization_config["lm_head"] = lm_head_quantized
205303
quantization_config["provider"] = "auto-round"
206304
quantization_config["quant_method"] = "gptq"
207305
quantization_config.pop("dataset", None) ## pile-10k is not supported in gptq

auto_round/export/export_to_autoround/export.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
is_nv_fp,
4545
is_standard_fp,
4646
set_module,
47+
to_standard_regex,
4748
)
4849

4950

@@ -340,8 +341,15 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
340341
if cfg[key] is not None:
341342
extra_config[layer_name][key] = cfg[key]
342343

344+
regex_config = quantization_config.pop("regex_config")
345+
if regex_config is not None:
346+
for name in regex_config.keys():
347+
regex_name = to_standard_regex(name)
348+
extra_config[regex_name] = {**{k: regex_config[name][k] for k in scheme_keys}}
349+
343350
if len(extra_config) > 0:
344351
quantization_config["extra_config"] = extra_config
352+
345353
names = list(layer_config.keys())
346354
max_workers = 1
347355
if not torch.cuda.is_available() and not torch.xpu.is_available():

auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
is_nv_fp,
4242
set_amax_for_all_moe_layers,
4343
set_module,
44+
to_standard_regex,
4445
)
4546
from auto_round.wrapper import WrapperWALayer
4647

@@ -211,6 +212,12 @@ def save_quantized_as_fp(output_dir, inplace=True, **kwargs):
211212
if cfg[key] is not None:
212213
extra_config[layer_name][key] = cfg[key]
213214

215+
regex_config = quantization_config.pop("regex_config")
216+
if regex_config is not None:
217+
for name in regex_config.keys():
218+
regex_name = to_standard_regex(name)
219+
extra_config[regex_name] = {**{k: regex_config[name][k] for k in scheme_keys}}
220+
214221
if len(extra_config) > 0:
215222
quantization_config["extra_config"] = extra_config
216223
names = list(layer_config.keys())

auto_round/export/export_to_awq/export.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@ def wrapper(name):
137137
return model
138138

139139
quantization_config = kwargs["serialization_dict"]
140+
regex_config = quantization_config.pop("regex_config", {}) # awq do not support mixed bits config saving
140141

141142
if output_dir is None:
142143
return compressed_model
@@ -145,11 +146,16 @@ def wrapper(name):
145146
for key in layer_config.keys():
146147
if not check_to_quantized(layer_config[key]) and not any(name in key for name in modules_to_not_convert):
147148
modules_to_not_convert.append(key)
149+
for key, cfg in regex_config.items():
150+
bits = cfg.get("bits")
151+
if bits > 8: # save fp_layer regexs
152+
modules_to_not_convert.append(key)
153+
148154
quantization_config["provider"] = "auto-round"
149155
quantization_config["quant_method"] = "awq"
150156
quantization_config["zero_point"] = not quantization_config["sym"]
151157
quantization_config["version"] = "gemm"
152-
quantization_config["modules_to_not_convert"] = modules_to_not_convert
158+
quantization_config["modules_to_not_convert"] = list(dict.fromkeys(modules_to_not_convert))
153159
##check module quantized in block, this may have bug for mixed precision quantization
154160
filter_quantization_config(quantization_config)
155161
if hasattr(compressed_model, "config"):

auto_round/export/export_to_llmcompressor/export_to_fp.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from tqdm import tqdm
2626

2727
from auto_round.export.export_to_autoround.qlinear_fp import QuantLinear
28+
from auto_round.export.export_to_llmcompressor.utils import generate_ignore_regex_list
2829
from auto_round.export.utils import save_model
2930
from auto_round.logger import logger
3031
from auto_round.utils import (
@@ -114,9 +115,8 @@ def pack_layer(name, model, backend, device=None):
114115
scale = layer.scale
115116
global_scale = getattr(layer, "weight_global_scale", None)
116117
input_global_scale = getattr(layer, "input_global_scale", None)
117-
# zero = layer.zp
118+
# zero = layer.zp # no zeros to handle, as mxfp not support asym quantization
118119
qlayer.pack(layer, scale, global_scale=global_scale, input_global_scale=input_global_scale, device=device)
119-
## no zeros to handle, as mxfp not support asym quantization
120120
qlayer.to(orig_device)
121121

122122

@@ -155,6 +155,9 @@ def save_quantized_as_fp(output_dir, inplace=True, **kwargs):
155155
device = kwargs.get("device", None)
156156
tokenizer = kwargs.get("tokenizer", None)
157157
processor = kwargs.get("processor", None)
158+
ar_quantization_config = kwargs["serialization_dict"]
159+
regex_config = ar_quantization_config.pop("regex_config")
160+
layer_config = kwargs["layer_config"]
158161
extra_config = {}
159162

160163
if act_bits <= 8:
@@ -199,12 +202,7 @@ def wrapper(name):
199202
for _ in executor.map(wrapper, names):
200203
pass
201204

202-
# TODO fix the ignore re match issue, compile with fp8 & int8 config
203-
ignore = ["lm_head"]
204-
for layer_name in layer_config:
205-
if layer_config[layer_name]["bits"] > 8: ## find ignore layers
206-
ignore.append(layer_name)
207-
ignore = list(set(ignore))
205+
ignore = generate_ignore_regex_list(regex_config=regex_config, layer_config=layer_config)
208206

209207
# get llm-compressor format config
210208
check_compressed_tensors_supported()

0 commit comments

Comments
 (0)