Skip to content

Commit ee54507

Browse files
authored
support lwq for gptq (#1324)
* [LLM]support lwq for gptq Signed-off-by: Guo, Heng <[email protected]>
1 parent 30142f1 commit ee54507

File tree

11 files changed

+200
-71
lines changed

11 files changed

+200
-71
lines changed

docs/source/quantization_weight_only.md

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -173,22 +173,19 @@ Large language models (LLMs) have shown exceptional performance across various t
173173
|:--------------:|:----------:|
174174
| RTN | &#10004; |
175175
| AWQ | &#10005; |
176-
| GPTQ | &#10005; |
176+
| GPTQ | &#10004; |
177177
| TEQ | &#10005; |
178178

179179
### Example
180180
```python
181181
from neural_compressor import PostTrainingQuantConfig, quantization
182-
from neural_compressor.adaptor.torch_utils.layer_wise_quant import load_shell
182+
from neural_compressor.adaptor.torch_utils.layer_wise_quant import load_empty_model
183183

184-
fp32_model = load_shell(model_name_or_path, AutoModelForCausalLM, torchscript=True)
184+
fp32_model = load_empty_model(model_name_or_path, torchscript=True)
185185
conf = PostTrainingQuantConfig(
186186
approach="weight_only",
187187
recipes={
188188
"layer_wise_quant": True,
189-
"layer_wise_quant_args": {
190-
"model_path": "facebook/opt-125m",
191-
},
192189
"rtn_args": {"enable_full_range": True},
193190
},
194191
)
@@ -201,6 +198,7 @@ q_model = quantization.fit(
201198
)
202199
ouput_dir = "./saved_model"
203200
q_model.save(ouput_dir)
201+
q_model = load(ouput_dir, fp32_model, weight_only=True, layer_wise=True)
204202
```
205203

206204
## Reference

examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_weight_only/run-gptq-llm.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ def skip(*args, **kwargs):
219219
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, trust_remote_code=True)
220220
model = AutoModel.from_pretrained(args.model_name_or_path, trust_remote_code=True)
221221
else:
222-
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=True)
222+
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=True, trust_remote_code=True)
223223
model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, low_cpu_mem_usage=True, trust_remote_code=True)
224224
model = model.eval()
225225

@@ -294,7 +294,8 @@ def skip(*args, **kwargs):
294294
dataloader=calib_dataloader,
295295
nsamples = args.nsamples,
296296
use_max_length = args.use_max_length,
297-
pad_max_length = args.pad_max_length
297+
pad_max_length = args.pad_max_length,
298+
device = DEV,
298299
)
299300

300301
results = lm_evaluate(

neural_compressor/adaptor/pytorch.py

Lines changed: 37 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3502,13 +3502,13 @@ def quantize(self, tune_cfg, model, dataloader, q_func=None):
35023502
):
35033503
from .torch_utils.layer_wise_quant import LayerWiseQuant
35043504

3505-
model_path = recipe_cfgs["layer_wise_quant_args"].get("model_path", None)
3505+
# model_path = recipe_cfgs["layer_wise_quant_args"].get("model_path", None)
3506+
model_path = model._model.path
35063507
smooth_quant = recipe_cfgs["layer_wise_quant_args"].get("smooth_quant", False)
35073508
alpha = recipe_cfgs["layer_wise_quant_args"].get("smooth_quant_alpha", 0.5)
3508-
assert (
3509-
model_path is not None
3510-
), "the layer_wise_quant_args should have args model_path to load the weight of model."
3511-
device = recipe_cfgs["layer_wise_quant_args"].get("decvice", "cpu")
3509+
# device = recipe_cfgs["layer_wise_quant_args"].get("decvice", "cpu")
3510+
assert model_path is not None, "The model_path should not be None."
3511+
device = self.device
35123512
lw_quant = LayerWiseQuant(
35133513
q_model._model,
35143514
model_path,
@@ -4541,14 +4541,12 @@ def rtn_quantize(self, model, tune_cfg):
45414541
# for layer_wise quant mode
45424542
recipe_cfgs = tune_cfg.get("recipe_cfgs", None)
45434543
if recipe_cfgs.get("layer_wise_quant", False):
4544-
from neural_compressor.config import options
4545-
4546-
from .torch_utils.layer_wise_quant.utils import _get_path, load_module
4544+
from .torch_utils.layer_wise_quant.utils import LWQ_WORKSPACE, _get_path, load_module
45474545

4548-
lwq_workspace = os.path.join(options.workspace, "lwq_tmpdir")
4549-
os.makedirs(lwq_workspace, exist_ok=True)
4550-
model_path = recipe_cfgs["layer_wise_quant_args"].get("model_path", None)
4551-
assert model_path, "model_path should specify in layer_wise_quant_args."
4546+
os.makedirs(LWQ_WORKSPACE, exist_ok=True)
4547+
# model_path = recipe_cfgs["layer_wise_quant_args"].get("model_path", None)
4548+
model_path = model.path
4549+
assert model_path, "model_path should not be None."
45524550
model_path = _get_path(model_path)
45534551

45544552
for key, config in tune_cfg["op"].items():
@@ -4584,7 +4582,7 @@ def rtn_quantize(self, model, tune_cfg):
45844582
# save and clean weight
45854583
from .torch_utils.layer_wise_quant.utils import clean_module_weight
45864584

4587-
torch.save(m.state_dict(), os.path.join(lwq_workspace, f"{op_name}.pt"))
4585+
torch.save(m.state_dict(), os.path.join(LWQ_WORKSPACE, f"{op_name}.pt"))
45884586
clean_module_weight(m)
45894587
set_module(model, op_name, m)
45904588
if recipe_cfgs.get("layer_wise_quant", False):
@@ -4619,6 +4617,23 @@ def gptq_quantize(self, model, tune_cfg, dataloader):
46194617
...
46204618
}
46214619
"""
4620+
# for layer_wise quant mode
4621+
recipe_cfgs = tune_cfg.get("recipe_cfgs", None)
4622+
model_path = None
4623+
layer_wise = False
4624+
if recipe_cfgs.get("layer_wise_quant", False):
4625+
layer_wise = True
4626+
from .torch_utils.layer_wise_quant.utils import LWQ_WORKSPACE, _get_path, register_weight_hooks
4627+
4628+
os.makedirs(LWQ_WORKSPACE, exist_ok=True)
4629+
# model_path = recipe_cfgs["layer_wise_quant_args"].get("model_path", None)
4630+
model_path = model.path
4631+
assert model_path, "model_path should not be None."
4632+
model_path = _get_path(model_path)
4633+
lwq_handles = register_weight_hooks(
4634+
model, model_path, device=self.device, clean_weight=True, saved_path=LWQ_WORKSPACE
4635+
)
4636+
46224637
weight_config = {}
46234638
for key, config in tune_cfg["op"].items():
46244639
op_name, op_type = key
@@ -4643,7 +4658,15 @@ def gptq_quantize(self, model, tune_cfg, dataloader):
46434658
)
46444659
# tune_cfg => weight_config
46454660
model, quantization_perm = gptq_quantize(
4646-
model, weight_config, dataloader, nsamples, use_max_length, pad_max_length, self.device
4661+
model,
4662+
weight_config,
4663+
dataloader,
4664+
nsamples,
4665+
use_max_length,
4666+
pad_max_length,
4667+
self.device,
4668+
layer_wise,
4669+
model_path,
46474670
)
46484671
return model, quantization_perm
46494672

neural_compressor/adaptor/torch_utils/gptq.py

Lines changed: 81 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
# See the License for the specific language governing permissions and
1616
# limitations under the License.
1717

18+
import gc
1819
import math
1920
import random
2021
import re
@@ -175,6 +176,7 @@ def __init__(
175176
use_max_length=True,
176177
pad_max_length=2048,
177178
device=None,
179+
layer_wise=False,
178180
):
179181
"""
180182
Args:
@@ -215,9 +217,13 @@ def __init__(
215217
self.check_layer_config()
216218

217219
# device
218-
self.device = model.device
220+
self.device = device
221+
if str(self.model.device).startswith("cuda"):
222+
self.device = self.model.device
219223
self.is_ready = False
220224

225+
self.layer_wise = layer_wise
226+
221227
# dataloader
222228
self.use_max_length = use_max_length
223229
self.pad_max_length = pad_max_length
@@ -438,11 +444,13 @@ def forward(layer, *args, **kwargs):
438444
raise ValueError
439445

440446
# Step1: fetch the embeddings and other layers before the transformer stack.
441-
for embedding_name, embedding_layer in self.gptq_related_blocks["embeddings"].items():
442-
embedding_layer = embedding_layer.to(self.device)
447+
if not self.layer_wise:
448+
for embedding_name, embedding_layer in self.gptq_related_blocks["embeddings"].items():
449+
embedding_layer = embedding_layer.to(self.device)
443450

444451
# Step2: modify the first transformer block's forward function to obtain inputs for calibration
445-
self.gptq_related_blocks["transformers"][0] = self.gptq_related_blocks["transformers"][0].to(self.device)
452+
if not self.layer_wise:
453+
self.gptq_related_blocks["transformers"][0] = self.gptq_related_blocks["transformers"][0].to(self.device)
446454
forward_cache = self.gptq_related_blocks["transformers"][0].forward
447455
self.gptq_related_blocks["transformers"][0].forward = partial(
448456
forward, self.gptq_related_blocks["transformers"][0]
@@ -451,7 +459,8 @@ def forward(layer, *args, **kwargs):
451459
# Step3: run forward to obtain calibration datasets
452460
logger.info("Collecting calibration inputs...")
453461
for batch in tqdm(self.dataloader):
454-
batch = move_input_to_device(batch, self.device)
462+
if not self.layer_wise:
463+
batch = move_input_to_device(batch, self.device)
455464
try:
456465
if isinstance(batch, tuple) or isinstance(batch, list):
457466
self.model(batch[0])
@@ -473,9 +482,10 @@ def forward(layer, *args, **kwargs):
473482

474483
# Step 4: restore original forward function, relocate layers back to cpu.
475484
self.gptq_related_blocks["transformers"][0].forward = forward_cache
476-
self.gptq_related_blocks["transformers"][0] = self.gptq_related_blocks["transformers"][0].cpu()
477-
for embedding_name, embedding_layer in self.gptq_related_blocks["embeddings"].items():
478-
embedding_layer.to(self.device)
485+
if not self.layer_wise:
486+
self.gptq_related_blocks["transformers"][0] = self.gptq_related_blocks["transformers"][0].cpu()
487+
for embedding_name, embedding_layer in self.gptq_related_blocks["embeddings"].items():
488+
embedding_layer.to(self.device)
479489
torch.cuda.empty_cache()
480490
# end
481491
logger.info("GPTQ quantization prepared.")
@@ -501,7 +511,7 @@ def update_blockwise_hidden_states(self, outs):
501511
self.cache_positional_arguments[0] = outs[:]
502512

503513
@torch.no_grad()
504-
def execute_quantization(self, means=None, stds=None):
514+
def execute_quantization(self, means=None, stds=None, model_path=None):
505515
"""Run quantization."""
506516
# Step1: prepare quantization (calibration datasets)
507517

@@ -513,7 +523,11 @@ def execute_quantization(self, means=None, stds=None):
513523
tblock_length = len(self.gptq_related_blocks["transformers"])
514524
for block_idx in range(tblock_length):
515525
logger.info(f"Quantizing layer {block_idx + 1} / {tblock_length}..")
516-
transformer_block = self.gptq_related_blocks["transformers"][block_idx].to(self.device)
526+
if not self.layer_wise:
527+
# if we do not apply layer-wise feature, we still place the entire block on the GPU
528+
transformer_block = self.gptq_related_blocks["transformers"][block_idx].to(self.device)
529+
else:
530+
transformer_block = self.gptq_related_blocks["transformers"][block_idx] # .to(self.device)
517531
# Step2.1: obtain all layers (Linear, Conv2d, etc) in the block which can be quantized.
518532
sub_layers = find_layers(transformer_block)
519533
sub_layers_to_quant = {}
@@ -534,8 +548,16 @@ def execute_quantization(self, means=None, stds=None):
534548
# weight_config_this_layer = self.weight_config.get(
535549
# self.get_full_layer_name(layer_name, block_idx), None
536550
# )
537-
weight_config_this_layer = self.get_layer_config(self.get_full_layer_name(layer_name, block_idx))
538-
gptq_for_this_block[layer_name] = GPTQ(sub_layers[layer_name])
551+
full_layer_name = self.get_full_layer_name(layer_name, block_idx)
552+
weight_config_this_layer = self.get_layer_config(full_layer_name)
553+
if self.layer_wise:
554+
from ..torch_utils.layer_wise_quant.utils import load_value
555+
556+
W = load_value(self.model, full_layer_name + ".weight", model_path)
557+
else:
558+
W = sub_layers[layer_name].weight.data.clone()
559+
560+
gptq_for_this_block[layer_name] = GPTQ(sub_layers[layer_name], W, self.device)
539561
# gptq_for_this_block[layer_name].quantizer = Quantizer()
540562
gptq_for_this_block[layer_name].quantizer.configure(
541563
weight_config_this_layer["wbits"],
@@ -555,7 +577,6 @@ def tmp(_, inp, out):
555577
for layer_name in sub_layers:
556578
handles.append(sub_layers[layer_name].register_forward_hook(add_batch(layer_name)))
557579
idx = self.cache_key_arguments.pop("i")
558-
# import pdb;pdb.set_trace()
559580
for j in range(len(self.dataloader)):
560581
cache_keyword_batch = self.gather_single_batch_from_dict(self.cache_key_arguments, j)
561582
cache_positional_batch = self.gather_single_batch_from_list(self.cache_positional_arguments, j)
@@ -570,12 +591,44 @@ def tmp(_, inp, out):
570591
# )
571592
weight_config_this_layer = self.get_layer_config(self.get_full_layer_name(layer_name, block_idx))
572593
logger.info(f"Quantizing layer {layer_name}")
573-
scale, zp = gptq_for_this_block[layer_name].fasterquant(
594+
if self.layer_wise:
595+
from ..torch_utils.layer_wise_quant.utils import load_value
596+
597+
full_layer_name = self.get_full_layer_name(layer_name, block_idx)
598+
W = load_value(self.model, full_layer_name + ".weight", model_path)
599+
else:
600+
W = sub_layers[layer_name].weight.data.clone()
601+
scale, zp, Q = gptq_for_this_block[layer_name].fasterquant(
602+
W,
574603
blocksize=weight_config_this_layer["block_size"],
575604
percdamp=weight_config_this_layer["percdamp"],
576605
groupsize=weight_config_this_layer["group_size"],
577606
act_order=weight_config_this_layer["act_order"],
578607
)
608+
if self.layer_wise:
609+
from ..torch_utils.layer_wise_quant.utils import (
610+
LWQ_WORKSPACE,
611+
clean_module_weight,
612+
load_value,
613+
set_module_tensor_to_device,
614+
)
615+
616+
sub_layer = sub_layers[layer_name]
617+
full_layer_name = self.get_full_layer_name(layer_name, block_idx)
618+
for n, p in sub_layer.named_parameters():
619+
param_name = full_layer_name + "." + n
620+
if n == "weight":
621+
set_module_tensor_to_device(self.model, param_name, self.device, Q)
622+
else:
623+
value = load_value(self.model, param_name, model_path)
624+
set_module_tensor_to_device(self.model, param_name, self.device, value)
625+
# sub_layer.weight.data = Q
626+
torch.save(sub_layer.state_dict(), LWQ_WORKSPACE + f"/{full_layer_name}.pt")
627+
clean_module_weight(sub_layer)
628+
del Q
629+
gc.collect()
630+
else:
631+
sub_layers[layer_name].weight.data = Q
579632
gptq_config[self.get_full_layer_name(layer_name, block_idx)] = {"scale": scale}
580633
if not weight_config_this_layer["sym"]:
581634
gptq_config[self.get_full_layer_name(layer_name, block_idx)]["zero"] = zp
@@ -594,7 +647,10 @@ def tmp(_, inp, out):
594647
out = transformer_block(*cache_positional_batch, **cache_keyword_batch)[0]
595648
outs.append(out)
596649
self.cache_key_arguments["i"] = idx
597-
self.gptq_related_blocks["transformers"][block_idx] = transformer_block.cpu()
650+
if self.layer_wise:
651+
self.gptq_related_blocks["transformers"][block_idx] = transformer_block
652+
else:
653+
self.gptq_related_blocks["transformers"][block_idx] = transformer_block.cpu()
598654
del gptq_for_this_block
599655
torch.cuda.empty_cache()
600656
# iteratively replace the input with output, thus layerwise quantization can continue.
@@ -617,10 +673,10 @@ class GPTQ:
617673
GPTQ: Accurate Post-training Compression for Generative Pretrained Transformers (https://arxiv.org/abs/2210.17323)
618674
"""
619675

620-
def __init__(self, layer):
676+
def __init__(self, layer, W, device="cpu"):
621677
self.layer = layer
622-
self.device = self.layer.weight.device
623-
W = layer.weight.data.clone()
678+
self.device = device
679+
# W = layer.weight.data.clone()
624680
if isinstance(self.layer, nn.Conv2d) or isinstance(self.layer, nn.Conv1d):
625681
W = W.flatten(1)
626682
if isinstance(self.layer, transformers.Conv1D):
@@ -661,8 +717,9 @@ def add_batch(self, inp, out):
661717
# self.H += 2 / self.nsamples * inp.matmul(inp.t())
662718
self.H += inp.matmul(inp.t()) # H = X*X, which should be a sysm matrix
663719

664-
def fasterquant(self, blocksize=128, percdamp=0.01, groupsize=-1, act_order=False):
665-
W = self.layer.weight.data.clone()
720+
def fasterquant(self, W, blocksize=128, percdamp=0.01, groupsize=-1, act_order=False):
721+
# W = self.layer.weight.data.clone()
722+
weight_shape, weight_dtype = W.shape, W.data.dtype
666723
if isinstance(self.layer, nn.Conv2d):
667724
W = W.flatten(1)
668725
if isinstance(self.layer, transformers.Conv1D):
@@ -740,7 +797,7 @@ def fasterquant(self, blocksize=128, percdamp=0.01, groupsize=-1, act_order=Fals
740797
# logger.info(f"{torch.sum((self.layer(self.inp1) - self.out1) ** 2)}")
741798
# logger.info(f"{torch.sum(Losses)}")
742799

743-
if self.device != torch.device("cpu"):
800+
if str(self.device).startswith("cuda"):
744801
torch.cuda.synchronize()
745802
logger.info(f"time {(time.time() - tick)}")
746803
logger.info(f"error {torch.sum(Losses).item()}")
@@ -751,7 +808,8 @@ def fasterquant(self, blocksize=128, percdamp=0.01, groupsize=-1, act_order=Fals
751808

752809
if isinstance(self.layer, transformers.Conv1D):
753810
Q = Q.t()
754-
self.layer.weight.data = Q.reshape(self.layer.weight.shape).to(self.layer.weight.data.dtype)
811+
# self.layer.weight.data = Q.reshape(self.layer.weight.shape).to(self.layer.weight.data.dtype)
812+
Q = Q.reshape(weight_shape).to(weight_dtype)
755813
if DEBUG:
756814
logger.info(f"{torch.sum((self.layer(self.inp1) - self.out1) ** 2)}")
757815

@@ -760,7 +818,7 @@ def fasterquant(self, blocksize=128, percdamp=0.01, groupsize=-1, act_order=Fals
760818
zero.append(self.quantizer.zero)
761819
scale = torch.cat(scale, dim=1)
762820
zero = torch.cat(zero, dim=1)
763-
return scale, zero
821+
return scale, zero, Q
764822

765823
def free(self):
766824
if DEBUG:

neural_compressor/adaptor/torch_utils/layer_wise_quant/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,5 +15,5 @@
1515
# See the License for the specific language governing permissions and
1616
# limitations under the License.
1717
"""Torch layer-wise quantization module."""
18-
from .utils import load_shell
18+
from .utils import load_empty_model
1919
from .quantize import LayerWiseQuant

neural_compressor/adaptor/torch_utils/layer_wise_quant/quantize.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
update_module,
4141
)
4242

43-
TMP_DIR = os.path.join(default_workspace, "layer_wise_quant_tmp_dir")
43+
TMP_DIR = os.path.join(default_workspace, "lwq_tmpdir")
4444

4545

4646
def mk_tmp_dir():
@@ -92,7 +92,7 @@ def __init__(
9292
alpha=0.5,
9393
):
9494
"""Init LayerWiseQuant."""
95-
# self.q_model = load_shell(pretrained_model_name_or_path, cls)
95+
# self.q_model = load_empty_model(pretrained_model_name_or_path, cls)
9696
self.q_model = q_model
9797
self.fp32_model = deepcopy(self.q_model)
9898
self.path = _get_path(pretrained_model_name_or_path)

0 commit comments

Comments
 (0)