From a9efdeccf14d866c4ee13c189d1915e76dcb90b2 Mon Sep 17 00:00:00 2001 From: changwangss Date: Sun, 25 Aug 2024 18:15:32 -0700 Subject: [PATCH 01/18] add quantize function for transformers-like api Signed-off-by: changwangss --- .../transformers/quantization/utils.py | 611 ++++++++++++++++++ .../transformers/utils/quantization_config.py | 401 ++++++++++++ .../transformers/utils/utility.py | 17 + 3 files changed, 1029 insertions(+) create mode 100644 neural_compressor/transformers/quantization/utils.py create mode 100644 neural_compressor/transformers/utils/quantization_config.py create mode 100644 neural_compressor/transformers/utils/utility.py diff --git a/neural_compressor/transformers/quantization/utils.py b/neural_compressor/transformers/quantization/utils.py new file mode 100644 index 00000000000..4ea2d63ce73 --- /dev/null +++ b/neural_compressor/transformers/quantization/utils.py @@ -0,0 +1,611 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Intel Neural Compressor model convert.""" + +import gc +import logging +import math +import os +from ....tools.utils import _ipex_version +from accelerate import init_empty_weights +from datasets import load_dataset +from neural_compressor.torch.algorithms.weight_only.modules import INCWeightOnlyLinear as WeightOnlyLinear +from neural_compressor.torch.quantization import ( + GPTQConfig, + RTNConfig, + convert, + prepare, +) +from neural_compressor.utils.utility import LazyImport +from transformers import AutoTokenizer + +from neural_compressor.torch.utils import is_ipex_available +if is_ipex_available(): + import intel_extension_for_pytorch as ipex + +from ...utils import CpuInfo + +torch = LazyImport("torch") + + +logger = logging.getLogger(__name__) + +def convert_dtype_str2torch(str_dtype): + if str_dtype == "int8": + return torch.int8 + elif str_dtype == "fp32" or str_dtype == "auto": + return torch.float + elif str_dtype == "fp16": + return torch.float16 + elif str_dtype == "bf16": + return torch.bfloat16 + else: + assert False, "Unsupported str dtype {} to torch dtype".format(str_dtype) + + +def convert_dtype_torch2str(dtype): + if dtype == torch.int8: + return "int8" + elif dtype == torch.float: + return "fp32" + elif dtype == torch.float16: + return "fp16" + elif dtype == torch.bfloat16: + return "bf16" + elif isinstance(dtype, str) and dtype in ["int8", "fp32", "fp16", "bf16"]: + return dtype + else: + assert False, "Unsupported pytorch dtype {} to str dtype".format(dtype) + +def replace_linear( + model, + modules_to_not_convert=None, + current_key_name=None, + quantization_config=None, + device="cpu", + empty_weights=False, +): + if modules_to_not_convert is None: + # output_layer is chatglm last layer name + # embed_out is dolly_v2 last layer name + modules_to_not_convert = [] + if quantization_config.llm_int8_skip_modules: + modules_to_not_convert.extend( + quantization_config.llm_int8_skip_modules + ) + modules_to_not_convert = list(set(modules_to_not_convert)) + model, is_replaced = _replace_linear( + model, + modules_to_not_convert, + current_key_name, + quantization_config, + device=device, + empty_weights=empty_weights, + ) + + if not is_replaced: + logger.warning( + "You are loading your model in 8bit or 4bit but no linear modules were found in your model." + " Please double check your model architecture, or submit an issue on github if you think this is" + " a bug." + ) + + return model + + +def _replace_linear( + model, + modules_to_not_convert=None, + current_key_name=None, + quantization_config=None, + is_replaced=False, + device="cpu", + empty_weights=False, +): + """Private method that wraps the recursion for module replacement. + + Returns the converted model and a boolean that indicates if the conversion has been successfully or not. + """ + for name, module in model.named_children(): + if current_key_name is None: + current_key_name = [] + current_key_name.append(name) + is_removed = False + use_optimum_format = getattr(module, "use_optimum_format", False) + + if ( + isinstance(module, torch.nn.Linear) + or isinstance(module, WeightOnlyLinear) + or ( + is_ipex_available() + and isinstance(module, ipex.nn.utils._weight_prepack._IPEXLinear) + ) + ) and (name not in modules_to_not_convert): + # Check if the current key is not in the `modules_to_not_convert` + if not any( + key in ".".join(current_key_name) for key in modules_to_not_convert + ): + with init_empty_weights(): + in_features = module.in_features + out_features = module.out_features + if ( + device == "cpu" + or device == torch.device("cpu") + or device == "auto" + ): + if is_ipex_available() and quantization_config.use_ipex: + from intel_extension_for_pytorch.nn.modules import ( + WeightOnlyQuantizedLinear as ipex_linear, + ) + from intel_extension_for_pytorch.utils.weight_only_quantization import ( + _convert_optimum_format_to_desired, + ) + + qweight, scales, qzeros = ( + _convert_optimum_format_to_desired( + module.qweight, module.scales, module.qzeros + ) + ) + + weight_dtype = { + 4: ipex.quantization.WoqWeightDtype.INT4, + 8: ipex.quantization.WoqWeightDtype.INT8, + } + compute_dtype = { + "fp32": ipex.quantization.WoqLowpMode.NONE, # follow the activation datatype. + "bf16": ipex.quantization.WoqLowpMode.BF16, + "fp16": ipex.quantization.WoqLowpMode.FP16, + "int8": ipex.quantization.WoqLowpMode.INT8, + } + + ipex_qconfig_mapping = ipex.quantization.get_weight_only_quant_qconfig_mapping( + weight_dtype=weight_dtype[quantization_config.bits], + lowp_mode=compute_dtype[ + quantization_config.compute_dtype + ], + act_quant_mode=ipex.quantization.WoqActQuantMode.PER_IC_BLOCK, + group_size=quantization_config.group_size, + ) + tmp_linear = torch.nn.Linear( + in_features, + out_features, + True if hasattr(module, "bias") else False, + ) + tmp_linear.qconfig = ipex_qconfig_mapping.global_qconfig + model._modules[name] = ( + ipex_linear.from_float_and_int4_weight( + mod=tmp_linear, + qweight=qweight, + scales=scales, + zero_points=qzeros, + bias=( + module.bias if hasattr(module, "bias") else None + ), + group_size=quantization_config.group_size, + g_idx=( + module.g_idx + if hasattr(module, "g_idx") + else None + ), + ) + ) + + elif device == "xpu" or device == torch.device("xpu"): + from intel_extension_for_pytorch.nn.utils._quantize_convert import \ + WeightOnlyQuantizedLinear as ipex_linear # pylint: disable=E0401 + model._modules[name] = ipex_linear( + in_features, + out_features, + module.bias is not None, + compute_dtype=quantization_config.compute_dtype, + compress_statistics=False, + weight_dtype=quantization_config.weight_dtype, + scale_dtype=quantization_config.scale_dtype, + blocksize=quantization_config.group_size, + scheme=quantization_config.scheme, + compression_dtype=getattr(module, "compression_dtype", + torch.int8 if _ipex_version < "2.3.10" else torch.int32), + compression_dim=getattr(module, "compression_dim", 0 if _ipex_version < "2.3.10" else 1), + device=device, + use_optimum_format=getattr(module, "use_optimum_format", + False if _ipex_version < "2.3.10" else True), + ) + if quantization_config.quant_method.value == "gptq": + g_idx = getattr( + module, + "g_idx", + torch.zeros(in_features, dtype=torch.int32).to(device), + ) + else: + g_idx = None + model._modules[name].set_scales_zps_gidx( + ( + module.scales + if hasattr(module, "scales") + else torch.ones( + ( + out_features, + math.ceil( + in_features / quantization_config.group_size + ), + ), + dtype=convert_dtype_str2torch( + quantization_config.compute_dtype + ), + device=torch.device(device), + ) if _ipex_version < "2.3.10" else torch.ones( + ( + math.ceil( + in_features / quantization_config.group_size + ), + out_features, + ), + dtype=convert_dtype_str2torch( + quantization_config.compute_dtype + ), + device=torch.device(device), + ) + ), + module.qzeros if hasattr(module, "qzeros") else None, + g_idx, + ) + else: + raise Exception( + "{} device Unsupported weight only quantization!".format( + device + ) + ) + + is_replaced = True + # Store the module class in case we need to transpose the weight later + model._modules[name].source_cls = type(module) + # Force requires grad to False to avoid unexpected errors + model._modules[name].requires_grad_(False) + if quantization_config.use_ipex: + pass + elif ( + device == "cpu" or device == torch.device("cpu") or device == "auto" + ): + if quantization_config.weight_dtype in [ + "fp8_e5m2", + "fp8_e4m3", + ]: + model._modules[name].set_fp_weights_bias( + module.weight.data, + None if module.bias is None else module.bias.data, + ) + else: + if quantization_config.weight_dtype in ["int4", "int4_clip", "int8"]: + int_weight, scales, zeros = unpack_weight( + module.qweight, + module.scales, + module.qzeros if hasattr(module, "qzeros") else None, + quantization_config, + ) + int_weight = int_weight.view(-1, int_weight.shape[-1]) + else: + int_weight = module.unpack_tensor_with_numpy(module.qweight) + scales = module.scales + zeros = module.qzeros if hasattr(module, "qzeros") else None + + model._modules[name].set_weights_bias( + int_weight, + scales, + zeros, + module.g_idx if hasattr(module, "g_idx") else None, + quantization_config, + bias=None if module.bias is None else module.bias.data, + ) + else: + if not hasattr(module, "qweight"): + n_pack = ( + (8 if _ipex_version < "2.3.10" else 32) + // DTYPE_BITS_MAPPING[quantization_config.weight_dtype] + ) + weight = torch.zeros( + (math.ceil(out_features / n_pack), in_features) if _ipex_version < "2.3.10" else + (math.ceil(in_features / n_pack), out_features), + dtype=torch.int8 if _ipex_version < "2.3.10" else torch.int32, + device=torch.device(device), + ) + model._modules[name].set_weights_bias( + module.qweight.data if hasattr(module, "qweight") else weight, + None if module.bias is None else module.bias.data, + ) + del module + gc.collect() + is_removed = True + + if not is_removed and len(list(module.children())) > 0: # pylint: disable=E1101 + _, is_replaced = _replace_linear( + module, + modules_to_not_convert, + current_key_name, + quantization_config, + is_replaced=is_replaced, + device=device, + empty_weights=empty_weights, + ) + # Remove the last key for recursion + current_key_name.pop(-1) + return model, is_replaced + + +def default_run_fn( + model, tokenizer, dataset, max_length=512, n_samples=100, batch_size=8, algo="rtn" +): + from torch.utils.data import DataLoader + + if isinstance(dataset, (str, bytes, os.PathLike)): + calib_dataset = load_dataset(dataset, split="train") + calib_dataset = calib_dataset.shuffle(seed=42) + if tokenizer is None: + logger.error("Please provide the tokenizer in quantization_config.") + exit(0) + + def tokenize_function(examples): + if algo == "teq": + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + if "prompt" in examples: + if algo == "teq": + example = tokenizer( + examples["prompt"], padding="max_length", max_length=max_length + ) + else: + example = tokenizer(examples["prompt"]) + elif "code" in examples: + if algo == "teq": + example = tokenizer( + examples["code"], padding="max_length", max_length=max_length + ) + else: + example = tokenizer(examples["code"]) + elif "text" in examples: + if algo == "teq": + example = tokenizer( + examples["text"], padding="max_length", max_length=max_length + ) + else: + example = tokenizer(examples["text"]) + else: + logger.error( + "Please check dataset prompt identifier," + + " NeelNanda/pile-10k is default used calibration dataset." + ) + exit(0) + return example + + tokenized_dataset = calib_dataset.map(tokenize_function, batched=True) + tokenized_dataset.set_format(type="torch", columns=["input_ids"]) + tokenized_dataset = tokenized_dataset.filter(lambda x: x["input_ids"].shape[-1] >= max_length) + + def collate_batch(batch): + input_ids_padded = [] + for text in batch: + input_ids = text["input_ids"] + if len(input_ids) >= max_length: + input_ids = input_ids[:max_length] + input_ids_padded.append(input_ids) + else: + continue + assert input_ids_padded != [], \ + "The dataset does not have data that meets the required input length. Please reduce seq_len." + return torch.vstack(input_ids_padded) + + + calib_dataloader = DataLoader( + tokenized_dataset, + batch_size=batch_size, + shuffle=False, + collate_fn=collate_batch, + ) + total_cnt = 0 + for i, (input_ids) in enumerate(calib_dataloader): + if total_cnt + input_ids.shape[0] > n_samples: + input_ids = input_ids[: n_samples - total_cnt, ...] + total_cnt += input_ids.shape[0] + if total_cnt >= n_samples: + break + + try: + model( + input_ids=input_ids, + ) + except ValueError: + pass + +@torch.no_grad() +def run_fn_for_autoround(model, dataloader): + for data in dataloader: + if isinstance(data, tuple) or isinstance(data, list): + model(*data) + elif isinstance(data, dict): + model(**data) + else: + model(data) + +def convert_to_quantized_model(model, config, device="cpu"): + if device == "xpu" or device == torch.device("xpu"): + import intel_extension_for_pytorch + + assert ( + hasattr(torch, "xpu") and torch.xpu.is_available() + ), "There is no xpu device in this system!" + + orig_dtype = torch.float32 + for param in model.parameters(): + orig_dtype = param.dtype + if orig_dtype != torch.float32: + model.to(dtype=torch.float32) + break + if config.weight_dtype in ["fp8_e4m3", "fp8_e5m2"]: + return replace_linear(model, None, None, config, device=device) + else: + if config.weight_dtype == "int8": + dtype = "int8" + elif "int4" in config.weight_dtype: + dtype = "int4" + else: + dtype = config.weight_dtype + # mapping to INC config + if config.quant_method.value == "rtn": + quant_config = RTNConfig( + dtype=dtype, + bits=config.bits, + use_sym=config.sym, + group_size=config.group_size, + use_layer_wise=config.layer_wise, + ) + if config.llm_int8_skip_modules != []: + for module in config.llm_int8_skip_modules: + module_name = ".*" + module + quant_config.set_local(module_name, RTNConfig(dtype="fp32")) + logger.info(f"Do RTN algorithm with config {quant_config}") + model = prepare(model, quant_config) + model = convert(model) + elif config.quant_method.value == "awq": + quant_config = AWQConfig( + dtype=dtype, + bits=config.bits, + use_sym=config.sym, + group_size=config.group_size, + use_layer_wise=config.layer_wise, + use_auto_scale=config.auto_scale, + use_auto_clip=config.auto_clip, + folding=True, + ) + if config.llm_int8_skip_modules != []: + for module in config.llm_int8_skip_modules: + module_name = ".*" + module + quant_config.set_local(module_name, AWQConfig(dtype="fp32")) + logger.info(f"Do AWQ algorithm with config {quant_config}") + run_fn = default_run_fn + run_args = ( + config.tokenizer, + config.dataset, + config.seq_len, # max_length + config.n_samples, # n_samples + config.batch_size, # batch_size + config.quant_method.value, # algo + ) + example_inputs = torch.ones([1, 512], dtype=torch.long).to(device) + model = prepare(model=model, quant_config=quant_config, example_inputs=example_inputs) + run_fn(model, *run_args) + model = convert(model) + elif config.quant_method.value == "teq": + quant_config = TEQConfig( + dtype=dtype, + bits=config.bits, + use_sym=config.sym, + group_size=config.group_size, + use_layer_wise=config.layer_wise, + absorb_to_layer=config.absorb_to_layer + ) + if config.llm_int8_skip_modules != []: + for module in config.llm_int8_skip_modules: + module_name = ".*" + module + quant_config.set_local(module_name, TEQConfig(dtype="fp32")) + logger.info(f"Do TEQ algorithm with config {quant_config}") + run_fn = default_run_fn + run_args = ( + config.tokenizer, + config.dataset, + config.seq_len, # max_length + config.n_samples, # n_samples + config.batch_size, # batch_size + config.quant_method.value, # algo + ) + example_inputs = torch.ones([1, 512], dtype=torch.long).to(device) + model = prepare(model=model, quant_config=quant_config, example_inputs=example_inputs) + run_fn(model, *run_args) + model = convert(model) + + elif config.quant_method.value == "gptq": + model.seqlen = config.seq_len + quant_config = GPTQConfig( + dtype=dtype, + bits=config.bits, + use_sym=config.sym, + group_size=config.group_size, + use_layer_wise=config.layer_wise, + act_order=config.desc_act, + percdamp=config.damp_percent, + block_size=config.blocksize, + static_groups=config.static_groups, + use_mse_search=config.use_mse_search, + ) + if config.llm_int8_skip_modules != []: + for module in config.llm_int8_skip_modules: + module_name = ".*" + module + quant_config.set_local(module_name, GPTQConfig(dtype="fp32")) + logger.info(f"Do GPTQ algorithm with config {quant_config}") + run_fn = default_run_fn + run_args = ( + config.tokenizer, + config.dataset, + config.seq_len, # max_length + config.n_samples, # n_samples + config.batch_size, # batch_size + config.quant_method.value, # algo + ) + model = prepare(model=model, quant_config=quant_config) + run_fn(model, *run_args) + model = convert(model) + elif config.quant_method.value == "autoround": + quant_config = AutoRoundConfig( + dtype=dtype, + bits=config.bits, + use_sym=config.sym, + group_size=config.group_size, + enable_quanted_input=not config.disable_quanted_input, + lr=config.lr, + minmax_lr=config.minmax_lr, + seqlen=config.seq_len, + nsamples=config.n_samples, + iters=config.iters, + scale_dtype=config.scale_dtype, + ) + if config.llm_int8_skip_modules != []: + for module in config.llm_int8_skip_modules: + module_name = ".*" + module + quant_config.set_local(module_name, AutoRoundConfig(dtype="fp32")) + logger.info(f"Do AutoRound algorithm with config {quant_config}") + dataloader = get_autoround_dataloader(tokenizer=config.tokenizer, + seqlen=config.seq_len, + dataset_name="NeelNanda/pile-10k", + seed=42, + bs=config.batch_size, + nsamples=config.n_samples) + run_fn = run_fn_for_autoround + run_args = (dataloader,) + model = prepare(model=model, quant_config=quant_config) + run_fn(model, *run_args) + model = convert(model) + else: + assert False, "The Supported algorithm are RTN, AWQ, TEQ, GPTQ, AUTOROUND" + + if device == "xpu" or device == torch.device("xpu"): + logger.warning("The recommended ipex version is higher than 2.3.10 for xpu device.") + + model.eval() + # INC attribute conflicted with transformers when use nf4/int8 training. + del model.is_quantized + q_model = replace_linear(model, None, None, config, device=device) + + if orig_dtype != torch.float32: + q_model.to(dtype=orig_dtype) + + return q_model.to(device) diff --git a/neural_compressor/transformers/utils/quantization_config.py b/neural_compressor/transformers/utils/quantization_config.py new file mode 100644 index 00000000000..1c14d37f5e7 --- /dev/null +++ b/neural_compressor/transformers/utils/quantization_config.py @@ -0,0 +1,401 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Intel Neural Compressor Transformers-like Config.""" + +import os +import transformers + +from typing import Any, Dict, List, Optional, Tuple, Union +from neural_compressor.utils import logger +from neural_compressor.utils.utility import LazyImport +torch = LazyImport("torch") + +QUANT_CONFIG = "quantize_config.json" + +if transformers.__version__ >= "4.32.0": + from transformers.utils.quantization_config import QuantizationConfigMixin + QuantizationConfig = QuantizationConfigMixin +else: + from transformers import PretrainedConfig + QuantizationConfig = PretrainedConfig +from enum import Enum + +class QuantizationMethod(str, Enum): + GPTQ = "gptq" + RTN = "rtn" + +class INCQuantizationConfigMixin(QuantizationConfig): + """Mixin class for quantization config.""" + + def update(self, **kwargs): + """Updates attributes of this class instance with attributes from `kwargs` if they match existing atributtes, + returning all the unused kwargs. + + Args: + kwargs (`Dict[str, Any]`): + Dictionary of attributes to tentatively update this class. + + Returns: + `Dict[str, Any]`: Dictionary containing all the key-value pairs that were not used to update the instance. + """ + to_remove = [] + for key, value in kwargs.items(): + if hasattr(self, key): + setattr(self, key, value) + to_remove.append(key) + + # Remove all the attributes that were updated, without modifying the input dict + unused_kwargs = { + key: value for key, value in kwargs.items() if key not in to_remove + } + return unused_kwargs + + def post_init_cpu(self): + r"""Safety checker that arguments are correct.""" + + if self.compute_dtype is not None and self.compute_dtype not in [ + "fp32", + "bf16", + "int8", + ]: + raise ValueError("compute_dtype must be 'fp32', 'bf16', 'int8'.") + elif self.compute_dtype is None: + self.compute_dtype = "fp32" + + if self.bits is None: + self.bits = 4 + elif self.bits is not None and self.bits not in [4, 8]: + raise ValueError( + f"Only support quantization to [4, 8] bits but found {self.bits}" + ) + + + if self.scale_dtype is not None and self.scale_dtype not in [ + "fp32", + "bf16", + ]: + raise ValueError( + "scale_dtype must be a string in 'fp32', 'bf16' " + ) + elif self.scale_dtype is None: + self.scale_dtype = "fp32" + + if not isinstance(self.group_size, int): + raise ValueError("group_size must be a int") + + if not isinstance(self.scheme, str): + raise ValueError("scheme must be a string") + + def post_init_xpu(self): + r""" + Safety checker that arguments are correct - also replaces some NoneType arguments with their default values. + """ + + if self.compute_dtype is not None and self.compute_dtype not in ["fp16"]: + raise ValueError("compute_dtype must be 'fp16'.") + elif self.compute_dtype is None: + self.compute_dtype = "fp16" + + if self.bits is None: + self.bits = 4 + elif self.bits not in [4]: + raise ValueError( + f"Only support quantization to [4] bits but found {self.bits}" + ) + + if self.weight_dtype is None: + self.weight_dtype = "int4_fullrange" + elif self.weight_dtype == "int4": + self.weight_dtype = "int4_fullrange" + elif self.weight_dtype not in [ + "int4_fullrange", + ]: + raise ValueError( + f"weight_dtype must be a string in 'int4_fullrange', but get {self.weight_dtype}." + ) + + if self.scale_dtype is not None and self.scale_dtype not in ["fp16"]: + raise ValueError("scale_dtype must be a string in 'fp16'") + elif self.scale_dtype is None: + self.scale_dtype = "fp16" + + if not isinstance(self.group_size, int): + raise ValueError("group_size must be a int") + + if self.scheme not in ["sym"]: + raise ValueError( + "scheme: {} is not support, only support 'sym' now!".format(self.scheme) + ) + + + def to_json_file( + self, json_file_path: Union[str, os.PathLike], use_diff: bool = True + ): + """Save this instance to a JSON file. + + Args: + json_file_path (`str` or `os.PathLike`): + Path to the JSON file in which this configuration instance's parameters will be saved. + """ + # set tokenizer to None due to it doesn't support write to json + if hasattr(self, "tokenizer"): + self.tokenizer = None + if hasattr(self, "calib_dataloader"): + self.calib_dataloader = None + with open(json_file_path, "w", encoding="utf-8") as writer: + writer.write(self.to_json_string(use_diff=use_diff)) + + def remove_redundant_parameters(self): + remove_parameters = [ + "calib_dataloader", + "dataset", + "calib_func", + "calib_iters", + "calib_len", + "double_quant_scale_dtype", + "use_double_quant", + "mse_range", + "scheme", + "tokenizer", + "use_ggml", + "use_neural_speed", + "use_quant", + "layer_wise", + "blocksize", + "nsamples", + "max_input_length", + "static_groups", + "lr", + "minmax_lr", + "iters", + "use_quant_input", + "device", + "calib_dataset", + "calib_pad_val", + "calib_shuffle", + "calib_padding", + "example_inputs", + "excluded_precisions", + "op_name_dict", + "op_type_dict", + "train_dataloader", + "train_func", + "train_iters", + "train_len", + "train_padding", + "train_dataset", + "train_pad_val", + "train_shuffle", + "train_batch_size", + ] + for parameter in remove_parameters: + if hasattr(self, parameter): + delattr(self, parameter) + + def save_pretrained( + self, + save_directory: Union[str, os.PathLike], + push_to_hub: bool = False, + **kwargs, + ): + """Save a configuration object to the directory `save_directory`, so that it can be re-loaded using the + [`~PretrainedConfig.from_pretrained`] class method. + + Args: + save_directory (`str` or `os.PathLike`): + Directory where the configuration JSON file will be saved (will be created if it does not exist). + push_to_hub (`bool`, *optional*, defaults to `False`): + Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the + repository you want to push to with `repo_id` (will default to the name of `save_directory` in your + namespace). + kwargs (`Dict[str, Any]`, *optional*): + Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method. + """ + if os.path.isfile(save_directory): + raise AssertionError( + f"Provided path ({save_directory}) should be a directory, not a file" + ) + + os.makedirs(save_directory, exist_ok=True) + + if push_to_hub: + commit_message = kwargs.pop("commit_message", None) + repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1]) + repo_id = self._create_repo(repo_id, **kwargs) + files_timestamps = self._get_files_timestamps(save_directory) + + # If we save using the predefined names, we can load using `from_pretrained` + output_config_file = os.path.join(save_directory, QUANT_CONFIG) + + self.to_json_file(output_config_file, use_diff=False) + logger.info(f"Configuration saved in {output_config_file}") + + if push_to_hub: + self._upload_modified_files( + save_directory, + repo_id, + files_timestamps, + commit_message=commit_message, + token=kwargs.get("token", None), + ) + + @classmethod + def get_config_dict( + cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs + ) -> Tuple[Dict[str, Any], Dict[str, Any]]: + cf = kwargs.pop("_configuration_file", QUANT_CONFIG) + return super().get_config_dict( + pretrained_model_name_or_path, _configuration_file=cf, **kwargs + ) + +class RtnConfig(INCQuantizationConfigMixin): + def __init__( + self, + bits: int = 4, + group_size: int = 32, + compute_dtype: Any = None, + scale_dtype: Any = None, + sym: bool = True, + use_layer_wise: bool = False, + **kwargs, + ): + self.quant_method = QuantizationMethod.RTN + self.bits = bits + self.compute_dtype = compute_dtype + self.weight_dtype = "int4" if self.bits==4 else "int8" + self.scale_dtype = scale_dtype + self.group_size = group_size + self.use_layer_wise = use_layer_wise + self.sym = sym + self.scheme = "sym" if self.sym else "asym" + + # "transformer.output_layer" for chatglm series model. + # "embed_out" for dolly v2 series model. + self.modules_to_not_convert = kwargs.get("modules_to_not_convert", + ["lm_head", "transformer.output_layer", "embed_out"]) + self.device = kwargs.get("device", "auto") + + def to_diff_dict(self) -> Dict[str, Any]: + """Removes all attributes from config which correspond to the default config attributes + for better readability and serializes to a Python dictionary. + + Returns: + `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance, + """ + config_dict = self.to_dict() + + # get the default config dict + default_config_dict = RtnConfig().to_dict() + + serializable_config_dict = {} + + # only serialize values that differ from the default config + for key, value in config_dict.items(): + if value != default_config_dict[key]: + serializable_config_dict[key] = value + + return serializable_config_dict + +class GPTQConfig(INCQuantizationConfigMixin): + def __init__( + self, + bits: int = 4, + tokenizer: Any = None, + dataset: str = "NeelNanda/pile-10k", + batch_size: int = 8, + group_size: int = 32, + compute_dtype: Any = None, + scale_dtype: Any = None, + sym: bool = True, + blocksize: int = 128, + damp_percent: float = 0.1, + desc_act: bool = False, + n_samples: int = 128, + seq_len: int = 2048, + static_groups: bool = False, + use_mse_search: bool = False, + true_sequential: bool = False, + use_layer_wise: bool = False, + **kwargs, + ): + + self.quant_method = QuantizationMethod.GPTQ + self.bits = bits + self.tokenizer = tokenizer + self.dataset = dataset + self.batch_size = batch_size + self.compute_dtype = compute_dtype + self.weight_dtype = "int4" if self.bits==4 else "int8" + self.scale_dtype = scale_dtype + self.sym = sym + self.blocksize = blocksize + self.n_samples = n_samples + self.group_size = group_size + self.damp_percent = damp_percent + self.desc_act = desc_act + self.static_groups = static_groups + self.use_mse_search = use_mse_search + self.true_sequential = true_sequential + self.use_layer_wise = use_layer_wise + self.seq_len = seq_len + self.modules_to_not_convert = kwargs.get("modules_to_not_convert", + ["lm_head", "transformer.output_layer", "embed_out"]) + self.device = kwargs.get("device", "auto") + self.scheme = "sym" if self.sym else "asym" + + if isinstance(compute_dtype, torch.dtype): + self.compute_dtype = compute_dtype + else: + self.compute_dtype = compute_dtype + + if isinstance(scale_dtype, torch.dtype): + self.scale_dtype = scale_dtype + else: + self.scale_dtype = scale_dtype + + self.post_init_gptq() + + def post_init_gptq(self): + r"""Safety checker that arguments are correct.""" + + if self.bits not in [4, 8]: + raise ValueError( + f"Only support quantization to [4, 8] bits but found {self.bits}" + ) + + if not (0 < self.damp_percent < 1): + raise ValueError("damp_percent must between 0 and 1.") + + def to_diff_dict(self) -> Dict[str, Any]: + """Removes all attributes from config which correspond to the default config attributes + for better readability and serializes to a Python dictionary. + + Returns: + `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance, + """ + config_dict = self.to_dict() + + # get the default config dict + default_config_dict = GPTQConfig().to_dict() + + serializable_config_dict = {} + + # only serialize values that differ from the default config + for key, value in config_dict.items(): + if value != default_config_dict[key]: + serializable_config_dict[key] = value + + return serializable_config_dict + diff --git a/neural_compressor/transformers/utils/utility.py b/neural_compressor/transformers/utils/utility.py new file mode 100644 index 00000000000..9b4943ff1df --- /dev/null +++ b/neural_compressor/transformers/utils/utility.py @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Intel Neural Compressor based Transformers API utility.""" + +QUANT_CONFIG = "quantize_config.json" \ No newline at end of file From 98acdda6cc475c6d0f0b2de39ef580897ae74941 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 26 Aug 2024 01:17:08 +0000 Subject: [PATCH 02/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../transformers/quantization/utils.py | 25 +++---- .../transformers/utils/quantization_config.py | 67 ++++++++----------- .../transformers/utils/utility.py | 2 +- 3 files changed, 39 insertions(+), 55 deletions(-) diff --git a/neural_compressor/transformers/quantization/utils.py b/neural_compressor/transformers/quantization/utils.py index 4ea2d63ce73..b6fc002af77 100644 --- a/neural_compressor/transformers/quantization/utils.py +++ b/neural_compressor/transformers/quantization/utils.py @@ -18,20 +18,18 @@ import logging import math import os -from ....tools.utils import _ipex_version + from accelerate import init_empty_weights from datasets import load_dataset -from neural_compressor.torch.algorithms.weight_only.modules import INCWeightOnlyLinear as WeightOnlyLinear -from neural_compressor.torch.quantization import ( - GPTQConfig, - RTNConfig, - convert, - prepare, -) -from neural_compressor.utils.utility import LazyImport from transformers import AutoTokenizer +from neural_compressor.torch.algorithms.weight_only.modules import INCWeightOnlyLinear as WeightOnlyLinear +from neural_compressor.torch.quantization import GPTQConfig, RTNConfig, convert, prepare from neural_compressor.torch.utils import is_ipex_available +from neural_compressor.utils.utility import LazyImport + +from ....tools.utils import _ipex_version + if is_ipex_available(): import intel_extension_for_pytorch as ipex @@ -146,9 +144,7 @@ def _replace_linear( or device == "auto" ): if is_ipex_available() and quantization_config.use_ipex: - from intel_extension_for_pytorch.nn.modules import ( - WeightOnlyQuantizedLinear as ipex_linear, - ) + from intel_extension_for_pytorch.nn.modules import WeightOnlyQuantizedLinear as ipex_linear from intel_extension_for_pytorch.utils.weight_only_quantization import ( _convert_optimum_format_to_desired, ) @@ -203,8 +199,9 @@ def _replace_linear( ) elif device == "xpu" or device == torch.device("xpu"): - from intel_extension_for_pytorch.nn.utils._quantize_convert import \ - WeightOnlyQuantizedLinear as ipex_linear # pylint: disable=E0401 + from intel_extension_for_pytorch.nn.utils._quantize_convert import ( + WeightOnlyQuantizedLinear as ipex_linear, # pylint: disable=E0401 + ) model._modules[name] = ipex_linear( in_features, out_features, diff --git a/neural_compressor/transformers/utils/quantization_config.py b/neural_compressor/transformers/utils/quantization_config.py index 1c14d37f5e7..1381293a211 100644 --- a/neural_compressor/transformers/utils/quantization_config.py +++ b/neural_compressor/transformers/utils/quantization_config.py @@ -15,27 +15,33 @@ """Intel Neural Compressor Transformers-like Config.""" import os +from typing import Any, Dict, List, Optional, Tuple, Union + import transformers -from typing import Any, Dict, List, Optional, Tuple, Union from neural_compressor.utils import logger from neural_compressor.utils.utility import LazyImport + torch = LazyImport("torch") QUANT_CONFIG = "quantize_config.json" if transformers.__version__ >= "4.32.0": from transformers.utils.quantization_config import QuantizationConfigMixin + QuantizationConfig = QuantizationConfigMixin else: from transformers import PretrainedConfig + QuantizationConfig = PretrainedConfig from enum import Enum + class QuantizationMethod(str, Enum): GPTQ = "gptq" RTN = "rtn" + class INCQuantizationConfigMixin(QuantizationConfig): """Mixin class for quantization config.""" @@ -57,9 +63,7 @@ def update(self, **kwargs): to_remove.append(key) # Remove all the attributes that were updated, without modifying the input dict - unused_kwargs = { - key: value for key, value in kwargs.items() if key not in to_remove - } + unused_kwargs = {key: value for key, value in kwargs.items() if key not in to_remove} return unused_kwargs def post_init_cpu(self): @@ -77,18 +81,13 @@ def post_init_cpu(self): if self.bits is None: self.bits = 4 elif self.bits is not None and self.bits not in [4, 8]: - raise ValueError( - f"Only support quantization to [4, 8] bits but found {self.bits}" - ) - + raise ValueError(f"Only support quantization to [4, 8] bits but found {self.bits}") if self.scale_dtype is not None and self.scale_dtype not in [ "fp32", "bf16", ]: - raise ValueError( - "scale_dtype must be a string in 'fp32', 'bf16' " - ) + raise ValueError("scale_dtype must be a string in 'fp32', 'bf16' ") elif self.scale_dtype is None: self.scale_dtype = "fp32" @@ -111,9 +110,7 @@ def post_init_xpu(self): if self.bits is None: self.bits = 4 elif self.bits not in [4]: - raise ValueError( - f"Only support quantization to [4] bits but found {self.bits}" - ) + raise ValueError(f"Only support quantization to [4] bits but found {self.bits}") if self.weight_dtype is None: self.weight_dtype = "int4_fullrange" @@ -122,9 +119,7 @@ def post_init_xpu(self): elif self.weight_dtype not in [ "int4_fullrange", ]: - raise ValueError( - f"weight_dtype must be a string in 'int4_fullrange', but get {self.weight_dtype}." - ) + raise ValueError(f"weight_dtype must be a string in 'int4_fullrange', but get {self.weight_dtype}.") if self.scale_dtype is not None and self.scale_dtype not in ["fp16"]: raise ValueError("scale_dtype must be a string in 'fp16'") @@ -135,14 +130,9 @@ def post_init_xpu(self): raise ValueError("group_size must be a int") if self.scheme not in ["sym"]: - raise ValueError( - "scheme: {} is not support, only support 'sym' now!".format(self.scheme) - ) - + raise ValueError("scheme: {} is not support, only support 'sym' now!".format(self.scheme)) - def to_json_file( - self, json_file_path: Union[str, os.PathLike], use_diff: bool = True - ): + def to_json_file(self, json_file_path: Union[str, os.PathLike], use_diff: bool = True): """Save this instance to a JSON file. Args: @@ -224,9 +214,7 @@ def save_pretrained( Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method. """ if os.path.isfile(save_directory): - raise AssertionError( - f"Provided path ({save_directory}) should be a directory, not a file" - ) + raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file") os.makedirs(save_directory, exist_ok=True) @@ -256,9 +244,8 @@ def get_config_dict( cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs ) -> Tuple[Dict[str, Any], Dict[str, Any]]: cf = kwargs.pop("_configuration_file", QUANT_CONFIG) - return super().get_config_dict( - pretrained_model_name_or_path, _configuration_file=cf, **kwargs - ) + return super().get_config_dict(pretrained_model_name_or_path, _configuration_file=cf, **kwargs) + class RtnConfig(INCQuantizationConfigMixin): def __init__( @@ -274,7 +261,7 @@ def __init__( self.quant_method = QuantizationMethod.RTN self.bits = bits self.compute_dtype = compute_dtype - self.weight_dtype = "int4" if self.bits==4 else "int8" + self.weight_dtype = "int4" if self.bits == 4 else "int8" self.scale_dtype = scale_dtype self.group_size = group_size self.use_layer_wise = use_layer_wise @@ -283,8 +270,9 @@ def __init__( # "transformer.output_layer" for chatglm series model. # "embed_out" for dolly v2 series model. - self.modules_to_not_convert = kwargs.get("modules_to_not_convert", - ["lm_head", "transformer.output_layer", "embed_out"]) + self.modules_to_not_convert = kwargs.get( + "modules_to_not_convert", ["lm_head", "transformer.output_layer", "embed_out"] + ) self.device = kwargs.get("device", "auto") def to_diff_dict(self) -> Dict[str, Any]: @@ -308,6 +296,7 @@ def to_diff_dict(self) -> Dict[str, Any]: return serializable_config_dict + class GPTQConfig(INCQuantizationConfigMixin): def __init__( self, @@ -337,7 +326,7 @@ def __init__( self.dataset = dataset self.batch_size = batch_size self.compute_dtype = compute_dtype - self.weight_dtype = "int4" if self.bits==4 else "int8" + self.weight_dtype = "int4" if self.bits == 4 else "int8" self.scale_dtype = scale_dtype self.sym = sym self.blocksize = blocksize @@ -350,8 +339,9 @@ def __init__( self.true_sequential = true_sequential self.use_layer_wise = use_layer_wise self.seq_len = seq_len - self.modules_to_not_convert = kwargs.get("modules_to_not_convert", - ["lm_head", "transformer.output_layer", "embed_out"]) + self.modules_to_not_convert = kwargs.get( + "modules_to_not_convert", ["lm_head", "transformer.output_layer", "embed_out"] + ) self.device = kwargs.get("device", "auto") self.scheme = "sym" if self.sym else "asym" @@ -371,9 +361,7 @@ def post_init_gptq(self): r"""Safety checker that arguments are correct.""" if self.bits not in [4, 8]: - raise ValueError( - f"Only support quantization to [4, 8] bits but found {self.bits}" - ) + raise ValueError(f"Only support quantization to [4, 8] bits but found {self.bits}") if not (0 < self.damp_percent < 1): raise ValueError("damp_percent must between 0 and 1.") @@ -398,4 +386,3 @@ def to_diff_dict(self) -> Dict[str, Any]: serializable_config_dict[key] = value return serializable_config_dict - diff --git a/neural_compressor/transformers/utils/utility.py b/neural_compressor/transformers/utils/utility.py index 9b4943ff1df..f7f81c5c5df 100644 --- a/neural_compressor/transformers/utils/utility.py +++ b/neural_compressor/transformers/utils/utility.py @@ -14,4 +14,4 @@ # limitations under the License. """Intel Neural Compressor based Transformers API utility.""" -QUANT_CONFIG = "quantize_config.json" \ No newline at end of file +QUANT_CONFIG = "quantize_config.json" From f7dc2ef5b9ea586730ad2eab3f6f6e36efbb405a Mon Sep 17 00:00:00 2001 From: changwangss Date: Mon, 26 Aug 2024 06:17:28 -0700 Subject: [PATCH 03/18] addd save and load Signed-off-by: changwangss --- neural_compressor/__init__.py | 4 + neural_compressor/transformers/__init__.py | 1 + .../transformers/models/__init__.py | 1 + .../transformers/models/modeling_auto.py | 711 ++++++++++++++++++ .../transformers/quantization/__init__.py | 1 + .../transformers/quantization/utils.py | 552 ++++++++------ .../transformers/utils/quantization_config.py | 16 +- 7 files changed, 1054 insertions(+), 232 deletions(-) create mode 100644 neural_compressor/transformers/__init__.py create mode 100644 neural_compressor/transformers/models/__init__.py create mode 100644 neural_compressor/transformers/models/modeling_auto.py create mode 100644 neural_compressor/transformers/quantization/__init__.py diff --git a/neural_compressor/__init__.py b/neural_compressor/__init__.py index 5ee86bf561a..74ad88db685 100644 --- a/neural_compressor/__init__.py +++ b/neural_compressor/__init__.py @@ -25,6 +25,10 @@ QuantizationAwareTrainingConfig, MixedPrecisionConfig, ) +from .transformers import( + GPTQConfig, + RtnConfig +) from .contrib import * from .model import * from .metric import * diff --git a/neural_compressor/transformers/__init__.py b/neural_compressor/transformers/__init__.py new file mode 100644 index 00000000000..f2ecbd3f05a --- /dev/null +++ b/neural_compressor/transformers/__init__.py @@ -0,0 +1 @@ +from .utils.quantization_config import GPTQConfig, RtnConfig diff --git a/neural_compressor/transformers/models/__init__.py b/neural_compressor/transformers/models/__init__.py new file mode 100644 index 00000000000..b10c092933e --- /dev/null +++ b/neural_compressor/transformers/models/__init__.py @@ -0,0 +1 @@ +from .modeling_auto import _BaseQBitsAutoModelClass diff --git a/neural_compressor/transformers/models/modeling_auto.py b/neural_compressor/transformers/models/modeling_auto.py new file mode 100644 index 00000000000..465cc171da1 --- /dev/null +++ b/neural_compressor/transformers/models/modeling_auto.py @@ -0,0 +1,711 @@ +# !/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# coding=utf-8 +# Copyright 2021 The EleutherAI and HuggingFace Teams. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import json +import os +import re +import types +from threading import Thread +from typing import Union + +import torch +import transformers +from accelerate import init_empty_weights + +from transformers import AutoConfig +from transformers.configuration_utils import PretrainedConfig +from transformers.modeling_utils import load_state_dict +from transformers.utils import has_file, is_safetensors_available +from neural_compressor.transformers.quantization.utils import save_low_bit, replace_linear +from neural_compressor.torch.algorithms.weight_only.modules import INCWeightOnlyLinear + +from ..quantization.utils import ( + convert_dtype_torch2str, + replace_linear, +) +from neural_compressor.transformers import GPTQConfig, RtnConfig +from neural_compressor.utils.utility import LazyImport, CpuInfo +from neural_compressor.utils import logger +from neural_compressor.torch.utils import is_ipex_available +from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, WEIGHTS_INDEX_NAME, WEIGHTS_NAME + + +torch = LazyImport("torch") + + + +def build_woq_model(model, quantization_config): + from neural_compressor.adaptor.torch_utils.util import set_module + + bits = quantization_config.bits + for n, m in model.named_modules(): + if n in quantization_config.modules_to_not_convert: + continue + if isinstance(m, torch.nn.Linear): + zp = getattr( + quantization_config, + "zero_point", + not getattr(quantization_config, "sym", False), + ) + use_optimum_format = True + with init_empty_weights(): + new_module = INCWeightOnlyLinear( + m.in_features, + m.out_features, + dtype="int4" if bits == 4 else "int8", + bits=quantization_config.bits, + group_size=quantization_config.group_size, + zp=zp, + bias=m.bias is not None, + g_idx=True, + use_optimum_format=use_optimum_format, + ) + set_module(model, n, new_module) + return model + + +class _BaseQBitsAutoModelClass: + ORIG_MODEL = None + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): + config = kwargs.pop("config", None) + if not isinstance(config, PretrainedConfig): + config, _ = AutoConfig.from_pretrained( + pretrained_model_name_or_path, + return_unused_kwargs=True, + **kwargs, + ) + + if hasattr(config, "quantization_config"): + if config.quantization_config is None: + logger.warning( + "Quantization_config loading failed. If you want to load saved " + "low bit model, please check your quantizate_config.json." + ) + + else: + logger.info("quantization_config: {}".format(config.quantization_config)) + try: + model = cls.load_low_bit( + pretrained_model_name_or_path, + *model_args, + config=config, + **kwargs, + ) + logger.info("Saved low bit model loading successfully. Other input args " "will be ignored.") + return model + except Exception as e: + logger.error(e) + logger.error("Saved low bit model loading failed, please check your model.") + exit(0) + + @classmethod + def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): + """Load a low bit optimized model (including INT4, INT5 and INT8) from a saved ckpt. + + :param pretrained_model_name_or_path: str value, Path to load the optimized model ckpt. + # :param optimize_model: boolean value, Whether to further optimize the low_bit llm model. + # Default to be True. + :return: a model instance + """ + from accelerate.big_modeling import init_empty_weights + from transformers.dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code + from transformers.generation.configuration_utils import GenerationConfig + from transformers.modeling_utils import _add_variant, get_checkpoint_shard_files, no_init_weights + from transformers.models.auto.auto_factory import _get_model_class + from transformers.models.auto.configuration_auto import AutoConfig + from transformers.utils import ContextManagers, cached_file, download_url, extract_commit_hash, is_remote_url + + # Autofactory + kwargs_orig = copy.deepcopy(kwargs) + # modules_to_not_convert = kwargs.pop("modules_to_not_convert", None) + trust_remote_code = kwargs.pop("trust_remote_code", None) + # Maybe needed when extract_local_archive_file + subfolder = kwargs.pop("subfolder", "") + variant = kwargs.pop("variant", None) + offload_folder = kwargs.pop("offload_folder", None) + offload_state_dict = kwargs.pop("offload_state_dict", False) + torch_dtype = kwargs.pop("torch_dtype", "auto") + cache_dir = kwargs.pop("cache_dir", None) + force_download = kwargs.pop("force_download", False) + proxies = kwargs.pop("proxies", None) + resume_download = kwargs.pop("resume_download", False) + local_files_only = kwargs.pop("local_files_only", False) + use_auth_token = kwargs.pop("use_auth_token", None) + token = kwargs.pop("token", None) + from_pipeline = kwargs.pop("_from_pipeline", None) + from_auto_class = kwargs.pop("_from_auto", False) + revision = kwargs.pop("revision", "main") + commit_hash = kwargs.pop("_commit_hash", None) + _fast_init = kwargs.pop("_fast_init", True) + device_map = kwargs.pop("device_map", "auto") + use_safetensors = kwargs.pop("use_safetensors", None) + kwarg_attn_imp = kwargs.pop("attn_implementation", None) + + # lm-eval device map is dictionary + device_map = device_map[""] if isinstance(device_map, dict) and "" in device_map else device_map + + if use_safetensors is None and not is_safetensors_available(): + use_safetensors = False + + if use_auth_token is not None: + logger.warn( + "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. " + "Please use `token` instead." + ) + if token is not None: + raise ValueError( + "`token` and `use_auth_token` are both specified. Please set only the argument `token`." + ) + token = use_auth_token + + use_cpu = True if device_map == torch.device("cpu") or device_map == "cpu" else False + use_xpu = True if device_map == torch.device("xpu") or device_map == "xpu" else False + + user_agent = { + "file_type": "model", + "framework": "pytorch", + "from_auto_class": from_auto_class, + } + if from_pipeline is not None: + user_agent["using_pipeline"] = from_pipeline + + config = kwargs.pop("config", None) + if kwarg_attn_imp is not None and config._attn_implementation != kwarg_attn_imp: + config._attn_implementation = kwarg_attn_imp + + quantization_config = config.quantization_config + + if quantization_config["quant_method"] == "rtn": + quantization_config = RtnConfig.from_dict(quantization_config) + elif quantization_config["quant_method"] == "gptq": + quantization_config = GPTQConfig.from_dict(quantization_config) + assert quantization_config is not None, "Detect this model is not a low-bit model." + + if commit_hash is None: + if not isinstance(config, PretrainedConfig): + # We make a call to the config file first (which may be absent) + # to get the commit hash as soon as possible. + resolved_config_file = cached_file( + pretrained_model_name_or_path, + "config.json", + cache_dir=cache_dir, + force_download=force_download, + resume_download=resume_download, + proxies=proxies, + local_files_only=local_files_only, + token=token, + revision=revision, + subfolder=subfolder, + _raise_exceptions_for_missing_entries=False, + _raise_exceptions_for_connection_errors=False, + ) + commit_hash = extract_commit_hash(resolved_config_file, commit_hash) + else: + commit_hash = getattr(config, "_commit_hash", None) + + has_remote_code = hasattr(config, "auto_map") and cls.ORIG_MODEL.__name__ in config.auto_map + + has_local_code = type(config) in cls.ORIG_MODEL._model_mapping.keys() + trust_remote_code = resolve_trust_remote_code( + trust_remote_code, + pretrained_model_name_or_path, + has_local_code, + has_remote_code, + ) + if has_remote_code and trust_remote_code: + class_ref = config.auto_map[cls.ORIG_MODEL.__name__] + model_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs_orig) + if os.path.isdir(pretrained_model_name_or_path): + model_class.register_for_auto_class(cls.ORIG_MODEL.__name__) + else: + cls.ORIG_MODEL.register(config.__class__, model_class, exist_ok=True) + elif type(config) in cls.ORIG_MODEL._model_mapping.keys(): + model_class = _get_model_class(config, cls.ORIG_MODEL._model_mapping) + + # This variable will flag if we're loading a sharded checkpoint. In this case the archive file is just the + # index of the files. + is_sharded = False + sharded_metadata = None + + if pretrained_model_name_or_path is not None: + pretrained_model_name_or_path = str(pretrained_model_name_or_path) + is_local = os.path.isdir(pretrained_model_name_or_path) + if is_local: + if os.path.isfile( + os.path.join( + pretrained_model_name_or_path, + subfolder, + _add_variant(WEIGHTS_NAME, variant), + ) + ): + # Load from a PyTorch checkpoint + archive_file = os.path.join( + pretrained_model_name_or_path, + subfolder, + _add_variant(WEIGHTS_NAME, variant), + ) + elif os.path.isfile( + os.path.join( + pretrained_model_name_or_path, + subfolder, + _add_variant(WEIGHTS_INDEX_NAME, variant), + ) + ): + # Load from a sharded PyTorch checkpoint + archive_file = os.path.join( + pretrained_model_name_or_path, + subfolder, + _add_variant(WEIGHTS_INDEX_NAME, variant), + ) + is_sharded = True + elif os.path.isfile( + os.path.join( + pretrained_model_name_or_path, + subfolder, + _add_variant(SAFE_WEIGHTS_NAME, variant), + ) + ): + # Load from a safetensors checkpoint + archive_file = os.path.join( + pretrained_model_name_or_path, + subfolder, + _add_variant(SAFE_WEIGHTS_NAME, variant), + ) + elif os.path.isfile( + os.path.join( + pretrained_model_name_or_path, + subfolder, + _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant), + ) + ): + # Load from a safetensors checkpoint + archive_file = os.path.join( + pretrained_model_name_or_path, + subfolder, + _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant), + ) + is_sharded = True + elif os.path.isfile(os.path.join(subfolder, pretrained_model_name_or_path)): + archive_file = pretrained_model_name_or_path + is_local = True + elif is_remote_url(pretrained_model_name_or_path): + filename = pretrained_model_name_or_path + resolved_archive_file = download_url(pretrained_model_name_or_path) + else: + if use_safetensors is not False: + filename = _add_variant(SAFE_WEIGHTS_NAME, variant) + else: + filename = _add_variant(WEIGHTS_NAME, variant) + try: + # Load from URL or cache if already cached + cached_file_kwargs = { + "cache_dir": cache_dir, + "force_download": force_download, + "proxies": proxies, + "resume_download": resume_download, + "local_files_only": local_files_only, + "token": token, + "user_agent": user_agent, + "revision": revision, + "subfolder": subfolder, + "_raise_exceptions_for_gated_repo": False, + "_raise_exceptions_for_missing_entries": False, + "_commit_hash": commit_hash, + } + resolved_archive_file = cached_file(pretrained_model_name_or_path, filename, **cached_file_kwargs) + + # Since we set _raise_exceptions_for_missing_entries=False, we don't get an exception but a None + # result when internet is up, the repo and revision exist, but the file does not. + if resolved_archive_file is None and filename == _add_variant(SAFE_WEIGHTS_NAME, variant): + # Maybe the checkpoint is sharded, we try to grab the index name in this case. + resolved_archive_file = cached_file( + pretrained_model_name_or_path, + _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant), + **cached_file_kwargs, + ) + if resolved_archive_file is not None: + is_sharded = True + elif use_safetensors: + raise EnvironmentError( + f"{pretrained_model_name_or_path} does not appear to have a file named" + f" {_add_variant(SAFE_WEIGHTS_NAME, variant)} or " + f"{_add_variant(SAFE_WEIGHTS_INDEX_NAME, variant)} " + "and thus cannot be loaded with `safetensors`. Please make sure that the model has " + "been saved with `safe_serialization=True` or do not set `use_safetensors=True`." + ) + else: + # This repo has no safetensors file of any kind, we switch to PyTorch. + filename = _add_variant(WEIGHTS_NAME, variant) + resolved_archive_file = cached_file( + pretrained_model_name_or_path, + filename, + **cached_file_kwargs, + ) + if resolved_archive_file is None and filename == _add_variant(WEIGHTS_NAME, variant): + # Maybe the checkpoint is sharded, we try to grab the index name in this case. + resolved_archive_file = cached_file( + pretrained_model_name_or_path, + _add_variant(WEIGHTS_INDEX_NAME, variant), + **cached_file_kwargs, + ) + if resolved_archive_file is not None: + is_sharded = True + + if resolved_archive_file is None: + # Otherwise, maybe there is a TF or Flax model file. We try those to give a helpful error + # message. + has_file_kwargs = { + "revision": revision, + "proxies": proxies, + "token": token, + } + if variant is not None and has_file( + pretrained_model_name_or_path, + WEIGHTS_NAME, + **has_file_kwargs, + ): + raise EnvironmentError( + f"{pretrained_model_name_or_path} does not appear to have a file named" + f" {_add_variant(WEIGHTS_NAME, variant)} but there is a file without the variant" + f" {variant}. Use `variant=None` to load this model from those weights." + ) + else: + raise EnvironmentError( + f"{pretrained_model_name_or_path} does not appear to have a file named" + f" {_add_variant(WEIGHTS_NAME, variant)}." + ) + except EnvironmentError: + # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted + # to the original exception. + raise + except Exception as e: + # For any other exception, we throw a generic error. + raise EnvironmentError( + f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it" + " from 'https://huggingface.co/models', make sure you don't have a local directory with the" + f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a" + f" directory containing a file named {_add_variant(WEIGHTS_NAME, variant)}." + ) from e + if is_local: + logger.info(f"loading weights file {archive_file}") + resolved_archive_file = archive_file + else: + logger.info(f"loading weights file {filename} from cache at {resolved_archive_file}") + else: + resolved_archive_file = None + + # We'll need to download and cache each checkpoint shard if the checkpoint is sharded. + if is_sharded: + # rsolved_archive_file becomes a list of files that point to the different checkpoint shards in this case. + resolved_archive_file, sharded_metadata = get_checkpoint_shard_files( + pretrained_model_name_or_path, + resolved_archive_file, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + resume_download=resume_download, + local_files_only=local_files_only, + token=token, + user_agent=user_agent, + revision=revision, + subfolder=subfolder, + _commit_hash=commit_hash, + ) + + # set dtype to instantiate the model under: + # 1. If torch_dtype is not None, we use that dtype + # 2. If torch_dtype is "auto", we auto-detect dtype from the loaded state_dict, + # by checking its first weights entry that is of a floating type + # - we assume all floating dtype weights are of the same dtype + # we also may have config.torch_dtype available, but we won't rely on it till v5 + # Pretrained Model + + dtype_orig = None + if torch_dtype is not None: + if isinstance(torch_dtype, str): + if torch_dtype == "auto": + if ( + hasattr(config, "torch_dtype") + and config.torch_dtype is not None + and config.torch_dtype != "auto" + ): + torch_dtype = config.torch_dtype + else: + if is_sharded and "dtype" in sharded_metadata: + torch_dtype = sharded_metadata["dtype"] + else: + torch_dtype = torch.float32 + else: + assert False, f'`torch_dtype` can be either `torch.dtype` or `"auto"`, but received {torch_dtype}' + + dtype_orig = model_class._set_default_torch_dtype(torch_dtype) + if quantization_config.compute_dtype is None: + if use_xpu: + quantization_config.compute_dtype = ( + "fp16" + if (torch_dtype is None or torch_dtype == torch.bfloat16) + else convert_dtype_torch2str(torch_dtype) + ) + else: + quantization_config.compute_dtype = ( + "fp32" + if ( + torch_dtype is None + or (not CpuInfo().bf16 and torch_dtype == torch.bfloat16) + or (torch_dtype == torch.float16) + ) + else convert_dtype_torch2str(torch_dtype) + ) + else: + if (not CpuInfo().bf16 and quantization_config.compute_dtype == "bf16") or ( + use_cpu and quantization_config.compute_dtype == "fp16" + ): + quantization_config.compute_dtype = "fp32" + + if quantization_config.scale_dtype is None: + quantization_config.scale_dtype = "fp32" + if quantization_config.scale_dtype not in ["fp32", "fp16", "bf16"]: + logger.warning("scale_dtype only supports fp32, bf16, fp16.") + quantization_config.scale_dtype = "fp32" + logger.warning("fp32 scale_dtype is used, please change the config.json if you don't want to use it.") + + # weight dtype is higher priority than bits in config.json when both existed. + if quantization_config.weight_dtype is None: + if quantization_config.bits == 4: + if use_xpu: + quantization_config.weight_dtype = "int4_fullrange" + else: + quantization_config.weight_dtype = "int4" + logger.info( + "{} quantization weight_dtype is used due to bits is 4 in config.json.".format( + quantization_config.weight_dtype + ) + ) + elif quantization_config.bits == 8: + quantization_config.weight_dtype = "int8" + logger.info( + "{} quantization weight_dtype is used due to bits is 8 in config.json.".format( + quantization_config.weight_dtype + ) + ) + else: + logger.warning("bits number only supports 4, 8.") + quantization_config.weight_dtype = "int4" + logger.warning( + "int4 weight_dtype is used, please change the config.json if you don't want to use it." + ) + else: + if quantization_config.weight_dtype not in [ + "int4_fullrange", + "int4", + "int8", + "fp8_e5m2", + "fp8_e4m3", + "nf4", + "fp4_e2m1_bnb", + "fp4_e2m1", + ]: + logger.warning("Please provide the correct bits number or weight_dtype in config.json.") + raise ValueError( + "weight_dtype must be a string in " + "'int8', 'int4', 'int4_fullrange', 'int4', 'nf4', " + "'fp4', 'fp4_e2m1', 'fp8', 'fp8_e5m2, fp8_e4m3'" + ) + else: + logger.info("{} quantization weight_dtype is used.".format(quantization_config.weight_dtype)) + + init_contexts = [no_init_weights(_enable=_fast_init)] + init_contexts.append(init_empty_weights()) + + with ContextManagers(init_contexts): + model = model_class(config, *model_args, **kwargs) + + model = build_woq_model(model, quantization_config) + + + if is_sharded: + loaded_state_dict_keys = sharded_metadata["all_checkpoint_keys"] + else: + # Time to load the checkpoint + state_dict = load_state_dict(resolved_archive_file) + loaded_state_dict_keys = list(state_dict.keys()) + + # restore default dtype + if dtype_orig is not None: + torch.set_default_dtype(dtype_orig) + + if is_ipex_available(): + model = replace_linear( + model, + quantization_config=quantization_config, + device="cpu" if device_map == "auto" else device_map, + empty_weights=True, + ) + # if (device_map == "cpu" or device_map == torch.device("cpu")): + # import intel_extension_for_pytorch as ipex + # from intel_extension_for_pytorch.nn.modules import WeightOnlyQuantizedLinear as ipex_linear + + # def replace_ipex_cpu_woq_linear(model, current_name=[]): + # for name, module in model.named_children(): + # current_name.append(name) + # if isinstance(module, INCWeightOnlyLinear): + # weight_dtype = { + # 4: ipex.quantization.WoqWeightDtype.INT4, + # 8: ipex.quantization.WoqWeightDtype.INT8, + # } + # compute_dtype = { + # "fp32": ipex.quantization.WoqLowpMode.NONE, # follow the activation datatype. + # "bf16": ipex.quantization.WoqLowpMode.BF16, + # "fp16": ipex.quantization.WoqLowpMode.FP16, + # "int8": ipex.quantization.WoqLowpMode.INT8, + # } + + # ipex_qconfig_mapping = ipex.quantization.get_weight_only_quant_qconfig_mapping( + # weight_dtype=weight_dtype[quantization_config.bits], + # lowp_mode=compute_dtype[quantization_config.compute_dtype], + # act_quant_mode=ipex.quantization.WoqActQuantMode.PER_IC_BLOCK, + # group_size=quantization_config.group_size, + # ) + # tmp_linear = torch.nn.Linear( + # module.in_features, + # module.out_features, + # True if hasattr(module, "bias") else False, + # ) + # tmp_linear.qconfig = ipex_qconfig_mapping.global_qconfig + # target_linear = ipex_linear.from_float_and_int4_weight( + # mod=tmp_linear, + # qweight=state_dict.pop(".".join(current_name) + ".qweight"), + # scales=state_dict.pop(".".join(current_name) + ".scales"), + # zero_points=state_dict.pop(".".join(current_name) + ".qzeros"), + # bias=( + # state_dict.pop(".".join(current_name) + ".bias") + # if ".".join(current_name) + ".bias" in state_dict + # else None + # ), + # group_size=quantization_config.group_size, + # g_idx=( + # state_dict.pop(".".join(current_name) + ".g_idx") + # if ".".join(current_name) + ".g_idx" in state_dict + # else None + # ), + # ) + # setattr(model, name, target_linear) + # else: + # replace_ipex_cpu_woq_linear(module, current_name) + # current_name.pop() + + # replace_ipex_cpu_woq_linear(model) + # model.load_state_dict(state_dict, strict=False, assign=True) + # else: + ( + model, + missing_keys, + unexpected_keys, + mismatched_keys, + offload_index, + error_msgs, + ) = model_class._load_pretrained_model( + model, + None, + loaded_state_dict_keys, # XXX: rename? + resolved_archive_file, + pretrained_model_name_or_path, + sharded_metadata=sharded_metadata, + _fast_init=_fast_init, + low_cpu_mem_usage=True, + offload_folder=offload_folder, + offload_state_dict=offload_state_dict, + dtype=torch_dtype, + keep_in_fp32_modules=[], + ) + else: + raise AssertionError("Please install intel_extension_for_pytorch.") + + # make sure token embedding weights are still tied if needed + model.tie_weights() + + # Set model in evaluation mode to deactivate DropOut modules by default + model.eval() + + # model = replace_linear( + # model, + # quantization_config=quantization_config, + # device="cpu" if device_map == "auto" else device_map, + # empty_weights=True, + # ) + + if (not use_xpu and torch_dtype == torch.float16) or ( + not use_xpu and not CpuInfo().bf16 and torch_dtype == torch.bfloat16 + ): + model.to(dtype=torch.float32) + + # If it is a model with generation capabilities, attempt to load the generation config + if model.can_generate(): + try: + model.generation_config = GenerationConfig.from_pretrained( + pretrained_model_name_or_path, + cache_dir=cache_dir, + force_download=force_download, + resume_download=resume_download, + proxies=proxies, + local_files_only=local_files_only, + token=token, + revision=revision, + subfolder=subfolder, + _from_auto=from_auto_class, + _from_pipeline=from_pipeline, + **kwargs, + ) + except (OSError, TypeError): + pass + for param in model.parameters(): + param.requires_grad_(False) + if device_map == "xpu": + model = model.to("xpu") + model.quantization_config = quantization_config + model.save_pretrained = types.MethodType(save_low_bit, model) + return model + + +class AutoModelForCausalLM(_BaseQBitsAutoModelClass): + ORIG_MODEL = transformers.AutoModelForCausalLM + + +class AutoModel(_BaseQBitsAutoModelClass): + ORIG_MODEL = transformers.AutoModel + + +class AutoModelForSeq2SeqLM(_BaseQBitsAutoModelClass): + ORIG_MODEL = transformers.AutoModelForSeq2SeqLM \ No newline at end of file diff --git a/neural_compressor/transformers/quantization/__init__.py b/neural_compressor/transformers/quantization/__init__.py new file mode 100644 index 00000000000..f07504d8f01 --- /dev/null +++ b/neural_compressor/transformers/quantization/__init__.py @@ -0,0 +1 @@ +from .utils import convert_to_quantized_model, save_low_bit diff --git a/neural_compressor/transformers/quantization/utils.py b/neural_compressor/transformers/quantization/utils.py index b6fc002af77..f30640f70a7 100644 --- a/neural_compressor/transformers/quantization/utils.py +++ b/neural_compressor/transformers/quantization/utils.py @@ -18,10 +18,26 @@ import logging import math import os +<<<<<<< Updated upstream from accelerate import init_empty_weights from datasets import load_dataset +======= +import types +import json + +from datasets import load_dataset +from neural_compressor.torch.algorithms.weight_only.modules import INCWeightOnlyLinear +from neural_compressor.torch.quantization import ( + GPTQConfig, + RTNConfig, + convert, + prepare, +) +from neural_compressor.utils.utility import LazyImport, CpuInfo +>>>>>>> Stashed changes from transformers import AutoTokenizer +from transformers.utils import SAFE_WEIGHTS_NAME, WEIGHTS_NAME from neural_compressor.torch.algorithms.weight_only.modules import INCWeightOnlyLinear as WeightOnlyLinear from neural_compressor.torch.quantization import GPTQConfig, RTNConfig, convert, prepare @@ -33,7 +49,7 @@ if is_ipex_available(): import intel_extension_for_pytorch as ipex -from ...utils import CpuInfo +from typing import Union torch = LazyImport("torch") @@ -79,9 +95,9 @@ def replace_linear( # output_layer is chatglm last layer name # embed_out is dolly_v2 last layer name modules_to_not_convert = [] - if quantization_config.llm_int8_skip_modules: + if quantization_config.modules_to_not_convert: modules_to_not_convert.extend( - quantization_config.llm_int8_skip_modules + quantization_config.modules_to_not_convert ) modules_to_not_convert = list(set(modules_to_not_convert)) model, is_replaced = _replace_linear( @@ -121,11 +137,9 @@ def _replace_linear( current_key_name = [] current_key_name.append(name) is_removed = False - use_optimum_format = getattr(module, "use_optimum_format", False) - if ( isinstance(module, torch.nn.Linear) - or isinstance(module, WeightOnlyLinear) + or isinstance(module, INCWeightOnlyLinear) or ( is_ipex_available() and isinstance(module, ipex.nn.utils._weight_prepack._IPEXLinear) @@ -135,6 +149,7 @@ def _replace_linear( if not any( key in ".".join(current_key_name) for key in modules_to_not_convert ): +<<<<<<< Updated upstream with init_empty_weights(): in_features = module.in_features out_features = module.out_features @@ -274,56 +289,149 @@ def _replace_linear( pass elif ( device == "cpu" or device == torch.device("cpu") or device == "auto" +======= + in_features = module.in_features + out_features = module.out_features + if ( + device == "cpu" + or device == torch.device("cpu") + or device == "auto" +>>>>>>> Stashed changes ): - if quantization_config.weight_dtype in [ - "fp8_e5m2", - "fp8_e4m3", - ]: - model._modules[name].set_fp_weights_bias( - module.weight.data, - None if module.bias is None else module.bias.data, + from intel_extension_for_pytorch.nn.modules import ( + WeightOnlyQuantizedLinear as ipex_linear, + ) + from intel_extension_for_pytorch.utils.weight_only_quantization import ( + _convert_optimum_format_to_desired, + ) + qweight = module.qweight + scales = module.scales + qzeros = module.qzeros + + qweight, scales, qzeros = _convert_optimum_format_to_desired( + qweight, scales, qzeros + ) + weight_dtype = { + 4: ipex.quantization.WoqWeightDtype.INT4, + 8: ipex.quantization.WoqWeightDtype.INT8, + } + compute_dtype = { + "fp32": ipex.quantization.WoqLowpMode.NONE, # follow the activation datatype. + "bf16": ipex.quantization.WoqLowpMode.BF16, + "fp16": ipex.quantization.WoqLowpMode.FP16, + "int8": ipex.quantization.WoqLowpMode.INT8, + } + + ipex_qconfig_mapping = ipex.quantization.get_weight_only_quant_qconfig_mapping( + weight_dtype=weight_dtype[quantization_config.bits], + lowp_mode=compute_dtype[ + quantization_config.compute_dtype + ], + act_quant_mode=ipex.quantization.WoqActQuantMode.PER_IC_BLOCK, + group_size=quantization_config.group_size, + ) + tmp_linear = torch.nn.Linear( + in_features, + out_features, + True if hasattr(module, "bias") else False, + ) + tmp_linear.qconfig = ipex_qconfig_mapping.global_qconfig + model._modules[name] = ( + ipex_linear.from_float_and_int4_weight( + mod=tmp_linear, + qweight=qweight, + scales=scales, + zero_points= qzeros, + bias=( + module.bias if hasattr(module, "bias") else None + ), + group_size=quantization_config.group_size, + g_idx=( + module.g_idx + if hasattr(module, "g_idx") + else None + ), + ) + ) + + elif device == "xpu" or device == torch.device("xpu"): + from intel_extension_for_pytorch.nn.utils._quantize_convert import \ + WeightOnlyQuantizedLinear as ipex_linear # pylint: disable=E0401 + model._modules[name] = ipex_linear( + in_features, + out_features, + module.bias is not None, + compute_dtype=quantization_config.compute_dtype, + compress_statistics=False, + weight_dtype=quantization_config.weight_dtype, + scale_dtype=quantization_config.scale_dtype, + blocksize=quantization_config.group_size, + scheme=quantization_config.scheme, + compression_dtype=getattr(module, "compression_dtype", torch.int32), + compression_dim=getattr(module, "compression_dim", 1), + device=device, + use_optimum_format=getattr(module, "use_optimum_format", True), + ) + if quantization_config.quant_method.value == "gptq": + g_idx = getattr( + module, + "g_idx", + torch.zeros(in_features, dtype=torch.int32).to(device), ) else: - if quantization_config.weight_dtype in ["int4", "int4_clip", "int8"]: - int_weight, scales, zeros = unpack_weight( - module.qweight, - module.scales, - module.qzeros if hasattr(module, "qzeros") else None, - quantization_config, + g_idx = None + model._modules[name].set_scales_zps_gidx( + ( + module.scales + if hasattr(module, "scales") + else torch.ones( + ( + math.ceil( + in_features / quantization_config.group_size + ), + out_features, + ), + dtype=convert_dtype_str2torch( + quantization_config.compute_dtype + ), + device=torch.device(device), ) - int_weight = int_weight.view(-1, int_weight.shape[-1]) - else: - int_weight = module.unpack_tensor_with_numpy(module.qweight) - scales = module.scales - zeros = module.qzeros if hasattr(module, "qzeros") else None - - model._modules[name].set_weights_bias( - int_weight, - scales, - zeros, - module.g_idx if hasattr(module, "g_idx") else None, - quantization_config, - bias=None if module.bias is None else module.bias.data, - ) + ), + module.qzeros if hasattr(module, "qzeros") else None, + g_idx, + ) else: - if not hasattr(module, "qweight"): - n_pack = ( - (8 if _ipex_version < "2.3.10" else 32) - // DTYPE_BITS_MAPPING[quantization_config.weight_dtype] - ) - weight = torch.zeros( - (math.ceil(out_features / n_pack), in_features) if _ipex_version < "2.3.10" else - (math.ceil(in_features / n_pack), out_features), - dtype=torch.int8 if _ipex_version < "2.3.10" else torch.int32, - device=torch.device(device), + raise Exception( + "{} device Unsupported weight only quantization!".format( + device ) - model._modules[name].set_weights_bias( - module.qweight.data if hasattr(module, "qweight") else weight, - None if module.bias is None else module.bias.data, ) - del module - gc.collect() - is_removed = True + + is_replaced = True + # Store the module class in case we need to transpose the weight later + model._modules[name].source_cls = type(module) + # Force requires grad to False to avoid unexpected errors + model._modules[name].requires_grad_(False) + + if ( + device == "xpu" or device == torch.device("xpu") + ): + + if not hasattr(module, "qweight"): + n_pack = 32 // quantization_config.bits + + weight = torch.zeros( + (math.ceil(in_features / n_pack), out_features), + dtype=torch.int32, + device=torch.device(device), + ) + model._modules[name].set_weights_bias( + module.qweight.data if hasattr(module, "qweight") else weight, + None if module.bias is None else module.bias.data, + ) + del module + gc.collect() + is_removed = True if not is_removed and len(list(module.children())) > 0: # pylint: disable=E1101 _, is_replaced = _replace_linear( @@ -353,30 +461,12 @@ def default_run_fn( exit(0) def tokenize_function(examples): - if algo == "teq": - if tokenizer.pad_token is None: - tokenizer.pad_token = tokenizer.eos_token if "prompt" in examples: - if algo == "teq": - example = tokenizer( - examples["prompt"], padding="max_length", max_length=max_length - ) - else: - example = tokenizer(examples["prompt"]) + example = tokenizer(examples["prompt"]) elif "code" in examples: - if algo == "teq": - example = tokenizer( - examples["code"], padding="max_length", max_length=max_length - ) - else: - example = tokenizer(examples["code"]) + example = tokenizer(examples["code"]) elif "text" in examples: - if algo == "teq": - example = tokenizer( - examples["text"], padding="max_length", max_length=max_length - ) - else: - example = tokenizer(examples["text"]) + example = tokenizer(examples["text"]) else: logger.error( "Please check dataset prompt identifier," @@ -424,15 +514,6 @@ def collate_batch(batch): except ValueError: pass -@torch.no_grad() -def run_fn_for_autoround(model, dataloader): - for data in dataloader: - if isinstance(data, tuple) or isinstance(data, list): - model(*data) - elif isinstance(data, dict): - model(**data) - else: - model(data) def convert_to_quantized_model(model, config, device="cpu"): if device == "xpu" or device == torch.device("xpu"): @@ -448,161 +529,170 @@ def convert_to_quantized_model(model, config, device="cpu"): if orig_dtype != torch.float32: model.to(dtype=torch.float32) break - if config.weight_dtype in ["fp8_e4m3", "fp8_e5m2"]: - return replace_linear(model, None, None, config, device=device) - else: - if config.weight_dtype == "int8": - dtype = "int8" - elif "int4" in config.weight_dtype: - dtype = "int4" - else: - dtype = config.weight_dtype - # mapping to INC config - if config.quant_method.value == "rtn": - quant_config = RTNConfig( - dtype=dtype, - bits=config.bits, - use_sym=config.sym, - group_size=config.group_size, - use_layer_wise=config.layer_wise, - ) - if config.llm_int8_skip_modules != []: - for module in config.llm_int8_skip_modules: - module_name = ".*" + module - quant_config.set_local(module_name, RTNConfig(dtype="fp32")) - logger.info(f"Do RTN algorithm with config {quant_config}") - model = prepare(model, quant_config) - model = convert(model) - elif config.quant_method.value == "awq": - quant_config = AWQConfig( - dtype=dtype, - bits=config.bits, - use_sym=config.sym, - group_size=config.group_size, - use_layer_wise=config.layer_wise, - use_auto_scale=config.auto_scale, - use_auto_clip=config.auto_clip, - folding=True, - ) - if config.llm_int8_skip_modules != []: - for module in config.llm_int8_skip_modules: - module_name = ".*" + module - quant_config.set_local(module_name, AWQConfig(dtype="fp32")) - logger.info(f"Do AWQ algorithm with config {quant_config}") - run_fn = default_run_fn - run_args = ( - config.tokenizer, - config.dataset, - config.seq_len, # max_length - config.n_samples, # n_samples - config.batch_size, # batch_size - config.quant_method.value, # algo - ) - example_inputs = torch.ones([1, 512], dtype=torch.long).to(device) - model = prepare(model=model, quant_config=quant_config, example_inputs=example_inputs) - run_fn(model, *run_args) - model = convert(model) - elif config.quant_method.value == "teq": - quant_config = TEQConfig( - dtype=dtype, - bits=config.bits, - use_sym=config.sym, - group_size=config.group_size, - use_layer_wise=config.layer_wise, - absorb_to_layer=config.absorb_to_layer - ) - if config.llm_int8_skip_modules != []: - for module in config.llm_int8_skip_modules: - module_name = ".*" + module - quant_config.set_local(module_name, TEQConfig(dtype="fp32")) - logger.info(f"Do TEQ algorithm with config {quant_config}") - run_fn = default_run_fn - run_args = ( - config.tokenizer, - config.dataset, - config.seq_len, # max_length - config.n_samples, # n_samples - config.batch_size, # batch_size - config.quant_method.value, # algo - ) - example_inputs = torch.ones([1, 512], dtype=torch.long).to(device) - model = prepare(model=model, quant_config=quant_config, example_inputs=example_inputs) - run_fn(model, *run_args) - model = convert(model) - - elif config.quant_method.value == "gptq": - model.seqlen = config.seq_len - quant_config = GPTQConfig( - dtype=dtype, - bits=config.bits, - use_sym=config.sym, - group_size=config.group_size, - use_layer_wise=config.layer_wise, - act_order=config.desc_act, - percdamp=config.damp_percent, - block_size=config.blocksize, - static_groups=config.static_groups, - use_mse_search=config.use_mse_search, - ) - if config.llm_int8_skip_modules != []: - for module in config.llm_int8_skip_modules: - module_name = ".*" + module - quant_config.set_local(module_name, GPTQConfig(dtype="fp32")) - logger.info(f"Do GPTQ algorithm with config {quant_config}") - run_fn = default_run_fn - run_args = ( - config.tokenizer, - config.dataset, - config.seq_len, # max_length - config.n_samples, # n_samples - config.batch_size, # batch_size - config.quant_method.value, # algo - ) - model = prepare(model=model, quant_config=quant_config) - run_fn(model, *run_args) - model = convert(model) - elif config.quant_method.value == "autoround": - quant_config = AutoRoundConfig( - dtype=dtype, + + # mapping to INC config + if config.quant_method.value == "rtn": + quant_config = RTNConfig( + dtype=config.weight_dtype, bits=config.bits, use_sym=config.sym, - group_size=config.group_size, - enable_quanted_input=not config.disable_quanted_input, - lr=config.lr, - minmax_lr=config.minmax_lr, - seqlen=config.seq_len, - nsamples=config.n_samples, - iters=config.iters, - scale_dtype=config.scale_dtype, - ) - if config.llm_int8_skip_modules != []: - for module in config.llm_int8_skip_modules: - module_name = ".*" + module - quant_config.set_local(module_name, AutoRoundConfig(dtype="fp32")) - logger.info(f"Do AutoRound algorithm with config {quant_config}") - dataloader = get_autoround_dataloader(tokenizer=config.tokenizer, - seqlen=config.seq_len, - dataset_name="NeelNanda/pile-10k", - seed=42, - bs=config.batch_size, - nsamples=config.n_samples) - run_fn = run_fn_for_autoround - run_args = (dataloader,) - model = prepare(model=model, quant_config=quant_config) - run_fn(model, *run_args) - model = convert(model) - else: - assert False, "The Supported algorithm are RTN, AWQ, TEQ, GPTQ, AUTOROUND" + group_size=config.group_size + ) + if config.use_layer_wise: + quant_config.user_layer_wise = config.use_layer_wise + quant_config.model_path = config.model_path + if config.modules_to_not_convert != []: + for module in config.modules_to_not_convert: + module_name = ".*" + module + quant_config.set_local(module_name, RTNConfig(dtype="fp32")) + logger.info(f"Do RTN algorithm with config {quant_config}") + model = prepare(model, quant_config) + model = convert(model) + elif config.quant_method.value == "gptq": + model.seqlen = config.seq_len + quant_config = GPTQConfig( + dtype=config.weight_dtype, + bits=config.bits, + use_sym=config.sym, + group_size=config.group_size, + use_layer_wise=config.layer_wise, + act_order=config.desc_act, + percdamp=config.damp_percent, + block_size=config.blocksize, + static_groups=config.static_groups, + use_mse_search=config.use_mse_search, + true_sequential=config.true_sequential, + ) + if config.use_layer_wise: + quant_config.user_layer_wise = config.use_layer_wise + quant_config.model_path = config.model_path + if config.modules_to_not_convert != []: + for module in config.modules_to_not_convert: + module_name = ".*" + module + quant_config.set_local(module_name, GPTQConfig(dtype="fp32")) + logger.info(f"Do GPTQ algorithm with config {quant_config}") + run_fn = default_run_fn + run_args = ( + config.tokenizer, + config.dataset, + config.seq_len, # max_length + config.n_samples, # n_samples + config.batch_size, # batch_size + config.quant_method.value, # algo + ) + model = prepare(model=model, quant_config=quant_config) + run_fn(model, *run_args) + model = convert(model) + else: + assert False, "The Supported algorithm are RTN, GPTQ." - if device == "xpu" or device == torch.device("xpu"): - logger.warning("The recommended ipex version is higher than 2.3.10 for xpu device.") + if device == "xpu" or device == torch.device("xpu"): + logger.warning("The recommended ipex version is higher than 2.3.10 for xpu device.") + + model.eval() + q_model = replace_linear(model, None, None, config, device=device) + + if orig_dtype != torch.float32: + q_model.to(dtype=orig_dtype) + + return q_model.to(device) + +# def save_linear_parameters(model, save_directory): + +# weights_file = os.path.join( +# os.path.abspath(os.path.expanduser(save_directory)), WEIGHTS_NAME +# ) +# linear_parameters = {} +# from intel_extension_for_pytorch.nn.modules import ( +# WeightOnlyQuantizedLinear as ipex_cpu_linear, +# ) + +# for name, module in model.named_modules(): +# if isinstance(module, ipex_cpu_linear): +# linear_parameters[name + ".qweight"] = ( +# module._op_context.to_public( +# module._op_context.get_weight() +# ).contiguous() +# ) +# linear_parameters[name + ".scales"] = ( +# module._op_context.get_scales().contiguous() +# ) +# linear_parameters[name + ".qzeros"] = ( +# module._op_context.get_zero_points().contiguous() +# ) +# if module._op_context.get_bias() is not None: +# linear_parameters[name + ".bias"] = ( +# module._op_context.get_bias().contiguous() +# ) +# if module._op_context.get_g_idx() is not None: +# linear_parameters[name + ".g_idx"] = ( +# module._op_context.get_g_idx().contiguous() +# ) + +# others_parameters = model.state_dict() +# linear_parameters.update(others_parameters) +# torch.save(linear_parameters, weights_file) + + +def save_low_bit( + self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs +): - model.eval() - # INC attribute conflicted with transformers when use nf4/int8 training. - del model.is_quantized - q_model = replace_linear(model, None, None, config, device=device) + assert hasattr( + self, "quantization_config" + ), f"Detected this model is not a low-bit model." - if orig_dtype != torch.float32: - q_model.to(dtype=orig_dtype) + if os.path.isfile(save_directory): + logger.error( + f"Provided path ({save_directory}) should be a directory, not a file" + ) + return + + + os.makedirs(save_directory, exist_ok=True) + # use transformers original `save_pretrained` function + del self.save_pretrained + + self.save_pretrained( + save_directory=save_directory, push_to_hub=push_to_hub, **kwargs + ) - return q_model.to(device) + # if self.device == "cpu" or self.device == torch.device("cpu") or self.device == "auto": + # save_linear_parameters(self, save_directory) + self.save_pretrained = types.MethodType(save_low_bit, self) + # We conveniently save all the keys of the model to have them on hand, + # so that when using 'low_cpumem load', + # it's not necessary to load the entire model to extract its keys + # and we can avoid gc not triggered potentially. + all_checkpoint_keys = {"all_checkpoint_keys": list(self.state_dict().keys())} + json_file_path = os.path.join(save_directory, "all_checkpoint_keys.json") + with open(json_file_path, "w") as json_file: + json.dump(all_checkpoint_keys, json_file) + if push_to_hub: + use_auth_token = kwargs.pop("use_auth_token", None) + + if use_auth_token is not None: + logger.warning.warn( + "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", + FutureWarning, + ) + if token is not None: + raise ValueError( + "`token` and `use_auth_token` are both specified. Please set only the argument `token`." + ) + token = use_auth_token + + if token is not None: + kwargs["token"] = token + commit_message = kwargs.pop("commit_message", None) + repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1]) + repo_id = self._create_repo(repo_id, **kwargs) + files_timestamps = self._get_files_timestamps(save_directory) + self._upload_modified_files( + save_directory, + repo_id, + files_timestamps, + commit_message=commit_message, + token=kwargs.get("token"), + ) + self.quantization_config.save_pretrained(save_directory, **kwargs) diff --git a/neural_compressor/transformers/utils/quantization_config.py b/neural_compressor/transformers/utils/quantization_config.py index 1381293a211..17d87d16213 100644 --- a/neural_compressor/transformers/utils/quantization_config.py +++ b/neural_compressor/transformers/utils/quantization_config.py @@ -86,8 +86,15 @@ def post_init_cpu(self): if self.scale_dtype is not None and self.scale_dtype not in [ "fp32", "bf16", +<<<<<<< Updated upstream ]: raise ValueError("scale_dtype must be a string in 'fp32', 'bf16' ") +======= + "fp16"]: + raise ValueError( + "scale_dtype must be a string in 'fp32', 'bf16' " + ) +>>>>>>> Stashed changes elif self.scale_dtype is None: self.scale_dtype = "fp32" @@ -274,7 +281,10 @@ def __init__( "modules_to_not_convert", ["lm_head", "transformer.output_layer", "embed_out"] ) self.device = kwargs.get("device", "auto") - + if self.use_layer_wise: + self.model_path = kwargs("model_path", None) + if self.model_path is None: + raise AssertionError("model_path is necessary if you would like to use_layer_wise for weight only quantization.") def to_diff_dict(self) -> Dict[str, Any]: """Removes all attributes from config which correspond to the default config attributes for better readability and serializes to a Python dictionary. @@ -344,6 +354,10 @@ def __init__( ) self.device = kwargs.get("device", "auto") self.scheme = "sym" if self.sym else "asym" + if self.use_layer_wise: + self.model_path = kwargs("model_path", None) + if self.model_path is None: + raise AssertionError("model_path is necessary if you would like to use_layer_wise for weight only quantization.") if isinstance(compute_dtype, torch.dtype): self.compute_dtype = compute_dtype From 5304d4192de71afb58f56aaac055243efd175c83 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 26 Aug 2024 13:18:40 +0000 Subject: [PATCH 04/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_compressor/__init__.py | 5 +-- neural_compressor/transformers/__init__.py | 14 ++++++++ .../transformers/models/__init__.py | 14 ++++++++ .../transformers/models/modeling_auto.py | 32 +++++++++---------- .../transformers/quantization/__init__.py | 14 ++++++++ .../transformers/quantization/utils.py | 26 +++++++-------- 6 files changed, 70 insertions(+), 35 deletions(-) diff --git a/neural_compressor/__init__.py b/neural_compressor/__init__.py index 74ad88db685..0340b7576b3 100644 --- a/neural_compressor/__init__.py +++ b/neural_compressor/__init__.py @@ -25,10 +25,7 @@ QuantizationAwareTrainingConfig, MixedPrecisionConfig, ) -from .transformers import( - GPTQConfig, - RtnConfig -) +from .transformers import GPTQConfig, RtnConfig from .contrib import * from .model import * from .metric import * diff --git a/neural_compressor/transformers/__init__.py b/neural_compressor/transformers/__init__.py index f2ecbd3f05a..7701cea89d9 100644 --- a/neural_compressor/transformers/__init__.py +++ b/neural_compressor/transformers/__init__.py @@ -1 +1,15 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from .utils.quantization_config import GPTQConfig, RtnConfig diff --git a/neural_compressor/transformers/models/__init__.py b/neural_compressor/transformers/models/__init__.py index b10c092933e..c44b58bb461 100644 --- a/neural_compressor/transformers/models/__init__.py +++ b/neural_compressor/transformers/models/__init__.py @@ -1 +1,15 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from .modeling_auto import _BaseQBitsAutoModelClass diff --git a/neural_compressor/transformers/models/modeling_auto.py b/neural_compressor/transformers/models/modeling_auto.py index 465cc171da1..3940aaeaa68 100644 --- a/neural_compressor/transformers/models/modeling_auto.py +++ b/neural_compressor/transformers/models/modeling_auto.py @@ -41,29 +41,30 @@ import torch import transformers from accelerate import init_empty_weights - from transformers import AutoConfig from transformers.configuration_utils import PretrainedConfig from transformers.modeling_utils import load_state_dict -from transformers.utils import has_file, is_safetensors_available -from neural_compressor.transformers.quantization.utils import save_low_bit, replace_linear -from neural_compressor.torch.algorithms.weight_only.modules import INCWeightOnlyLinear - -from ..quantization.utils import ( - convert_dtype_torch2str, - replace_linear, +from transformers.utils import ( + SAFE_WEIGHTS_INDEX_NAME, + SAFE_WEIGHTS_NAME, + WEIGHTS_INDEX_NAME, + WEIGHTS_NAME, + has_file, + is_safetensors_available, ) + +from neural_compressor.torch.algorithms.weight_only.modules import INCWeightOnlyLinear +from neural_compressor.torch.utils import is_ipex_available from neural_compressor.transformers import GPTQConfig, RtnConfig -from neural_compressor.utils.utility import LazyImport, CpuInfo +from neural_compressor.transformers.quantization.utils import replace_linear, save_low_bit from neural_compressor.utils import logger -from neural_compressor.torch.utils import is_ipex_available -from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, WEIGHTS_INDEX_NAME, WEIGHTS_NAME +from neural_compressor.utils.utility import CpuInfo, LazyImport +from ..quantization.utils import convert_dtype_torch2str, replace_linear torch = LazyImport("torch") - def build_woq_model(model, quantization_config): from neural_compressor.adaptor.torch_utils.util import set_module @@ -523,9 +524,7 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): else: logger.warning("bits number only supports 4, 8.") quantization_config.weight_dtype = "int4" - logger.warning( - "int4 weight_dtype is used, please change the config.json if you don't want to use it." - ) + logger.warning("int4 weight_dtype is used, please change the config.json if you don't want to use it.") else: if quantization_config.weight_dtype not in [ "int4_fullrange", @@ -554,7 +553,6 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): model = build_woq_model(model, quantization_config) - if is_sharded: loaded_state_dict_keys = sharded_metadata["all_checkpoint_keys"] else: @@ -708,4 +706,4 @@ class AutoModel(_BaseQBitsAutoModelClass): class AutoModelForSeq2SeqLM(_BaseQBitsAutoModelClass): - ORIG_MODEL = transformers.AutoModelForSeq2SeqLM \ No newline at end of file + ORIG_MODEL = transformers.AutoModelForSeq2SeqLM diff --git a/neural_compressor/transformers/quantization/__init__.py b/neural_compressor/transformers/quantization/__init__.py index f07504d8f01..5dd8b2769f6 100644 --- a/neural_compressor/transformers/quantization/__init__.py +++ b/neural_compressor/transformers/quantization/__init__.py @@ -1 +1,15 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from .utils import convert_to_quantized_model, save_low_bit diff --git a/neural_compressor/transformers/quantization/utils.py b/neural_compressor/transformers/quantization/utils.py index f30640f70a7..c89fbe7100b 100644 --- a/neural_compressor/transformers/quantization/utils.py +++ b/neural_compressor/transformers/quantization/utils.py @@ -18,26 +18,25 @@ import logging import math import os + <<<<<<< Updated upstream from accelerate import init_empty_weights from datasets import load_dataset + ======= -import types import json +import types from datasets import load_dataset + from neural_compressor.torch.algorithms.weight_only.modules import INCWeightOnlyLinear -from neural_compressor.torch.quantization import ( - GPTQConfig, - RTNConfig, - convert, - prepare, -) -from neural_compressor.utils.utility import LazyImport, CpuInfo +from neural_compressor.torch.quantization import GPTQConfig, RTNConfig, convert, prepare +from neural_compressor.utils.utility import CpuInfo, LazyImport + >>>>>>> Stashed changes from transformers import AutoTokenizer -from transformers.utils import SAFE_WEIGHTS_NAME, WEIGHTS_NAME +from transformers.utils import SAFE_WEIGHTS_NAME, WEIGHTS_NAME from neural_compressor.torch.algorithms.weight_only.modules import INCWeightOnlyLinear as WeightOnlyLinear from neural_compressor.torch.quantization import GPTQConfig, RTNConfig, convert, prepare @@ -298,9 +297,7 @@ def _replace_linear( or device == "auto" >>>>>>> Stashed changes ): - from intel_extension_for_pytorch.nn.modules import ( - WeightOnlyQuantizedLinear as ipex_linear, - ) + from intel_extension_for_pytorch.nn.modules import WeightOnlyQuantizedLinear as ipex_linear from intel_extension_for_pytorch.utils.weight_only_quantization import ( _convert_optimum_format_to_desired, ) @@ -355,8 +352,9 @@ def _replace_linear( ) elif device == "xpu" or device == torch.device("xpu"): - from intel_extension_for_pytorch.nn.utils._quantize_convert import \ - WeightOnlyQuantizedLinear as ipex_linear # pylint: disable=E0401 + from intel_extension_for_pytorch.nn.utils._quantize_convert import ( + WeightOnlyQuantizedLinear as ipex_linear, # pylint: disable=E0401 + ) model._modules[name] = ipex_linear( in_features, out_features, From 627d7f318dd34690f5db22683bd80961e1a2e9a0 Mon Sep 17 00:00:00 2001 From: changwangss Date: Mon, 26 Aug 2024 06:20:05 -0700 Subject: [PATCH 05/18] fix style Signed-off-by: changwangss --- neural_compressor/transformers/utils/quantization_config.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/neural_compressor/transformers/utils/quantization_config.py b/neural_compressor/transformers/utils/quantization_config.py index 17d87d16213..94aa22b6453 100644 --- a/neural_compressor/transformers/utils/quantization_config.py +++ b/neural_compressor/transformers/utils/quantization_config.py @@ -86,15 +86,10 @@ def post_init_cpu(self): if self.scale_dtype is not None and self.scale_dtype not in [ "fp32", "bf16", -<<<<<<< Updated upstream - ]: - raise ValueError("scale_dtype must be a string in 'fp32', 'bf16' ") -======= "fp16"]: raise ValueError( "scale_dtype must be a string in 'fp32', 'bf16' " ) ->>>>>>> Stashed changes elif self.scale_dtype is None: self.scale_dtype = "fp32" From a816736c12adb6e1652d73a3d1f7243cdb9f65a9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 26 Aug 2024 13:21:12 +0000 Subject: [PATCH 06/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../transformers/utils/quantization_config.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/neural_compressor/transformers/utils/quantization_config.py b/neural_compressor/transformers/utils/quantization_config.py index 94aa22b6453..1ff39b66483 100644 --- a/neural_compressor/transformers/utils/quantization_config.py +++ b/neural_compressor/transformers/utils/quantization_config.py @@ -83,13 +83,8 @@ def post_init_cpu(self): elif self.bits is not None and self.bits not in [4, 8]: raise ValueError(f"Only support quantization to [4, 8] bits but found {self.bits}") - if self.scale_dtype is not None and self.scale_dtype not in [ - "fp32", - "bf16", - "fp16"]: - raise ValueError( - "scale_dtype must be a string in 'fp32', 'bf16' " - ) + if self.scale_dtype is not None and self.scale_dtype not in ["fp32", "bf16", "fp16"]: + raise ValueError("scale_dtype must be a string in 'fp32', 'bf16' ") elif self.scale_dtype is None: self.scale_dtype = "fp32" @@ -279,7 +274,10 @@ def __init__( if self.use_layer_wise: self.model_path = kwargs("model_path", None) if self.model_path is None: - raise AssertionError("model_path is necessary if you would like to use_layer_wise for weight only quantization.") + raise AssertionError( + "model_path is necessary if you would like to use_layer_wise for weight only quantization." + ) + def to_diff_dict(self) -> Dict[str, Any]: """Removes all attributes from config which correspond to the default config attributes for better readability and serializes to a Python dictionary. @@ -352,7 +350,9 @@ def __init__( if self.use_layer_wise: self.model_path = kwargs("model_path", None) if self.model_path is None: - raise AssertionError("model_path is necessary if you would like to use_layer_wise for weight only quantization.") + raise AssertionError( + "model_path is necessary if you would like to use_layer_wise for weight only quantization." + ) if isinstance(compute_dtype, torch.dtype): self.compute_dtype = compute_dtype From ea62bb352e5b68ef42bff8e0048c518faf743cca Mon Sep 17 00:00:00 2001 From: changwangss Date: Mon, 26 Aug 2024 06:32:15 -0700 Subject: [PATCH 07/18] fix style Signed-off-by: changwangss --- .../transformers/models/modeling_auto.py | 5 +- .../transformers/quantization/utils.py | 282 +++--------------- 2 files changed, 42 insertions(+), 245 deletions(-) diff --git a/neural_compressor/transformers/models/modeling_auto.py b/neural_compressor/transformers/models/modeling_auto.py index 3940aaeaa68..c1848fdfe37 100644 --- a/neural_compressor/transformers/models/modeling_auto.py +++ b/neural_compressor/transformers/models/modeling_auto.py @@ -38,7 +38,6 @@ from threading import Thread from typing import Union -import torch import transformers from accelerate import init_empty_weights from transformers import AutoConfig @@ -56,12 +55,10 @@ from neural_compressor.torch.algorithms.weight_only.modules import INCWeightOnlyLinear from neural_compressor.torch.utils import is_ipex_available from neural_compressor.transformers import GPTQConfig, RtnConfig -from neural_compressor.transformers.quantization.utils import replace_linear, save_low_bit +from neural_compressor.transformers.quantization.utils import convert_dtype_torch2str, replace_linear, save_low_bit from neural_compressor.utils import logger from neural_compressor.utils.utility import CpuInfo, LazyImport -from ..quantization.utils import convert_dtype_torch2str, replace_linear - torch = LazyImport("torch") diff --git a/neural_compressor/transformers/quantization/utils.py b/neural_compressor/transformers/quantization/utils.py index c89fbe7100b..65e1858e0a1 100644 --- a/neural_compressor/transformers/quantization/utils.py +++ b/neural_compressor/transformers/quantization/utils.py @@ -15,35 +15,21 @@ """Intel Neural Compressor model convert.""" import gc +import json import logging import math import os - -<<<<<<< Updated upstream - -from accelerate import init_empty_weights -from datasets import load_dataset - -======= -import json import types from datasets import load_dataset - -from neural_compressor.torch.algorithms.weight_only.modules import INCWeightOnlyLinear -from neural_compressor.torch.quantization import GPTQConfig, RTNConfig, convert, prepare -from neural_compressor.utils.utility import CpuInfo, LazyImport - ->>>>>>> Stashed changes from transformers import AutoTokenizer from transformers.utils import SAFE_WEIGHTS_NAME, WEIGHTS_NAME +from neural_compressor.torch.algorithms.weight_only.modules import INCWeightOnlyLinear from neural_compressor.torch.algorithms.weight_only.modules import INCWeightOnlyLinear as WeightOnlyLinear from neural_compressor.torch.quantization import GPTQConfig, RTNConfig, convert, prepare from neural_compressor.torch.utils import is_ipex_available -from neural_compressor.utils.utility import LazyImport - -from ....tools.utils import _ipex_version +from neural_compressor.utils.utility import CpuInfo, LazyImport if is_ipex_available(): import intel_extension_for_pytorch as ipex @@ -55,6 +41,7 @@ logger = logging.getLogger(__name__) + def convert_dtype_str2torch(str_dtype): if str_dtype == "int8": return torch.int8 @@ -82,6 +69,7 @@ def convert_dtype_torch2str(dtype): else: assert False, "Unsupported pytorch dtype {} to str dtype".format(dtype) + def replace_linear( model, modules_to_not_convert=None, @@ -95,9 +83,7 @@ def replace_linear( # embed_out is dolly_v2 last layer name modules_to_not_convert = [] if quantization_config.modules_to_not_convert: - modules_to_not_convert.extend( - quantization_config.modules_to_not_convert - ) + modules_to_not_convert.extend(quantization_config.modules_to_not_convert) modules_to_not_convert = list(set(modules_to_not_convert)) model, is_replaced = _replace_linear( model, @@ -139,175 +125,23 @@ def _replace_linear( if ( isinstance(module, torch.nn.Linear) or isinstance(module, INCWeightOnlyLinear) - or ( - is_ipex_available() - and isinstance(module, ipex.nn.utils._weight_prepack._IPEXLinear) - ) + or (is_ipex_available() and isinstance(module, ipex.nn.utils._weight_prepack._IPEXLinear)) ) and (name not in modules_to_not_convert): # Check if the current key is not in the `modules_to_not_convert` - if not any( - key in ".".join(current_key_name) for key in modules_to_not_convert - ): -<<<<<<< Updated upstream - with init_empty_weights(): - in_features = module.in_features - out_features = module.out_features - if ( - device == "cpu" - or device == torch.device("cpu") - or device == "auto" - ): - if is_ipex_available() and quantization_config.use_ipex: - from intel_extension_for_pytorch.nn.modules import WeightOnlyQuantizedLinear as ipex_linear - from intel_extension_for_pytorch.utils.weight_only_quantization import ( - _convert_optimum_format_to_desired, - ) - - qweight, scales, qzeros = ( - _convert_optimum_format_to_desired( - module.qweight, module.scales, module.qzeros - ) - ) - - weight_dtype = { - 4: ipex.quantization.WoqWeightDtype.INT4, - 8: ipex.quantization.WoqWeightDtype.INT8, - } - compute_dtype = { - "fp32": ipex.quantization.WoqLowpMode.NONE, # follow the activation datatype. - "bf16": ipex.quantization.WoqLowpMode.BF16, - "fp16": ipex.quantization.WoqLowpMode.FP16, - "int8": ipex.quantization.WoqLowpMode.INT8, - } - - ipex_qconfig_mapping = ipex.quantization.get_weight_only_quant_qconfig_mapping( - weight_dtype=weight_dtype[quantization_config.bits], - lowp_mode=compute_dtype[ - quantization_config.compute_dtype - ], - act_quant_mode=ipex.quantization.WoqActQuantMode.PER_IC_BLOCK, - group_size=quantization_config.group_size, - ) - tmp_linear = torch.nn.Linear( - in_features, - out_features, - True if hasattr(module, "bias") else False, - ) - tmp_linear.qconfig = ipex_qconfig_mapping.global_qconfig - model._modules[name] = ( - ipex_linear.from_float_and_int4_weight( - mod=tmp_linear, - qweight=qweight, - scales=scales, - zero_points=qzeros, - bias=( - module.bias if hasattr(module, "bias") else None - ), - group_size=quantization_config.group_size, - g_idx=( - module.g_idx - if hasattr(module, "g_idx") - else None - ), - ) - ) - - elif device == "xpu" or device == torch.device("xpu"): - from intel_extension_for_pytorch.nn.utils._quantize_convert import ( - WeightOnlyQuantizedLinear as ipex_linear, # pylint: disable=E0401 - ) - model._modules[name] = ipex_linear( - in_features, - out_features, - module.bias is not None, - compute_dtype=quantization_config.compute_dtype, - compress_statistics=False, - weight_dtype=quantization_config.weight_dtype, - scale_dtype=quantization_config.scale_dtype, - blocksize=quantization_config.group_size, - scheme=quantization_config.scheme, - compression_dtype=getattr(module, "compression_dtype", - torch.int8 if _ipex_version < "2.3.10" else torch.int32), - compression_dim=getattr(module, "compression_dim", 0 if _ipex_version < "2.3.10" else 1), - device=device, - use_optimum_format=getattr(module, "use_optimum_format", - False if _ipex_version < "2.3.10" else True), - ) - if quantization_config.quant_method.value == "gptq": - g_idx = getattr( - module, - "g_idx", - torch.zeros(in_features, dtype=torch.int32).to(device), - ) - else: - g_idx = None - model._modules[name].set_scales_zps_gidx( - ( - module.scales - if hasattr(module, "scales") - else torch.ones( - ( - out_features, - math.ceil( - in_features / quantization_config.group_size - ), - ), - dtype=convert_dtype_str2torch( - quantization_config.compute_dtype - ), - device=torch.device(device), - ) if _ipex_version < "2.3.10" else torch.ones( - ( - math.ceil( - in_features / quantization_config.group_size - ), - out_features, - ), - dtype=convert_dtype_str2torch( - quantization_config.compute_dtype - ), - device=torch.device(device), - ) - ), - module.qzeros if hasattr(module, "qzeros") else None, - g_idx, - ) - else: - raise Exception( - "{} device Unsupported weight only quantization!".format( - device - ) - ) - - is_replaced = True - # Store the module class in case we need to transpose the weight later - model._modules[name].source_cls = type(module) - # Force requires grad to False to avoid unexpected errors - model._modules[name].requires_grad_(False) - if quantization_config.use_ipex: - pass - elif ( - device == "cpu" or device == torch.device("cpu") or device == "auto" -======= + if not any(key in ".".join(current_key_name) for key in modules_to_not_convert): in_features = module.in_features out_features = module.out_features - if ( - device == "cpu" - or device == torch.device("cpu") - or device == "auto" ->>>>>>> Stashed changes - ): + if device == "cpu" or device == torch.device("cpu") or device == "auto": from intel_extension_for_pytorch.nn.modules import WeightOnlyQuantizedLinear as ipex_linear from intel_extension_for_pytorch.utils.weight_only_quantization import ( _convert_optimum_format_to_desired, ) + qweight = module.qweight scales = module.scales qzeros = module.qzeros - qweight, scales, qzeros = _convert_optimum_format_to_desired( - qweight, scales, qzeros - ) + qweight, scales, qzeros = _convert_optimum_format_to_desired(qweight, scales, qzeros) weight_dtype = { 4: ipex.quantization.WoqWeightDtype.INT4, 8: ipex.quantization.WoqWeightDtype.INT8, @@ -321,9 +155,7 @@ def _replace_linear( ipex_qconfig_mapping = ipex.quantization.get_weight_only_quant_qconfig_mapping( weight_dtype=weight_dtype[quantization_config.bits], - lowp_mode=compute_dtype[ - quantization_config.compute_dtype - ], + lowp_mode=compute_dtype[quantization_config.compute_dtype], act_quant_mode=ipex.quantization.WoqActQuantMode.PER_IC_BLOCK, group_size=quantization_config.group_size, ) @@ -333,28 +165,21 @@ def _replace_linear( True if hasattr(module, "bias") else False, ) tmp_linear.qconfig = ipex_qconfig_mapping.global_qconfig - model._modules[name] = ( - ipex_linear.from_float_and_int4_weight( - mod=tmp_linear, - qweight=qweight, - scales=scales, - zero_points= qzeros, - bias=( - module.bias if hasattr(module, "bias") else None - ), - group_size=quantization_config.group_size, - g_idx=( - module.g_idx - if hasattr(module, "g_idx") - else None - ), - ) + model._modules[name] = ipex_linear.from_float_and_int4_weight( + mod=tmp_linear, + qweight=qweight, + scales=scales, + zero_points=qzeros, + bias=(module.bias if hasattr(module, "bias") else None), + group_size=quantization_config.group_size, + g_idx=(module.g_idx if hasattr(module, "g_idx") else None), ) elif device == "xpu" or device == torch.device("xpu"): from intel_extension_for_pytorch.nn.utils._quantize_convert import ( WeightOnlyQuantizedLinear as ipex_linear, # pylint: disable=E0401 ) + model._modules[name] = ipex_linear( in_features, out_features, @@ -384,14 +209,10 @@ def _replace_linear( if hasattr(module, "scales") else torch.ones( ( - math.ceil( - in_features / quantization_config.group_size - ), + math.ceil(in_features / quantization_config.group_size), out_features, ), - dtype=convert_dtype_str2torch( - quantization_config.compute_dtype - ), + dtype=convert_dtype_str2torch(quantization_config.compute_dtype), device=torch.device(device), ) ), @@ -399,11 +220,7 @@ def _replace_linear( g_idx, ) else: - raise Exception( - "{} device Unsupported weight only quantization!".format( - device - ) - ) + raise Exception("{} device Unsupported weight only quantization!".format(device)) is_replaced = True # Store the module class in case we need to transpose the weight later @@ -411,9 +228,7 @@ def _replace_linear( # Force requires grad to False to avoid unexpected errors model._modules[name].requires_grad_(False) - if ( - device == "xpu" or device == torch.device("xpu") - ): + if device == "xpu" or device == torch.device("xpu"): if not hasattr(module, "qweight"): n_pack = 32 // quantization_config.bits @@ -446,9 +261,7 @@ def _replace_linear( return model, is_replaced -def default_run_fn( - model, tokenizer, dataset, max_length=512, n_samples=100, batch_size=8, algo="rtn" -): +def default_run_fn(model, tokenizer, dataset, max_length=512, n_samples=100, batch_size=8, algo="rtn"): from torch.utils.data import DataLoader if isinstance(dataset, (str, bytes, os.PathLike)): @@ -467,8 +280,7 @@ def tokenize_function(examples): example = tokenizer(examples["text"]) else: logger.error( - "Please check dataset prompt identifier," - + " NeelNanda/pile-10k is default used calibration dataset." + "Please check dataset prompt identifier," + " NeelNanda/pile-10k is default used calibration dataset." ) exit(0) return example @@ -486,11 +298,11 @@ def collate_batch(batch): input_ids_padded.append(input_ids) else: continue - assert input_ids_padded != [], \ - "The dataset does not have data that meets the required input length. Please reduce seq_len." + assert ( + input_ids_padded != [] + ), "The dataset does not have data that meets the required input length. Please reduce seq_len." return torch.vstack(input_ids_padded) - calib_dataloader = DataLoader( tokenized_dataset, batch_size=batch_size, @@ -517,9 +329,7 @@ def convert_to_quantized_model(model, config, device="cpu"): if device == "xpu" or device == torch.device("xpu"): import intel_extension_for_pytorch - assert ( - hasattr(torch, "xpu") and torch.xpu.is_available() - ), "There is no xpu device in this system!" + assert hasattr(torch, "xpu") and torch.xpu.is_available(), "There is no xpu device in this system!" orig_dtype = torch.float32 for param in model.parameters(): @@ -531,10 +341,7 @@ def convert_to_quantized_model(model, config, device="cpu"): # mapping to INC config if config.quant_method.value == "rtn": quant_config = RTNConfig( - dtype=config.weight_dtype, - bits=config.bits, - use_sym=config.sym, - group_size=config.group_size + dtype=config.weight_dtype, bits=config.bits, use_sym=config.sym, group_size=config.group_size ) if config.use_layer_wise: quant_config.user_layer_wise = config.use_layer_wise @@ -595,6 +402,7 @@ def convert_to_quantized_model(model, config, device="cpu"): return q_model.to(device) + # def save_linear_parameters(model, save_directory): # weights_file = os.path.join( @@ -632,28 +440,19 @@ def convert_to_quantized_model(model, config, device="cpu"): # torch.save(linear_parameters, weights_file) -def save_low_bit( - self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs -): +def save_low_bit(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs): - assert hasattr( - self, "quantization_config" - ), f"Detected this model is not a low-bit model." + assert hasattr(self, "quantization_config"), "Detected this model is not a low-bit model." if os.path.isfile(save_directory): - logger.error( - f"Provided path ({save_directory}) should be a directory, not a file" - ) + logger.error(f"Provided path ({save_directory}) should be a directory, not a file") return - os.makedirs(save_directory, exist_ok=True) # use transformers original `save_pretrained` function del self.save_pretrained - self.save_pretrained( - save_directory=save_directory, push_to_hub=push_to_hub, **kwargs - ) + self.save_pretrained(save_directory=save_directory, push_to_hub=push_to_hub, **kwargs) # if self.device == "cpu" or self.device == torch.device("cpu") or self.device == "auto": # save_linear_parameters(self, save_directory) @@ -674,14 +473,15 @@ def save_low_bit( "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning, ) + + token = use_auth_token if token is not None: raise ValueError( "`token` and `use_auth_token` are both specified. Please set only the argument `token`." ) - token = use_auth_token - if token is not None: - kwargs["token"] = token + if token is not None: + kwargs["token"] = token commit_message = kwargs.pop("commit_message", None) repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1]) repo_id = self._create_repo(repo_id, **kwargs) From b7da9199a26f8aed46a55104f8a43b1f13e70201 Mon Sep 17 00:00:00 2001 From: changwangss Date: Tue, 27 Aug 2024 06:26:13 -0700 Subject: [PATCH 08/18] support save and load Signed-off-by: changwangss --- .../transformers/models/modeling_auto.py | 85 ++---------- .../transformers/quantization/utils.py | 123 ++++++++++++------ test/3x/torch/test_transformers.py | 38 ++++++ 3 files changed, 130 insertions(+), 116 deletions(-) create mode 100644 test/3x/torch/test_transformers.py diff --git a/neural_compressor/transformers/models/modeling_auto.py b/neural_compressor/transformers/models/modeling_auto.py index c1848fdfe37..ab0e47ed101 100644 --- a/neural_compressor/transformers/models/modeling_auto.py +++ b/neural_compressor/transformers/models/modeling_auto.py @@ -31,12 +31,8 @@ # limitations under the License. import copy -import json import os -import re import types -from threading import Thread -from typing import Union import transformers from accelerate import init_empty_weights @@ -52,8 +48,8 @@ is_safetensors_available, ) +from neural_compressor.adaptor.torch_utils.util import set_module from neural_compressor.torch.algorithms.weight_only.modules import INCWeightOnlyLinear -from neural_compressor.torch.utils import is_ipex_available from neural_compressor.transformers import GPTQConfig, RtnConfig from neural_compressor.transformers.quantization.utils import convert_dtype_torch2str, replace_linear, save_low_bit from neural_compressor.utils import logger @@ -63,8 +59,6 @@ def build_woq_model(model, quantization_config): - from neural_compressor.adaptor.torch_utils.util import set_module - bits = quantization_config.bits for n, m in model.named_modules(): if n in quantization_config.modules_to_not_convert: @@ -560,70 +554,6 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): # restore default dtype if dtype_orig is not None: torch.set_default_dtype(dtype_orig) - - if is_ipex_available(): - model = replace_linear( - model, - quantization_config=quantization_config, - device="cpu" if device_map == "auto" else device_map, - empty_weights=True, - ) - # if (device_map == "cpu" or device_map == torch.device("cpu")): - # import intel_extension_for_pytorch as ipex - # from intel_extension_for_pytorch.nn.modules import WeightOnlyQuantizedLinear as ipex_linear - - # def replace_ipex_cpu_woq_linear(model, current_name=[]): - # for name, module in model.named_children(): - # current_name.append(name) - # if isinstance(module, INCWeightOnlyLinear): - # weight_dtype = { - # 4: ipex.quantization.WoqWeightDtype.INT4, - # 8: ipex.quantization.WoqWeightDtype.INT8, - # } - # compute_dtype = { - # "fp32": ipex.quantization.WoqLowpMode.NONE, # follow the activation datatype. - # "bf16": ipex.quantization.WoqLowpMode.BF16, - # "fp16": ipex.quantization.WoqLowpMode.FP16, - # "int8": ipex.quantization.WoqLowpMode.INT8, - # } - - # ipex_qconfig_mapping = ipex.quantization.get_weight_only_quant_qconfig_mapping( - # weight_dtype=weight_dtype[quantization_config.bits], - # lowp_mode=compute_dtype[quantization_config.compute_dtype], - # act_quant_mode=ipex.quantization.WoqActQuantMode.PER_IC_BLOCK, - # group_size=quantization_config.group_size, - # ) - # tmp_linear = torch.nn.Linear( - # module.in_features, - # module.out_features, - # True if hasattr(module, "bias") else False, - # ) - # tmp_linear.qconfig = ipex_qconfig_mapping.global_qconfig - # target_linear = ipex_linear.from_float_and_int4_weight( - # mod=tmp_linear, - # qweight=state_dict.pop(".".join(current_name) + ".qweight"), - # scales=state_dict.pop(".".join(current_name) + ".scales"), - # zero_points=state_dict.pop(".".join(current_name) + ".qzeros"), - # bias=( - # state_dict.pop(".".join(current_name) + ".bias") - # if ".".join(current_name) + ".bias" in state_dict - # else None - # ), - # group_size=quantization_config.group_size, - # g_idx=( - # state_dict.pop(".".join(current_name) + ".g_idx") - # if ".".join(current_name) + ".g_idx" in state_dict - # else None - # ), - # ) - # setattr(model, name, target_linear) - # else: - # replace_ipex_cpu_woq_linear(module, current_name) - # current_name.pop() - - # replace_ipex_cpu_woq_linear(model) - # model.load_state_dict(state_dict, strict=False, assign=True) - # else: ( model, missing_keys, @@ -645,6 +575,7 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): dtype=torch_dtype, keep_in_fp32_modules=[], ) + else: raise AssertionError("Please install intel_extension_for_pytorch.") @@ -654,12 +585,12 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): # Set model in evaluation mode to deactivate DropOut modules by default model.eval() - # model = replace_linear( - # model, - # quantization_config=quantization_config, - # device="cpu" if device_map == "auto" else device_map, - # empty_weights=True, - # ) + model = replace_linear( + model, + quantization_config=quantization_config, + device="cpu" if device_map == "auto" else device_map, + empty_weights=True, + ) if (not use_xpu and torch_dtype == torch.float16) or ( not use_xpu and not CpuInfo().bf16 and torch_dtype == torch.bfloat16 diff --git a/neural_compressor/transformers/quantization/utils.py b/neural_compressor/transformers/quantization/utils.py index 65e1858e0a1..6d0009eb0b5 100644 --- a/neural_compressor/transformers/quantization/utils.py +++ b/neural_compressor/transformers/quantization/utils.py @@ -170,10 +170,13 @@ def _replace_linear( qweight=qweight, scales=scales, zero_points=qzeros, - bias=(module.bias if hasattr(module, "bias") else None), + # bias=(module.bias if (hasattr(module, "bias") and not torch.all(module.bias.eq(0))) else None), + bias=(module.bias.float() if hasattr(module, "bias") else None), group_size=quantization_config.group_size, g_idx=(module.g_idx if hasattr(module, "g_idx") else None), ) + # print(current_key_name) + # print(module.bias.float()) elif device == "xpu" or device == torch.device("xpu"): from intel_extension_for_pytorch.nn.utils._quantize_convert import ( @@ -403,41 +406,84 @@ def convert_to_quantized_model(model, config, device="cpu"): return q_model.to(device) -# def save_linear_parameters(model, save_directory): - -# weights_file = os.path.join( -# os.path.abspath(os.path.expanduser(save_directory)), WEIGHTS_NAME -# ) -# linear_parameters = {} -# from intel_extension_for_pytorch.nn.modules import ( -# WeightOnlyQuantizedLinear as ipex_cpu_linear, -# ) - -# for name, module in model.named_modules(): -# if isinstance(module, ipex_cpu_linear): -# linear_parameters[name + ".qweight"] = ( -# module._op_context.to_public( -# module._op_context.get_weight() -# ).contiguous() -# ) -# linear_parameters[name + ".scales"] = ( -# module._op_context.get_scales().contiguous() -# ) -# linear_parameters[name + ".qzeros"] = ( -# module._op_context.get_zero_points().contiguous() -# ) -# if module._op_context.get_bias() is not None: -# linear_parameters[name + ".bias"] = ( -# module._op_context.get_bias().contiguous() -# ) -# if module._op_context.get_g_idx() is not None: -# linear_parameters[name + ".g_idx"] = ( -# module._op_context.get_g_idx().contiguous() -# ) - -# others_parameters = model.state_dict() -# linear_parameters.update(others_parameters) -# torch.save(linear_parameters, weights_file) +def pack_tensor_with_torch(raw_tensor, bits, compression_dtype=torch.int32): + """Pack the tensor with torch. + + Args: + raw_tensor (tensor): raw tensor. + + Returns: + tensor: packed tensor. + """ + n_pack = 32 // bits + target_len = math.ceil(raw_tensor.shape[1] / n_pack) + packed_tensor = torch.zeros(raw_tensor.shape[0], target_len, dtype=compression_dtype).to(raw_tensor.device) + mask = torch.tensor(2**bits - 1, dtype=compression_dtype).to(raw_tensor.device) + for j in range(packed_tensor.shape[1]): + start = n_pack * j + end = n_pack * (j + 1) + tmp = raw_tensor[:, start:end].type(compression_dtype) + tmp &= mask + for e in range(tmp.shape[1]): + tmp[:, e] = tmp[:, e] << (bits * e) + packed_tensor[:, j] |= tmp[:, e] + + return packed_tensor + + +def convert_to_GPTQ_checkpoints(model, quantization_config): + from intel_extension_for_pytorch.nn.modules import WeightOnlyQuantizedLinear as ipex_cpu_linear + + from neural_compressor.adaptor.torch_utils.util import set_module + from neural_compressor.torch.algorithms.weight_only.modules import INCWeightOnlyLinear + + dtype = "int4" if quantization_config.bits == 4 else "int8" + bits = quantization_config.bits + group_size = quantization_config.group_size + zp = False if quantization_config.sym else True + scale_dtype = quantization_config.scale_dtype + desc_act = (True if hasattr(quantization_config, "desc_act") else False,) + + for name, module in model.named_modules(): + if isinstance(module, ipex_cpu_linear): + in_features = module.in_features + out_features = module.out_features + new_module = INCWeightOnlyLinear( + in_features, + out_features, + dtype=dtype, + bits=bits, + group_size=group_size, + zp=zp, + bias=True if hasattr(module, "bias") else False, + scale_dtype=scale_dtype, + g_idx=desc_act, + use_optimum_format=True, + ) + + new_module.bits = 8 + new_module.n_pack = 32 // 8 + qweight = ( + new_module.pack_tensor_with_numpy(module._op_context.to_public(module._op_context.get_weight())) + .t() + .contiguous() + ) + new_module.bits = bits + new_module.n_pack = 32 // bits + scales = module._op_context.get_scales().t().contiguous() + bias = module._op_context.get_bias() + qzeros = new_module.pack_tensor_with_numpy(module._op_context.get_zero_points().t()).contiguous() + g_idx = module._op_context.get_g_idx() + + new_module.qweight = qweight + new_module.scales = scales + new_module.qzeros = qzeros + if g_idx is not None: + new_module.g_idx = g_idx.contiguous() + if bias is not None: + new_module.bias = bias.contiguous() + set_module(model, name, new_module) + return model def save_low_bit(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs): @@ -452,10 +498,9 @@ def save_low_bit(self, save_directory: Union[str, os.PathLike], push_to_hub: boo # use transformers original `save_pretrained` function del self.save_pretrained + if self.device == "cpu" or self.device == torch.device("cpu") or self.device == "auto": + convert_to_GPTQ_checkpoints(self, self.quantization_config) self.save_pretrained(save_directory=save_directory, push_to_hub=push_to_hub, **kwargs) - - # if self.device == "cpu" or self.device == torch.device("cpu") or self.device == "auto": - # save_linear_parameters(self, save_directory) self.save_pretrained = types.MethodType(save_low_bit, self) # We conveniently save all the keys of the model to have them on hand, # so that when using 'low_cpumem load', diff --git a/test/3x/torch/test_transformers.py b/test/3x/torch/test_transformers.py new file mode 100644 index 00000000000..16b962198fb --- /dev/null +++ b/test/3x/torch/test_transformers.py @@ -0,0 +1,38 @@ +import unittest +import torch +import pytest +import shutil +from transformers import AutoTokenizer +from optimum.intel import INCModelForCausalLM +from neural_compressor.transformers import GPTQConfig, RtnConfig +class TestQuantizationConfig(unittest.TestCase): + @classmethod + def setUpClass(self): + self.model_name = "TheBlokeAI/Mixtral-tiny-GPTQ" + self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) + self.prompt = "One day, the little girl" + self.input_ids = self.tokenizer(self.prompt, return_tensors="pt")["input_ids"] + + @classmethod + def tearDownClass(self): + shutil.rmtree("tmp_gptq") + shutil.rmtree("tmp_rtn") + def test_gptq(self): + quantization_config = GPTQConfig( + bits=4, sym=True, damp_percent=0.01,desc_act=True + ) + user_model = INCModelForCausalLM.from_pretrained(self.model_name, quantization_config=quantization_config) + output = user_model(self.input_ids) + user_model.save_pretrained("tmp_gptq") + loaded_model = INCModelForCausalLM.from_pretrained("tmp_gptq") + loaded_output = loaded_model(self.input_ids) + assert torch.allclose(output, loaded_output, atol=1e-2), "Compare failed!" + + def test_rtn(self): + quantization_config = RtnConfig(bits=4) + user_model = INCModelForCausalLM.from_pretrained(self.model_name, quantization_config=quantization_config) + output = user_model(self.input_ids) + user_model.save_pretrained("tmp_rtn") + loaded_model = INCModelForCausalLM.from_pretrained("tmp_rtn") + loaded_output = loaded_model(self.input_ids) + assert torch.allclose(output, loaded_output, atol=1e-2), "Compare failed!" From cd383e618242b872096713aa00be7d4bc5b12d87 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 27 Aug 2024 13:28:17 +0000 Subject: [PATCH 09/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- test/3x/torch/test_transformers.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/test/3x/torch/test_transformers.py b/test/3x/torch/test_transformers.py index 16b962198fb..39612bcd9bd 100644 --- a/test/3x/torch/test_transformers.py +++ b/test/3x/torch/test_transformers.py @@ -1,10 +1,14 @@ +import shutil import unittest -import torch + import pytest -import shutil -from transformers import AutoTokenizer +import torch from optimum.intel import INCModelForCausalLM +from transformers import AutoTokenizer + from neural_compressor.transformers import GPTQConfig, RtnConfig + + class TestQuantizationConfig(unittest.TestCase): @classmethod def setUpClass(self): @@ -17,10 +21,9 @@ def setUpClass(self): def tearDownClass(self): shutil.rmtree("tmp_gptq") shutil.rmtree("tmp_rtn") + def test_gptq(self): - quantization_config = GPTQConfig( - bits=4, sym=True, damp_percent=0.01,desc_act=True - ) + quantization_config = GPTQConfig(bits=4, sym=True, damp_percent=0.01, desc_act=True) user_model = INCModelForCausalLM.from_pretrained(self.model_name, quantization_config=quantization_config) output = user_model(self.input_ids) user_model.save_pretrained("tmp_gptq") From c02db4683ee05e78971953b0c503c1f691d4a431 Mon Sep 17 00:00:00 2001 From: changwangss Date: Wed, 28 Aug 2024 00:49:40 -0700 Subject: [PATCH 10/18] fix bias issue Signed-off-by: changwangss --- .../transformers/quantization/utils.py | 20 ++++++------ test/3x/torch/test_transformers.py | 32 +++++++++++++++---- 2 files changed, 35 insertions(+), 17 deletions(-) diff --git a/neural_compressor/transformers/quantization/utils.py b/neural_compressor/transformers/quantization/utils.py index 6d0009eb0b5..f03f192d0db 100644 --- a/neural_compressor/transformers/quantization/utils.py +++ b/neural_compressor/transformers/quantization/utils.py @@ -22,11 +22,8 @@ import types from datasets import load_dataset -from transformers import AutoTokenizer -from transformers.utils import SAFE_WEIGHTS_NAME, WEIGHTS_NAME from neural_compressor.torch.algorithms.weight_only.modules import INCWeightOnlyLinear -from neural_compressor.torch.algorithms.weight_only.modules import INCWeightOnlyLinear as WeightOnlyLinear from neural_compressor.torch.quantization import GPTQConfig, RTNConfig, convert, prepare from neural_compressor.torch.utils import is_ipex_available from neural_compressor.utils.utility import CpuInfo, LazyImport @@ -162,21 +159,21 @@ def _replace_linear( tmp_linear = torch.nn.Linear( in_features, out_features, - True if hasattr(module, "bias") else False, + True if hasattr(module, "bias") and module.bias is not None else False, ) + if tmp_linear.bias is not None and module.bias is not None: + tmp_linear.bias = torch.nn.Parameter(module.bias.float()) + tmp_linear.qconfig = ipex_qconfig_mapping.global_qconfig model._modules[name] = ipex_linear.from_float_and_int4_weight( mod=tmp_linear, qweight=qweight, scales=scales, zero_points=qzeros, - # bias=(module.bias if (hasattr(module, "bias") and not torch.all(module.bias.eq(0))) else None), - bias=(module.bias.float() if hasattr(module, "bias") else None), + bias=(module.bias.float() if hasattr(module, "bias") and module.bias is not None else None), group_size=quantization_config.group_size, g_idx=(module.g_idx if hasattr(module, "g_idx") else None), ) - # print(current_key_name) - # print(module.bias.float()) elif device == "xpu" or device == torch.device("xpu"): from intel_extension_for_pytorch.nn.utils._quantize_convert import ( @@ -363,7 +360,7 @@ def convert_to_quantized_model(model, config, device="cpu"): bits=config.bits, use_sym=config.sym, group_size=config.group_size, - use_layer_wise=config.layer_wise, + use_layer_wise=config.use_layer_wise, act_order=config.desc_act, percdamp=config.damp_percent, block_size=config.blocksize, @@ -472,7 +469,9 @@ def convert_to_GPTQ_checkpoints(model, quantization_config): new_module.n_pack = 32 // bits scales = module._op_context.get_scales().t().contiguous() bias = module._op_context.get_bias() - qzeros = new_module.pack_tensor_with_numpy(module._op_context.get_zero_points().t()).contiguous() + qzeros = new_module.pack_tensor_with_numpy( + module._op_context.get_zero_points().t().to(torch.uint8) - 1 + ).contiguous() g_idx = module._op_context.get_g_idx() new_module.qweight = qweight @@ -482,6 +481,7 @@ def convert_to_GPTQ_checkpoints(model, quantization_config): new_module.g_idx = g_idx.contiguous() if bias is not None: new_module.bias = bias.contiguous() + set_module(model, name, new_module) return model diff --git a/test/3x/torch/test_transformers.py b/test/3x/torch/test_transformers.py index 39612bcd9bd..e6e553cd2ce 100644 --- a/test/3x/torch/test_transformers.py +++ b/test/3x/torch/test_transformers.py @@ -1,3 +1,4 @@ +import os import shutil import unittest @@ -12,30 +13,47 @@ class TestQuantizationConfig(unittest.TestCase): @classmethod def setUpClass(self): - self.model_name = "TheBlokeAI/Mixtral-tiny-GPTQ" + self.model_name = "hf-internal-testing/tiny-random-gptj" self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) self.prompt = "One day, the little girl" self.input_ids = self.tokenizer(self.prompt, return_tensors="pt")["input_ids"] @classmethod def tearDownClass(self): - shutil.rmtree("tmp_gptq") - shutil.rmtree("tmp_rtn") + if os.path.exists("tmp_gptq") and os.path.isdir("tmp_gptq"): + shutil.rmtree("tmp_gptq") + if os.path.exists("tmp_rtn") and os.path.isdir("tmp_rtn"): + shutil.rmtree("tmp_rtn") def test_gptq(self): - quantization_config = GPTQConfig(bits=4, sym=True, damp_percent=0.01, desc_act=True) + quantization_config = GPTQConfig( + bits=4, + sym=True, + damp_percent=0.01, + desc_act=True, + tokenizer=self.tokenizer, + n_samples=20, + group_size=8, + batch_size=5, + seq_len=32, + block_size=16, + ) user_model = INCModelForCausalLM.from_pretrained(self.model_name, quantization_config=quantization_config) output = user_model(self.input_ids) user_model.save_pretrained("tmp_gptq") loaded_model = INCModelForCausalLM.from_pretrained("tmp_gptq") loaded_output = loaded_model(self.input_ids) - assert torch.allclose(output, loaded_output, atol=1e-2), "Compare failed!" + assert torch.allclose(output.logits, loaded_output.logits, atol=1e-2), "Compare failed!" def test_rtn(self): - quantization_config = RtnConfig(bits=4) + quantization_config = RtnConfig(bits=4, group_size=8, sym=False) user_model = INCModelForCausalLM.from_pretrained(self.model_name, quantization_config=quantization_config) output = user_model(self.input_ids) user_model.save_pretrained("tmp_rtn") loaded_model = INCModelForCausalLM.from_pretrained("tmp_rtn") loaded_output = loaded_model(self.input_ids) - assert torch.allclose(output, loaded_output, atol=1e-2), "Compare failed!" + assert torch.allclose(output.logits, loaded_output.logits, atol=1e-2), "Compare failed!" + + +if __name__ == "__main__": + unittest.main() From 4b1059787e49e4572f697613ebb12ceccc3f1022 Mon Sep 17 00:00:00 2001 From: changwangss Date: Wed, 28 Aug 2024 02:19:33 -0700 Subject: [PATCH 11/18] add __init__.py Signed-off-by: changwangss --- .../utils/{utility.py => __init__.py} | 7 +-- test/3x/torch/test_transformers.py | 59 ------------------- 2 files changed, 2 insertions(+), 64 deletions(-) rename neural_compressor/transformers/utils/{utility.py => __init__.py} (75%) delete mode 100644 test/3x/torch/test_transformers.py diff --git a/neural_compressor/transformers/utils/utility.py b/neural_compressor/transformers/utils/__init__.py similarity index 75% rename from neural_compressor/transformers/utils/utility.py rename to neural_compressor/transformers/utils/__init__.py index f7f81c5c5df..0370d3c0a4e 100644 --- a/neural_compressor/transformers/utils/utility.py +++ b/neural_compressor/transformers/utils/__init__.py @@ -1,17 +1,14 @@ -# -*- coding: utf-8 -*- # Copyright (c) 2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Intel Neural Compressor based Transformers API utility.""" - -QUANT_CONFIG = "quantize_config.json" +"""initialization.""" diff --git a/test/3x/torch/test_transformers.py b/test/3x/torch/test_transformers.py deleted file mode 100644 index e6e553cd2ce..00000000000 --- a/test/3x/torch/test_transformers.py +++ /dev/null @@ -1,59 +0,0 @@ -import os -import shutil -import unittest - -import pytest -import torch -from optimum.intel import INCModelForCausalLM -from transformers import AutoTokenizer - -from neural_compressor.transformers import GPTQConfig, RtnConfig - - -class TestQuantizationConfig(unittest.TestCase): - @classmethod - def setUpClass(self): - self.model_name = "hf-internal-testing/tiny-random-gptj" - self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) - self.prompt = "One day, the little girl" - self.input_ids = self.tokenizer(self.prompt, return_tensors="pt")["input_ids"] - - @classmethod - def tearDownClass(self): - if os.path.exists("tmp_gptq") and os.path.isdir("tmp_gptq"): - shutil.rmtree("tmp_gptq") - if os.path.exists("tmp_rtn") and os.path.isdir("tmp_rtn"): - shutil.rmtree("tmp_rtn") - - def test_gptq(self): - quantization_config = GPTQConfig( - bits=4, - sym=True, - damp_percent=0.01, - desc_act=True, - tokenizer=self.tokenizer, - n_samples=20, - group_size=8, - batch_size=5, - seq_len=32, - block_size=16, - ) - user_model = INCModelForCausalLM.from_pretrained(self.model_name, quantization_config=quantization_config) - output = user_model(self.input_ids) - user_model.save_pretrained("tmp_gptq") - loaded_model = INCModelForCausalLM.from_pretrained("tmp_gptq") - loaded_output = loaded_model(self.input_ids) - assert torch.allclose(output.logits, loaded_output.logits, atol=1e-2), "Compare failed!" - - def test_rtn(self): - quantization_config = RtnConfig(bits=4, group_size=8, sym=False) - user_model = INCModelForCausalLM.from_pretrained(self.model_name, quantization_config=quantization_config) - output = user_model(self.input_ids) - user_model.save_pretrained("tmp_rtn") - loaded_model = INCModelForCausalLM.from_pretrained("tmp_rtn") - loaded_output = loaded_model(self.input_ids) - assert torch.allclose(output.logits, loaded_output.logits, atol=1e-2), "Compare failed!" - - -if __name__ == "__main__": - unittest.main() From 3183b54d6039ca52e643a52fef15838e7612f91d Mon Sep 17 00:00:00 2001 From: changwangss Date: Wed, 28 Aug 2024 02:51:14 -0700 Subject: [PATCH 12/18] add transformers Signed-off-by: changwangss --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index a21a8702c9b..81b3bc05e10 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,3 +12,4 @@ pyyaml requests schema scikit-learn +transformers From 9cae1845e0e158de9afec2a453b5088df5201850 Mon Sep 17 00:00:00 2001 From: changwangss Date: Wed, 28 Aug 2024 03:02:21 -0700 Subject: [PATCH 13/18] lazy import transformers Signed-off-by: changwangss --- .../transformers/models/modeling_auto.py | 31 +++++++++---------- .../transformers/utils/quantization_config.py | 3 +- requirements.txt | 1 - 3 files changed, 16 insertions(+), 19 deletions(-) diff --git a/neural_compressor/transformers/models/modeling_auto.py b/neural_compressor/transformers/models/modeling_auto.py index ab0e47ed101..12cf412e04b 100644 --- a/neural_compressor/transformers/models/modeling_auto.py +++ b/neural_compressor/transformers/models/modeling_auto.py @@ -34,19 +34,7 @@ import os import types -import transformers from accelerate import init_empty_weights -from transformers import AutoConfig -from transformers.configuration_utils import PretrainedConfig -from transformers.modeling_utils import load_state_dict -from transformers.utils import ( - SAFE_WEIGHTS_INDEX_NAME, - SAFE_WEIGHTS_NAME, - WEIGHTS_INDEX_NAME, - WEIGHTS_NAME, - has_file, - is_safetensors_available, -) from neural_compressor.adaptor.torch_utils.util import set_module from neural_compressor.torch.algorithms.weight_only.modules import INCWeightOnlyLinear @@ -56,6 +44,9 @@ from neural_compressor.utils.utility import CpuInfo, LazyImport torch = LazyImport("torch") +transformers = LazyImport("transformers") +transformers_configuration_utils = LazyImport("transformers.configuration_utils") + def build_woq_model(model, quantization_config): @@ -92,8 +83,8 @@ class _BaseQBitsAutoModelClass: @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): config = kwargs.pop("config", None) - if not isinstance(config, PretrainedConfig): - config, _ = AutoConfig.from_pretrained( + if not isinstance(config, transformers_configuration_utils.PretrainedConfig): + config, _ = transformers.AutoConfig.from_pretrained( pretrained_model_name_or_path, return_unused_kwargs=True, **kwargs, @@ -138,7 +129,15 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): from transformers.models.auto.auto_factory import _get_model_class from transformers.models.auto.configuration_auto import AutoConfig from transformers.utils import ContextManagers, cached_file, download_url, extract_commit_hash, is_remote_url - + from transformers.modeling_utils import load_state_dict + from transformers.utils import ( + SAFE_WEIGHTS_INDEX_NAME, + SAFE_WEIGHTS_NAME, + WEIGHTS_INDEX_NAME, + WEIGHTS_NAME, + has_file, + is_safetensors_available, + ) # Autofactory kwargs_orig = copy.deepcopy(kwargs) # modules_to_not_convert = kwargs.pop("modules_to_not_convert", None) @@ -206,7 +205,7 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): assert quantization_config is not None, "Detect this model is not a low-bit model." if commit_hash is None: - if not isinstance(config, PretrainedConfig): + if not isinstance(config, transformers_configuration_utils.PretrainedConfig): # We make a call to the config file first (which may be absent) # to get the commit hash as soon as possible. resolved_config_file = cached_file( diff --git a/neural_compressor/transformers/utils/quantization_config.py b/neural_compressor/transformers/utils/quantization_config.py index 1ff39b66483..a9769512b40 100644 --- a/neural_compressor/transformers/utils/quantization_config.py +++ b/neural_compressor/transformers/utils/quantization_config.py @@ -17,12 +17,11 @@ import os from typing import Any, Dict, List, Optional, Tuple, Union -import transformers - from neural_compressor.utils import logger from neural_compressor.utils.utility import LazyImport torch = LazyImport("torch") +transformers = LazyImport("transformers") QUANT_CONFIG = "quantize_config.json" diff --git a/requirements.txt b/requirements.txt index 81b3bc05e10..a21a8702c9b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,4 +12,3 @@ pyyaml requests schema scikit-learn -transformers From a58111fa4d3be12a3adb5af99b0b51227ede06cc Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 28 Aug 2024 10:03:31 +0000 Subject: [PATCH 14/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../transformers/models/modeling_auto.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/neural_compressor/transformers/models/modeling_auto.py b/neural_compressor/transformers/models/modeling_auto.py index 12cf412e04b..ac650ef10dd 100644 --- a/neural_compressor/transformers/models/modeling_auto.py +++ b/neural_compressor/transformers/models/modeling_auto.py @@ -48,7 +48,6 @@ transformers_configuration_utils = LazyImport("transformers.configuration_utils") - def build_woq_model(model, quantization_config): bits = quantization_config.bits for n, m in model.named_modules(): @@ -125,19 +124,28 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): from accelerate.big_modeling import init_empty_weights from transformers.dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code from transformers.generation.configuration_utils import GenerationConfig - from transformers.modeling_utils import _add_variant, get_checkpoint_shard_files, no_init_weights + from transformers.modeling_utils import ( + _add_variant, + get_checkpoint_shard_files, + load_state_dict, + no_init_weights, + ) from transformers.models.auto.auto_factory import _get_model_class from transformers.models.auto.configuration_auto import AutoConfig - from transformers.utils import ContextManagers, cached_file, download_url, extract_commit_hash, is_remote_url - from transformers.modeling_utils import load_state_dict from transformers.utils import ( SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, WEIGHTS_INDEX_NAME, WEIGHTS_NAME, + ContextManagers, + cached_file, + download_url, + extract_commit_hash, has_file, + is_remote_url, is_safetensors_available, ) + # Autofactory kwargs_orig = copy.deepcopy(kwargs) # modules_to_not_convert = kwargs.pop("modules_to_not_convert", None) From 62deebc74da921516a1aa375e81c69bece1f885e Mon Sep 17 00:00:00 2001 From: changwangss Date: Wed, 28 Aug 2024 03:09:37 -0700 Subject: [PATCH 15/18] add transformers dependency Signed-off-by: changwangss --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index a21a8702c9b..81b3bc05e10 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,3 +12,4 @@ pyyaml requests schema scikit-learn +transformers From 53e0fe31b24adf031739330ee58895cf7414156d Mon Sep 17 00:00:00 2001 From: changwangss Date: Wed, 28 Aug 2024 05:11:02 -0700 Subject: [PATCH 16/18] lazy import transformers Signed-off-by: changwangss --- neural_compressor/__init__.py | 1 - requirements.txt | 1 - 2 files changed, 2 deletions(-) diff --git a/neural_compressor/__init__.py b/neural_compressor/__init__.py index 0340b7576b3..5ee86bf561a 100644 --- a/neural_compressor/__init__.py +++ b/neural_compressor/__init__.py @@ -25,7 +25,6 @@ QuantizationAwareTrainingConfig, MixedPrecisionConfig, ) -from .transformers import GPTQConfig, RtnConfig from .contrib import * from .model import * from .metric import * diff --git a/requirements.txt b/requirements.txt index 81b3bc05e10..a21a8702c9b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,4 +12,3 @@ pyyaml requests schema scikit-learn -transformers From 3cd59d212eaf352e1717666a9845bd693cd7e375 Mon Sep 17 00:00:00 2001 From: changwangss Date: Wed, 28 Aug 2024 18:37:04 -0700 Subject: [PATCH 17/18] rename Signed-off-by: changwangss --- .../transformers/models/__init__.py | 2 +- .../transformers/models/modeling_auto.py | 8 ++--- .../transformers/quantization/utils.py | 29 +++---------------- 3 files changed, 9 insertions(+), 30 deletions(-) diff --git a/neural_compressor/transformers/models/__init__.py b/neural_compressor/transformers/models/__init__.py index c44b58bb461..fcaf093c802 100644 --- a/neural_compressor/transformers/models/__init__.py +++ b/neural_compressor/transformers/models/__init__.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .modeling_auto import _BaseQBitsAutoModelClass +from .modeling_auto import _BaseINCAutoModelClass diff --git a/neural_compressor/transformers/models/modeling_auto.py b/neural_compressor/transformers/models/modeling_auto.py index ac650ef10dd..b3da93818d3 100644 --- a/neural_compressor/transformers/models/modeling_auto.py +++ b/neural_compressor/transformers/models/modeling_auto.py @@ -76,7 +76,7 @@ def build_woq_model(model, quantization_config): return model -class _BaseQBitsAutoModelClass: +class _BaseINCAutoModelClass: ORIG_MODEL = None @classmethod @@ -632,13 +632,13 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): return model -class AutoModelForCausalLM(_BaseQBitsAutoModelClass): +class AutoModelForCausalLM(_BaseINCAutoModelClass): ORIG_MODEL = transformers.AutoModelForCausalLM -class AutoModel(_BaseQBitsAutoModelClass): +class AutoModel(_BaseINCAutoModelClass): ORIG_MODEL = transformers.AutoModel -class AutoModelForSeq2SeqLM(_BaseQBitsAutoModelClass): +class AutoModelForSeq2SeqLM(_BaseINCAutoModelClass): ORIG_MODEL = transformers.AutoModelForSeq2SeqLM diff --git a/neural_compressor/transformers/quantization/utils.py b/neural_compressor/transformers/quantization/utils.py index f03f192d0db..64902991eb6 100644 --- a/neural_compressor/transformers/quantization/utils.py +++ b/neural_compressor/transformers/quantization/utils.py @@ -330,6 +330,10 @@ def convert_to_quantized_model(model, config, device="cpu"): import intel_extension_for_pytorch assert hasattr(torch, "xpu") and torch.xpu.is_available(), "There is no xpu device in this system!" + os.environ["FORCE_DEVICE"] = "cpu" + logger.info( + "Set the environment variable FORCE_DEVICE='cpu' to ensure the quantization process occurs on the CPU." + ) orig_dtype = torch.float32 for param in model.parameters(): @@ -403,31 +407,6 @@ def convert_to_quantized_model(model, config, device="cpu"): return q_model.to(device) -def pack_tensor_with_torch(raw_tensor, bits, compression_dtype=torch.int32): - """Pack the tensor with torch. - - Args: - raw_tensor (tensor): raw tensor. - - Returns: - tensor: packed tensor. - """ - n_pack = 32 // bits - target_len = math.ceil(raw_tensor.shape[1] / n_pack) - packed_tensor = torch.zeros(raw_tensor.shape[0], target_len, dtype=compression_dtype).to(raw_tensor.device) - mask = torch.tensor(2**bits - 1, dtype=compression_dtype).to(raw_tensor.device) - for j in range(packed_tensor.shape[1]): - start = n_pack * j - end = n_pack * (j + 1) - tmp = raw_tensor[:, start:end].type(compression_dtype) - tmp &= mask - for e in range(tmp.shape[1]): - tmp[:, e] = tmp[:, e] << (bits * e) - packed_tensor[:, j] |= tmp[:, e] - - return packed_tensor - - def convert_to_GPTQ_checkpoints(model, quantization_config): from intel_extension_for_pytorch.nn.modules import WeightOnlyQuantizedLinear as ipex_cpu_linear From 20f5152066f09a274e0acd8c3f01840b44598160 Mon Sep 17 00:00:00 2001 From: changwangss Date: Thu, 29 Aug 2024 23:29:12 -0700 Subject: [PATCH 18/18] adapt fx Signed-off-by: changwangss --- .../transformers/models/modeling_auto.py | 105 +++++++----------- .../transformers/quantization/utils.py | 21 +++- 2 files changed, 57 insertions(+), 69 deletions(-) diff --git a/neural_compressor/transformers/models/modeling_auto.py b/neural_compressor/transformers/models/modeling_auto.py index b3da93818d3..b200548d4cc 100644 --- a/neural_compressor/transformers/models/modeling_auto.py +++ b/neural_compressor/transformers/models/modeling_auto.py @@ -35,6 +35,7 @@ import types from accelerate import init_empty_weights +from accelerate.utils import is_xpu_available from neural_compressor.adaptor.torch_utils.util import set_module from neural_compressor.torch.algorithms.weight_only.modules import INCWeightOnlyLinear @@ -168,7 +169,7 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): revision = kwargs.pop("revision", "main") commit_hash = kwargs.pop("_commit_hash", None) _fast_init = kwargs.pop("_fast_init", True) - device_map = kwargs.pop("device_map", "auto") + device_map = kwargs.pop("device_map", "xpu" if is_xpu_available() else "cpu") use_safetensors = kwargs.pop("use_safetensors", None) kwarg_attn_imp = kwargs.pop("attn_implementation", None) @@ -210,6 +211,7 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): quantization_config = RtnConfig.from_dict(quantization_config) elif quantization_config["quant_method"] == "gptq": quantization_config = GPTQConfig.from_dict(quantization_config) + assert quantization_config is not None, "Detect this model is not a low-bit model." if commit_hash is None: @@ -501,47 +503,27 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): logger.warning("fp32 scale_dtype is used, please change the config.json if you don't want to use it.") # weight dtype is higher priority than bits in config.json when both existed. - if quantization_config.weight_dtype is None: - if quantization_config.bits == 4: - if use_xpu: - quantization_config.weight_dtype = "int4_fullrange" - else: - quantization_config.weight_dtype = "int4" - logger.info( - "{} quantization weight_dtype is used due to bits is 4 in config.json.".format( - quantization_config.weight_dtype - ) - ) - elif quantization_config.bits == 8: - quantization_config.weight_dtype = "int8" - logger.info( - "{} quantization weight_dtype is used due to bits is 8 in config.json.".format( - quantization_config.weight_dtype - ) - ) + if quantization_config.bits == 4: + if use_xpu: + quantization_config.weight_dtype = "int4_fullrange" else: - logger.warning("bits number only supports 4, 8.") quantization_config.weight_dtype = "int4" - logger.warning("int4 weight_dtype is used, please change the config.json if you don't want to use it.") - else: - if quantization_config.weight_dtype not in [ - "int4_fullrange", - "int4", - "int8", - "fp8_e5m2", - "fp8_e4m3", - "nf4", - "fp4_e2m1_bnb", - "fp4_e2m1", - ]: - logger.warning("Please provide the correct bits number or weight_dtype in config.json.") - raise ValueError( - "weight_dtype must be a string in " - "'int8', 'int4', 'int4_fullrange', 'int4', 'nf4', " - "'fp4', 'fp4_e2m1', 'fp8', 'fp8_e5m2, fp8_e4m3'" + logger.info( + "{} quantization weight_dtype is used due to bits is 4 in config.json.".format( + quantization_config.weight_dtype ) - else: - logger.info("{} quantization weight_dtype is used.".format(quantization_config.weight_dtype)) + ) + elif quantization_config.bits == 8: + quantization_config.weight_dtype = "int8" + logger.info( + "{} quantization weight_dtype is used due to bits is 8 in config.json.".format( + quantization_config.weight_dtype + ) + ) + else: + logger.warning("bits number only supports 4, 8.") + quantization_config.weight_dtype = "int4" + logger.warning("int4 weight_dtype is used, please change the config.json if you don't want to use it.") init_contexts = [no_init_weights(_enable=_fast_init)] init_contexts.append(init_empty_weights()) @@ -561,30 +543,27 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): # restore default dtype if dtype_orig is not None: torch.set_default_dtype(dtype_orig) - ( - model, - missing_keys, - unexpected_keys, - mismatched_keys, - offload_index, - error_msgs, - ) = model_class._load_pretrained_model( - model, - None, - loaded_state_dict_keys, # XXX: rename? - resolved_archive_file, - pretrained_model_name_or_path, - sharded_metadata=sharded_metadata, - _fast_init=_fast_init, - low_cpu_mem_usage=True, - offload_folder=offload_folder, - offload_state_dict=offload_state_dict, - dtype=torch_dtype, - keep_in_fp32_modules=[], - ) - - else: - raise AssertionError("Please install intel_extension_for_pytorch.") + ( + model, + missing_keys, + unexpected_keys, + mismatched_keys, + offload_index, + error_msgs, + ) = model_class._load_pretrained_model( + model, + None, + loaded_state_dict_keys, # XXX: rename? + resolved_archive_file, + pretrained_model_name_or_path, + sharded_metadata=sharded_metadata, + _fast_init=_fast_init, + low_cpu_mem_usage=True, + offload_folder=offload_folder, + offload_state_dict=offload_state_dict, + dtype=torch_dtype, + keep_in_fp32_modules=[], + ) # make sure token embedding weights are still tied if needed model.tie_weights() diff --git a/neural_compressor/transformers/quantization/utils.py b/neural_compressor/transformers/quantization/utils.py index 64902991eb6..d4739a38562 100644 --- a/neural_compressor/transformers/quantization/utils.py +++ b/neural_compressor/transformers/quantization/utils.py @@ -229,7 +229,6 @@ def _replace_linear( model._modules[name].requires_grad_(False) if device == "xpu" or device == torch.device("xpu"): - if not hasattr(module, "qweight"): n_pack = 32 // quantization_config.bits @@ -343,10 +342,9 @@ def convert_to_quantized_model(model, config, device="cpu"): break # mapping to INC config + dtype = "int4" if config.weight_dtype == "int4_fullrange" else config.weight_dtype if config.quant_method.value == "rtn": - quant_config = RTNConfig( - dtype=config.weight_dtype, bits=config.bits, use_sym=config.sym, group_size=config.group_size - ) + quant_config = RTNConfig(dtype=dtype, bits=config.bits, use_sym=config.sym, group_size=config.group_size) if config.use_layer_wise: quant_config.user_layer_wise = config.use_layer_wise quant_config.model_path = config.model_path @@ -360,7 +358,7 @@ def convert_to_quantized_model(model, config, device="cpu"): elif config.quant_method.value == "gptq": model.seqlen = config.seq_len quant_config = GPTQConfig( - dtype=config.weight_dtype, + dtype=dtype, bits=config.bits, use_sym=config.sym, group_size=config.group_size, @@ -399,6 +397,7 @@ def convert_to_quantized_model(model, config, device="cpu"): logger.warning("The recommended ipex version is higher than 2.3.10 for xpu device.") model.eval() + q_model = replace_linear(model, None, None, config, device=device) if orig_dtype != torch.float32: @@ -477,8 +476,18 @@ def save_low_bit(self, save_directory: Union[str, os.PathLike], push_to_hub: boo # use transformers original `save_pretrained` function del self.save_pretrained - if self.device == "cpu" or self.device == torch.device("cpu") or self.device == "auto": + if self.device == "cpu" or self.device == torch.device("cpu"): convert_to_GPTQ_checkpoints(self, self.quantization_config) + if self.device == "xpu" or (isinstance(self.device, torch.device) and self.device.type == "xpu"): + from intel_extension_for_pytorch.nn.utils._quantize_convert import WeightOnlyQuantizedLinear + + for name, module in self.named_modules(): + if isinstance(module, WeightOnlyQuantizedLinear): + if module.weight_transposed: + module.qweight.data = module.qweight.t_().contiguous() + module.scales.data = module.scales.t_().contiguous() + module.weight_transposed = False + self.save_pretrained(save_directory=save_directory, push_to_hub=push_to_hub, **kwargs) self.save_pretrained = types.MethodType(save_low_bit, self) # We conveniently save all the keys of the model to have them on hand,