diff --git a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_dynamic/README.md b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_dynamic/README.md index 03b91b484ef..a2033427b81 100644 --- a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_dynamic/README.md +++ b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_dynamic/README.md @@ -9,7 +9,6 @@ This example load LayoutLMv3 model and confirm its accuracy and speed based on [ ```shell pip install neural-compressor pip install -r requirements.txt -bash install_layoutlmft.sh ``` > Note: Validated ONNX Runtime [Version](/docs/source/installation_guide.md#validated-software-environment). diff --git a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_dynamic/data_utils.py b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_dynamic/data_utils.py new file mode 100644 index 00000000000..ef83f822b1f --- /dev/null +++ b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_dynamic/data_utils.py @@ -0,0 +1,179 @@ +from dataclasses import dataclass, field +from typing import Optional, Union + +import torch + +from detectron2.structures import ImageList +from detectron2.data.detection_utils import read_image +from detectron2.data.transforms import ResizeTransform, TransformList + +from transformers import PreTrainedTokenizerBase +from transformers.file_utils import PaddingStrategy + + +@dataclass +class DataCollatorForKeyValueExtraction: + """ + Data collator that will dynamically pad the inputs received, as well as the labels. + + Args: + tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`): + The tokenizer used for encoding the data. + padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`): + Select a strategy to pad the returned sequences (according to the model's padding side and padding index) + among: + + * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single + sequence if provided). + * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the + maximum acceptable input length for the model if that argument is not provided. + * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of + different lengths). + max_length (:obj:`int`, `optional`): + Maximum length of the returned list and optionally padding length (see above). + pad_to_multiple_of (:obj:`int`, `optional`): + If set will pad the sequence to a multiple of the provided value. + + This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= + 7.5 (Volta). + label_pad_token_id (:obj:`int`, `optional`, defaults to -100): + The id to use when padding the labels (-100 will be automatically ignore by PyTorch loss functions). + """ + + tokenizer: PreTrainedTokenizerBase + padding: Union[bool, str, PaddingStrategy] = True + max_length: Optional[int] = None + pad_to_multiple_of: Optional[int] = None + label_pad_token_id: int = -100 + + def __call__(self, features): + label_name = "label" if "label" in features[0].keys() else "labels" + labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None + + has_image_input = "image" in features[0] + has_bbox_input = "bbox" in features[0] + if has_image_input: + image = ImageList.from_tensors([torch.tensor(feature["image"]) for feature in features], 32) + for feature in features: + del feature["image"] + batch = self.tokenizer.pad( + features, + padding=self.padding, + max_length=self.max_length, + pad_to_multiple_of=self.pad_to_multiple_of, + # Conversion to tensors will fail if we have labels as they are not of the same length yet. + return_tensors="pt" if labels is None else None, + ) + + if labels is None: + return batch + + sequence_length = torch.tensor(batch["input_ids"]).shape[1] + padding_side = self.tokenizer.padding_side + if padding_side == "right": + batch["labels"] = [label + [self.label_pad_token_id] * (sequence_length - len(label)) for label in labels] + if has_bbox_input: + batch["bbox"] = [bbox + [[0, 0, 0, 0]] * (sequence_length - len(bbox)) for bbox in batch["bbox"]] + else: + batch["labels"] = [[self.label_pad_token_id] * (sequence_length - len(label)) + label for label in labels] + if has_bbox_input: + batch["bbox"] = [[[0, 0, 0, 0]] * (sequence_length - len(bbox)) + bbox for bbox in batch["bbox"]] + + batch = {k: torch.tensor(v, dtype=torch.int64) if isinstance(v[0], list) else v for k, v in batch.items()} + if has_image_input: + batch["image"] = image + return batch + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + """ + + task_name: Optional[str] = field(default="ner", metadata={"help": "The name of the task (ner, pos...)."}) + dataset_name: Optional[str] = field( + default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} + ) + dataset_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + ) + train_file: Optional[str] = field( + default=None, metadata={"help": "The input training data file (a csv or JSON file)."} + ) + validation_file: Optional[str] = field( + default=None, + metadata={"help": "An optional input evaluation data file to evaluate on (a csv or JSON file)."}, + ) + test_file: Optional[str] = field( + default=None, + metadata={"help": "An optional input test data file to predict on (a csv or JSON file)."}, + ) + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} + ) + preprocessing_num_workers: Optional[int] = field( + default=None, + metadata={"help": "The number of processes to use for the preprocessing."}, + ) + pad_to_max_length: bool = field( + default=True, + metadata={ + "help": "Whether to pad all samples to model maximum sentence length. " + "If False, will pad the samples dynamically when batching to the maximum length in the batch. More " + "efficient on GPU but very bad for TPU." + }, + ) + max_train_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + }, + ) + max_val_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of validation examples to this " + "value if set." + }, + ) + max_test_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of test examples to this " + "value if set." + }, + ) + label_all_tokens: bool = field( + default=False, + metadata={ + "help": "Whether to put the label for one word on all tokens of generated by that word or just on the " + "one (in which case the other tokens will have a padding index)." + }, + ) + return_entity_level_metrics: bool = field( + default=False, + metadata={"help": "Whether to return all the entity levels during evaluation or just the overall ones."}, + ) + + +@dataclass +class XFUNDataTrainingArguments(DataTrainingArguments): + lang: Optional[str] = field(default="en") + additional_langs: Optional[str] = field(default=None) + +def normalize_bbox(bbox, size): + return [ + int(1000 * bbox[0] / size[0]), + int(1000 * bbox[1] / size[1]), + int(1000 * bbox[2] / size[0]), + int(1000 * bbox[3] / size[1]), + ] + +def load_image(image_path): + image = read_image(image_path, format="BGR") + h = image.shape[0] + w = image.shape[1] + img_trans = TransformList([ResizeTransform(h=h, w=w, new_h=224, new_w=224)]) + image = torch.tensor(img_trans.apply_image(image).copy()).permute(2, 0, 1) # copy to make it writeable + return image, (w, h) \ No newline at end of file diff --git a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_dynamic/funsd.py b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_dynamic/funsd.py new file mode 100644 index 00000000000..f91f4fd3dd8 --- /dev/null +++ b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_dynamic/funsd.py @@ -0,0 +1,116 @@ +# coding=utf-8 + +import json +import os + +import datasets + +from data_utils import load_image, normalize_bbox + + +logger = datasets.logging.get_logger(__name__) + + +_CITATION = """\ +@article{Jaume2019FUNSDAD, + title={FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents}, + author={Guillaume Jaume and H. K. Ekenel and J. Thiran}, + journal={2019 International Conference on Document Analysis and Recognition Workshops (ICDARW)}, + year={2019}, + volume={2}, + pages={1-6} +} +""" + +_DESCRIPTION = """\ +https://guillaumejaume.github.io/FUNSD/ +""" + + +class FunsdConfig(datasets.BuilderConfig): + """BuilderConfig for FUNSD""" + + def __init__(self, **kwargs): + """BuilderConfig for FUNSD. + + Args: + **kwargs: keyword arguments forwarded to super. + """ + super(FunsdConfig, self).__init__(**kwargs) + + +class Funsd(datasets.GeneratorBasedBuilder): + """Conll2003 dataset.""" + + BUILDER_CONFIGS = [ + FunsdConfig(name="funsd", version=datasets.Version("1.0.0"), description="FUNSD dataset"), + ] + + def _info(self): + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=datasets.Features( + { + "id": datasets.Value("string"), + "tokens": datasets.Sequence(datasets.Value("string")), + "bboxes": datasets.Sequence(datasets.Sequence(datasets.Value("int64"))), + "ner_tags": datasets.Sequence( + datasets.features.ClassLabel( + names=["O", "B-HEADER", "I-HEADER", "B-QUESTION", "I-QUESTION", "B-ANSWER", "I-ANSWER"] + ) + ), + "image": datasets.Array3D(shape=(3, 224, 224), dtype="uint8"), + } + ), + supervised_keys=None, + homepage="https://guillaumejaume.github.io/FUNSD/", + citation=_CITATION, + ) + + def _split_generators(self, dl_manager): + """Returns SplitGenerators.""" + downloaded_file = dl_manager.download_and_extract("https://guillaumejaume.github.io/FUNSD/dataset.zip") + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, gen_kwargs={"filepath": f"{downloaded_file}/dataset/training_data/"} + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, gen_kwargs={"filepath": f"{downloaded_file}/dataset/testing_data/"} + ), + ] + + def _generate_examples(self, filepath): + logger.info("⏳ Generating examples from = %s", filepath) + ann_dir = os.path.join(filepath, "annotations") + img_dir = os.path.join(filepath, "images") + for guid, file in enumerate(sorted(os.listdir(ann_dir))): + tokens = [] + bboxes = [] + ner_tags = [] + + file_path = os.path.join(ann_dir, file) + with open(file_path, "r", encoding="utf8") as f: + data = json.load(f) + image_path = os.path.join(img_dir, file) + image_path = image_path.replace("json", "png") + image, size = load_image(image_path) + for item in data["form"]: + words, label = item["words"], item["label"] + words = [w for w in words if w["text"].strip() != ""] + if len(words) == 0: + continue + if label == "other": + for w in words: + tokens.append(w["text"]) + ner_tags.append("O") + bboxes.append(normalize_bbox(w["box"], size)) + else: + tokens.append(words[0]["text"]) + ner_tags.append("B-" + label.upper()) + bboxes.append(normalize_bbox(words[0]["box"], size)) + for w in words[1:]: + tokens.append(w["text"]) + ner_tags.append("I-" + label.upper()) + bboxes.append(normalize_bbox(w["box"], size)) + + yield guid, {"id": str(guid), "tokens": tokens, "bboxes": bboxes, "ner_tags": ner_tags, "image": image} \ No newline at end of file diff --git a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_dynamic/install_layoutlmft.sh b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_dynamic/install_layoutlmft.sh deleted file mode 100644 index 7fb2814537b..00000000000 --- a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_dynamic/install_layoutlmft.sh +++ /dev/null @@ -1,3 +0,0 @@ -git clone https://github.com/microsoft/unilm.git -cd unilm/layoutlmft -pip install -e . \ No newline at end of file diff --git a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_dynamic/main.py b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_dynamic/main.py index 85f32a67518..80074d4dd7e 100644 --- a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_dynamic/main.py +++ b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_dynamic/main.py @@ -10,11 +10,10 @@ import numpy as np from datasets import ClassLabel, load_dataset, load_metric -import layoutlmft.data.datasets.funsd +import funsd import transformers -from layoutlmft.data import DataCollatorForKeyValueExtraction -from layoutlmft.data.data_args import DataTrainingArguments -from layoutlmft.trainers import FunsdTrainer as Trainer +from data_utils import DataCollatorForKeyValueExtraction, DataTrainingArguments +from trainer import FunsdTrainer as Trainer from transformers import ( AutoConfig, AutoModelForTokenClassification, @@ -184,7 +183,7 @@ def main(): # Set seed before initializing model. set_seed(training_args.seed) - datasets = load_dataset(os.path.abspath(layoutlmft.data.datasets.funsd.__file__)) + datasets = load_dataset(os.path.abspath(funsd.__file__)) if training_args.do_train: column_names = datasets["train"].column_names features = datasets["train"].features diff --git a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_dynamic/requirements.txt b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_dynamic/requirements.txt index 505a5d4aa80..864fa3eaada 100644 --- a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_dynamic/requirements.txt +++ b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_dynamic/requirements.txt @@ -1,10 +1,11 @@ -datasets==1.6.2 -transformers==4.6 -huggingface-hub==0.0.8 -seqeval==1.2.2 -tensorboard==2.7.0 +accelerate +datasets +transformers +huggingface-hub +seqeval +tensorboard sentencepiece -timm==0.4.12 +timm Pillow einops textdistance @@ -17,4 +18,4 @@ onnx onnxruntime onnxruntime-extensions; python_version < '3.10' -f https://dl.fbaipublicfiles.com/detectron2/wheels/cpu/torch1.10/index.html -detectron2 +detectron2 \ No newline at end of file diff --git a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_dynamic/trainer.py b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_dynamic/trainer.py new file mode 100644 index 00000000000..0a2f88390f8 --- /dev/null +++ b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_dynamic/trainer.py @@ -0,0 +1,21 @@ +from typing import Any, Dict, Union + +import torch + +from transformers import Trainer + + +class FunsdTrainer(Trainer): + def _prepare_inputs(self, inputs: Dict[str, Union[torch.Tensor, Any]]) -> Dict[str, Union[torch.Tensor, Any]]: + """ + Prepare :obj:`inputs` before feeding them to the model, converting them to tensors if they are not already and + handling potential state. + """ + for k, v in inputs.items(): + if hasattr(v, "to") and hasattr(v, "device"): + inputs[k] = v.to(self.args.device) + + if self.args.past_index >= 0 and self._past is not None: + inputs["mems"] = self._past + + return inputs \ No newline at end of file diff --git a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_static/README.md b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_static/README.md index 58232f6485e..fc05e9cd229 100644 --- a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_static/README.md +++ b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_static/README.md @@ -9,7 +9,6 @@ This example load LayoutLMv3 model and confirm its accuracy and speed based on [ ```shell pip install neural-compressor pip install -r requirements.txt -bash install_layoutlmft.sh ``` > Note: Validated ONNX Runtime [Version](/docs/source/installation_guide.md#validated-software-environment). diff --git a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_static/data_utils.py b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_static/data_utils.py new file mode 100644 index 00000000000..ef83f822b1f --- /dev/null +++ b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_static/data_utils.py @@ -0,0 +1,179 @@ +from dataclasses import dataclass, field +from typing import Optional, Union + +import torch + +from detectron2.structures import ImageList +from detectron2.data.detection_utils import read_image +from detectron2.data.transforms import ResizeTransform, TransformList + +from transformers import PreTrainedTokenizerBase +from transformers.file_utils import PaddingStrategy + + +@dataclass +class DataCollatorForKeyValueExtraction: + """ + Data collator that will dynamically pad the inputs received, as well as the labels. + + Args: + tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`): + The tokenizer used for encoding the data. + padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`): + Select a strategy to pad the returned sequences (according to the model's padding side and padding index) + among: + + * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single + sequence if provided). + * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the + maximum acceptable input length for the model if that argument is not provided. + * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of + different lengths). + max_length (:obj:`int`, `optional`): + Maximum length of the returned list and optionally padding length (see above). + pad_to_multiple_of (:obj:`int`, `optional`): + If set will pad the sequence to a multiple of the provided value. + + This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= + 7.5 (Volta). + label_pad_token_id (:obj:`int`, `optional`, defaults to -100): + The id to use when padding the labels (-100 will be automatically ignore by PyTorch loss functions). + """ + + tokenizer: PreTrainedTokenizerBase + padding: Union[bool, str, PaddingStrategy] = True + max_length: Optional[int] = None + pad_to_multiple_of: Optional[int] = None + label_pad_token_id: int = -100 + + def __call__(self, features): + label_name = "label" if "label" in features[0].keys() else "labels" + labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None + + has_image_input = "image" in features[0] + has_bbox_input = "bbox" in features[0] + if has_image_input: + image = ImageList.from_tensors([torch.tensor(feature["image"]) for feature in features], 32) + for feature in features: + del feature["image"] + batch = self.tokenizer.pad( + features, + padding=self.padding, + max_length=self.max_length, + pad_to_multiple_of=self.pad_to_multiple_of, + # Conversion to tensors will fail if we have labels as they are not of the same length yet. + return_tensors="pt" if labels is None else None, + ) + + if labels is None: + return batch + + sequence_length = torch.tensor(batch["input_ids"]).shape[1] + padding_side = self.tokenizer.padding_side + if padding_side == "right": + batch["labels"] = [label + [self.label_pad_token_id] * (sequence_length - len(label)) for label in labels] + if has_bbox_input: + batch["bbox"] = [bbox + [[0, 0, 0, 0]] * (sequence_length - len(bbox)) for bbox in batch["bbox"]] + else: + batch["labels"] = [[self.label_pad_token_id] * (sequence_length - len(label)) + label for label in labels] + if has_bbox_input: + batch["bbox"] = [[[0, 0, 0, 0]] * (sequence_length - len(bbox)) + bbox for bbox in batch["bbox"]] + + batch = {k: torch.tensor(v, dtype=torch.int64) if isinstance(v[0], list) else v for k, v in batch.items()} + if has_image_input: + batch["image"] = image + return batch + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + """ + + task_name: Optional[str] = field(default="ner", metadata={"help": "The name of the task (ner, pos...)."}) + dataset_name: Optional[str] = field( + default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} + ) + dataset_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + ) + train_file: Optional[str] = field( + default=None, metadata={"help": "The input training data file (a csv or JSON file)."} + ) + validation_file: Optional[str] = field( + default=None, + metadata={"help": "An optional input evaluation data file to evaluate on (a csv or JSON file)."}, + ) + test_file: Optional[str] = field( + default=None, + metadata={"help": "An optional input test data file to predict on (a csv or JSON file)."}, + ) + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} + ) + preprocessing_num_workers: Optional[int] = field( + default=None, + metadata={"help": "The number of processes to use for the preprocessing."}, + ) + pad_to_max_length: bool = field( + default=True, + metadata={ + "help": "Whether to pad all samples to model maximum sentence length. " + "If False, will pad the samples dynamically when batching to the maximum length in the batch. More " + "efficient on GPU but very bad for TPU." + }, + ) + max_train_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + }, + ) + max_val_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of validation examples to this " + "value if set." + }, + ) + max_test_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of test examples to this " + "value if set." + }, + ) + label_all_tokens: bool = field( + default=False, + metadata={ + "help": "Whether to put the label for one word on all tokens of generated by that word or just on the " + "one (in which case the other tokens will have a padding index)." + }, + ) + return_entity_level_metrics: bool = field( + default=False, + metadata={"help": "Whether to return all the entity levels during evaluation or just the overall ones."}, + ) + + +@dataclass +class XFUNDataTrainingArguments(DataTrainingArguments): + lang: Optional[str] = field(default="en") + additional_langs: Optional[str] = field(default=None) + +def normalize_bbox(bbox, size): + return [ + int(1000 * bbox[0] / size[0]), + int(1000 * bbox[1] / size[1]), + int(1000 * bbox[2] / size[0]), + int(1000 * bbox[3] / size[1]), + ] + +def load_image(image_path): + image = read_image(image_path, format="BGR") + h = image.shape[0] + w = image.shape[1] + img_trans = TransformList([ResizeTransform(h=h, w=w, new_h=224, new_w=224)]) + image = torch.tensor(img_trans.apply_image(image).copy()).permute(2, 0, 1) # copy to make it writeable + return image, (w, h) \ No newline at end of file diff --git a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_static/funsd.py b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_static/funsd.py new file mode 100644 index 00000000000..f91f4fd3dd8 --- /dev/null +++ b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_static/funsd.py @@ -0,0 +1,116 @@ +# coding=utf-8 + +import json +import os + +import datasets + +from data_utils import load_image, normalize_bbox + + +logger = datasets.logging.get_logger(__name__) + + +_CITATION = """\ +@article{Jaume2019FUNSDAD, + title={FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents}, + author={Guillaume Jaume and H. K. Ekenel and J. Thiran}, + journal={2019 International Conference on Document Analysis and Recognition Workshops (ICDARW)}, + year={2019}, + volume={2}, + pages={1-6} +} +""" + +_DESCRIPTION = """\ +https://guillaumejaume.github.io/FUNSD/ +""" + + +class FunsdConfig(datasets.BuilderConfig): + """BuilderConfig for FUNSD""" + + def __init__(self, **kwargs): + """BuilderConfig for FUNSD. + + Args: + **kwargs: keyword arguments forwarded to super. + """ + super(FunsdConfig, self).__init__(**kwargs) + + +class Funsd(datasets.GeneratorBasedBuilder): + """Conll2003 dataset.""" + + BUILDER_CONFIGS = [ + FunsdConfig(name="funsd", version=datasets.Version("1.0.0"), description="FUNSD dataset"), + ] + + def _info(self): + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=datasets.Features( + { + "id": datasets.Value("string"), + "tokens": datasets.Sequence(datasets.Value("string")), + "bboxes": datasets.Sequence(datasets.Sequence(datasets.Value("int64"))), + "ner_tags": datasets.Sequence( + datasets.features.ClassLabel( + names=["O", "B-HEADER", "I-HEADER", "B-QUESTION", "I-QUESTION", "B-ANSWER", "I-ANSWER"] + ) + ), + "image": datasets.Array3D(shape=(3, 224, 224), dtype="uint8"), + } + ), + supervised_keys=None, + homepage="https://guillaumejaume.github.io/FUNSD/", + citation=_CITATION, + ) + + def _split_generators(self, dl_manager): + """Returns SplitGenerators.""" + downloaded_file = dl_manager.download_and_extract("https://guillaumejaume.github.io/FUNSD/dataset.zip") + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, gen_kwargs={"filepath": f"{downloaded_file}/dataset/training_data/"} + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, gen_kwargs={"filepath": f"{downloaded_file}/dataset/testing_data/"} + ), + ] + + def _generate_examples(self, filepath): + logger.info("⏳ Generating examples from = %s", filepath) + ann_dir = os.path.join(filepath, "annotations") + img_dir = os.path.join(filepath, "images") + for guid, file in enumerate(sorted(os.listdir(ann_dir))): + tokens = [] + bboxes = [] + ner_tags = [] + + file_path = os.path.join(ann_dir, file) + with open(file_path, "r", encoding="utf8") as f: + data = json.load(f) + image_path = os.path.join(img_dir, file) + image_path = image_path.replace("json", "png") + image, size = load_image(image_path) + for item in data["form"]: + words, label = item["words"], item["label"] + words = [w for w in words if w["text"].strip() != ""] + if len(words) == 0: + continue + if label == "other": + for w in words: + tokens.append(w["text"]) + ner_tags.append("O") + bboxes.append(normalize_bbox(w["box"], size)) + else: + tokens.append(words[0]["text"]) + ner_tags.append("B-" + label.upper()) + bboxes.append(normalize_bbox(words[0]["box"], size)) + for w in words[1:]: + tokens.append(w["text"]) + ner_tags.append("I-" + label.upper()) + bboxes.append(normalize_bbox(w["box"], size)) + + yield guid, {"id": str(guid), "tokens": tokens, "bboxes": bboxes, "ner_tags": ner_tags, "image": image} \ No newline at end of file diff --git a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_static/install_layoutlmft.sh b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_static/install_layoutlmft.sh deleted file mode 100644 index 7fb2814537b..00000000000 --- a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_static/install_layoutlmft.sh +++ /dev/null @@ -1,3 +0,0 @@ -git clone https://github.com/microsoft/unilm.git -cd unilm/layoutlmft -pip install -e . \ No newline at end of file diff --git a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_static/main.py b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_static/main.py index 26d2517d623..4bd4b63d870 100644 --- a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_static/main.py +++ b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_static/main.py @@ -10,11 +10,10 @@ import numpy as np from datasets import ClassLabel, load_dataset, load_metric -import layoutlmft.data.datasets.funsd +import funsd import transformers -from layoutlmft.data import DataCollatorForKeyValueExtraction -from layoutlmft.data.data_args import DataTrainingArguments -from layoutlmft.trainers import FunsdTrainer as Trainer +from data_utils import DataCollatorForKeyValueExtraction, DataTrainingArguments +from trainer import FunsdTrainer as Trainer from transformers import ( AutoConfig, AutoModelForTokenClassification, @@ -188,7 +187,7 @@ def main(): # Set seed before initializing model. set_seed(training_args.seed) - datasets = load_dataset(os.path.abspath(layoutlmft.data.datasets.funsd.__file__)) + datasets = load_dataset(os.path.abspath(funsd.__file__)) if training_args.do_train: column_names = datasets["train"].column_names features = datasets["train"].features diff --git a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_static/requirements.txt b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_static/requirements.txt index 505a5d4aa80..864fa3eaada 100644 --- a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_static/requirements.txt +++ b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_static/requirements.txt @@ -1,10 +1,11 @@ -datasets==1.6.2 -transformers==4.6 -huggingface-hub==0.0.8 -seqeval==1.2.2 -tensorboard==2.7.0 +accelerate +datasets +transformers +huggingface-hub +seqeval +tensorboard sentencepiece -timm==0.4.12 +timm Pillow einops textdistance @@ -17,4 +18,4 @@ onnx onnxruntime onnxruntime-extensions; python_version < '3.10' -f https://dl.fbaipublicfiles.com/detectron2/wheels/cpu/torch1.10/index.html -detectron2 +detectron2 \ No newline at end of file diff --git a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_static/trainer.py b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_static/trainer.py new file mode 100644 index 00000000000..0a2f88390f8 --- /dev/null +++ b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmft/quantization/ptq_static/trainer.py @@ -0,0 +1,21 @@ +from typing import Any, Dict, Union + +import torch + +from transformers import Trainer + + +class FunsdTrainer(Trainer): + def _prepare_inputs(self, inputs: Dict[str, Union[torch.Tensor, Any]]) -> Dict[str, Union[torch.Tensor, Any]]: + """ + Prepare :obj:`inputs` before feeding them to the model, converting them to tensors if they are not already and + handling potential state. + """ + for k, v in inputs.items(): + if hasattr(v, "to") and hasattr(v, "device"): + inputs[k] = v.to(self.args.device) + + if self.args.past_index >= 0 and self._past is not None: + inputs["mems"] = self._past + + return inputs \ No newline at end of file diff --git a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_dynamic/README.md b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_dynamic/README.md index b719d09e0b6..ee3c232b1af 100644 --- a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_dynamic/README.md +++ b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_dynamic/README.md @@ -9,7 +9,6 @@ This example load LayoutLMv3 model and confirm its accuracy and speed based on [ ```shell pip install neural-compressor pip install -r requirements.txt -bash install_layoutlmft.sh ``` > Note: Validated ONNX Runtime [Version](/docs/source/installation_guide.md#validated-software-environment). diff --git a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_dynamic/funsd.py b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_dynamic/funsd.py new file mode 100644 index 00000000000..dbadb70ba4c --- /dev/null +++ b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_dynamic/funsd.py @@ -0,0 +1,136 @@ +# coding=utf-8 +''' +Reference: https://huggingface.co/datasets/nielsr/funsd/blob/main/funsd.py +''' +import json +import os + +import datasets + +from image_utils import load_image, normalize_bbox + + +logger = datasets.logging.get_logger(__name__) + + +_CITATION = """\ +@article{Jaume2019FUNSDAD, + title={FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents}, + author={Guillaume Jaume and H. K. Ekenel and J. Thiran}, + journal={2019 International Conference on Document Analysis and Recognition Workshops (ICDARW)}, + year={2019}, + volume={2}, + pages={1-6} +} +""" + +_DESCRIPTION = """\ +https://guillaumejaume.github.io/FUNSD/ +""" + + +class FunsdConfig(datasets.BuilderConfig): + """BuilderConfig for FUNSD""" + + def __init__(self, **kwargs): + """BuilderConfig for FUNSD. + + Args: + **kwargs: keyword arguments forwarded to super. + """ + super(FunsdConfig, self).__init__(**kwargs) + + +class Funsd(datasets.GeneratorBasedBuilder): + """Conll2003 dataset.""" + + BUILDER_CONFIGS = [ + FunsdConfig(name="funsd", version=datasets.Version("1.0.0"), description="FUNSD dataset"), + ] + + def _info(self): + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=datasets.Features( + { + "id": datasets.Value("string"), + "tokens": datasets.Sequence(datasets.Value("string")), + "bboxes": datasets.Sequence(datasets.Sequence(datasets.Value("int64"))), + "ner_tags": datasets.Sequence( + datasets.features.ClassLabel( + names=["O", "B-HEADER", "I-HEADER", "B-QUESTION", "I-QUESTION", "B-ANSWER", "I-ANSWER"] + ) + ), + "image": datasets.Array3D(shape=(3, 224, 224), dtype="uint8"), + "image_path": datasets.Value("string"), + } + ), + supervised_keys=None, + homepage="https://guillaumejaume.github.io/FUNSD/", + citation=_CITATION, + ) + + def _split_generators(self, dl_manager): + """Returns SplitGenerators.""" + downloaded_file = dl_manager.download_and_extract("https://guillaumejaume.github.io/FUNSD/dataset.zip") + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, gen_kwargs={"filepath": f"{downloaded_file}/dataset/training_data/"} + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, gen_kwargs={"filepath": f"{downloaded_file}/dataset/testing_data/"} + ), + ] + + def get_line_bbox(self, bboxs): + x = [bboxs[i][j] for i in range(len(bboxs)) for j in range(0, len(bboxs[i]), 2)] + y = [bboxs[i][j] for i in range(len(bboxs)) for j in range(1, len(bboxs[i]), 2)] + + x0, y0, x1, y1 = min(x), min(y), max(x), max(y) + + assert x1 >= x0 and y1 >= y0 + bbox = [[x0, y0, x1, y1] for _ in range(len(bboxs))] + return bbox + + def _generate_examples(self, filepath): + logger.info("⏳ Generating examples from = %s", filepath) + ann_dir = os.path.join(filepath, "annotations") + img_dir = os.path.join(filepath, "images") + for guid, file in enumerate(sorted(os.listdir(ann_dir))): + tokens = [] + bboxes = [] + ner_tags = [] + + file_path = os.path.join(ann_dir, file) + with open(file_path, "r", encoding="utf8") as f: + data = json.load(f) + image_path = os.path.join(img_dir, file) + image_path = image_path.replace("json", "png") + image, size = load_image(image_path) + for item in data["form"]: + cur_line_bboxes = [] + words, label = item["words"], item["label"] + words = [w for w in words if w["text"].strip() != ""] + if len(words) == 0: + continue + if label == "other": + for w in words: + tokens.append(w["text"]) + ner_tags.append("O") + cur_line_bboxes.append(normalize_bbox(w["box"], size)) + else: + tokens.append(words[0]["text"]) + ner_tags.append("B-" + label.upper()) + cur_line_bboxes.append(normalize_bbox(words[0]["box"], size)) + for w in words[1:]: + tokens.append(w["text"]) + ner_tags.append("I-" + label.upper()) + cur_line_bboxes.append(normalize_bbox(w["box"], size)) + # by default: --segment_level_layout 1 + # if do not want to use segment_level_layout, comment the following line + cur_line_bboxes = self.get_line_bbox(cur_line_bboxes) + # box = normalize_bbox(item["box"], size) + # cur_line_bboxes = [box for _ in range(len(words))] + bboxes.extend(cur_line_bboxes) + yield guid, {"id": str(guid), "tokens": tokens, "bboxes": bboxes, "ner_tags": ner_tags, + "image": image, "image_path": image_path} \ No newline at end of file diff --git a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_dynamic/image_utils.py b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_dynamic/image_utils.py new file mode 100644 index 00000000000..beeeb5b6ba9 --- /dev/null +++ b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_dynamic/image_utils.py @@ -0,0 +1,284 @@ +import torchvision.transforms.functional as F +import warnings +import math +import random +import numpy as np +from PIL import Image +import torch + +from detectron2.data.detection_utils import read_image +from detectron2.data.transforms import ResizeTransform, TransformList + +def normalize_bbox(bbox, size): + return [ + int(1000 * bbox[0] / size[0]), + int(1000 * bbox[1] / size[1]), + int(1000 * bbox[2] / size[0]), + int(1000 * bbox[3] / size[1]), + ] + + +def load_image(image_path): + image = read_image(image_path, format="BGR") + h = image.shape[0] + w = image.shape[1] + img_trans = TransformList([ResizeTransform(h=h, w=w, new_h=224, new_w=224)]) + image = torch.tensor(img_trans.apply_image(image).copy()).permute(2, 0, 1) # copy to make it writeable + return image, (w, h) + + +def crop(image, i, j, h, w, boxes=None): + cropped_image = F.crop(image, i, j, h, w) + + if boxes is not None: + # Currently we cannot use this case since when some boxes is out of the cropped image, + # it may be better to drop out these boxes along with their text input (instead of min or clamp) + # which haven't been implemented here + max_size = torch.as_tensor([w, h], dtype=torch.float32) + cropped_boxes = torch.as_tensor(boxes) - torch.as_tensor([j, i, j, i]) + cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size) + cropped_boxes = cropped_boxes.clamp(min=0) + boxes = cropped_boxes.reshape(-1, 4) + + return cropped_image, boxes + + +def resize(image, size, interpolation, boxes=None): + # It seems that we do not need to resize boxes here, since the boxes will be resized to 1000x1000 finally, + # which is compatible with a square image size of 224x224 + rescaled_image = F.resize(image, size, interpolation) + + if boxes is None: + return rescaled_image, None + + ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size)) + ratio_width, ratio_height = ratios + + # boxes = boxes.copy() + scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height]) + + return rescaled_image, scaled_boxes + + +def clamp(num, min_value, max_value): + return max(min(num, max_value), min_value) + + +def get_bb(bb, page_size): + bbs = [float(j) for j in bb] + xs, ys = [], [] + for i, b in enumerate(bbs): + if i % 2 == 0: + xs.append(b) + else: + ys.append(b) + (width, height) = page_size + return_bb = [ + clamp(min(xs), 0, width - 1), + clamp(min(ys), 0, height - 1), + clamp(max(xs), 0, width - 1), + clamp(max(ys), 0, height - 1), + ] + return_bb = [ + int(1000 * return_bb[0] / width), + int(1000 * return_bb[1] / height), + int(1000 * return_bb[2] / width), + int(1000 * return_bb[3] / height), + ] + return return_bb + + +class ToNumpy: + + def __call__(self, pil_img): + np_img = np.array(pil_img, dtype=np.uint8) + if np_img.ndim < 3: + np_img = np.expand_dims(np_img, axis=-1) + np_img = np.rollaxis(np_img, 2) # HWC to CHW + return np_img + + +class ToTensor: + + def __init__(self, dtype=torch.float32): + self.dtype = dtype + + def __call__(self, pil_img): + np_img = np.array(pil_img, dtype=np.uint8) + if np_img.ndim < 3: + np_img = np.expand_dims(np_img, axis=-1) + np_img = np.rollaxis(np_img, 2) # HWC to CHW + return torch.from_numpy(np_img).to(dtype=self.dtype) + + +_pil_interpolation_to_str = { + F.InterpolationMode.NEAREST: 'F.InterpolationMode.NEAREST', + F.InterpolationMode.BILINEAR: 'F.InterpolationMode.BILINEAR', + F.InterpolationMode.BICUBIC: 'F.InterpolationMode.BICUBIC', + F.InterpolationMode.LANCZOS: 'F.InterpolationMode.LANCZOS', + F.InterpolationMode.HAMMING: 'F.InterpolationMode.HAMMING', + F.InterpolationMode.BOX: 'F.InterpolationMode.BOX', +} + + +def _pil_interp(method): + if method == 'bicubic': + return F.InterpolationMode.BICUBIC + elif method == 'lanczos': + return F.InterpolationMode.LANCZOS + elif method == 'hamming': + return F.InterpolationMode.HAMMING + else: + # default bilinear, do we want to allow nearest? + return F.InterpolationMode.BILINEAR + + +class Compose: + """Composes several transforms together. This transform does not support torchscript. + Please, see the note below. + + Args: + transforms (list of ``Transform`` objects): list of transforms to compose. + + Example: + >>> transforms.Compose([ + >>> transforms.CenterCrop(10), + >>> transforms.PILToTensor(), + >>> transforms.ConvertImageDtype(torch.float), + >>> ]) + + .. note:: + In order to script the transformations, please use ``torch.nn.Sequential`` as below. + + >>> transforms = torch.nn.Sequential( + >>> transforms.CenterCrop(10), + >>> transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + >>> ) + >>> scripted_transforms = torch.jit.script(transforms) + + Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor``, does not require + `lambda` functions or ``PIL.Image``. + + """ + + def __init__(self, transforms): + self.transforms = transforms + + def __call__(self, img, augmentation=False, box=None): + for t in self.transforms: + img = t(img, augmentation, box) + return img + + +class RandomResizedCropAndInterpolationWithTwoPic: + """Crop the given PIL Image to random size and aspect ratio with random interpolation. + A crop of random size (default: of 0.08 to 1.0) of the original size and a random + aspect ratio (default: of 3/4 to 4/3) of the original aspect ratio is made. This crop + is finally resized to given size. + This is popularly used to train the Inception networks. + Args: + size: expected output size of each edge + scale: range of size of the origin size cropped + ratio: range of aspect ratio of the origin aspect ratio cropped + interpolation: Default: PIL.Image.BILINEAR + """ + + def __init__(self, size, second_size=None, scale=(0.08, 1.0), ratio=(3. / 4., 4. / 3.), + interpolation='bilinear', second_interpolation='lanczos'): + if isinstance(size, tuple): + self.size = size + else: + self.size = (size, size) + if second_size is not None: + if isinstance(second_size, tuple): + self.second_size = second_size + else: + self.second_size = (second_size, second_size) + else: + self.second_size = None + if (scale[0] > scale[1]) or (ratio[0] > ratio[1]): + warnings.warn("range should be of kind (min, max)") + + self.interpolation = _pil_interp(interpolation) + self.second_interpolation = _pil_interp(second_interpolation) + self.scale = scale + self.ratio = ratio + + @staticmethod + def get_params(img, scale, ratio): + """Get parameters for ``crop`` for a random sized crop. + Args: + img (PIL Image): Image to be cropped. + scale (tuple): range of size of the origin size cropped + ratio (tuple): range of aspect ratio of the origin aspect ratio cropped + Returns: + tuple: params (i, j, h, w) to be passed to ``crop`` for a random + sized crop. + """ + area = img.size[0] * img.size[1] + + for attempt in range(10): + target_area = random.uniform(*scale) * area + log_ratio = (math.log(ratio[0]), math.log(ratio[1])) + aspect_ratio = math.exp(random.uniform(*log_ratio)) + + w = int(round(math.sqrt(target_area * aspect_ratio))) + h = int(round(math.sqrt(target_area / aspect_ratio))) + + if w <= img.size[0] and h <= img.size[1]: + i = random.randint(0, img.size[1] - h) + j = random.randint(0, img.size[0] - w) + return i, j, h, w + + # Fallback to central crop + in_ratio = img.size[0] / img.size[1] + if in_ratio < min(ratio): + w = img.size[0] + h = int(round(w / min(ratio))) + elif in_ratio > max(ratio): + h = img.size[1] + w = int(round(h * max(ratio))) + else: # whole image + w = img.size[0] + h = img.size[1] + i = (img.size[1] - h) // 2 + j = (img.size[0] - w) // 2 + return i, j, h, w + + def __call__(self, img, augmentation=False, box=None): + """ + Args: + img (PIL Image): Image to be cropped and resized. + Returns: + PIL Image: Randomly cropped and resized image. + """ + if augmentation: + i, j, h, w = self.get_params(img, self.scale, self.ratio) + img = F.crop(img, i, j, h, w) + # img, box = crop(img, i, j, h, w, box) + img = F.resize(img, self.size, self.interpolation) + second_img = F.resize(img, self.second_size, self.second_interpolation) \ + if self.second_size is not None else None + return img, second_img + + def __repr__(self): + if isinstance(self.interpolation, (tuple, list)): + interpolate_str = ' '.join([_pil_interpolation_to_str[x] for x in self.interpolation]) + else: + interpolate_str = _pil_interpolation_to_str[self.interpolation] + format_string = self.__class__.__name__ + '(size={0}'.format(self.size) + format_string += ', scale={0}'.format(tuple(round(s, 4) for s in self.scale)) + format_string += ', ratio={0}'.format(tuple(round(r, 4) for r in self.ratio)) + format_string += ', interpolation={0}'.format(interpolate_str) + if self.second_size is not None: + format_string += ', second_size={0}'.format(self.second_size) + format_string += ', second_interpolation={0}'.format(_pil_interpolation_to_str[self.second_interpolation]) + format_string += ')' + return format_string + + +def pil_loader(path: str) -> Image.Image: + # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835) + with open(path, 'rb') as f: + img = Image.open(f) + return img.convert('RGB') \ No newline at end of file diff --git a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_dynamic/install_layoutlmft.sh b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_dynamic/install_layoutlmft.sh deleted file mode 100644 index fd29421017d..00000000000 --- a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_dynamic/install_layoutlmft.sh +++ /dev/null @@ -1,3 +0,0 @@ -git clone https://github.com/microsoft/unilm.git -cd unilm/layoutlmv3 -pip install -e . \ No newline at end of file diff --git a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_dynamic/main.py b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_dynamic/main.py index fee932cf5ca..d300f3e2fdc 100644 --- a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_dynamic/main.py +++ b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_dynamic/main.py @@ -10,8 +10,6 @@ from datasets import ClassLabel, load_dataset, load_metric import transformers - -from layoutlmft.data import DataCollatorForKeyValueExtraction from transformers import ( AutoConfig, AutoModelForTokenClassification, @@ -28,7 +26,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. check_min_version("4.5.0") -from layoutlmft.data.image_utils import RandomResizedCropAndInterpolationWithTwoPic, pil_loader, Compose +from image_utils import RandomResizedCropAndInterpolationWithTwoPic, pil_loader, Compose from timm.data.constants import \ IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD @@ -263,11 +261,8 @@ def main(): if data_args.dataset_name == 'funsd': # datasets = load_dataset("nielsr/funsd") - import layoutlmft.data.funsd - datasets = load_dataset(os.path.abspath(layoutlmft.data.funsd.__file__), cache_dir=model_args.cache_dir) - elif data_args.dataset_name == 'cord': - import layoutlmft.data.cord - datasets = load_dataset(os.path.abspath(layoutlmft.data.cord.__file__), cache_dir=model_args.cache_dir) + import funsd + datasets = load_dataset(os.path.abspath(funsd.__file__), cache_dir=model_args.cache_dir) else: raise NotImplementedError() @@ -275,6 +270,7 @@ def main(): features = datasets["test"].features text_column_name = "words" if "words" in column_names else "tokens" + boxes_column_name = "bboxes" label_column_name = ( f"{data_args.task_name}_tags" if f"{data_args.task_name}_tags" in column_names else column_names[1] @@ -362,8 +358,7 @@ def tokenize_and_align_labels(examples, augmentation=False): padding=False, truncation=True, return_overflowing_tokens=True, - # We use this argument because the texts in our dataset are lists of words (with a label for each word). - is_split_into_words=True, + boxes=examples[boxes_column_name], ) labels = [] diff --git a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_dynamic/requirements.txt b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_dynamic/requirements.txt index 92ae07d8600..0f7947499f6 100644 --- a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_dynamic/requirements.txt +++ b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_dynamic/requirements.txt @@ -1,10 +1,10 @@ +accelerate datasets -transformers==4.12.5 -seqeval==1.2.2 -tensorboard==2.7.0 -seqeval==1.2.2 +transformers +seqeval +tensorboard sentencepiece -timm==0.4.12 +timm Pillow einops textdistance diff --git a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_static/README.md b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_static/README.md index fbc132a921a..3e05e57d35a 100644 --- a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_static/README.md +++ b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_static/README.md @@ -9,7 +9,6 @@ This example load LayoutLMv3 model and confirm its accuracy and speed based on [ ```shell pip install neural-compressor pip install -r requirements.txt -bash install_layoutlmft.sh ``` > Note: Validated ONNX Runtime [Version](/docs/source/installation_guide.md#validated-software-environment). diff --git a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_static/funsd.py b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_static/funsd.py new file mode 100644 index 00000000000..dbadb70ba4c --- /dev/null +++ b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_static/funsd.py @@ -0,0 +1,136 @@ +# coding=utf-8 +''' +Reference: https://huggingface.co/datasets/nielsr/funsd/blob/main/funsd.py +''' +import json +import os + +import datasets + +from image_utils import load_image, normalize_bbox + + +logger = datasets.logging.get_logger(__name__) + + +_CITATION = """\ +@article{Jaume2019FUNSDAD, + title={FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents}, + author={Guillaume Jaume and H. K. Ekenel and J. Thiran}, + journal={2019 International Conference on Document Analysis and Recognition Workshops (ICDARW)}, + year={2019}, + volume={2}, + pages={1-6} +} +""" + +_DESCRIPTION = """\ +https://guillaumejaume.github.io/FUNSD/ +""" + + +class FunsdConfig(datasets.BuilderConfig): + """BuilderConfig for FUNSD""" + + def __init__(self, **kwargs): + """BuilderConfig for FUNSD. + + Args: + **kwargs: keyword arguments forwarded to super. + """ + super(FunsdConfig, self).__init__(**kwargs) + + +class Funsd(datasets.GeneratorBasedBuilder): + """Conll2003 dataset.""" + + BUILDER_CONFIGS = [ + FunsdConfig(name="funsd", version=datasets.Version("1.0.0"), description="FUNSD dataset"), + ] + + def _info(self): + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=datasets.Features( + { + "id": datasets.Value("string"), + "tokens": datasets.Sequence(datasets.Value("string")), + "bboxes": datasets.Sequence(datasets.Sequence(datasets.Value("int64"))), + "ner_tags": datasets.Sequence( + datasets.features.ClassLabel( + names=["O", "B-HEADER", "I-HEADER", "B-QUESTION", "I-QUESTION", "B-ANSWER", "I-ANSWER"] + ) + ), + "image": datasets.Array3D(shape=(3, 224, 224), dtype="uint8"), + "image_path": datasets.Value("string"), + } + ), + supervised_keys=None, + homepage="https://guillaumejaume.github.io/FUNSD/", + citation=_CITATION, + ) + + def _split_generators(self, dl_manager): + """Returns SplitGenerators.""" + downloaded_file = dl_manager.download_and_extract("https://guillaumejaume.github.io/FUNSD/dataset.zip") + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, gen_kwargs={"filepath": f"{downloaded_file}/dataset/training_data/"} + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, gen_kwargs={"filepath": f"{downloaded_file}/dataset/testing_data/"} + ), + ] + + def get_line_bbox(self, bboxs): + x = [bboxs[i][j] for i in range(len(bboxs)) for j in range(0, len(bboxs[i]), 2)] + y = [bboxs[i][j] for i in range(len(bboxs)) for j in range(1, len(bboxs[i]), 2)] + + x0, y0, x1, y1 = min(x), min(y), max(x), max(y) + + assert x1 >= x0 and y1 >= y0 + bbox = [[x0, y0, x1, y1] for _ in range(len(bboxs))] + return bbox + + def _generate_examples(self, filepath): + logger.info("⏳ Generating examples from = %s", filepath) + ann_dir = os.path.join(filepath, "annotations") + img_dir = os.path.join(filepath, "images") + for guid, file in enumerate(sorted(os.listdir(ann_dir))): + tokens = [] + bboxes = [] + ner_tags = [] + + file_path = os.path.join(ann_dir, file) + with open(file_path, "r", encoding="utf8") as f: + data = json.load(f) + image_path = os.path.join(img_dir, file) + image_path = image_path.replace("json", "png") + image, size = load_image(image_path) + for item in data["form"]: + cur_line_bboxes = [] + words, label = item["words"], item["label"] + words = [w for w in words if w["text"].strip() != ""] + if len(words) == 0: + continue + if label == "other": + for w in words: + tokens.append(w["text"]) + ner_tags.append("O") + cur_line_bboxes.append(normalize_bbox(w["box"], size)) + else: + tokens.append(words[0]["text"]) + ner_tags.append("B-" + label.upper()) + cur_line_bboxes.append(normalize_bbox(words[0]["box"], size)) + for w in words[1:]: + tokens.append(w["text"]) + ner_tags.append("I-" + label.upper()) + cur_line_bboxes.append(normalize_bbox(w["box"], size)) + # by default: --segment_level_layout 1 + # if do not want to use segment_level_layout, comment the following line + cur_line_bboxes = self.get_line_bbox(cur_line_bboxes) + # box = normalize_bbox(item["box"], size) + # cur_line_bboxes = [box for _ in range(len(words))] + bboxes.extend(cur_line_bboxes) + yield guid, {"id": str(guid), "tokens": tokens, "bboxes": bboxes, "ner_tags": ner_tags, + "image": image, "image_path": image_path} \ No newline at end of file diff --git a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_static/image_utils.py b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_static/image_utils.py new file mode 100644 index 00000000000..beeeb5b6ba9 --- /dev/null +++ b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_static/image_utils.py @@ -0,0 +1,284 @@ +import torchvision.transforms.functional as F +import warnings +import math +import random +import numpy as np +from PIL import Image +import torch + +from detectron2.data.detection_utils import read_image +from detectron2.data.transforms import ResizeTransform, TransformList + +def normalize_bbox(bbox, size): + return [ + int(1000 * bbox[0] / size[0]), + int(1000 * bbox[1] / size[1]), + int(1000 * bbox[2] / size[0]), + int(1000 * bbox[3] / size[1]), + ] + + +def load_image(image_path): + image = read_image(image_path, format="BGR") + h = image.shape[0] + w = image.shape[1] + img_trans = TransformList([ResizeTransform(h=h, w=w, new_h=224, new_w=224)]) + image = torch.tensor(img_trans.apply_image(image).copy()).permute(2, 0, 1) # copy to make it writeable + return image, (w, h) + + +def crop(image, i, j, h, w, boxes=None): + cropped_image = F.crop(image, i, j, h, w) + + if boxes is not None: + # Currently we cannot use this case since when some boxes is out of the cropped image, + # it may be better to drop out these boxes along with their text input (instead of min or clamp) + # which haven't been implemented here + max_size = torch.as_tensor([w, h], dtype=torch.float32) + cropped_boxes = torch.as_tensor(boxes) - torch.as_tensor([j, i, j, i]) + cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size) + cropped_boxes = cropped_boxes.clamp(min=0) + boxes = cropped_boxes.reshape(-1, 4) + + return cropped_image, boxes + + +def resize(image, size, interpolation, boxes=None): + # It seems that we do not need to resize boxes here, since the boxes will be resized to 1000x1000 finally, + # which is compatible with a square image size of 224x224 + rescaled_image = F.resize(image, size, interpolation) + + if boxes is None: + return rescaled_image, None + + ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size)) + ratio_width, ratio_height = ratios + + # boxes = boxes.copy() + scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height]) + + return rescaled_image, scaled_boxes + + +def clamp(num, min_value, max_value): + return max(min(num, max_value), min_value) + + +def get_bb(bb, page_size): + bbs = [float(j) for j in bb] + xs, ys = [], [] + for i, b in enumerate(bbs): + if i % 2 == 0: + xs.append(b) + else: + ys.append(b) + (width, height) = page_size + return_bb = [ + clamp(min(xs), 0, width - 1), + clamp(min(ys), 0, height - 1), + clamp(max(xs), 0, width - 1), + clamp(max(ys), 0, height - 1), + ] + return_bb = [ + int(1000 * return_bb[0] / width), + int(1000 * return_bb[1] / height), + int(1000 * return_bb[2] / width), + int(1000 * return_bb[3] / height), + ] + return return_bb + + +class ToNumpy: + + def __call__(self, pil_img): + np_img = np.array(pil_img, dtype=np.uint8) + if np_img.ndim < 3: + np_img = np.expand_dims(np_img, axis=-1) + np_img = np.rollaxis(np_img, 2) # HWC to CHW + return np_img + + +class ToTensor: + + def __init__(self, dtype=torch.float32): + self.dtype = dtype + + def __call__(self, pil_img): + np_img = np.array(pil_img, dtype=np.uint8) + if np_img.ndim < 3: + np_img = np.expand_dims(np_img, axis=-1) + np_img = np.rollaxis(np_img, 2) # HWC to CHW + return torch.from_numpy(np_img).to(dtype=self.dtype) + + +_pil_interpolation_to_str = { + F.InterpolationMode.NEAREST: 'F.InterpolationMode.NEAREST', + F.InterpolationMode.BILINEAR: 'F.InterpolationMode.BILINEAR', + F.InterpolationMode.BICUBIC: 'F.InterpolationMode.BICUBIC', + F.InterpolationMode.LANCZOS: 'F.InterpolationMode.LANCZOS', + F.InterpolationMode.HAMMING: 'F.InterpolationMode.HAMMING', + F.InterpolationMode.BOX: 'F.InterpolationMode.BOX', +} + + +def _pil_interp(method): + if method == 'bicubic': + return F.InterpolationMode.BICUBIC + elif method == 'lanczos': + return F.InterpolationMode.LANCZOS + elif method == 'hamming': + return F.InterpolationMode.HAMMING + else: + # default bilinear, do we want to allow nearest? + return F.InterpolationMode.BILINEAR + + +class Compose: + """Composes several transforms together. This transform does not support torchscript. + Please, see the note below. + + Args: + transforms (list of ``Transform`` objects): list of transforms to compose. + + Example: + >>> transforms.Compose([ + >>> transforms.CenterCrop(10), + >>> transforms.PILToTensor(), + >>> transforms.ConvertImageDtype(torch.float), + >>> ]) + + .. note:: + In order to script the transformations, please use ``torch.nn.Sequential`` as below. + + >>> transforms = torch.nn.Sequential( + >>> transforms.CenterCrop(10), + >>> transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + >>> ) + >>> scripted_transforms = torch.jit.script(transforms) + + Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor``, does not require + `lambda` functions or ``PIL.Image``. + + """ + + def __init__(self, transforms): + self.transforms = transforms + + def __call__(self, img, augmentation=False, box=None): + for t in self.transforms: + img = t(img, augmentation, box) + return img + + +class RandomResizedCropAndInterpolationWithTwoPic: + """Crop the given PIL Image to random size and aspect ratio with random interpolation. + A crop of random size (default: of 0.08 to 1.0) of the original size and a random + aspect ratio (default: of 3/4 to 4/3) of the original aspect ratio is made. This crop + is finally resized to given size. + This is popularly used to train the Inception networks. + Args: + size: expected output size of each edge + scale: range of size of the origin size cropped + ratio: range of aspect ratio of the origin aspect ratio cropped + interpolation: Default: PIL.Image.BILINEAR + """ + + def __init__(self, size, second_size=None, scale=(0.08, 1.0), ratio=(3. / 4., 4. / 3.), + interpolation='bilinear', second_interpolation='lanczos'): + if isinstance(size, tuple): + self.size = size + else: + self.size = (size, size) + if second_size is not None: + if isinstance(second_size, tuple): + self.second_size = second_size + else: + self.second_size = (second_size, second_size) + else: + self.second_size = None + if (scale[0] > scale[1]) or (ratio[0] > ratio[1]): + warnings.warn("range should be of kind (min, max)") + + self.interpolation = _pil_interp(interpolation) + self.second_interpolation = _pil_interp(second_interpolation) + self.scale = scale + self.ratio = ratio + + @staticmethod + def get_params(img, scale, ratio): + """Get parameters for ``crop`` for a random sized crop. + Args: + img (PIL Image): Image to be cropped. + scale (tuple): range of size of the origin size cropped + ratio (tuple): range of aspect ratio of the origin aspect ratio cropped + Returns: + tuple: params (i, j, h, w) to be passed to ``crop`` for a random + sized crop. + """ + area = img.size[0] * img.size[1] + + for attempt in range(10): + target_area = random.uniform(*scale) * area + log_ratio = (math.log(ratio[0]), math.log(ratio[1])) + aspect_ratio = math.exp(random.uniform(*log_ratio)) + + w = int(round(math.sqrt(target_area * aspect_ratio))) + h = int(round(math.sqrt(target_area / aspect_ratio))) + + if w <= img.size[0] and h <= img.size[1]: + i = random.randint(0, img.size[1] - h) + j = random.randint(0, img.size[0] - w) + return i, j, h, w + + # Fallback to central crop + in_ratio = img.size[0] / img.size[1] + if in_ratio < min(ratio): + w = img.size[0] + h = int(round(w / min(ratio))) + elif in_ratio > max(ratio): + h = img.size[1] + w = int(round(h * max(ratio))) + else: # whole image + w = img.size[0] + h = img.size[1] + i = (img.size[1] - h) // 2 + j = (img.size[0] - w) // 2 + return i, j, h, w + + def __call__(self, img, augmentation=False, box=None): + """ + Args: + img (PIL Image): Image to be cropped and resized. + Returns: + PIL Image: Randomly cropped and resized image. + """ + if augmentation: + i, j, h, w = self.get_params(img, self.scale, self.ratio) + img = F.crop(img, i, j, h, w) + # img, box = crop(img, i, j, h, w, box) + img = F.resize(img, self.size, self.interpolation) + second_img = F.resize(img, self.second_size, self.second_interpolation) \ + if self.second_size is not None else None + return img, second_img + + def __repr__(self): + if isinstance(self.interpolation, (tuple, list)): + interpolate_str = ' '.join([_pil_interpolation_to_str[x] for x in self.interpolation]) + else: + interpolate_str = _pil_interpolation_to_str[self.interpolation] + format_string = self.__class__.__name__ + '(size={0}'.format(self.size) + format_string += ', scale={0}'.format(tuple(round(s, 4) for s in self.scale)) + format_string += ', ratio={0}'.format(tuple(round(r, 4) for r in self.ratio)) + format_string += ', interpolation={0}'.format(interpolate_str) + if self.second_size is not None: + format_string += ', second_size={0}'.format(self.second_size) + format_string += ', second_interpolation={0}'.format(_pil_interpolation_to_str[self.second_interpolation]) + format_string += ')' + return format_string + + +def pil_loader(path: str) -> Image.Image: + # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835) + with open(path, 'rb') as f: + img = Image.open(f) + return img.convert('RGB') \ No newline at end of file diff --git a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_static/install_layoutlmft.sh b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_static/install_layoutlmft.sh deleted file mode 100644 index fd29421017d..00000000000 --- a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_static/install_layoutlmft.sh +++ /dev/null @@ -1,3 +0,0 @@ -git clone https://github.com/microsoft/unilm.git -cd unilm/layoutlmv3 -pip install -e . \ No newline at end of file diff --git a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_static/main.py b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_static/main.py index a9496b16fbb..b9462cfe067 100644 --- a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_static/main.py +++ b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_static/main.py @@ -10,8 +10,6 @@ from datasets import ClassLabel, load_dataset, load_metric import transformers - -from layoutlmft.data import DataCollatorForKeyValueExtraction from transformers import ( AutoConfig, AutoModelForTokenClassification, @@ -28,7 +26,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. check_min_version("4.5.0") -from layoutlmft.data.image_utils import RandomResizedCropAndInterpolationWithTwoPic, pil_loader, Compose +from image_utils import RandomResizedCropAndInterpolationWithTwoPic, pil_loader, Compose from timm.data.constants import \ IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD @@ -267,11 +265,8 @@ def main(): if data_args.dataset_name == 'funsd': # datasets = load_dataset("nielsr/funsd") - import layoutlmft.data.funsd - datasets = load_dataset(os.path.abspath(layoutlmft.data.funsd.__file__), cache_dir=model_args.cache_dir) - elif data_args.dataset_name == 'cord': - import layoutlmft.data.cord - datasets = load_dataset(os.path.abspath(layoutlmft.data.cord.__file__), cache_dir=model_args.cache_dir) + import funsd + datasets = load_dataset(os.path.abspath(funsd.__file__), cache_dir=model_args.cache_dir) else: raise NotImplementedError() @@ -279,6 +274,7 @@ def main(): features = datasets["test"].features text_column_name = "words" if "words" in column_names else "tokens" + boxes_column_name = "bboxes" label_column_name = ( f"{data_args.task_name}_tags" if f"{data_args.task_name}_tags" in column_names else column_names[1] @@ -366,8 +362,7 @@ def tokenize_and_align_labels(examples, augmentation=False): padding=False, truncation=True, return_overflowing_tokens=True, - # We use this argument because the texts in our dataset are lists of words (with a label for each word). - is_split_into_words=True, + boxes=examples[boxes_column_name], ) labels = [] diff --git a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_static/requirements.txt b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_static/requirements.txt index 92ae07d8600..0f7947499f6 100644 --- a/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_static/requirements.txt +++ b/examples/onnxrt/nlp/huggingface_model/token_classification/layoutlmv3/quantization/ptq_static/requirements.txt @@ -1,10 +1,10 @@ +accelerate datasets -transformers==4.12.5 -seqeval==1.2.2 -tensorboard==2.7.0 -seqeval==1.2.2 +transformers +seqeval +tensorboard sentencepiece -timm==0.4.12 +timm Pillow einops textdistance