diff --git a/docs/3x/PT_MXQuant.md b/docs/3x/PT_MXQuant.md index 1cfb17ff30b..42e12d039a6 100644 --- a/docs/3x/PT_MXQuant.md +++ b/docs/3x/PT_MXQuant.md @@ -95,7 +95,7 @@ user_model = convert(model=user_model) ## Examples -- PyTorch [huggingface models](/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx) +- PyTorch [huggingface models](/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant) ## Reference diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json index 8520a9545b0..3a21f42bd20 100644 --- a/examples/.config/model_params_pytorch_3x.json +++ b/examples/.config/model_params_pytorch_3x.json @@ -1,46 +1,53 @@ -{ - "pytorch": { - "gpt_j_ipex":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 1 - }, - "gpt_j_ipex_sq":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 1 - }, - "llama2_7b_ipex":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 1 - }, - "llama2_7b_ipex_sq":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 1 - }, - "opt_125m_ipex":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 8 - }, - "opt_125m_ipex_sq":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 8 - } - } -} \ No newline at end of file +{ + "pytorch": { + "gpt_j_ipex":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + }, + "gpt_j_ipex_sq":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + }, + "llama2_7b_ipex":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + }, + "llama2_7b_ipex_sq":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + }, + "opt_125m_ipex":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 8 + }, + "opt_125m_ipex_sq":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 8 + }, + "dlrm_ipex": { + "model_src_dir": "recommendation/dlrm/static_quant/ipex", + "dataset_location": "/mnt/local_disk3/dataset/dlrm/dlrm/input", + "input_model": "/mnt/local_disk3/dataset/dlrm/dlrm/dlrm_weight/tb00_40M.pt", + "main_script": "dlrm_s_pytorch.py", + "batch_size": 16384 + } + } +} diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx/README.md b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant/README.md similarity index 97% rename from examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx/README.md rename to examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant/README.md index 6608cbcf726..e61d5a64ade 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx/README.md +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant/README.md @@ -1,6 +1,7 @@ # Run ## Run WOQ MX FP4 model + ``` python python run_clm_no_trainer.py --model [model_name_or_id] --quantize --accuracy --tasks lambada_openai --w_dtype fp4 --woq -``` \ No newline at end of file +``` diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant/requirements.txt similarity index 100% rename from examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx/requirements.txt rename to examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant/requirements.txt diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant/run_clm_no_trainer.py similarity index 100% rename from examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx/run_clm_no_trainer.py rename to examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant/run_clm_no_trainer.py diff --git a/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/CODE_OF_CONDUCT.md b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/CODE_OF_CONDUCT.md new file mode 100644 index 00000000000..0f7ad8bfc17 --- /dev/null +++ b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/CODE_OF_CONDUCT.md @@ -0,0 +1,5 @@ +# Code of Conduct + +Facebook has adopted a Code of Conduct that we expect project participants to adhere to. +Please read the [full text](https://code.fb.com/codeofconduct/) +so that you can understand what actions will and will not be tolerated. diff --git a/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/CONTRIBUTING.md b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/CONTRIBUTING.md new file mode 100644 index 00000000000..cc013a17ec8 --- /dev/null +++ b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/CONTRIBUTING.md @@ -0,0 +1,36 @@ +# Contributing to DLRM +We want to make contributing to this project as easy and transparent as +possible. + +## Pull Requests +We actively welcome your pull requests. + +1. Fork the repo and create your branch from `master`. +2. If you've added code that should be tested, add tests. +3. If you've changed APIs, update the documentation. +4. Ensure the test suite passes. +5. Make sure your code lints. +6. If you haven't already, complete the Contributor License Agreement ("CLA"). + +## Contributor License Agreement ("CLA") +In order to accept your pull request, we need you to submit a CLA. You only need +to do this once to work on any of Facebook's open source projects. + +Complete your CLA here: + +## Issues +We use GitHub issues to track public bugs. Please ensure your description is +clear and has sufficient instructions to be able to reproduce the issue. + +Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe +disclosure of security bugs. In those cases, please go through the process +outlined on that page and do not file a public issue. + +## Coding Style +* 4 spaces for indentation rather than tabs +* 80 character line length +* in general, please maintain a consistent style with the rest of the code + +## License +By contributing to DLRM, you agree that your contributions will be licensed +under the LICENSE file in the root directory of this source tree. diff --git a/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/LICENSE b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/LICENSE new file mode 100644 index 00000000000..b96dcb0480a --- /dev/null +++ b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) Facebook, Inc. and its affiliates. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/README.md b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/README.md new file mode 100644 index 00000000000..918cc1edc23 --- /dev/null +++ b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/README.md @@ -0,0 +1,90 @@ +Step-by-Step +============ + +This document is used to list steps of reproducing PyTorch DLRM tuning zoo result. and original DLRM README is in [DLRM README](https://github.com/facebookresearch/dlrm/blob/master/README.md) + +> **Note** +> +> Please ensure your PC have >370G memory to run DLRM +> IPEX version >= 1.11 + +# Prerequisite + +### 1. Environment + +PyTorch 1.11 or higher version is needed with pytorch_fx backend. + + ```shell + # Install dependency + cd examples/pytorch/recommendation/dlrm/quantization/ptq/ipex + pip install -r requirements.txt + ``` +> Note: Validated PyTorch [Version](/docs/source/installation_guide.md#validated-software-environment). + +### 2. Prepare Dataset + + The code supports interface with the [Criteo Terabyte Dataset](https://labs.criteo.com/2013/12/download-terabyte-click-logs/) + + 1. download the raw data files day_0.gz, ...,day_23.gz and unzip them. + 2. Specify the location of the unzipped text files day_0, ...,day_23, using --raw-data-file= (the day number will be appended automatically), please refer "Run" command. + +### 3. Prepare pretrained model + + Download the DLRM PyTorch weights (`tb00_40M.pt`, 90GB) from the +[MLPerf repo](https://github.com/mlcommons/inference/tree/master/recommendation/dlrm/pytorch#more-information-about-the-model-weights) + +# Run +### tune with INC + ```shell + cd examples/pytorch/recommendation/dlrm/quantization/ptq/ipex + bash run_quant.sh --input_model="/path/of/pretrained/model" --dataset_location="/path/of/dataset" + ``` + +### benchmark +```shell +bash run_benchmark.sh --input_model="/path/of/pretrained/model" --dataset_location="/path/of/dataset" --mode=accuracy --int8=true +``` + + +Examples of enabling IntelĀ® Neural Compressor +========================= + +This is a tutorial of how to enable DLRM model with IntelĀ® Neural Compressor. + + +### Code update + +We need update dlrm_s_pytorch.py like below + +```python +# evaluation +def eval_func(model): + args.int8 = model.is_quantized + with torch.no_grad(): + return inference( + args, + model, + best_acc_test, + best_auc_test, + test_ld, + trace=args.int8 + ) + +# calibration +def calib_fn(model): + calib_number = 0 + for X_test, lS_o_test, lS_i_test, T in train_ld: + if calib_number < 102400: + model(X_test, lS_o_test, lS_i_test) + calib_number += 1 + +from neural_compressor.torch.quantization import SmoothQuantConfig, autotune, TuningConfig +tune_config = TuningConfig(config_set=SmoothQuantConfig.get_config_set_for_tuning()) +dlrm = autotune( + dlrm, + tune_config=tune_config, + eval_fn=eval_func, + run_fn=calib_fn, +) +dlrm.save("saved_results") +``` diff --git a/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/data_loader_terabyte.py b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/data_loader_terabyte.py new file mode 100644 index 00000000000..5bc0c4d3aab --- /dev/null +++ b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/data_loader_terabyte.py @@ -0,0 +1,388 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +from __future__ import absolute_import, division, print_function, unicode_literals + +import os +import numpy as np +from torch.utils.data import Dataset +import torch +import time +import math +from tqdm import tqdm +import argparse +import extend_distributed as ext_dist + + +class DataLoader: + """ + DataLoader dedicated for the Criteo Terabyte Click Logs dataset + """ + + def __init__( + self, + data_filename, + data_directory, + days, + batch_size, + max_ind_range=-1, + split="train", + drop_last_batch=False + ): + self.data_filename = data_filename + self.data_directory = data_directory + self.days = days + self.batch_size = batch_size + self.max_ind_range = max_ind_range + + total_file = os.path.join( + data_directory, + data_filename + "_day_count.npz" + ) + with np.load(total_file) as data: + total_per_file = data["total_per_file"][np.array(days)] + + self.length = sum(total_per_file) + if split == "test" or split == "val": + self.length = int(np.ceil(self.length / 2.)) + self.split = split + self.drop_last_batch = drop_last_batch + + def __iter__(self): + return iter( + _batch_generator( + self.data_filename, self.data_directory, self.days, + self.batch_size, self.split, self.drop_last_batch, self.max_ind_range + ) + ) + + def __len__(self): + if self.drop_last_batch: + return self.length // self.batch_size + else: + return math.ceil(self.length / self.batch_size) + + +def _transform_features( + x_int_batch, x_cat_batch, y_batch, max_ind_range, flag_input_torch_tensor=False +): + if max_ind_range > 0: + x_cat_batch = x_cat_batch % max_ind_range + + if flag_input_torch_tensor: + x_int_batch = torch.log(x_int_batch.clone().detach().type(torch.float) + 1) + x_cat_batch = x_cat_batch.clone().detach().type(torch.long) + y_batch = y_batch.clone().detach().type(torch.float32).view(-1, 1) + else: + x_int_batch = torch.log(torch.tensor(x_int_batch, dtype=torch.float) + 1) + x_cat_batch = torch.tensor(x_cat_batch, dtype=torch.long) + y_batch = torch.tensor(y_batch, dtype=torch.float32).view(-1, 1) + + batch_size = x_cat_batch.shape[0] + feature_count = x_cat_batch.shape[1] + lS_o = torch.arange(batch_size).reshape(1, -1).repeat(feature_count, 1) + + return x_int_batch, lS_o, x_cat_batch.t(), y_batch.view(-1, 1) + + +def _batch_generator( + data_filename, data_directory, days, batch_size, split, drop_last, max_ind_range +): + previous_file = None + for day in days: + filepath = os.path.join( + data_directory, + data_filename + "_{}_reordered.npz".format(day) + ) + + # print('Loading file: ', filepath) + with np.load(filepath) as data: + x_int = data["X_int"] + x_cat = data["X_cat"] + y = data["y"] + + samples_in_file = y.shape[0] + batch_start_idx = 0 + if split == "test" or split == "val": + length = int(np.ceil(samples_in_file / 2.)) + if split == "test": + samples_in_file = length + elif split == "val": + batch_start_idx = samples_in_file - length + + while batch_start_idx < samples_in_file - batch_size: + + missing_samples = batch_size + if previous_file is not None: + missing_samples -= previous_file['y'].shape[0] + + current_slice = slice(batch_start_idx, batch_start_idx + missing_samples) + + x_int_batch = x_int[current_slice] + x_cat_batch = x_cat[current_slice] + y_batch = y[current_slice] + + if previous_file is not None: + x_int_batch = np.concatenate( + [previous_file['x_int'], x_int_batch], + axis=0 + ) + x_cat_batch = np.concatenate( + [previous_file['x_cat'], x_cat_batch], + axis=0 + ) + y_batch = np.concatenate([previous_file['y'], y_batch], axis=0) + previous_file = None + + if x_int_batch.shape[0] != batch_size: + raise ValueError('should not happen') + + yield _transform_features(x_int_batch, x_cat_batch, y_batch, max_ind_range) + + batch_start_idx += missing_samples + if batch_start_idx != samples_in_file: + current_slice = slice(batch_start_idx, samples_in_file) + if previous_file is not None: + previous_file = { + 'x_int' : np.concatenate( + [previous_file['x_int'], x_int[current_slice]], + axis=0 + ), + 'x_cat' : np.concatenate( + [previous_file['x_cat'], x_cat[current_slice]], + axis=0 + ), + 'y' : np.concatenate([previous_file['y'], y[current_slice]], axis=0) + } + else: + previous_file = { + 'x_int' : x_int[current_slice], + 'x_cat' : x_cat[current_slice], + 'y' : y[current_slice] + } + + if not drop_last: + yield _transform_features( + previous_file['x_int'], + previous_file['x_cat'], + previous_file['y'], + max_ind_range + ) + + +def _test(): + generator = _batch_generator( + data_filename='day', + data_directory='./input', + days=range(23), + split="train", + batch_size=2048, + drop_last=True, + max_ind_range=-1 + ) + t1 = time.time() + for x_int, lS_o, x_cat, y in generator: + t2 = time.time() + time_diff = t2 - t1 + t1 = t2 + print( + "time {} x_int.shape: {} lS_o.shape: {} x_cat.shape: {} y.shape: {}".format( + time_diff, x_int.shape, lS_o.shape, x_cat.shape, y.shape + ) + ) + + +class CriteoBinDataset(Dataset): + """Binary version of criteo dataset.""" + + def __init__(self, data_file, counts_file, + batch_size=1, max_ind_range=-1, bytes_per_feature=4): + # dataset + self.tar_fea = 1 # single target + self.den_fea = 13 # 13 dense features + self.spa_fea = 26 # 26 sparse features + self.tad_fea = self.tar_fea + self.den_fea + self.tot_fea = self.tad_fea + self.spa_fea + + self.batch_size = batch_size + self.max_ind_range = max_ind_range + self.bytes_per_entry = (bytes_per_feature * self.tot_fea * batch_size) + + self.num_entries = math.ceil(os.path.getsize(data_file) / self.bytes_per_entry) + + data_file_size = os.path.getsize(data_file) + bytes_per_sample = bytes_per_feature * self.tot_fea + if ext_dist.my_size > 1: + self.bytes_per_rank = self.bytes_per_entry // ext_dist.my_size + else: + self.bytes_per_rank = self.bytes_per_entry + + if ext_dist.my_size > 1 and self.num_entries * self.bytes_per_entry > data_file_size: + last_batch = (data_file_size % self.bytes_per_entry) // bytes_per_sample + self.bytes_last_batch = last_batch // ext_dist.my_size * bytes_per_sample + else: + self.bytes_last_batch = self.bytes_per_rank + + if self.bytes_last_batch == 0: + self.num_entries = self.num_entries - 1 + self.bytes_last_batch = self.bytes_per_rank + + print('data file:', data_file, 'number of batches:', self.num_entries) + self.file = open(data_file, 'rb') + + with np.load(counts_file) as data: + self.counts = data["counts"] + + # hardcoded for now + self.m_den = 13 + + def __len__(self): + return self.num_entries + + def __getitem__(self, idx): + my_rank = ext_dist.dist.get_rank() if ext_dist.my_size > 1 else 0 + rank_size = self.bytes_last_batch if idx == (self.num_entries - 1) else self.bytes_per_rank + self.file.seek(idx * self.bytes_per_entry + rank_size * my_rank, 0) + raw_data = self.file.read(rank_size) + array = np.frombuffer(raw_data, dtype=np.int32) + tensor = torch.from_numpy(array).view((-1, self.tot_fea)) + + return _transform_features(x_int_batch=tensor[:, 1:14], + x_cat_batch=tensor[:, 14:], + y_batch=tensor[:, 0], + max_ind_range=self.max_ind_range, + flag_input_torch_tensor=True) + + def __del__(self): + self.file.close() + + +def numpy_to_binary(input_files, output_file_path, split='train'): + """Convert the data to a binary format to be read with CriteoBinDataset.""" + + # WARNING - both categorical and numerical data must fit into int32 for + # the following code to work correctly + + with open(output_file_path, 'wb') as output_file: + if split == 'train': + for input_file in input_files: + print('Processing file: ', input_file) + + np_data = np.load(input_file) + np_data = np.concatenate([np_data['y'].reshape(-1, 1), + np_data['X_int'], + np_data['X_cat']], axis=1) + np_data = np_data.astype(np.int32) + + output_file.write(np_data.tobytes()) + else: + assert len(input_files) == 1 + np_data = np.load(input_files[0]) + np_data = np.concatenate([np_data['y'].reshape(-1, 1), + np_data['X_int'], + np_data['X_cat']], axis=1) + np_data = np_data.astype(np.int32) + + samples_in_file = np_data.shape[0] + midpoint = int(np.ceil(samples_in_file / 2.)) + if split == "test": + begin = 0 + end = midpoint + elif split == "val": + begin = midpoint + end = samples_in_file + else: + raise ValueError('Unknown split value: ', split) + + output_file.write(np_data[begin:end].tobytes()) + + +def _preprocess(args): + train_files = ['{}_{}_reordered.npz'.format(args.input_data_prefix, day) for + day in range(0, 23)] + + test_valid_file = args.input_data_prefix + '_23_reordered.npz' + + os.makedirs(args.output_directory, exist_ok=True) + for split in ['train', 'val', 'test']: + print('Running preprocessing for split =', split) + + output_file = os.path.join(args.output_directory, + '{}_data.bin'.format(split)) + + input_files = train_files if split == 'train' else [test_valid_file] + numpy_to_binary(input_files=input_files, + output_file_path=output_file, + split=split) + + +def _test_bin(): + parser = argparse.ArgumentParser() + parser.add_argument('--output_directory', required=True) + parser.add_argument('--input_data_prefix', required=True) + parser.add_argument('--split', choices=['train', 'test', 'val'], + required=True) + args = parser.parse_args() + + _preprocess(args) + + binary_data_file = os.path.join(args.output_directory, + '{}_data.bin'.format(args.split)) + + counts_file = os.path.join(args.output_directory, 'day_fea_count.npz') + dataset_binary = CriteoBinDataset(data_file=binary_data_file, + counts_file=counts_file, + batch_size=2048,) + from dlrm_data_pytorch import CriteoDataset + from dlrm_data_pytorch import collate_wrapper_criteo_offset as collate_wrapper_criteo + + binary_loader = torch.utils.data.DataLoader( + dataset_binary, + batch_size=None, + shuffle=False, + num_workers=0, + collate_fn=None, + pin_memory=False, + drop_last=False, + ) + + original_dataset = CriteoDataset( + dataset='terabyte', + max_ind_range=10 * 1000 * 1000, + sub_sample_rate=1, + randomize=True, + split=args.split, + raw_path=args.input_data_prefix, + pro_data='dummy_string', + memory_map=True + ) + + original_loader = torch.utils.data.DataLoader( + original_dataset, + batch_size=2048, + shuffle=False, + num_workers=0, + collate_fn=collate_wrapper_criteo, + pin_memory=False, + drop_last=False, + ) + + assert len(dataset_binary) == len(original_loader) + for i, (old_batch, new_batch) in tqdm(enumerate(zip(original_loader, + binary_loader)), + total=len(dataset_binary)): + + for j in range(len(new_batch)): + if not np.array_equal(old_batch[j], new_batch[j]): + raise ValueError('FAILED: Datasets not equal') + if i > len(dataset_binary): + break + print('PASSED') + + +if __name__ == '__main__': + _test() + _test_bin() diff --git a/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/data_utils.py b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/data_utils.py new file mode 100644 index 00000000000..6ceef9517df --- /dev/null +++ b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/data_utils.py @@ -0,0 +1,1292 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# Description: generate inputs and targets for the DLRM benchmark +# +# Utility function(s) to download and pre-process public data sets +# - Criteo Kaggle Display Advertising Challenge Dataset +# https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset +# - Criteo Terabyte Dataset +# https://labs.criteo.com/2013/12/download-terabyte-click-logs +# +# After downloading dataset, run: +# getCriteoAdData( +# datafile="", +# o_filename=kaggleAdDisplayChallenge_processed.npz, +# max_ind_range=-1, +# sub_sample_rate=0.0, +# days=7, +# data_split='train', +# randomize='total', +# criteo_kaggle=True, +# memory_map=False +# ) +# getCriteoAdData( +# datafile="", +# o_filename=terabyte_processed.npz, +# max_ind_range=-1, +# sub_sample_rate=0.0, +# days=24, +# data_split='train', +# randomize='total', +# criteo_kaggle=False, +# memory_map=False +# ) + +from __future__ import absolute_import, division, print_function, unicode_literals + +import sys +# import os +from os import path +from multiprocessing import Process, Manager +# import io +# from io import StringIO +# import collections as coll + +import numpy as np + + +def convertUStringToDistinctIntsDict(mat, convertDicts, counts): + # Converts matrix of unicode strings into distinct integers. + # + # Inputs: + # mat (np.array): array of unicode strings to convert + # convertDicts (list): dictionary for each column + # counts (list): number of different categories in each column + # + # Outputs: + # out (np.array): array of output integers + # convertDicts (list): dictionary for each column + # counts (list): number of different categories in each column + + # check if convertDicts and counts match correct length of mat + if len(convertDicts) != mat.shape[1] or len(counts) != mat.shape[1]: + print("Length of convertDicts or counts does not match input shape") + print("Generating convertDicts and counts...") + + convertDicts = [{} for _ in range(mat.shape[1])] + counts = [0 for _ in range(mat.shape[1])] + + # initialize output + out = np.zeros(mat.shape) + + for j in range(mat.shape[1]): + for i in range(mat.shape[0]): + # add to convertDict and increment count + if mat[i, j] not in convertDicts[j]: + convertDicts[j][mat[i, j]] = counts[j] + counts[j] += 1 + out[i, j] = convertDicts[j][mat[i, j]] + + return out, convertDicts, counts + + +def convertUStringToDistinctIntsUnique(mat, mat_uni, counts): + # mat is an array of 0,...,# samples, with each being 26 categorical features + + # check if mat_unique and counts match correct length of mat + if len(mat_uni) != mat.shape[1] or len(counts) != mat.shape[1]: + print("Length of mat_unique or counts does not match input shape") + print("Generating mat_unique and counts...") + + mat_uni = [np.array([]) for _ in range(mat.shape[1])] + counts = [0 for _ in range(mat.shape[1])] + + # initialize output + out = np.zeros(mat.shape) + ind_map = [np.array([]) for _ in range(mat.shape[1])] + + # find out and assign unique ids to features + for j in range(mat.shape[1]): + m = mat_uni[j].size + mat_concat = np.concatenate((mat_uni[j], mat[:, j])) + mat_uni[j], ind_map[j] = np.unique(mat_concat, return_inverse=True) + out[:, j] = ind_map[j][m:] + counts[j] = mat_uni[j].size + + return out, mat_uni, counts + + +def processCriteoAdData(d_path, d_file, npzfile, i, convertDicts, pre_comp_counts): + # Process Kaggle Display Advertising Challenge or Terabyte Dataset + # by converting unicode strings in X_cat to integers and + # converting negative integer values in X_int. + # + # Loads data in the form "{kaggle|terabyte}_day_i.npz" where i is the day. + # + # Inputs: + # d_path (str): path for {kaggle|terabyte}_day_i.npz files + # i (int): splits in the dataset (typically 0 to 7 or 0 to 24) + + # process data if not all files exist + filename_i = npzfile + "_{0}_processed.npz".format(i) + + if path.exists(filename_i): + print("Using existing " + filename_i, end="\n") + else: + print("Not existing " + filename_i) + with np.load(npzfile + "_{0}.npz".format(i)) as data: + # categorical features + ''' + # Approach 1a: using empty dictionaries + X_cat, convertDicts, counts = convertUStringToDistinctIntsDict( + data["X_cat"], convertDicts, counts + ) + ''' + ''' + # Approach 1b: using empty np.unique + X_cat, convertDicts, counts = convertUStringToDistinctIntsUnique( + data["X_cat"], convertDicts, counts + ) + ''' + # Approach 2a: using pre-computed dictionaries + X_cat_t = np.zeros(data["X_cat_t"].shape) + for j in range(26): + for k, x in enumerate(data["X_cat_t"][j, :]): + X_cat_t[j, k] = convertDicts[j][x] + # continuous features + X_int = data["X_int"] + X_int[X_int < 0] = 0 + # targets + y = data["y"] + + np.savez_compressed( + filename_i, + # X_cat = X_cat, + X_cat=np.transpose(X_cat_t), # transpose of the data + X_int=X_int, + y=y, + ) + print("Processed " + filename_i, end="\n") + # sanity check (applicable only if counts have been pre-computed & are re-computed) + # for j in range(26): + # if pre_comp_counts[j] != counts[j]: + # sys.exit("ERROR: Sanity check on counts has failed") + # print("\nSanity check on counts passed") + + return + + +def concatCriteoAdData( + d_path, + d_file, + npzfile, + trafile, + days, + data_split, + randomize, + total_per_file, + total_count, + memory_map, + o_filename +): + # Concatenates different days and saves the result. + # + # Inputs: + # days (int): total number of days in the dataset (typically 7 or 24) + # d_path (str): path for {kaggle|terabyte}_day_i.npz files + # o_filename (str): output file name + # + # Output: + # o_file (str): output file path + + if memory_map: + # dataset break up per fea + # tar_fea = 1 # single target + den_fea = 13 # 13 dense features + spa_fea = 26 # 26 sparse features + # tad_fea = tar_fea + den_fea + # tot_fea = tad_fea + spa_fea + # create offset per file + offset_per_file = np.array([0] + [x for x in total_per_file]) + for i in range(days): + offset_per_file[i + 1] += offset_per_file[i] + + ''' + # Approach 1, 2 and 3 use indices, while Approach 4 does not use them + # create indices + indices = np.arange(total_count) + if data_split == "none": + if randomize == "total": + indices = np.random.permutation(indices) + else: + indices = np.array_split(indices, offset_per_file[1:-1]) + + # randomize train data (per day) + if randomize == "day": # or randomize == "total": + for i in range(len(indices) - 1): + indices[i] = np.random.permutation(indices[i]) + print("Randomized indices per day ...") + + train_indices = np.concatenate(indices[:-1]) + test_indices = indices[-1] + + # randomize train data (across days) + if randomize == "total": + train_indices = np.random.permutation(train_indices) + print("Randomized indices across days ...") + + indices = np.concatenate((train_indices, test_indices)) + # no reordering + # indices = np.arange(total_count) + ''' + ''' + # Approach 1: simple and slow (no grouping is used) + # check if data already exists + recreate_flag = False + for j in range(tot_fea): + filename_j = trafile + "_{0}_reordered.npy".format(j) + if path.exists(filename_j): + print("Using existing " + filename_j) + else: + recreate_flag = True + # load, reorder and concatenate data (memmap all reordered files per feature) + if recreate_flag: + # init reordered files (.npy appended automatically) + z = np.zeros((total_count)) + for j in range(tot_fea): + filename_j = trafile + "_{0}_reordered".format(j) + np.save(filename_j, z) + print("Creating " + filename_j) + + for i in range(days): + filename_i = d_path + npzfile + "_{0}_processed.npz".format(i) + with np.load(filename_i) as data: + X_cat_t = np.transpose(data["X_cat"]) + X_int_t = np.transpose(data["X_int"]) + y = data["y"] + size = len(y) + # sanity check + if total_per_file[i] != size: + sys.exit("ERROR: sanity check on number of samples failed") + # setup start and end ranges + start = offset_per_file[i] + end = offset_per_file[i + 1] + # print(filename_i) + # print("start=" + str(start) + " end=" + str(end) + # + " diff=" + str(end - start) + "=" + str(total_per_file[i])) + + for j in range(tot_fea): + filename_j = trafile + "_{0}_reordered.npy".format(j) + fj = np.load(filename_j, mmap_mode='r+') + if j < tar_fea: + fj[indices[start:end]] = y + elif tar_fea <= j and j < tad_fea: + fj[indices[start:end]] = X_int_t[j - tar_fea, :] + else: + fj[indices[start:end]] = X_cat_t[j - tad_fea, :] + del fj + else: + print("Reordered fea files already exist, skipping ...") + + # check if data already exists + recreate_flag = False + for i in range(days): + filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i) + if path.exists(filename_i): + print("Using existing " + filename_i) + else: + recreate_flag = True + # split reordered data by files (memmap all reordered files per feature) + # on the day boundary del the file object and memmap again + if recreate_flag: + for i in range(days): + filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i) + size = total_per_file[i] + X_int_t = np.zeros((den_fea, size)) + X_cat_t = np.zeros((spa_fea, size)) + # setup start and end ranges + start = offset_per_file[i] + end = offset_per_file[i + 1] + print("Creating " + filename_i) + # print("start=" + str(start) + " end=" + str(end) + # + " diff=" + str(end - start) + "=" + str(total_per_file[i])) + + for j in range(tot_fea): + filename_j = trafile + "_{0}_reordered.npy".format(j) + fj = np.load(filename_j, mmap_mode='r') + if j < tar_fea: + y = fj[start:end] + elif tar_fea <= j and j < tad_fea: + X_int_t[j - tar_fea, :] = fj[start:end] + else: + X_cat_t[j - tad_fea, :] = fj[start:end] + del fj + + np.savez_compressed( + filename_i, + X_cat=np.transpose(X_cat_t), # transpose of the data + X_int=np.transpose(X_int_t), # transpose of the data + y=y, + ) + else: + print("Reordered day files already exist, skipping ...") + ''' + ''' + # Approach 2: group days + # check if data already exists + recreate_flag = False + for j in range(tot_fea): + filename_j = trafile + "_{0}_reordered.npy".format(j) + if path.exists(filename_j): + print("Using existing " + filename_j) + else: + recreate_flag = True + # load, reorder and concatenate data (memmap all reordered files per feature) + if recreate_flag: + # init reordered files (.npy appended automatically) + z = np.zeros((total_count)) + for j in range(tot_fea): + filename_j = trafile + "_{0}_reordered".format(j) + np.save(filename_j, z) + print("Creating " + filename_j) + + group_day = 3 # e.g. 8, 4 or 3 + group_num = days // group_day + file_group = [i*group_day for i in range(group_num)] + [days] + for ii in range(group_num): + # for last may be group_size != group_num, therefore reset it below + group_size = file_group[ii + 1] - file_group[ii] + X_cat_t = [0]*group_size + X_int_t = [0]*group_size + y = [0]*group_size + start = [0]*group_size + end = [0]*group_size + for ig in range(group_size): + i = file_group[ii] + ig + filename_i = d_path + npzfile + "_{0}_processed.npz".format(i) + # setup start and end ranges + start[ig] = offset_per_file[i] + end[ig] = offset_per_file[i + 1] + # print(filename_i) + # load a group of files + with np.load(filename_i) as data: + X_cat_t[ig] = np.transpose(data["X_cat"]) + X_int_t[ig] = np.transpose(data["X_int"]) + y[ig] = data["y"] + # sanity check + if total_per_file[i] != len(y[ig]): + sys.exit("ERROR: sanity check on number of samples failed") + # print("start=" + str(start) + " end=" + str(end) + # + " diff=" + str(end[ig]-start[ig]) + "=" + str(total_per_file[i])) + + for j in range(tot_fea): + filename_j = trafile + "_{0}_reordered.npy".format(j) + fj = np.load(filename_j, mmap_mode='r+') + for ig in range(group_size): + if j < tar_fea: + fj[indices[start[ig]:end[ig]]] = y[ig] + elif tar_fea <= j and j < tad_fea: + fj[indices[start[ig]:end[ig]]] = X_int_t[ig][j - tar_fea, :] + else: + fj[indices[start[ig]:end[ig]]] = X_cat_t[ig][j - tad_fea, :] + del fj + else: + print("Reordered fea files already exist, skipping ...") + + # check if data already exists + recreate_flag = False + for i in range(days): + filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i) + if path.exists(filename_i): + print("Using existing " + filename_i) + else: + recreate_flag = True + # split reordered data by files (memmap all reordered files per feature) + # on the day boundary del the file object and memmap again + if recreate_flag: + for ii in range(group_num): + # for last may be group_size != group_num, therefore reset it below + group_size = file_group[ii + 1] - file_group[ii] + X_cat_t= []; X_int_t = [] + for ig in range(group_size): + i = file_group[ii] + ig + X_int_t.append(np.zeros((den_fea, total_per_file[i]))) + X_cat_t.append(np.zeros((spa_fea, total_per_file[i]))) + y = [0]*group_size + start = [0]*group_size + end = [0]*group_size + + for j in range(tot_fea): + filename_j = trafile + "_{0}_reordered.npy".format(j) + fj = np.load(filename_j, mmap_mode='r') + # load a group of files + for ig in range(group_size): + i = file_group[ii] + ig + # setup start and end ranges + start[ig] = offset_per_file[i] + end[ig] = offset_per_file[i + 1] + # load data for the group of files + if j < tar_fea: + y[ig] = fj[start[ig]:end[ig]] + elif tar_fea <= j and j < tad_fea: + X_int_t[ig][j - tar_fea, :] = fj[start[ig]:end[ig]] + else: + X_cat_t[ig][j - tad_fea, :] = fj[start[ig]:end[ig]] + del fj + + for ig in range(group_size): + i = file_group[ii] + ig + filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i) + print("Creating " + filename_i) + np.savez_compressed( + filename_i, + X_cat=np.transpose(X_cat_t[ig]), # transpose of the data + X_int=np.transpose(X_int_t[ig]), # transpose of the data + y=y[ig], + ) + else: + print("Reordered day files already exist, skipping ...") + ''' + ''' + # Approach 3: group features + # check if data already exists + group_fea = 5 # e.g. 8, 5 or 4 + group_num = tot_fea // group_fea + if tot_fea % group_fea != 0: # sanity check + sys.exit("ERROR: the group_fea must divided tot_fea evenly.") + recreate_flag = False + for jn in range(group_num): + filename_j = trafile + "_{0}_reordered{1}.npy".format( + jn, group_fea + ) + if path.exists(filename_j): + print("Using existing " + filename_j) + else: + recreate_flag = True + # load, reorder and concatenate data (memmap all reordered files per feature) + if recreate_flag: + # init reordered files (.npy appended automatically) + z = np.zeros((group_fea, total_count)) + for jn in range(group_num): + filename_j = trafile + "_{0}_reordered{1}".format( + jn, group_fea + ) + np.save(filename_j, z) + print("Creating " + filename_j) + + for i in range(days): + filename_i = d_path + npzfile + "_{0}_processed.npz".format(i) + with np.load(filename_i) as data: + X_cat_t = np.transpose(data["X_cat"]) + X_int_t = np.transpose(data["X_int"]) + y = data["y"] + size = len(y) + # sanity check + if total_per_file[i] != size: + sys.exit("ERROR: sanity check on number of samples failed") + # setup start and end ranges + start = offset_per_file[i] + end = offset_per_file[i + 1] + # print(filename_i) + # print("start=" + str(start) + " end=" + str(end) + # + " diff=" + str(end - start) + "=" + str(total_per_file[i])) + + for jn in range(group_num): + filename_j = trafile + "_{0}_reordered{1}.npy".format( + jn, group_fea + ) + fj = np.load(filename_j, mmap_mode='r+') + for jg in range(group_fea): + j = jn * group_fea + jg + # print("j=" + str(j) + " jn=" + str(jn) + " jg=" + str(jg)) + if j < tar_fea: + fj[jg, indices[start:end]] = y + elif tar_fea <= j and j < tad_fea: + fj[jg, indices[start:end]] = X_int_t[j - tar_fea, :] + else: + fj[jg, indices[start:end]] = X_cat_t[j - tad_fea, :] + del fj + else: + print("Reordered fea files already exist, skipping ...") + + # check if data already exists + recreate_flag = False + for i in range(days): + filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i) + if path.exists(filename_i): + print("Using existing" + filename_i) + else: + recreate_flag = True + # split reordered data by files (memmap all reordered files per feature) + # on the day boundary del the file object and memmap again + if recreate_flag: + for i in range(days): + filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i) + size = total_per_file[i] + X_int_t = np.zeros((den_fea, size)) + X_cat_t = np.zeros((spa_fea, size)) + # setup start and end ranges + start = offset_per_file[i] + end = offset_per_file[i + 1] + print("Creating " + filename_i) + # print("start=" + str(start) + " end=" + str(end) + # + " diff=" + str(end - start) + "=" + str(total_per_file[i])) + + for jn in range(group_num): + filename_j = trafile + "_{0}_reordered{1}.npy".format( + jn, group_fea + ) + fj = np.load(filename_j, mmap_mode='r') + for jg in range(group_fea): + j = jn * group_fea + jg + # print("j=" + str(j) + " jn=" + str(jn) + " jg=" + str(jg)) + if j < tar_fea: + y = fj[jg, start:end] + elif tar_fea <= j and j < tad_fea: + X_int_t[j - tar_fea, :] = fj[jg, start:end] + else: + X_cat_t[j - tad_fea, :] = fj[jg, start:end] + del fj + + np.savez_compressed( + filename_i, + X_cat=np.transpose(X_cat_t), # transpose of the data + X_int=np.transpose(X_int_t), # transpose of the data + y=y, + ) + + else: + print("Reordered day files already exist, skipping ...") + ''' + + # Approach 4: Fisher-Yates-Rao (FYR) shuffle algorithm + # 1st pass of FYR shuffle + # check if data already exists + recreate_flag = False + for j in range(days): + filename_j_y = npzfile + "_{0}_intermediate_y.npy".format(j) + filename_j_d = npzfile + "_{0}_intermediate_d.npy".format(j) + filename_j_s = npzfile + "_{0}_intermediate_s.npy".format(j) + if ( + path.exists(filename_j_y) + and path.exists(filename_j_d) + and path.exists(filename_j_s) + ): + print( + "Using existing\n" + + filename_j_y + "\n" + + filename_j_d + "\n" + + filename_j_s + ) + else: + recreate_flag = True + # reorder across buckets using sampling + if recreate_flag: + # init intermediate files (.npy appended automatically) + for j in range(days): + filename_j_y = npzfile + "_{0}_intermediate_y".format(j) + filename_j_d = npzfile + "_{0}_intermediate_d".format(j) + filename_j_s = npzfile + "_{0}_intermediate_s".format(j) + np.save(filename_j_y, np.zeros((total_per_file[j]))) + np.save(filename_j_d, np.zeros((total_per_file[j], den_fea))) + np.save(filename_j_s, np.zeros((total_per_file[j], spa_fea))) + # start processing files + total_counter = [0] * days + for i in range(days): + filename_i = npzfile + "_{0}_processed.npz".format(i) + with np.load(filename_i) as data: + X_cat = data["X_cat"] + X_int = data["X_int"] + y = data["y"] + size = len(y) + # sanity check + if total_per_file[i] != size: + sys.exit("ERROR: sanity check on number of samples failed") + # debug prints + print("Reordering (1st pass) " + filename_i) + + # create buckets using sampling of random ints + # from (discrete) uniform distribution + buckets = [] + for _j in range(days): + buckets.append([]) + counter = [0] * days + days_to_sample = days if data_split == "none" else days - 1 + if randomize == "total": + rand_u = np.random.randint(low=0, high=days_to_sample, size=size) + for k in range(size): + # sample and make sure elements per buckets do not overflow + if data_split == "none" or i < days - 1: + # choose bucket + p = rand_u[k] + # retry of the bucket is full + while total_counter[p] + counter[p] >= total_per_file[p]: + p = np.random.randint(low=0, high=days_to_sample) + else: # preserve the last day/bucket if needed + p = i + buckets[p].append(k) + counter[p] += 1 + else: # randomize is day or none + for k in range(size): + # do not sample, preserve the data in this bucket + p = i + buckets[p].append(k) + counter[p] += 1 + + # sanity check + if np.sum(counter) != size: + sys.exit("ERROR: sanity check on number of samples failed") + # debug prints + # print(counter) + # print(str(np.sum(counter)) + " = " + str(size)) + # print([len(x) for x in buckets]) + # print(total_counter) + + # partially feel the buckets + for j in range(days): + filename_j_y = npzfile + "_{0}_intermediate_y.npy".format(j) + filename_j_d = npzfile + "_{0}_intermediate_d.npy".format(j) + filename_j_s = npzfile + "_{0}_intermediate_s.npy".format(j) + start = total_counter[j] + end = total_counter[j] + counter[j] + # target buckets + fj_y = np.load(filename_j_y, mmap_mode='r+') + # print("start=" + str(start) + " end=" + str(end) + # + " end - start=" + str(end - start) + " " + # + str(fj_y[start:end].shape) + " " + # + str(len(buckets[j]))) + fj_y[start:end] = y[buckets[j]] + del fj_y + # dense buckets + fj_d = np.load(filename_j_d, mmap_mode='r+') + # print("start=" + str(start) + " end=" + str(end) + # + " end - start=" + str(end - start) + " " + # + str(fj_d[start:end, :].shape) + " " + # + str(len(buckets[j]))) + fj_d[start:end, :] = X_int[buckets[j], :] + del fj_d + # sparse buckets + fj_s = np.load(filename_j_s, mmap_mode='r+') + # print("start=" + str(start) + " end=" + str(end) + # + " end - start=" + str(end - start) + " " + # + str(fj_s[start:end, :].shape) + " " + # + str(len(buckets[j]))) + fj_s[start:end, :] = X_cat[buckets[j], :] + del fj_s + # update counters for next step + total_counter[j] += counter[j] + + # 2nd pass of FYR shuffle + # check if data already exists + for j in range(days): + filename_j = npzfile + "_{0}_reordered.npz".format(j) + if path.exists(filename_j): + print("Using existing " + filename_j) + else: + recreate_flag = True + # reorder within buckets + if recreate_flag: + for j in range(days): + filename_j_y = npzfile + "_{0}_intermediate_y.npy".format(j) + filename_j_d = npzfile + "_{0}_intermediate_d.npy".format(j) + filename_j_s = npzfile + "_{0}_intermediate_s.npy".format(j) + fj_y = np.load(filename_j_y) + fj_d = np.load(filename_j_d) + fj_s = np.load(filename_j_s) + + indices = range(total_per_file[j]) + if randomize == "day" or randomize == "total": + if data_split == "none" or j < days - 1: + indices = np.random.permutation(range(total_per_file[j])) + + filename_r = npzfile + "_{0}_reordered.npz".format(j) + print("Reordering (2nd pass) " + filename_r) + np.savez_compressed( + filename_r, + X_cat=fj_s[indices, :], + X_int=fj_d[indices, :], + y=fj_y[indices], + ) + + ''' + # sanity check (under no reordering norms should be zero) + for i in range(days): + filename_i_o = npzfile + "_{0}_processed.npz".format(i) + print(filename_i_o) + with np.load(filename_i_o) as data_original: + X_cat_o = data_original["X_cat"] + X_int_o = data_original["X_int"] + y_o = data_original["y"] + filename_i_r = npzfile + "_{0}_reordered.npz".format(i) + print(filename_i_r) + with np.load(filename_i_r) as data_reordered: + X_cat_r = data_reordered["X_cat"] + X_int_r = data_reordered["X_int"] + y_r = data_reordered["y"] + print(np.linalg.norm(y_o - y_r)) + print(np.linalg.norm(X_int_o - X_int_r)) + print(np.linalg.norm(X_cat_o - X_cat_r)) + ''' + + else: + print("Concatenating multiple days into %s.npz file" % str(d_path + o_filename)) + + # load and concatenate data + for i in range(days): + filename_i = npzfile + "_{0}_processed.npz".format(i) + with np.load(filename_i) as data: + if i == 0: + X_cat = data["X_cat"] + X_int = data["X_int"] + y = data["y"] + else: + X_cat = np.concatenate((X_cat, data["X_cat"])) + X_int = np.concatenate((X_int, data["X_int"])) + y = np.concatenate((y, data["y"])) + print("Loaded day:", i, "y = 1:", len(y[y == 1]), "y = 0:", len(y[y == 0])) + + with np.load(d_path + d_file + "_fea_count.npz") as data: + counts = data["counts"] + print("Loaded counts!") + + np.savez_compressed( + d_path + o_filename + ".npz", + X_cat=X_cat, + X_int=X_int, + y=y, + counts=counts, + ) + + return d_path + o_filename + ".npz" + + +def transformCriteoAdData(X_cat, X_int, y, days, data_split, randomize, total_per_file): + # Transforms Criteo Kaggle or terabyte data by applying log transformation + # on dense features and converting everything to appropriate tensors. + # + # Inputs: + # X_cat (ndarray): array of integers corresponding to preprocessed + # categorical features + # X_int (ndarray): array of integers corresponding to dense features + # y (ndarray): array of bool corresponding to labels + # data_split(str): flag for splitting dataset into training/validation/test + # sets + # randomize (str): determines randomization scheme + # "none": no randomization + # "day": randomizes each day"s data (only works if split = True) + # "total": randomizes total dataset + # + # Outputs: + # if split: + # X_cat_train (tensor): sparse features for training set + # X_int_train (tensor): dense features for training set + # y_train (tensor): labels for training set + # X_cat_val (tensor): sparse features for validation set + # X_int_val (tensor): dense features for validation set + # y_val (tensor): labels for validation set + # X_cat_test (tensor): sparse features for test set + # X_int_test (tensor): dense features for test set + # y_test (tensor): labels for test set + # else: + # X_cat (tensor): sparse features + # X_int (tensor): dense features + # y (tensor): label + + # define initial set of indices + indices = np.arange(len(y)) + + # create offset per file + offset_per_file = np.array([0] + [x for x in total_per_file]) + for i in range(days): + offset_per_file[i + 1] += offset_per_file[i] + + # split dataset + if data_split == 'train': + indices = np.array_split(indices, offset_per_file[1:-1]) + + # randomize train data (per day) + if randomize == "day": # or randomize == "total": + for i in range(len(indices) - 1): + indices[i] = np.random.permutation(indices[i]) + print("Randomized indices per day ...") + + train_indices = np.concatenate(indices[:-1]) + test_indices = indices[-1] + test_indices, val_indices = np.array_split(test_indices, 2) + + print("Defined training and testing indices...") + + # randomize train data (across days) + if randomize == "total": + train_indices = np.random.permutation(train_indices) + print("Randomized indices across days ...") + + # indices = np.concatenate((train_indices, test_indices)) + + # create training, validation, and test sets + X_cat_train = X_cat[train_indices] + X_int_train = X_int[train_indices] + y_train = y[train_indices] + + X_cat_val = X_cat[val_indices] + X_int_val = X_int[val_indices] + y_val = y[val_indices] + + X_cat_test = X_cat[test_indices] + X_int_test = X_int[test_indices] + y_test = y[test_indices] + + print("Split data according to indices...") + + X_cat_train = X_cat_train.astype(np.long) + X_int_train = np.log(X_int_train.astype(np.float32) + 1) + y_train = y_train.astype(np.float32) + + X_cat_val = X_cat_val.astype(np.long) + X_int_val = np.log(X_int_val.astype(np.float32) + 1) + y_val = y_val.astype(np.float32) + + X_cat_test = X_cat_test.astype(np.long) + X_int_test = np.log(X_int_test.astype(np.float32) + 1) + y_test = y_test.astype(np.float32) + + print("Converted to tensors...done!") + + return ( + X_cat_train, + X_int_train, + y_train, + X_cat_val, + X_int_val, + y_val, + X_cat_test, + X_int_test, + y_test, + ) + + else: + + # randomize data + if randomize == "total": + indices = np.random.permutation(indices) + print("Randomized indices...") + + X_cat = X_cat[indices].astype(np.long) + X_int = np.log(X_int[indices].astype(np.float32) + 1) + y = y[indices].astype(np.float32) + + print("Converted to tensors...done!") + + return (X_cat, X_int, y, [], [], [], [], [], []) + + +def getCriteoAdData( + datafile, + o_filename, + max_ind_range=-1, + sub_sample_rate=0.0, + days=7, + data_split='train', + randomize='total', + criteo_kaggle=True, + memory_map=False, + dataset_multiprocessing=False, +): + # Passes through entire dataset and defines dictionaries for categorical + # features and determines the number of total categories. + # + # Inputs: + # datafile : path to downloaded raw data file + # o_filename (str): saves results under o_filename if filename is not "" + # + # Output: + # o_file (str): output file path + + #split the datafile into path and filename + lstr = datafile.split("/") + d_path = "/".join(lstr[0:-1]) + "/" + d_file = lstr[-1].split(".")[0] if criteo_kaggle else lstr[-1] + npzfile = d_path + ((d_file + "_day") if criteo_kaggle else d_file) + trafile = d_path + ((d_file + "_fea") if criteo_kaggle else "fea") + + # count number of datapoints in training set + total_file = d_path + d_file + "_day_count.npz" + if path.exists(total_file): + with np.load(total_file) as data: + total_per_file = list(data["total_per_file"]) + total_count = np.sum(total_per_file) + print("Skipping counts per file (already exist)") + else: + total_count = 0 + total_per_file = [] + if criteo_kaggle: + # WARNING: The raw data consists of a single train.txt file + # Each line in the file is a sample, consisting of 13 continuous and + # 26 categorical features (an extra space indicates that feature is + # missing and will be interpreted as 0). + if path.exists(datafile): + print("Reading data from path=%s" % (datafile)) + with open(str(datafile)) as f: + for _ in f: + total_count += 1 + total_per_file.append(total_count) + # reset total per file due to split + num_data_per_split, extras = divmod(total_count, days) + total_per_file = [num_data_per_split] * days + for j in range(extras): + total_per_file[j] += 1 + # split into days (simplifies code later on) + file_id = 0 + boundary = total_per_file[file_id] + nf = open(npzfile + "_" + str(file_id), "w") + with open(str(datafile)) as f: + for j, line in enumerate(f): + if j == boundary: + nf.close() + file_id += 1 + nf = open(npzfile + "_" + str(file_id), "w") + boundary += total_per_file[file_id] + nf.write(line) + nf.close() + else: + sys.exit("ERROR: Criteo Kaggle Display Ad Challenge Dataset path is invalid; please download from https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset") + else: + # WARNING: The raw data consist of day_0.gz,... ,day_23.gz text files + # Each line in the file is a sample, consisting of 13 continuous and + # 26 categorical features (an extra space indicates that feature is + # missing and will be interpreted as 0). + for i in range(days): + datafile_i = datafile + "_" + str(i) # + ".gz" + if path.exists(str(datafile_i)): + print("Reading data from path=%s" % (str(datafile_i))) + # file day_ + total_per_file_count = 0 + with open(str(datafile_i)) as f: + for _ in f: + total_per_file_count += 1 + total_per_file.append(total_per_file_count) + total_count += total_per_file_count + else: + sys.exit("ERROR: Criteo Terabyte Dataset path is invalid; please download from https://labs.criteo.com/2013/12/download-terabyte-click-logs") + + # process a file worth of data and reinitialize data + # note that a file main contain a single or multiple splits + def process_one_file( + datfile, + npzfile, + split, + num_data_in_split, + dataset_multiprocessing, + convertDictsDay=None, + resultDay=None + ): + if dataset_multiprocessing: + convertDicts_day = [{} for _ in range(26)] + + with open(str(datfile)) as f: + y = np.zeros(num_data_in_split, dtype="i4") # 4 byte int + X_int = np.zeros((num_data_in_split, 13), dtype="i4") # 4 byte int + X_cat = np.zeros((num_data_in_split, 26), dtype="i4") # 4 byte int + if sub_sample_rate == 0.0: + rand_u = 1.0 + else: + rand_u = np.random.uniform(low=0.0, high=1.0, size=num_data_in_split) + + i = 0 + percent = 0 + for k, line in enumerate(f): + # process a line (data point) + line = line.split('\t') + # set missing values to zero + for j in range(len(line)): + if (line[j] == '') or (line[j] == '\n'): + line[j] = '0' + # sub-sample data by dropping zero targets, if needed + target = np.int32(line[0]) + if target == 0 and \ + (rand_u if sub_sample_rate == 0.0 else rand_u[k]) < sub_sample_rate: + continue + + y[i] = target + X_int[i] = np.array(line[1:14], dtype=np.int32) + if max_ind_range > 0: + X_cat[i] = np.array( + list(map(lambda x: int(x, 16) % max_ind_range, line[14:])), + dtype=np.int32 + ) + else: + X_cat[i] = np.array( + list(map(lambda x: int(x, 16), line[14:])), + dtype=np.int32 + ) + + # count uniques + if dataset_multiprocessing: + for j in range(26): + convertDicts_day[j][X_cat[i][j]] = 1 + # debug prints + if float(i)/num_data_in_split*100 > percent+1: + percent = int(float(i)/num_data_in_split*100) + print( + "Load %d/%d (%d%%) Split: %d Label True: %d Stored: %d" + % ( + i, + num_data_in_split, + percent, + split, + target, + y[i], + ), + end="\n", + ) + else: + for j in range(26): + convertDicts[j][X_cat[i][j]] = 1 + # debug prints + print( + "Load %d/%d Split: %d Label True: %d Stored: %d" + % ( + i, + num_data_in_split, + split, + target, + y[i], + ), + end="\r", + ) + i += 1 + + # store num_data_in_split samples or extras at the end of file + # count uniques + # X_cat_t = np.transpose(X_cat) + # for j in range(26): + # for x in X_cat_t[j,:]: + # convertDicts[j][x] = 1 + # store parsed + filename_s = npzfile + "_{0}.npz".format(split) + if path.exists(filename_s): + print("\nSkip existing " + filename_s) + else: + np.savez_compressed( + filename_s, + X_int=X_int[0:i, :], + # X_cat=X_cat[0:i, :], + X_cat_t=np.transpose(X_cat[0:i, :]), # transpose of the data + y=y[0:i], + ) + print("\nSaved " + npzfile + "_{0}.npz!".format(split)) + + if dataset_multiprocessing: + resultDay[split] = i + convertDictsDay[split] = convertDicts_day + return + else: + return i + + # create all splits (reuse existing files if possible) + recreate_flag = False + convertDicts = [{} for _ in range(26)] + # WARNING: to get reproducible sub-sampling results you must reset the seed below + # np.random.seed(123) + # in this case there is a single split in each day + for i in range(days): + npzfile_i = npzfile + "_{0}.npz".format(i) + npzfile_p = npzfile + "_{0}_processed.npz".format(i) + if path.exists(npzfile_i): + print("Skip existing " + npzfile_i) + elif path.exists(npzfile_p): + print("Skip existing " + npzfile_p) + else: + recreate_flag = True + + if recreate_flag: + if dataset_multiprocessing: + resultDay = Manager().dict() + convertDictsDay = Manager().dict() + processes = [Process(target=process_one_file, + name="process_one_file:%i" % i, + args=(npzfile + "_{0}".format(i), + npzfile, + i, + total_per_file[i], + dataset_multiprocessing, + convertDictsDay, + resultDay, + ) + ) for i in range(0, days)] + for process in processes: + process.start() + for process in processes: + process.join() + for day in range(days): + total_per_file[day] = resultDay[day] + print("Constructing convertDicts Split: {}".format(day)) + convertDicts_tmp = convertDictsDay[day] + for i in range(26): + for j in convertDicts_tmp[i]: + convertDicts[i][j] = 1 + else: + for i in range(days): + total_per_file[i] = process_one_file( + npzfile + "_{0}".format(i), + npzfile, + i, + total_per_file[i], + dataset_multiprocessing, + ) + + # report and save total into a file + total_count = np.sum(total_per_file) + if not path.exists(total_file): + np.savez_compressed(total_file, total_per_file=total_per_file) + print("Total number of samples:", total_count) + print("Divided into days/splits:\n", total_per_file) + + # dictionary files + counts = np.zeros(26, dtype=np.int32) + if recreate_flag: + # create dictionaries + for j in range(26): + for i, x in enumerate(convertDicts[j]): + convertDicts[j][x] = i + dict_file_j = d_path + d_file + "_fea_dict_{0}.npz".format(j) + if not path.exists(dict_file_j): + np.savez_compressed( + dict_file_j, + unique=np.array(list(convertDicts[j]), dtype=np.int32) + ) + counts[j] = len(convertDicts[j]) + # store (uniques and) counts + count_file = d_path + d_file + "_fea_count.npz" + if not path.exists(count_file): + np.savez_compressed(count_file, counts=counts) + else: + # create dictionaries (from existing files) + for j in range(26): + with np.load(d_path + d_file + "_fea_dict_{0}.npz".format(j)) as data: + unique = data["unique"] + for i, x in enumerate(unique): + convertDicts[j][x] = i + # load (uniques and) counts + with np.load(d_path + d_file + "_fea_count.npz") as data: + counts = data["counts"] + + # process all splits + if dataset_multiprocessing: + processes = [Process(target=processCriteoAdData, + name="processCriteoAdData:%i" % i, + args=(d_path, + d_file, + npzfile, + i, + convertDicts, + counts, + ) + ) for i in range(0, days)] + for process in processes: + process.start() + for process in processes: + process.join() + + else: + for i in range(days): + processCriteoAdData(d_path, d_file, npzfile, i, convertDicts, counts) + + o_file = concatCriteoAdData( + d_path, + d_file, + npzfile, + trafile, + days, + data_split, + randomize, + total_per_file, + total_count, + memory_map, + o_filename + ) + + return o_file + + +def loadDataset( + dataset, + max_ind_range, + sub_sample_rate, + randomize, + data_split, + raw_path="", + pro_data="", + memory_map=False +): + # dataset + if dataset == "kaggle": + days = 7 + o_filename = "kaggleAdDisplayChallenge_processed" + elif dataset == "terabyte": + days = 24 + o_filename = "terabyte_processed" + else: + raise(ValueError("Data set option is not supported")) + + # split the datafile into path and filename + lstr = raw_path.split("/") + d_path = "/".join(lstr[0:-1]) + "/" + d_file = lstr[-1].split(".")[0] if dataset == "kaggle" else lstr[-1] + npzfile = (d_file + "_day") if dataset == "kaggle" else d_file + # trafile = d_path + ((d_file + "_fea") if dataset == "kaggle" else "fea") + + # check if pre-processed data is available + data_ready = True + if memory_map: + for i in range(days): + reo_data = d_path + npzfile + "_{0}_reordered.npz".format(i) + if not path.exists(str(reo_data)): + data_ready = False + else: + if not path.exists(str(pro_data)): + data_ready = False + + # pre-process data if needed + # WARNNING: when memory mapping is used we get a collection of files + if data_ready: + print("Reading pre-processed data=%s" % (str(pro_data))) + file = str(pro_data) + else: + print("Reading raw data=%s" % (str(raw_path))) + file = getCriteoAdData( + raw_path, + o_filename, + max_ind_range, + sub_sample_rate, + days, + data_split, + randomize, + dataset == "kaggle", + memory_map + ) + + return file, days + + +if __name__ == "__main__": + ### import packages ### + import argparse + + ### parse arguments ### + parser = argparse.ArgumentParser( + description="Preprocess Criteo dataset" + ) + # model related parameters + parser.add_argument("--max-ind-range", type=int, default=-1) + parser.add_argument("--data-sub-sample-rate", type=float, default=0.0) # in [0, 1] + parser.add_argument("--data-randomize", type=str, default="total") # or day or none + parser.add_argument("--memory-map", action="store_true", default=False) + parser.add_argument("--data-set", type=str, default="kaggle") # or terabyte + parser.add_argument("--raw-data-file", type=str, default="") + parser.add_argument("--processed-data-file", type=str, default="") + args = parser.parse_args() + + loadDataset( + args.data_set, + args.max_ind_range, + args.data_sub_sample_rate, + args.data_randomize, + "train", + args.raw_data_file, + args.processed_data_file, + args.memory_map + ) diff --git a/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/dlrm_data_pytorch.py b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/dlrm_data_pytorch.py new file mode 100644 index 00000000000..f6f30f8e663 --- /dev/null +++ b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/dlrm_data_pytorch.py @@ -0,0 +1,575 @@ +# Copyright (c) 2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# Description: generate inputs and targets for the dlrm benchmark +# The inpts and outputs are generated according to the following three option(s) +# 1) random distribution +# 2) synthetic distribution, based on unique accesses and distances between them +# i) R. Hassan, A. Harris, N. Topham and A. Efthymiou "Synthetic Trace-Driven +# Simulation of Cache Memory", IEEE AINAM'07 +# 3) public data set +# i) Criteo Kaggle Display Advertising Challenge Dataset +# https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset +# ii) Criteo Terabyte Dataset +# https://labs.criteo.com/2013/12/download-terabyte-click-logs + + +from __future__ import absolute_import, division, print_function, unicode_literals + +# others +from os import path +import sys + +import data_utils + +# numpy +import numpy as np +from numpy import random as ra + + +# pytorch +import torch +from torch.utils.data import Dataset, RandomSampler + +import data_loader_terabyte + + +# Kaggle Display Advertising Challenge Dataset +# dataset (str): name of dataset (Kaggle or Terabyte) +# randomize (str): determines randomization scheme +# "none": no randomization +# "day": randomizes each day"s data (only works if split = True) +# "total": randomizes total dataset +# split (bool) : to split into train, test, validation data-sets +class CriteoDataset(Dataset): + + def __init__( + self, + dataset, + max_ind_range, + sub_sample_rate, + randomize, + split="train", + raw_path="", + pro_data="", + memory_map=False, + dataset_multiprocessing=False, + ): + # dataset + # tar_fea = 1 # single target + den_fea = 13 # 13 dense features + # spa_fea = 26 # 26 sparse features + # tad_fea = tar_fea + den_fea + # tot_fea = tad_fea + spa_fea + if dataset == "kaggle": + days = 7 + out_file = "kaggleAdDisplayChallenge_processed" + elif dataset == "terabyte": + days = 24 + out_file = "terabyte_processed" + else: + raise(ValueError("Data set option is not supported")) + self.max_ind_range = max_ind_range + self.memory_map = memory_map + + # split the datafile into path and filename + lstr = raw_path.split("/") + self.d_path = "/".join(lstr[0:-1]) + "/" + self.d_file = lstr[-1].split(".")[0] if dataset == "kaggle" else lstr[-1] + self.npzfile = self.d_path + ( + (self.d_file + "_day") if dataset == "kaggle" else self.d_file + ) + self.trafile = self.d_path + ( + (self.d_file + "_fea") if dataset == "kaggle" else "fea" + ) + + # check if pre-processed data is available + data_ready = True + if memory_map: + for i in range(days): + reo_data = self.npzfile + "_{0}_reordered.npz".format(i) + if not path.exists(str(reo_data)): + data_ready = False + else: + if not path.exists(str(pro_data)): + data_ready = False + + # pre-process data if needed + # WARNNING: when memory mapping is used we get a collection of files + if data_ready: + print("Reading pre-processed data=%s" % (str(pro_data))) + file = str(pro_data) + else: + print("Reading raw data=%s" % (str(raw_path))) + file = data_utils.getCriteoAdData( + raw_path, + out_file, + max_ind_range, + sub_sample_rate, + days, + split, + randomize, + dataset == "kaggle", + memory_map, + dataset_multiprocessing, + ) + + # get a number of samples per day + total_file = self.d_path + self.d_file + "_day_count.npz" + with np.load(total_file) as data: + total_per_file = data["total_per_file"] + # compute offsets per file + self.offset_per_file = np.array([0] + [x for x in total_per_file]) + for i in range(days): + self.offset_per_file[i + 1] += self.offset_per_file[i] + # print(self.offset_per_file) + + # setup data + if memory_map: + # setup the training/testing split + self.split = split + if split == 'none' or split == 'train': + self.day = 0 + self.max_day_range = days if split == 'none' else days - 1 + elif split == 'test' or split == 'val': + self.day = days - 1 + num_samples = self.offset_per_file[days] - \ + self.offset_per_file[days - 1] + self.test_size = int(np.ceil(num_samples / 2.)) + self.val_size = num_samples - self.test_size + else: + sys.exit("ERROR: dataset split is neither none, nor train or test.") + + ''' + # text + print("text") + for i in range(days): + fi = self.npzfile + "_{0}".format(i) + with open(fi) as data: + ttt = 0; nnn = 0 + for _j, line in enumerate(data): + ttt +=1 + if np.int32(line[0]) > 0: + nnn +=1 + print("day=" + str(i) + " total=" + str(ttt) + " non-zeros=" + + str(nnn) + " ratio=" +str((nnn * 100.) / ttt) + "%") + # processed + print("processed") + for i in range(days): + fi = self.npzfile + "_{0}_processed.npz".format(i) + with np.load(fi) as data: + yyy = data["y"] + ttt = len(yyy) + nnn = np.count_nonzero(yyy) + print("day=" + str(i) + " total=" + str(ttt) + " non-zeros=" + + str(nnn) + " ratio=" +str((nnn * 100.) / ttt) + "%") + # reordered + print("reordered") + for i in range(days): + fi = self.npzfile + "_{0}_reordered.npz".format(i) + with np.load(fi) as data: + yyy = data["y"] + ttt = len(yyy) + nnn = np.count_nonzero(yyy) + print("day=" + str(i) + " total=" + str(ttt) + " non-zeros=" + + str(nnn) + " ratio=" +str((nnn * 100.) / ttt) + "%") + ''' + + # load unique counts + with np.load(self.d_path + self.d_file + "_fea_count.npz") as data: + self.counts = data["counts"] + self.m_den = den_fea # X_int.shape[1] + self.n_emb = len(self.counts) + print("Sparse features= %d, Dense features= %d" % (self.n_emb, self.m_den)) + + # Load the test data + # Only a single day is used for testing + if self.split == 'test' or self.split == 'val': + # only a single day is used for testing + fi = self.npzfile + "_{0}_reordered.npz".format( + self.day + ) + with np.load(fi) as data: + self.X_int = data["X_int"] # continuous feature + self.X_cat = data["X_cat"] # categorical feature + self.y = data["y"] # target + + else: + # load and preprocess data + with np.load(file) as data: + X_int = data["X_int"] # continuous feature + X_cat = data["X_cat"] # categorical feature + y = data["y"] # target + self.counts = data["counts"] + self.m_den = X_int.shape[1] # den_fea + self.n_emb = len(self.counts) + print("Sparse fea = %d, Dense fea = %d" % (self.n_emb, self.m_den)) + + # create reordering + indices = np.arange(len(y)) + + if split == "none": + # randomize all data + if randomize == "total": + indices = np.random.permutation(indices) + print("Randomized indices...") + + X_int[indices] = X_int + X_cat[indices] = X_cat + y[indices] = y + + else: + indices = np.array_split(indices, self.offset_per_file[1:-1]) + + # randomize train data (per day) + if randomize == "day": # or randomize == "total": + for i in range(len(indices) - 1): + indices[i] = np.random.permutation(indices[i]) + print("Randomized indices per day ...") + + train_indices = np.concatenate(indices[:-1]) + test_indices = indices[-1] + test_indices, val_indices = np.array_split(test_indices, 2) + + print("Defined %s indices..." % (split)) + + # randomize train data (across days) + if randomize == "total": + train_indices = np.random.permutation(train_indices) + print("Randomized indices across days ...") + + # create training, validation, and test sets + if split == 'train': + self.X_int = [X_int[i] for i in train_indices] + self.X_cat = [X_cat[i] for i in train_indices] + self.y = [y[i] for i in train_indices] + elif split == 'val': + self.X_int = [X_int[i] for i in val_indices] + self.X_cat = [X_cat[i] for i in val_indices] + self.y = [y[i] for i in val_indices] + elif split == 'test': + self.X_int = [X_int[i] for i in test_indices] + self.X_cat = [X_cat[i] for i in test_indices] + self.y = [y[i] for i in test_indices] + + print("Split data according to indices...") + + def __getitem__(self, index): + + if isinstance(index, slice): + return [ + self[idx] for idx in range( + index.start or 0, index.stop or len(self), index.step or 1 + ) + ] + + if self.memory_map: + if self.split == 'none' or self.split == 'train': + # check if need to switch to next day and load data + if index == self.offset_per_file[self.day]: + # print("day_boundary switch", index) + self.day_boundary = self.offset_per_file[self.day] + fi = self.npzfile + "_{0}_reordered.npz".format( + self.day + ) + # print('Loading file: ', fi) + with np.load(fi) as data: + self.X_int = data["X_int"] # continuous feature + self.X_cat = data["X_cat"] # categorical feature + self.y = data["y"] # target + self.day = (self.day + 1) % self.max_day_range + + i = index - self.day_boundary + elif self.split == 'test' or self.split == 'val': + # only a single day is used for testing + i = index + (0 if self.split == 'test' else self.test_size) + else: + sys.exit("ERROR: dataset split is neither none, nor train or test.") + else: + i = index + + if self.max_ind_range > 0: + return self.X_int[i], self.X_cat[i] % self.max_ind_range, self.y[i] + else: + return self.X_int[i], self.X_cat[i], self.y[i] + + def _default_preprocess(self, X_int, X_cat, y): + X_int = torch.log(torch.tensor(X_int, dtype=torch.float) + 1) + if self.max_ind_range > 0: + X_cat = torch.tensor(X_cat % self.max_ind_range, dtype=torch.long) + else: + X_cat = torch.tensor(X_cat, dtype=torch.long) + y = torch.tensor(y.astype(np.float32)) + + return X_int, X_cat, y + + def __len__(self): + if self.memory_map: + if self.split == 'none': + return self.offset_per_file[-1] + elif self.split == 'train': + return self.offset_per_file[-2] + elif self.split == 'test': + return self.test_size + elif self.split == 'val': + return self.val_size + else: + sys.exit("ERROR: dataset split is neither none, nor train nor test.") + else: + return len(self.y) + + +def collate_wrapper_criteo_offset(list_of_tuples): + # where each tuple is (X_int, X_cat, y) + transposed_data = list(zip(*list_of_tuples)) + X_int = torch.log(torch.tensor(transposed_data[0], dtype=torch.float) + 1) + X_cat = torch.tensor(transposed_data[1], dtype=torch.long) + T = torch.tensor(transposed_data[2], dtype=torch.float32).view(-1, 1) + + batchSize = X_cat.shape[0] + featureCnt = X_cat.shape[1] + + lS_i = [X_cat[:, i] for i in range(featureCnt)] + lS_o = [torch.tensor(range(batchSize)) for _ in range(featureCnt)] + + return X_int, torch.stack(lS_o), torch.stack(lS_i), T + + +def ensure_dataset_preprocessed(args, d_path): + _ = CriteoDataset( + args.data_set, + args.max_ind_range, + args.data_sub_sample_rate, + args.data_randomize, + "train", + args.raw_data_file, + args.processed_data_file, + args.memory_map, + args.dataset_multiprocessing + ) + + _ = CriteoDataset( + args.data_set, + args.max_ind_range, + args.data_sub_sample_rate, + args.data_randomize, + "test", + args.raw_data_file, + args.processed_data_file, + args.memory_map, + args.dataset_multiprocessing + ) + + for split in ['train', 'val', 'test']: + print('Running preprocessing for split =', split) + + train_files = ['{}_{}_reordered.npz'.format(args.raw_data_file, day) + for + day in range(0, 23)] + + test_valid_file = args.raw_data_file + '_23_reordered.npz' + + output_file = d_path + '_{}.bin'.format(split) + + input_files = train_files if split == 'train' else [test_valid_file] + data_loader_terabyte.numpy_to_binary(input_files=input_files, + output_file_path=output_file, + split=split) + + +# Conversion from offset to length +def offset_to_length_converter(lS_o, lS_i): + def diff(tensor): + return tensor[1:] - tensor[:-1] + + return torch.stack( + [ + diff(torch.cat((S_o, torch.tensor(lS_i[ind].shape))).int()) + for ind, S_o in enumerate(lS_o) + ] + ) + + +def collate_wrapper_criteo_length(list_of_tuples): + # where each tuple is (X_int, X_cat, y) + transposed_data = list(zip(*list_of_tuples)) + X_int = torch.log(torch.tensor(transposed_data[0], dtype=torch.float) + 1) + X_cat = torch.tensor(transposed_data[1], dtype=torch.long) + T = torch.tensor(transposed_data[2], dtype=torch.float32).view(-1, 1) + + batchSize = X_cat.shape[0] + featureCnt = X_cat.shape[1] + + lS_i = torch.stack([X_cat[:, i] for i in range(featureCnt)]) + lS_o = torch.stack( + [torch.tensor(range(batchSize)) for _ in range(featureCnt)] + ) + + lS_l = offset_to_length_converter(lS_o, lS_i) + + return X_int, lS_l, lS_i, T + + +def make_criteo_data_and_loaders(args, offset_to_length_converter=False): + if args.memory_map and args.data_set == "terabyte": + # more efficient for larger batches + data_directory = path.dirname(args.raw_data_file) + + if args.mlperf_bin_loader: + lstr = args.processed_data_file.split("/") + d_path = "/".join(lstr[0:-1]) + "/" + lstr[-1].split(".")[0] + train_file = d_path + "_train.bin" + test_file = d_path + "_test.bin" + # val_file = d_path + "_val.bin" + counts_file = args.raw_data_file + '_fea_count.npz' + if any(not path.exists(p) for p in [train_file, + test_file, + counts_file]): + ensure_dataset_preprocessed(args, d_path) + + train_data = data_loader_terabyte.CriteoBinDataset( + data_file=train_file, + counts_file=counts_file, + batch_size=args.mini_batch_size, + max_ind_range=args.max_ind_range + ) + + train_loader = torch.utils.data.DataLoader( + train_data, + batch_size=None, + batch_sampler=None, + shuffle=False, + num_workers=0, + collate_fn=None, + pin_memory=False, + drop_last=False, + sampler=RandomSampler(train_data) if args.mlperf_bin_shuffle else None + ) + + test_data = data_loader_terabyte.CriteoBinDataset( + data_file=test_file, + counts_file=counts_file, + batch_size=args.test_mini_batch_size, + max_ind_range=args.max_ind_range + ) + + test_loader = torch.utils.data.DataLoader( + test_data, + batch_size=None, + batch_sampler=None, + shuffle=False, + num_workers=0, + collate_fn=None, + pin_memory=False, + drop_last=False, + ) + else: + data_filename = args.raw_data_file.split("/")[-1] + + train_data = CriteoDataset( + args.data_set, + args.max_ind_range, + args.data_sub_sample_rate, + args.data_randomize, + "train", + args.raw_data_file, + args.processed_data_file, + args.memory_map, + args.dataset_multiprocessing + ) + + test_data = CriteoDataset( + args.data_set, + args.max_ind_range, + args.data_sub_sample_rate, + args.data_randomize, + "test", + args.raw_data_file, + args.processed_data_file, + args.memory_map, + args.dataset_multiprocessing + ) + + train_loader = data_loader_terabyte.DataLoader( + data_directory=data_directory, + data_filename=data_filename, + days=list(range(23)), + batch_size=args.mini_batch_size, + max_ind_range=args.max_ind_range, + split="train" + ) + + test_loader = data_loader_terabyte.DataLoader( + data_directory=data_directory, + data_filename=data_filename, + days=[23], + batch_size=args.test_mini_batch_size, + max_ind_range=args.max_ind_range, + split="test" + ) + else: + train_data = CriteoDataset( + args.data_set, + args.max_ind_range, + args.data_sub_sample_rate, + args.data_randomize, + "train", + args.raw_data_file, + args.processed_data_file, + args.memory_map, + args.dataset_multiprocessing, + ) + + test_data = CriteoDataset( + args.data_set, + args.max_ind_range, + args.data_sub_sample_rate, + args.data_randomize, + "test", + args.raw_data_file, + args.processed_data_file, + args.memory_map, + args.dataset_multiprocessing, + ) + + collate_wrapper_criteo = collate_wrapper_criteo_offset + if offset_to_length_converter: + collate_wrapper_criteo = collate_wrapper_criteo_length + + train_loader = torch.utils.data.DataLoader( + train_data, + batch_size=args.mini_batch_size, + shuffle=False, + num_workers=args.num_workers, + collate_fn=collate_wrapper_criteo, + pin_memory=False, + drop_last=False, # True + ) + + test_loader = torch.utils.data.DataLoader( + test_data, + batch_size=args.test_mini_batch_size, + shuffle=False, + num_workers=args.test_num_workers, + collate_fn=collate_wrapper_criteo, + pin_memory=False, + drop_last=False, # True + ) + + return train_data, train_loader, test_data, test_loader \ No newline at end of file diff --git a/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/dlrm_s_pytorch.py b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/dlrm_s_pytorch.py new file mode 100644 index 00000000000..12936c64165 --- /dev/null +++ b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/dlrm_s_pytorch.py @@ -0,0 +1,1140 @@ +# Copyright (c) 2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# Description: an implementation of a deep learning recommendation model (DLRM) +# The model input consists of dense and sparse features. The former is a vector +# of floating point values. The latter is a list of sparse indices into +# embedding tables, which consist of vectors of floating point values. +# The selected vectors are passed to mlp networks denoted by triangles, +# in some cases the vectors are interacted through operators (Ops). +# +# output: +# vector of values +# model: | +# /\ +# /__\ +# | +# _____________________> Op <___________________ +# / | \ +# /\ /\ /\ +# /__\ /__\ ... /__\ +# | | | +# | Op Op +# | ____/__\_____ ____/__\____ +# | |_Emb_|____|__| ... |_Emb_|__|___| +# input: +# [ dense features ] [sparse indices] , ..., [sparse indices] +# +# More precise definition of model layers: +# 1) fully connected layers of an mlp +# z = f(y) +# y = Wx + b +# +# 2) embedding lookup (for a list of sparse indices p=[p1,...,pk]) +# z = Op(e1,...,ek) +# obtain vectors e1=E[:,p1], ..., ek=E[:,pk] +# +# 3) Operator Op can be one of the following +# Sum(e1,...,ek) = e1 + ... + ek +# Dot(e1,...,ek) = [e1'e1, ..., e1'ek, ..., ek'e1, ..., ek'ek] +# Cat(e1,...,ek) = [e1', ..., ek']' +# where ' denotes transpose operation +# +# References: +# [1] Maxim Naumov, Dheevatsa Mudigere, Hao-Jun Michael Shi, Jianyu Huang, +# Narayanan Sundaram, Jongsoo Park, Xiaodong Wang, Udit Gupta, Carole-Jean Wu, +# Alisson G. Azzolini, Dmytro Dzhulgakov, Andrey Mallevich, Ilia Cherniavskii, +# Yinghai Lu, Raghuraman Krishnamoorthi, Ansha Yu, Volodymyr Kondratenko, +# Stephanie Pereira, Xianjie Chen, Wenlin Chen, Vijay Rao, Bill Jia, Liang Xiong, +# Misha Smelyanskiy, "Deep Learning Recommendation Model for Personalization and +# Recommendation Systems", CoRR, arXiv:1906.00091, 2019 + +from __future__ import absolute_import, division, print_function, unicode_literals + +import argparse + +# miscellaneous +import builtins +import datetime +import sys +import time + + +# data generation +import dlrm_data_pytorch as dp + +# numpy +import numpy as np +import sklearn.metrics + +# pytorch +import torch +import torch.nn as nn +from torch._ops import ops +from torch.autograd.profiler import record_function +from torch.nn.parallel.parallel_apply import parallel_apply +from torch.nn.parallel.replicate import replicate +from torch.nn.parallel.scatter_gather import gather, scatter +from torch.nn.parameter import Parameter +from torch.optim.lr_scheduler import _LRScheduler +from torch.utils import ThroughputBenchmark +# For distributed run +import extend_distributed as ext_dist + + +try: + import intel_extension_for_pytorch as ipex +except: + assert False, "please install intel-extension-for-pytorch, support version higher than 1.10" + + +exc = getattr(builtins, "IOError", "FileNotFoundError") + +def freeze(model): + return torch.jit._recursive.wrap_cpp_module(torch._C._freeze_module(model._c, preserveParameters=True)) + + +def time_wrap(): + return time.time() + + +def dlrm_wrap(X, *emb_args): + with record_function("DLRM forward"): + return dlrm(X, *emb_args) + + +def loss_fn_wrap(Z, T): + with record_function("DLRM loss compute"): + return dlrm.loss_fn(Z, T) + +# The following function is a wrapper to avoid checking this multiple times in th +# loop below. +def unpack_batch(b): + # Experiment with unweighted samples + return b[0], b[1], b[2], b[3], torch.ones(b[3].size()), None + + +class LRPolicyScheduler(_LRScheduler): + def __init__(self, optimizer, num_warmup_steps, decay_start_step, num_decay_steps): + self.num_warmup_steps = num_warmup_steps + self.decay_start_step = decay_start_step + self.decay_end_step = decay_start_step + num_decay_steps + self.num_decay_steps = num_decay_steps + + if self.decay_start_step < self.num_warmup_steps: + sys.exit("Learning rate warmup must finish before the decay starts") + + super(LRPolicyScheduler, self).__init__(optimizer) + + def get_lr(self): + step_count = self._step_count + if step_count < self.num_warmup_steps: + # warmup + scale = 1.0 - (self.num_warmup_steps - step_count) / self.num_warmup_steps + lr = [base_lr * scale for base_lr in self.base_lrs] + self.last_lr = lr + elif self.decay_start_step <= step_count and step_count < self.decay_end_step: + # decay + decayed_steps = step_count - self.decay_start_step + scale = ((self.num_decay_steps - decayed_steps) / self.num_decay_steps) ** 2 + min_lr = 0.0000001 + lr = [max(min_lr, base_lr * scale) for base_lr in self.base_lrs] + self.last_lr = lr + else: + if self.num_decay_steps > 0: + # freeze at last, either because we're after decay + # or because we're between warmup and decay + lr = self.last_lr + else: + # do not adjust + lr = self.base_lrs + return lr + + +### define dlrm in PyTorch ### +class DLRM_Net(nn.Module): + def create_mlp(self, ln, sigmoid_layer): + # build MLP layer by layer + layers = nn.ModuleList() + for i in range(0, ln.size - 1): + n = ln[i] + m = ln[i + 1] + + # construct fully connected operator + LL = nn.Linear(int(n), int(m), bias=True) + + # initialize the weights + # with torch.no_grad(): + # custom Xavier input, output or two-sided fill + mean = 0.0 # std_dev = np.sqrt(variance) + std_dev = np.sqrt(2 / (m + n)) # np.sqrt(1 / m) # np.sqrt(1 / n) + W = np.random.normal(mean, std_dev, size=(m, n)).astype(np.float32) + std_dev = np.sqrt(1 / m) # np.sqrt(2 / (m + 1)) + bt = np.random.normal(mean, std_dev, size=m).astype(np.float32) + # approach 1 + LL.weight.data = torch.tensor(W, requires_grad=True) + LL.bias.data = torch.tensor(bt, requires_grad=True) + # approach 2 + # LL.weight.data.copy_(torch.tensor(W)) + # LL.bias.data.copy_(torch.tensor(bt)) + # approach 3 + # LL.weight = Parameter(torch.tensor(W),requires_grad=True) + # LL.bias = Parameter(torch.tensor(bt),requires_grad=True) + layers.append(LL) + + # construct sigmoid or relu operator + if i == sigmoid_layer: + layers.append(nn.Sigmoid()) + else: + layers.append(nn.ReLU()) + + # approach 1: use ModuleList + # return layers + # approach 2: use Sequential container to wrap all layers + return torch.nn.Sequential(*layers) + + def create_emb(self, m, ln, local_ln_emb=None): + emb_l = nn.ModuleList() + n_embs = ln.size if local_ln_emb is None else len(local_ln_emb) + for i in range(n_embs): + if local_ln_emb is None: + n = ln[i] + else: + n = ln[local_ln_emb[i]] + EE = nn.EmbeddingBag(n, m, mode="sum", sparse=True) + # initialize embeddings + if not args.inference_only: + nn.init.uniform_(EE.weight, a=-np.sqrt(1 / n), b=np.sqrt(1 / n)) + emb_l.append(EE) + return emb_l + + def __init__( + self, + m_spa=None, + ln_emb=None, + ln_bot=None, + ln_top=None, + sigmoid_bot=-1, + sigmoid_top=-1, + weighted_pooling=None, + loss_threshold=0.0, + ): + super(DLRM_Net, self).__init__() + self.loss_threshold = loss_threshold + #If running distributed, get local slice of embedding tables + if ext_dist.my_size > 1: + n_emb = len(ln_emb) + self.n_global_emb = n_emb + self.rank = ext_dist.dist.get_rank() + self.ln_emb = [i for i in range(n_emb)] + self.n_local_emb, self.n_emb_per_rank = ext_dist.get_split_lengths(n_emb) + self.local_ln_emb_slice = ext_dist.get_my_slice(n_emb) + self.local_ln_emb = self.ln_emb[self.local_ln_emb_slice] + else: + self.local_ln_emb = None + self.emb_l = self.create_emb(m_spa, ln_emb, self.local_ln_emb) + self.bot_l = self.create_mlp(ln_bot, sigmoid_bot) + self.top_l = self.create_mlp(ln_top, sigmoid_top) + self.loss_fn = torch.nn.BCELoss(reduction="mean") + + + def apply_mlp(self, x, layers): + # approach 1: use ModuleList + # for layer in layers: + # x = layer(x) + # return x + # approach 2: use Sequential container to wrap all layers + return layers(x) + + def apply_emb(self, emb_l, *emb_args): + # WARNING: notice that we are processing the batch at once. We implicitly + # assume that the data is laid out such that: + # 1. each embedding is indexed with a group of sparse indices, + # corresponding to a single lookup + # 2. for each embedding the lookups are further organized into a batch + # 3. for a list of embedding tables there is a list of batched lookups + if isinstance(emb_l, ipex.nn.modules.MergedEmbeddingBagWithSGD): + return emb_l(emb_args, self.need_linearize_indices_and_offsets) + lS_o, lS_i = emb_args + ly = [] + for k, sparse_index_group_batch in enumerate(lS_i): + sparse_offset_group_batch = lS_o[k] + + # embedding lookup + # We are using EmbeddingBag, which implicitly uses sum operator. + # The embeddings are represented as tall matrices, with sum + # happening vertically across 0 axis, resulting in a row vector + E = emb_l[k] + V = E( + sparse_index_group_batch, + sparse_offset_group_batch, + ) + + ly.append(V) + + return ly + + def interact_features(self, x, ly): + if args.ipex_interaction: + T = [x] + list(ly) + R = ipex.nn.functional.interaction(*T) + else: + # concatenate dense and sparse features + (batch_size, d) = x.shape + T = torch.cat([x] + ly, dim=1).view((batch_size, -1, d)) + # perform a dot product + Z = torch.bmm(T, torch.transpose(T, 1, 2)) + # append dense feature with the interactions (into a row vector) + # approach 1: all + # Zflat = Z.view((batch_size, -1)) + # approach 2: unique + _, ni, nj = Z.shape + # approach 1: tril_indices + # offset = -1 + # li, lj = torch.tril_indices(ni, nj, offset=offset) + # approach 2: custom + offset = 0 + li = torch.tensor([i for i in range(ni) for j in range(i + offset)]) + lj = torch.tensor([j for i in range(nj) for j in range(i + offset)]) + Zflat = Z[:, li, lj] + # concatenate dense features and interactions + R = torch.cat([x] + [Zflat], dim=1) + return R + + def forward(self, dense_x, *emb_args): + if ext_dist.my_size > 1: + return self.distributed_forward(dense_x, *emb_args) + else: + return self.sequential_forward(dense_x, *emb_args) + + def distributed_forward(self, dense_x, *emb_args): + batch_size = dense_x.size()[0] + vector_lenght = self.emb_l.weights[0].size()[1] + # WARNING: # of ranks must be <= batch size in distributed_forward call + if batch_size < ext_dist.my_size: + sys.exit("ERROR: batch_size (%d) must be larger than number of ranks (%d)" % (batch_size, ext_dist.my_size)) + + # embeddings + ly = self.apply_emb(self.emb_l, *emb_args) + a2a_req = ext_dist.alltoall(ly, self.n_emb_per_rank) + # bottom mlp + x = self.apply_mlp(dense_x, self.bot_l) + ly = a2a_req.wait() + _ly = [] + for item in ly: + _ly += [item[:, emb_id * vector_lenght: (emb_id + 1) * vector_lenght] for emb_id in range(self.emb_l.n_tables)] + # interactions + z = self.interact_features(x, _ly) + # top mlp + p = self.apply_mlp(z, self.top_l) + # clamp output if needed + if 0.0 < self.loss_threshold and self.loss_threshold < 1.0: + z = torch.clamp( + p, min=self.loss_threshold, max=(1.0 - self.loss_threshold) + ) + else: + z = p + return z + + + def sequential_forward(self, dense_x, *emb_args): + # process dense features (using bottom mlp), resulting in a row vector + x = self.apply_mlp(dense_x, self.bot_l) + # debug prints + # print("intermediate") + # print(x.detach().cpu().numpy()) + + # process sparse features(using embeddings), resulting in a list of row vectors + ly = self.apply_emb(self.emb_l, *emb_args) + # for y in ly: + # print(y.detach().cpu().numpy()) + + # interact features (dense and sparse) + z = self.interact_features(x, ly) + # print(z.detach().cpu().numpy()) + + # obtain probability of a click (using top mlp) + p = self.apply_mlp(z, self.top_l) + + # clamp output if needed + if 0.0 < self.loss_threshold and self.loss_threshold < 1.0: + z = torch.clamp(p, min=self.loss_threshold, max=(1.0 - self.loss_threshold)) + else: + z = p + + return z + + +def dash_separated_ints(value): + vals = value.split("-") + for val in vals: + try: + int(val) + except ValueError: + raise argparse.ArgumentTypeError( + "%s is not a valid dash separated list of ints" % value + ) + + return value + + +def trace_model(args, dlrm, test_ld, inplace=True): + dlrm.eval() + for j, inputBatch in enumerate(test_ld): + X, lS_o, lS_i, _, _, _ = unpack_batch(inputBatch) + if args.bf16: + # at::GradMode::is_enabled() will query a threadlocal flag + # but new thread generate from throughputbench mark will + # init this flag to true, so we temporal cast embedding's + # weight to bfloat16 for now + if args.inference_only: + dlrm.emb_l.bfloat16() + dlrm = ipex.optimize(dlrm, dtype=torch.bfloat16, inplace=inplace) + elif args.int8 and not args.tune: + if args.num_cpu_cores != 0: + torch.set_num_threads(args.num_cpu_cores) + from neural_compressor.torch.quantization import load + dlrm = load(args.save_model) + elif args.int8 and args.tune: + dlrm = dlrm + else: + dlrm = ipex.optimize(dlrm, dtype=torch.float, inplace=True, auto_kernel_selection=True) + with torch.cpu.amp.autocast(enabled=args.bf16): + dlrm = torch.jit.trace(dlrm, (X, lS_o, lS_i), check_trace=True) + dlrm = torch.jit.freeze(dlrm) + dlrm(X, lS_o, lS_i) + dlrm(X, lS_o, lS_i) + return dlrm + + +def run_throughput_benchmark(args, dlrm, test_ld): + bench = ThroughputBenchmark(dlrm) + for j, inputBatch in enumerate(test_ld): + X, lS_o, lS_i, T, W, CBPP = unpack_batch(inputBatch) + bench.add_input(X, lS_o, lS_i) + if args.num_batches > 0 and j == args.num_batches: + break + args.num_batches = args.num_batches if args.num_batches > 0 else j + stats = bench.benchmark( + num_calling_threads=args.share_weight_instance, + num_warmup_iters=100, + num_iters=args.num_batches * args.share_weight_instance, + ) + print(stats) + latency = stats.latency_avg_ms + throughput = (1 / latency) * 1000 * test_ld.dataset.batch_size * args.share_weight_instance + print("throughput: {:.3f} fps".format(throughput)) + print("latency: {:.5f} ms".format(1/throughput * 1000)) + exit(0) + + +def inference( + args, + dlrm, + best_acc_test, + best_auc_test, + test_ld, + trace=True +): + test_accu = 0 + test_samp = 0 + + if args.print_auc: + scores = [] + targets = [] + + total_time = 0 + total_iter = 0 + if args.inference_only and trace: + dlrm = trace_model(args, dlrm, test_ld) + if args.share_weight_instance != 0: + run_throughput_benchmark(args, dlrm, test_ld) + with torch.cpu.amp.autocast(enabled=args.bf16): + for i, testBatch in enumerate(test_ld): + should_print = ((i + 1) % args.print_freq == 0 or i + 1 == len(test_ld)) and args.inference_only + if should_print: + gT = 1000.0 * total_time / total_iter + print( + "Finished {} it {}/{}, {:.2f} ms/it,".format( + "inference", i + 1, len(test_ld), gT + ), + flush=True, + ) + total_time = 0 + total_iter = 0 + # early exit if nbatches was set by the user and was exceeded + if args.inference_only and nbatches > 0 and i >= nbatches: + break + + X_test, lS_o_test, lS_i_test, T_test, W_test, CBPP_test = unpack_batch( + testBatch + ) + + # forward pass + + if not args.inference_only and isinstance(dlrm.emb_l, ipex.nn.modules.MergedEmbeddingBagWithSGD): + n_tables = lS_i_test.shape[0] + idx = [lS_i_test[i] for i in range(n_tables)] + offset = [lS_o_test[i] for i in range(n_tables)] + include_last = [False for i in range(n_tables)] + indices, offsets, indices_with_row_offsets = dlrm.emb_l.linearize_indices_and_offsets(idx, offset, include_last) + + start = time_wrap() + if not args.inference_only and isinstance(dlrm.emb_l, ipex.nn.modules.MergedEmbeddingBagWithSGD): + Z_test = dlrm(X_test, indices, offsets, indices_with_row_offsets) + else: + Z_test = dlrm(X_test, lS_o_test, lS_i_test) + + + total_time += (time_wrap() - start) + total_iter += 1 + + if args.print_auc: + S_test = Z_test.detach().cpu().float().numpy() # numpy array + T_test = T_test.detach().cpu().float().numpy() # numpy array + scores.append(S_test) + targets.append(T_test) + elif not args.inference_only: + with record_function("DLRM accuracy compute"): + # compute loss and accuracy + S_test = Z_test.detach().cpu().float().numpy() # numpy array + T_test = T_test.detach().cpu().float().numpy() # numpy array + + mbs_test = T_test.shape[0] # = mini_batch_size except last + A_test = np.sum((np.round(S_test, 0) == T_test).astype(np.uint8)) + + test_accu += A_test + test_samp += mbs_test + else: + # do nothing to save time + pass + + if args.print_auc: + with record_function("DLRM mlperf sklearn metrics compute"): + scores = np.concatenate(scores, axis=0) + targets = np.concatenate(targets, axis=0) + + metrics = { + "recall": lambda y_true, y_score: sklearn.metrics.recall_score( + y_true=y_true, y_pred=np.round(y_score) + ), + "precision": lambda y_true, y_score: sklearn.metrics.precision_score( + y_true=y_true, y_pred=np.round(y_score) + ), + "f1": lambda y_true, y_score: sklearn.metrics.f1_score( + y_true=y_true, y_pred=np.round(y_score) + ), + "ap": sklearn.metrics.average_precision_score, + "roc_auc": sklearn.metrics.roc_auc_score, + "accuracy": lambda y_true, y_score: sklearn.metrics.accuracy_score( + y_true=y_true, y_pred=np.round(y_score) + ), + } + + validation_results = {} + for metric_name, metric_function in metrics.items(): + validation_results[metric_name] = metric_function(targets, scores) + acc_test = validation_results["accuracy"] + elif not args.inference_only: + acc_test = test_accu / test_samp + else: + pass + + model_metrics_dict = { + "nepochs": args.nepochs, + "nbatches": nbatches, + "nbatches_test": nbatches_test, + } + if not args.inference_only: + model_metrics_dict["test_acc"] = acc_test + + if args.print_auc: + is_best = validation_results["roc_auc"] > best_auc_test + if is_best: + best_auc_test = validation_results["roc_auc"] + model_metrics_dict["test_auc"] = best_auc_test + print( + "recall {:.4f}, precision {:.4f},".format( + validation_results["recall"], + validation_results["precision"], + ) + + " f1 {:.4f}, ap {:.4f},".format( + validation_results["f1"], validation_results["ap"] + ) + + " auc {:.4f}, best auc {:.4f},".format( + validation_results["roc_auc"], best_auc_test + ) + + " accuracy {:3.3f} %, best accuracy {:3.3f} %".format( + validation_results["accuracy"] * 100, best_acc_test * 100 + ), + flush=True, + ) + print("Accuracy: {:.34} ".format(validation_results["roc_auc"])) + elif not args.inference_only: + is_best = acc_test > best_acc_test + if is_best: + best_acc_test = acc_test + print( + " accuracy {:3.3f} %, best {:3.3f} %".format( + acc_test * 100, best_acc_test * 100 + ), + flush=True, + ) + else: + pass + if not args.inference_only: + return model_metrics_dict, is_best + else: + return validation_results["roc_auc"] + + +def run(): + ### parse arguments ### + parser = argparse.ArgumentParser( + description="Train Deep Learning Recommendation Model (DLRM)" + ) + # model related parameters + parser.add_argument("--arch-sparse-feature-size", type=int, default=2) + parser.add_argument( + "--arch-embedding-size", type=dash_separated_ints, default="4-3-2" + ) + # j will be replaced with the table number + parser.add_argument("--arch-mlp-bot", type=dash_separated_ints, default="4-3-2") + parser.add_argument("--arch-mlp-top", type=dash_separated_ints, default="4-2-1") + # activations and loss + parser.add_argument("--activation-function", type=str, default="relu") + parser.add_argument("--loss-threshold", type=float, default=0.0) # 1.0e-7 + parser.add_argument("--round-targets", type=bool, default=False) + # data + parser.add_argument("--num-batches", type=int, default=0) + parser.add_argument("--data-set", type=str, default="kaggle") # or terabyte + parser.add_argument("--raw-data-file", type=str, default="") + parser.add_argument("--processed-data-file", type=str, default="") + parser.add_argument("--max-ind-range", type=int, default=-1) + parser.add_argument("--memory-map", action="store_true", default=False) + parser.add_argument("--data-sub-sample-rate", type=float, default=0.0) # in [0, 1] + parser.add_argument("--data-randomize", type=str, default="total") # or day or none + parser.add_argument( + "--dataset-multiprocessing", + action="store_true", + default=False, + help="The Kaggle dataset can be multiprocessed in an environment \ + with more than 7 CPU cores and more than 20 GB of memory. \n \ + The Terabyte dataset can be multiprocessed in an environment \ + with more than 24 CPU cores and at least 1 TB of memory.", + ) + # training + parser.add_argument("--mini-batch-size", type=int, default=1) + parser.add_argument("--nepochs", type=int, default=1) + parser.add_argument("--learning-rate", type=float, default=0.01) + parser.add_argument("--print-precision", type=int, default=5) + parser.add_argument("--numpy-rand-seed", type=int, default=123) + # inference + parser.add_argument("--inference-only", action="store_true", default=False) + # store/load model + parser.add_argument("--save-model", type=str, default="") + parser.add_argument("--load-model", type=str, default="") + # debugging and profiling + parser.add_argument("--print-freq", type=int, default=1) + parser.add_argument("--test-freq", type=int, default=-1) + parser.add_argument("--test-mini-batch-size", type=int, default=-1) + parser.add_argument("--print-time", action="store_true", default=False) + parser.add_argument("--print-wall-time", action="store_true", default=False) + parser.add_argument("--enable-profiling", action="store_true", default=False) + # stop at target AUC Terabyte (no subsampling) 0.8025 + parser.add_argument("--mlperf-auc-threshold", type=float, default=0.0) + parser.add_argument("--mlperf-bin-loader", action="store_true", default=False) + parser.add_argument("--mlperf-bin-shuffle", action="store_true", default=False) + # LR policy + parser.add_argument("--lr-num-warmup-steps", type=int, default=0) + parser.add_argument("--lr-decay-start-step", type=int, default=0) + parser.add_argument("--lr-num-decay-steps", type=int, default=0) + # intel + parser.add_argument("--print-auc", action="store_true", default=False) + parser.add_argument("--should-test", action="store_true", default=False) + parser.add_argument("--bf16", action="store_true", default=False) + parser.add_argument("--share-weight-instance", type=int, default=0) + parser.add_argument("--num-cpu-cores", type=int, default=0) + parser.add_argument("--ipex-interaction", action="store_true", default=False) + parser.add_argument("--ipex-merged-emb", action="store_true", default=False) + parser.add_argument("--num-warmup-iters", type=int, default=1000) + parser.add_argument("--int8", action="store_true", default=False) + parser.add_argument("--dist-backend", type=str, default="ccl") + parser.add_argument("--tune", action="store_true", default=False) + parser.add_argument("--benchmark", action="store_true", default=False) + parser.add_argument("--accuracy_only", action="store_true", default=False) + + global args + global nbatches + global nbatches_test + args = parser.parse_args() + ext_dist.init_distributed(backend=args.dist_backend) + + + ### some basic setup ### + np.random.seed(args.numpy_rand_seed) + np.set_printoptions(precision=args.print_precision) + torch.set_printoptions(precision=args.print_precision) + torch.manual_seed(args.numpy_rand_seed) + + if args.test_mini_batch_size < 0: + # if the parameter is not set, use the training batch size + args.test_mini_batch_size = args.mini_batch_size + + device = torch.device("cpu") + print("Using CPU...") + + ### prepare training data ### + ln_bot = np.fromstring(args.arch_mlp_bot, dtype=int, sep="-") + # input data + train_data, train_ld, test_data, test_ld = dp.make_criteo_data_and_loaders(args) + nbatches = args.num_batches if args.num_batches > 0 else len(train_ld) + nbatches_test = len(test_ld) + + ln_emb = train_data.counts + # enforce maximum limit on number of vectors per embedding + if args.max_ind_range > 0: + ln_emb = np.array( + list( + map( + lambda x: x if x < args.max_ind_range else args.max_ind_range, + ln_emb, + ) + ) + ) + else: + ln_emb = np.array(ln_emb) + m_den = train_data.m_den + ln_bot[0] = m_den + + args.ln_emb = ln_emb.tolist() + + ### parse command line arguments ### + m_spa = args.arch_sparse_feature_size + ln_emb = np.asarray(ln_emb) + num_fea = ln_emb.size + 1 # num sparse + num dense features + + m_den_out = ln_bot[ln_bot.size - 1] + # approach 1: all + # num_int = num_fea * num_fea + m_den_out + # approach 2: unique + num_int = (num_fea * (num_fea - 1)) // 2 + m_den_out + + arch_mlp_top_adjusted = str(num_int) + "-" + args.arch_mlp_top + ln_top = np.fromstring(arch_mlp_top_adjusted, dtype=int, sep="-") + + ### construct the neural network specified above ### + # WARNING: to obtain exactly the same initialization for + # the weights we need to start from the same random seed. + # np.random.seed(args.numpy_rand_seed) + global dlrm + dlrm = DLRM_Net( + m_spa, + ln_emb, + ln_bot, + ln_top, + sigmoid_bot=-1, + sigmoid_top=ln_top.size - 2, + loss_threshold=args.loss_threshold, + ) + if args.ipex_merged_emb: + dlrm.emb_l = ipex.nn.modules.MergedEmbeddingBagWithSGD.from_embeddingbag_list(dlrm.emb_l, lr=args.learning_rate) + dlrm.need_linearize_indices_and_offsets = torch.BoolTensor([False]) + + if not args.inference_only: + optimizer = torch.optim.SGD(dlrm.parameters(), lr=args.learning_rate) + lr_scheduler = LRPolicyScheduler( + optimizer, + args.lr_num_warmup_steps, + args.lr_decay_start_step, + args.lr_num_decay_steps, + ) + + ### main loop ### + + # training or inference + best_acc_test = 0 + best_auc_test = 0 + skip_upto_epoch = 0 + skip_upto_batch = 0 + total_time = 0 + total_loss = 0 + total_iter = 0 + total_samp = 0 + + # Load model is specified + if not (args.load_model == ""): + print("Loading saved model {}".format(args.load_model)) + ld_model = torch.load(args.load_model, map_location=torch.device("cpu")) + dlrm.load_state_dict(ld_model["state_dict"]) + ld_j = ld_model["iter"] + ld_k = ld_model["epoch"] + ld_nepochs = ld_model["nepochs"] + ld_nbatches = ld_model["nbatches"] + ld_nbatches_test = ld_model["nbatches_test"] + ld_train_loss = ld_model["train_loss"] + ld_total_loss = ld_model["total_loss"] + ld_acc_test = ld_model["test_acc"] + if not args.inference_only: + optimizer.load_state_dict(ld_model["opt_state_dict"]) + best_acc_test = ld_acc_test + total_loss = ld_total_loss + skip_upto_epoch = ld_k # epochs + skip_upto_batch = ld_j # batches + else: + args.print_freq = ld_nbatches + args.test_freq = 0 + + print( + "Saved at: epoch = {:d}/{:d}, batch = {:d}/{:d}, ntbatch = {:d}".format( + ld_k, ld_nepochs, ld_j, ld_nbatches, ld_nbatches_test + ) + ) + print( + "Training state: loss = {:.6f}".format( + ld_train_loss, + ) + ) + print("Testing state: accuracy = {:3.3f} %".format(ld_acc_test * 100)) + del(ld_model) + + ext_dist.barrier() + print("time/loss/accuracy (if enabled):") + + if args.tune: + # evaluation + def eval_func(model): + args.int8 = getattr(model, "is_quantized", False) + with torch.no_grad(): + return inference( + args, + model, + best_acc_test, + best_auc_test, + test_ld, + trace=args.int8 + ) + + # calibration + def calib_fn(model): + calib_number = 0 + for X_test, lS_o_test, lS_i_test, T in train_ld: + if calib_number < 100: + model(X_test, lS_o_test, lS_i_test) + calib_number += 1 + else: + break + + X_test, lS_o_test, lS_i_test, T = next(iter(train_ld)) + example_inputs = (X_test, lS_o_test, lS_i_test) + assert args.inference_only, "Please set inference_only in arguments" + from neural_compressor.torch.quantization import StaticQuantConfig, autotune, TuningConfig + tune_config = TuningConfig(config_set=StaticQuantConfig.get_config_set_for_tuning()) + + dlrm = autotune( + dlrm, + tune_config=tune_config, + eval_fn=eval_func, + run_fn=calib_fn, + example_inputs=example_inputs, + ) + dlrm.save(args.save_model) + exit(0) + if args.benchmark: + # To do + print('Not implemented yet') + exit(0) + + if args.accuracy_only: + with torch.no_grad(): + inference( + args, + dlrm, + best_acc_test, + best_auc_test, + test_ld + ) + exit(0) + + + if args.bf16 and not args.inference_only: + for j, inputBatch in enumerate(train_ld): + X, lS_o, lS_i, T, W, CBPP = unpack_batch(inputBatch) + if ext_dist.my_size > 1: + local_bs = X.size()[0] // ext_dist.my_size + rank_id = dlrm.rank + X = X[rank_id * local_bs: (rank_id + 1) * local_bs] + T = T[rank_id * local_bs: (rank_id + 1) * local_bs] + global_bs = local_bs * ext_dist.my_size + lS_o = lS_o[:, :global_bs] + lS_i = lS_i[:, :global_bs] + + if isinstance(dlrm.emb_l, ipex.nn.modules.MergedEmbeddingBagWithSGD): + if ext_dist.my_size > 1: + batch_size = X.size()[0] + g_i = lS_i[dlrm.local_ln_emb] + g_o = lS_o[dlrm.local_ln_emb] + n_tables = g_i.shape[0] + idx = [g_i[i] for i in range(n_tables)] + offset = [g_o[i] for i in range(n_tables)] + include_last = [False for i in range(n_tables)] + indices, offsets, indices_with_row_offsets = dlrm.emb_l.linearize_indices_and_offsets(idx, offset, include_last) + else: + n_tables = lS_i.shape[0] + idx = [lS_i[i] for i in range(n_tables)] + offset = [lS_o[i] for i in range(n_tables)] + include_last = [False for i in range(n_tables)] + indices, offsets, indices_with_row_offsets = dlrm.emb_l.linearize_indices_and_offsets(idx, offset, include_last) + if isinstance(dlrm.emb_l, ipex.nn.modules.MergedEmbeddingBagWithSGD): + sample_input = (X, indices, offsets, indices_with_row_offsets) + else: + sample_input = (X, lS_o, lS_i) + break + dlrm, optimizer = ipex.optimize(dlrm, dtype=torch.bfloat16, optimizer=optimizer, inplace=True, sample_input=sample_input) + + if args.ipex_merged_emb: + dlrm.emb_l.to_bfloat16_train() + for i in range(len(dlrm.top_l)): + if isinstance(dlrm.top_l[i], ipex.nn.utils._weight_prepack._IPEXLinear): + if isinstance(dlrm.top_l[i+1], torch.nn.ReLU): + dlrm.top_l[i] = ipex.nn.modules.IPEXLinearEltwise(dlrm.top_l[i], 'relu') + else: + dlrm.top_l[i] = ipex.nn.modules.IPEXLinearEltwise(dlrm.top_l[i], 'sigmoid') + dlrm.top_l[i + 1] = torch.nn.Identity() + for i in range(len(dlrm.bot_l)): + if isinstance(dlrm.bot_l[i], ipex.nn.utils._weight_prepack._IPEXLinear): + if isinstance(dlrm.bot_l[i+1], torch.nn.ReLU): + dlrm.bot_l[i] = ipex.nn.modules.IPEXLinearEltwise(dlrm.bot_l[i], 'relu') + else: + dlrm.bot_l[i] = ipex.nn.modules.IPEXLinearEltwise(dlrm.bot_l[i], 'sigmoid') + dlrm.bot_l[i + 1] = torch.nn.Identity() + + if ext_dist.my_size > 1: + dlrm.bot_l = ext_dist.DDP(dlrm.bot_l) + dlrm.top_l = ext_dist.DDP(dlrm.top_l) + training_record = [0, 0] + def update_training_performance(time, iters, training_record=training_record): + if iters > args.num_warmup_iters: + training_record[0] += time + training_record[1] += 1 + + def print_training_performance( training_record=training_record): + if training_record[0] == 0: + print("num-batches larger than warm up iters, please increase num-batches or decrease warmup iters") + exit() + total_samples = training_record[1] * args.mini_batch_size + throughput = total_samples / training_record[0] * 1000 + print("throughput: {:.3f} fps".format(throughput)) + + test_freq = args.test_freq if args.test_freq != -1 else nbatches // 20 + with torch.autograd.profiler.profile( + enabled=args.enable_profiling, use_cuda=False, record_shapes=False + ) as prof: + if not args.inference_only: + k = 0 + while k < args.nepochs: + + if k < skip_upto_epoch: + continue + + for j, inputBatch in enumerate(train_ld): + + if j < skip_upto_batch: + continue + + X, lS_o, lS_i, T, W, CBPP = unpack_batch(inputBatch) + if ext_dist.my_size > 1: + local_bs = X.size()[0] // ext_dist.my_size + rank_id = dlrm.rank + X = X[rank_id * local_bs: (rank_id + 1) * local_bs] + T = T[rank_id * local_bs: (rank_id + 1) * local_bs] + global_bs = local_bs * ext_dist.my_size + lS_o = lS_o[:, :global_bs] + lS_i = lS_i[:, :global_bs] + + if isinstance(dlrm.emb_l, ipex.nn.modules.MergedEmbeddingBagWithSGD): + if ext_dist.my_size > 1: + batch_size = X.size()[0] + g_i = lS_i[dlrm.local_ln_emb] + g_o = lS_o[dlrm.local_ln_emb] + n_tables = g_i.shape[0] + idx = [g_i[i] for i in range(n_tables)] + offset = [g_o[i] for i in range(n_tables)] + include_last = [False for i in range(n_tables)] + indices, offsets, indices_with_row_offsets = dlrm.emb_l.linearize_indices_and_offsets(idx, offset, include_last) + else: + n_tables = lS_i.shape[0] + idx = [lS_i[i] for i in range(n_tables)] + offset = [lS_o[i] for i in range(n_tables)] + include_last = [False for i in range(n_tables)] + indices, offsets, indices_with_row_offsets = dlrm.emb_l.linearize_indices_and_offsets(idx, offset, include_last) + + t1 = time_wrap() + + # early exit if nbatches was set by the user and has been exceeded + if nbatches > 0 and j >= nbatches: + break + + mbs = T.shape[0] # = args.mini_batch_size except maybe for last + + # forward pass + with torch.cpu.amp.autocast(enabled=args.bf16): + if isinstance(dlrm.emb_l, ipex.nn.modules.MergedEmbeddingBagWithSGD): + Z = dlrm_wrap( + X, + indices, + offsets, + indices_with_row_offsets + ).float() + else: + Z = dlrm_wrap( + X, + lS_o, + lS_i, + ).float() + + # loss + E = loss_fn_wrap(Z, T) + + # compute loss and accuracy + L = E.detach().cpu().numpy() # numpy array + + with record_function("DLRM backward"): + # scaled error gradient propagation + # (where we do not accumulate gradients across mini-batches) + optimizer.zero_grad(set_to_none=True) + # backward pass + E.backward() + + with record_function("DLRM update"): + # optimizer + optimizer.step() + lr_scheduler.step() + if isinstance(dlrm.emb_l, ipex.nn.modules.MergedEmbeddingBagWithSGD): + dlrm.emb_l.sgd_args = dlrm.emb_l.sgd_args._replace(lr=lr_scheduler.get_last_lr()[0]) + + t2 = time_wrap() + total_time += t2 - t1 + + total_loss += L * mbs + total_iter += 1 + total_samp += mbs + + should_print = ((j + 1) % args.print_freq == 0) or ( + j + 1 == nbatches + ) + should_test = ( + (args.should_test) + and (((j + 1) % test_freq == 0) or (j + 1 == nbatches)) + ) + + # print time, loss and accuracy + if should_print or should_test: + gT = 1000.0 * total_time / total_iter if args.print_time else -1 + total_time = 0 + + train_loss = total_loss / total_samp + total_loss = 0 + + str_run_type = ( + "inference" if args.inference_only else "training" + ) + + wall_time = "" + if args.print_wall_time: + wall_time = " ({})".format(time.strftime("%H:%M")) + + print( + "Finished {} it {}/{} of epoch {}, {:.2f} ms/it,".format( + str_run_type, j + 1, nbatches, k, gT + ) + + " loss {:.6f}".format(train_loss) + + wall_time, + flush=True, + ) + update_training_performance(gT, j) + + total_iter = 0 + total_samp = 0 + + # testing + if should_test: + model_metrics_dict, is_best = inference( + args, + dlrm, + best_acc_test, + best_auc_test, + test_ld, + ) + + if ( + is_best + and not (args.save_model == "") + and not args.inference_only + ): + model_metrics_dict["epoch"] = k + model_metrics_dict["iter"] = j + 1 + model_metrics_dict["train_loss"] = train_loss + model_metrics_dict["total_loss"] = total_loss + model_metrics_dict[ + "opt_state_dict" + ] = optimizer.state_dict() + print("Saving model to {}".format(args.save_model)) + torch.save(model_metrics_dict, args.save_model) + + if ( + (args.mlperf_auc_threshold > 0) + and (best_auc_test > args.mlperf_auc_threshold) + ): + print( + "MLPerf testing auc threshold " + + str(args.mlperf_auc_threshold) + + " reached, stop training" + ) + k += 1 # nepochs + else: + print("Testing for inference only") + with torch.no_grad(): + inference( + args, + dlrm, + best_acc_test, + best_auc_test, + test_ld + ) + + # profiling + if not args.inference_only: + print_training_performance() + + if args.enable_profiling: + time_stamp = str(datetime.datetime.now()).replace(" ", "_") + with open("dlrm_s_pytorch" + time_stamp + "_shape.prof", "w") as prof_f: + prof_f.write( + prof.key_averages(group_by_input_shape=True).table( + sort_by="self_cpu_time_total" + ) + ) + with open("dlrm_s_pytorch" + time_stamp + "_total.prof", "w") as prof_f: + prof_f.write(prof.key_averages().table(sort_by="self_cpu_time_total")) + prof.export_chrome_trace("dlrm_s_pytorch" + time_stamp + ".json") + exit(0) + +if __name__ == "__main__": + run() diff --git a/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/extend_distributed.py b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/extend_distributed.py new file mode 100644 index 00000000000..0b117975b25 --- /dev/null +++ b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/extend_distributed.py @@ -0,0 +1,424 @@ +import os +import builtins +import numpy as np +import torch +from torch.autograd import Function +from torch.nn.parallel import DistributedDataParallel as DDP +import torch.distributed as dist +try: + import torch_ccl +except ImportError as e: + #print(e) + torch_ccl = False + +my_rank = -1 +my_size = -1 +my_local_rank = -1 +my_local_size = -1 +alltoall_supported = False +allgatherv_supported = False +a2a_impl = os.environ.get('DLRM_ALLTOALL_IMPL', '') + +myreq = None + +def env2int(env_list, default = -1): + for e in env_list: + val = int(os.environ.get(e, -1)) + if val >= 0: return val + return default + +def get_my_slice(n): + my_size = dist.get_world_size() + my_rank = dist.get_rank() + k, m = divmod(n, my_size) + return slice(my_rank * k + min(my_rank, m), (my_rank+1) * k + min(my_rank+1, m), 1) + +def get_split_lengths(n): + my_size = dist.get_world_size() + k, m = divmod(n, my_size) + if m == 0: + splits = None + my_len = k + else: + my_rank = dist.get_rank() + splits = [(k+1) if i < m else k for i in range(my_size)] + my_len = splits[my_rank] + return (my_len, splits) + +def init_distributed(rank = -1, size = -1, backend=''): + global myreq + #global my_rank + global my_size + global my_local_rank + global my_local_size + global a2a_impl + global alltoall_supported + global allgatherv_supported + # guess MPI ranks from env (works for IMPI, OMPI and MVAPICH2) + num_mpi_ranks = env2int(['PMI_SIZE', 'OMPI_COMM_WORLD_SIZE', 'MV2_COMM_WORLD_SIZE', 'WORLD_SIZE']) + if backend == '' and num_mpi_ranks > 1: + if torch_ccl and env2int(['CCL_WORKER_COUNT']) > 0: + backend = 'ccl' + elif dist.is_mpi_available(): + backend = 'mpi' + else: + print("WARNING: MPI multi-process launch detected but PyTorch MPI backend not available.") + backend = 'gloo' + if backend != '': + #guess Rank and size + if rank == -1: + rank = env2int(['PMI_RANK', 'OMPI_COMM_WORLD_RANK', 'MV2_COMM_WORLD_RANK', 'RANK'], 0) + if size == -1: + size = env2int(['PMI_SIZE', 'OMPI_COMM_WORLD_SIZE', 'MV2_COMM_WORLD_SIZE', 'WORLD_SIZE'], 1) + if not os.environ.get('RANK', None) and rank != -1: os.environ['RANK'] = str(rank) + if not os.environ.get('WORLD_SIZE', None) and size != -1: os.environ['WORLD_SIZE'] = str(size) + if not os.environ.get('MASTER_PORT', None): os.environ['MASTER_PORT'] = '29500' + if not os.environ.get('MASTER_ADDR', None): + local_size = env2int(['MPI_LOCALNRANKS', 'OMPI_COMM_WORLD_LOCAL_SIZE', 'MV2_COMM_WORLD_LOCAL_SIZE'], 1) + if local_size != size and backend != 'mpi': + print("Warning: Looks like distributed multinode run but MASTER_ADDR env not set, using '127.0.0.1' as default") + print("If this run hangs, try exporting rank 0's hostname as MASTER_ADDR") + os.environ['MASTER_ADDR'] = '127.0.0.1' + if size > 1: + dist.init_process_group(backend, rank=rank, world_size=size) + my_rank = dist.get_rank() + my_size = dist.get_world_size() + my_local_rank = env2int(['MPI_LOCALRANKID', 'OMPI_COMM_WORLD_LOCAL_RANK', 'MV2_COMM_WORLD_LOCAL_RANK'], 0) + my_local_size = env2int(['MPI_LOCALNRANKS', 'OMPI_COMM_WORLD_LOCAL_SIZE', 'MV2_COMM_WORLD_LOCAL_SIZE'], 1) + if my_rank == 0: print("Running on %d ranks using %s backend" % (my_size, backend)) + if backend == 'ccl': + print("Using CCL_ATL_TRANSPORT=%s" % os.environ.get('CCL_ATL_TRANSPORT', '(default)')) + print("Using CCL_ATL_SHM=%s" % os.environ.get('CCL_ATL_SHM', '(default)')) + if hasattr(dist, 'all_to_all_single'): + try: + # dist.all_to_all_single(torch.empty([0]), torch.empty([0])) + alltoall_supported = True + except RuntimeError: + pass + if a2a_impl == 'alltoall' and alltoall_supported == False: + print("Requested DLRM_ALLTOALL_IMPL=%s but backend %s does not support it, use scatter/gather based alltoall" % (a2a_impl, backend)) + a2a_impl = 'scatter' + if a2a_impl != '': print("Using DLRM_ALLTOALL_IMPL=%s" % a2a_impl) + try: + x = torch.ones([my_rank]) + y = torch.zeros([(my_size*(my_size-1))//2]) + y = list(y.split([r for r in range(my_size)])) + dist.all_gather(y, x) + allgatherv_supported = True + except RuntimeError: + pass + else: + my_rank = 0 + my_size = 1 + my_local_rank = 0 + my_local_size = 1 + myreq = Request() + +class Request(object): + def __init__(self): + self.req = None + self.tensor = None + self.WaitFunction = All2All_Scatter_Wait + + def wait(self): + ret = self.WaitFunction.apply(*self.tensor) + self.req = None + self.tensor = None + return ret + +class All2All_ScatterList_Req(Function): + @staticmethod + def forward(ctx, a2ai, *inputs): + global myreq + my_rank = dist.get_rank() + #print("All2All_ScatterList_Req:forward") + mb_split_lengths = a2ai.gNS if a2ai.gNS else a2ai.lN + emb_split_lengths = a2ai.gSS if a2ai.gSS else [a2ai.lS] * my_size + gather_list = [] + req_list = [] + for i in range(my_size): + for j in range(emb_split_lengths[i]): + out_tensor = inputs[0].new_empty([a2ai.lN, a2ai.E]) + scatter_list = list(inputs[j].split(mb_split_lengths, dim = 0)) if i == my_rank else [] + req = dist.scatter(out_tensor, scatter_list, src=i, async_op=True) + gather_list.append(out_tensor) + req_list.append(req) + myreq.req = req_list + myreq.tensor = tuple(gather_list) + myreq.a2ai = a2ai + return myreq.tensor + + @staticmethod + def backward(ctx, *grad_output): + global myreq + #print("All2All_ScatterList_Req:backward") + for r in myreq.req: + r.wait() + myreq.req = None + grad_inputs = myreq.tensor + myreq.tensor = None + return (None, *grad_inputs) + + +class All2All_ScatterList_Wait(Function): + @staticmethod + def forward(ctx, *output): + global myreq + #print("All2All_Scatter_Wait:forward") + ctx.a2ai = myreq.a2ai + for r in myreq.req: + r.wait() + myreq.req = None + myreq.tensor = None + return output + + @staticmethod + def backward(ctx, *grad_output): + global myreq + my_rank = dist.get_rank() + a2ai = ctx.a2ai + grad_output = [t.contiguous() for t in grad_output] + mb_split_lengths = a2ai.gNS if a2ai.gNS else [a2ai.lN] * my_size + per_rank_split_lengths = a2ai.gSS if a2ai.gSS else [a2ai.lS] * my_size + grad_inputs = [grad_output[0].new_empty([ctx.a2ai.N, ctx.a2ai.E]) for _ in range(a2ai.lS)] + req_list = [] + ind = 0 + for i in range(my_size): + for j in range(per_rank_split_lengths[i]): + gather_list = list(grad_inputs[j].split(mb_split_lengths, dim = 0)) if i == my_rank else None + req = dist.gather(grad_output[ind], gather_list, dst = i, async_op=True) + req_list.append(req) + ind += 1 + myreq.req = req_list + myreq.tensor = grad_inputs + return tuple(grad_output) + + + +class All2All_Scatter_Req(Function): + @staticmethod + def forward(ctx, a2ai, *inputs): + global myreq + #print("All2All_Scatter_Req:forward") + my_rank = dist.get_rank() + mb_split_lengths = a2ai.gNS if a2ai.gNS else a2ai.lN + emb_split_lengths = a2ai.gSS if a2ai.gSS else [a2ai.lS] * my_size + input = torch.cat(inputs, dim=1) + scatter_list = list(input.split(mb_split_lengths, dim=0)) + gather_list = [] + req_list = [] + for i in range(my_size): + out_tensor = input.new_empty([a2ai.lN, emb_split_lengths[i] * a2ai.E]) + req = dist.scatter(out_tensor, scatter_list if i == my_rank else [], src=i, async_op=True) + gather_list.append(out_tensor) + req_list.append(req) + myreq.req = req_list + myreq.tensor = tuple(gather_list) + myreq.a2ai = a2ai + ctx.a2ai = a2ai + return myreq.tensor + + @staticmethod + def backward(ctx, *grad_output): + global myreq + #print("All2All_Scatter_Req:backward") + for r in myreq.req: + r.wait() + myreq.req = None + grad_input = myreq.tensor + grad_inputs = grad_input.split(ctx.a2ai.E, dim=1) + myreq.tensor = None + return (None, *grad_inputs) + + +class All2All_Scatter_Wait(Function): + @staticmethod + def forward(ctx, *output): + global myreq + #print("All2All_Scatter_Wait:forward") + ctx.a2ai = myreq.a2ai + for r in myreq.req: + r.wait() + myreq.req = None + myreq.tensor = None + return output + + @staticmethod + def backward(ctx, *grad_output): + global myreq + my_rank = dist.get_rank() + #print("All2All_Scatter_Wait:backward") + assert len(grad_output) == my_size + scatter_list = [t.contiguous() for t in grad_output] + a2ai = ctx.a2ai + mb_split_lengths = a2ai.gNS if a2ai.gNS else a2ai.lN + emb_split_lengths = a2ai.gSS if a2ai.gSS else [a2ai.lS] * my_size + grad_input = grad_output[0].new_empty([a2ai.N, a2ai.E*a2ai.lS]) + gather_list = list(grad_input.split(mb_split_lengths, dim=0)) + req_list = [] + for i in range(my_size): + #req = dist.scatter(gather_list[i], scatter_list if i == my_rank else [], src=i, async_op=True) + req = dist.gather(scatter_list[i], gather_list if i == my_rank else [], dst=i, async_op=True) + req_list.append(req) + myreq.req = req_list + myreq.tensor = grad_input + return grad_output + + +class All2All_Req(Function): + @staticmethod + def forward(ctx, a2ai, *inputs): + global myreq + #print("All2All_Req:forward") + mb_split_lengths = a2ai.gNS + if mb_split_lengths: mb_split_lengths = [m * a2ai.lS * a2ai.E for m in mb_split_lengths] + emb_split_lengths = a2ai.gSS + if emb_split_lengths: emb_split_lengths = [a2ai.lN * e * a2ai.E for e in emb_split_lengths] + input = torch.cat(inputs, dim=1).view([-1]) + output = input.new_empty([a2ai.S*a2ai.lN*a2ai.E]) + req = dist.all_to_all_single(output, input, emb_split_lengths, mb_split_lengths, async_op=True) + myreq.req = req + myreq.tensor = [] + myreq.tensor.append(output) + myreq.tensor = tuple(myreq.tensor) + a2ai.mb_split_lengths = mb_split_lengths + a2ai.emb_split_lengths = emb_split_lengths + myreq.a2ai = a2ai + ctx.a2ai = a2ai + return myreq.tensor + + @staticmethod + def backward(ctx, *grad_output): + global myreq + #print("All2All_Req:backward") + a2ai = ctx.a2ai + myreq.req.wait() + myreq.req = None + grad_input = myreq.tensor + grad_inputs = grad_input.view([a2ai.N, -1]).split(a2ai.E, dim=1) + grad_inputs = [gin.contiguous() for gin in grad_inputs] + myreq.tensor = None + return (None, *grad_inputs) + + +class All2All_Wait(Function): + @staticmethod + def forward(ctx, *output): + global myreq + #print("All2All_Wait:forward") + a2ai = myreq.a2ai + ctx.a2ai = a2ai + myreq.req.wait() + myreq.req = None + myreq.tensor = None + emb_split_lengths = a2ai.emb_split_lengths if a2ai.emb_split_lengths else a2ai.lS * a2ai.lN * a2ai.E + outputs = output[0].split(emb_split_lengths) + outputs = tuple([out.view([a2ai.lN, -1]) for out in outputs]) + return outputs + + @staticmethod + def backward(ctx, *grad_outputs): + global myreq + #print("All2All_Wait:backward") + a2ai = ctx.a2ai + grad_outputs = [gout.contiguous().view([-1]) for gout in grad_outputs] + grad_output = torch.cat(grad_outputs) + grad_input = grad_output.new_empty([a2ai.N * a2ai.lS * a2ai.E]) + req = dist.all_to_all_single(grad_input, grad_output, a2ai.mb_split_lengths, a2ai.emb_split_lengths, async_op=True) + myreq.req = req + myreq.tensor = grad_input + return (grad_output,) + +class AllGather(Function): + + @staticmethod + def forward(ctx, input, global_lengths, dim=0): + if not isinstance(global_lengths, (list, tuple)): + global_lengths = [global_lengths] * my_size + my_rank = dist.get_rank() + assert(len(global_lengths) == my_size) + assert(global_lengths[my_rank] == input.size(dim)) + local_start = sum(global_lengths[:my_rank]) + + output_size = list(input.size()) + + ctx.dim = dim + ctx.local_start = local_start + ctx.local_length = global_lengths[my_rank] + + input = input.contiguous() + if dim == 0: + out_len = sum(global_lengths) + output_size[dim] = out_len + output = input.new_empty(output_size) + gather_list = list(output.split(global_lengths, dim=0)) + else: + gather_list = [torch.empty_like(input) for _ in range(my_size)] + gather_list = [] + for l in global_lengths: + output_size[dim] = l + gather_list.append(input.new_empty(output_size)) + + dist.all_gather(gather_list, input) + + if dim != 0: + output = torch.cat(gather_list, dim=dim) + + return output + + @staticmethod + def backward(ctx, grad_output): + # print("Inside All2AllBackward") + dim = ctx.dim + start = ctx.local_start + length = ctx.local_length + + grad_input = grad_output.narrow(dim, start, length) + + return (grad_input, None, None) + +class All2AllInfo(object): + pass + +def alltoall(inputs, per_rank_split_lengths): + global myreq + N, E = inputs[0].size() + a2ai = All2AllInfo() + a2ai.lS = len(inputs) + a2ai.gSS = per_rank_split_lengths + a2ai.lN, a2ai.gNS = get_split_lengths(N) + a2ai.E = E + a2ai.N = N + a2ai.S = sum(per_rank_split_lengths) if per_rank_split_lengths else a2ai.lS * my_size + if a2a_impl == '' and alltoall_supported or a2a_impl == 'alltoall': + output = All2All_Req.apply(a2ai, *inputs) + myreq.WaitFunction = All2All_Wait + elif a2a_impl == '' or a2a_impl == 'scatter': + #print("Using All2All_Scatter_Req") + output = All2All_Scatter_Req.apply(a2ai, *inputs) + myreq.WaitFunction = All2All_Scatter_Wait + elif a2a_impl == 'scatter_list': + #print("Using All2All_ScatterList_Req") + output = All2All_ScatterList_Req.apply(a2ai, *inputs) + myreq.WaitFunction = All2All_ScatterList_Wait + else: + print("Unknown value set for DLRM_ALLTOALL_IMPL (%s), please use one of [alltoall, scatter, scatter_list]" % a2a_impl) + return myreq + +def shuffle_data(inputs): + input = torch.cat(inputs) + output = input.new_empty(input.size()) + req = dist.all_to_all_single(output, input) + output = output.reshape(my_size, -1) + return output + + +def all_gather(input, lengths, dim=0): + #print("lengths: ", lengths) + if not lengths: lengths = [input.size(0)] * my_size + return AllGather.apply(input, lengths, dim) + +def barrier(): + if my_size > 1: + dist.barrier() + diff --git a/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/requirements.txt b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/requirements.txt new file mode 100644 index 00000000000..859bbfc346b --- /dev/null +++ b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/requirements.txt @@ -0,0 +1,8 @@ +future +numpy +pydot +neural-compressor +scikit-learn +tqdm +torch>=1.11.0 +intel_extension_for_pytorch>=1.11.0 \ No newline at end of file diff --git a/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/run_benchmark.sh b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/run_benchmark.sh new file mode 100755 index 00000000000..3089868c3a0 --- /dev/null +++ b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/run_benchmark.sh @@ -0,0 +1,98 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_tuning + +} + +# init params +function init_params { + tuned_checkpoint=saved_results + batch_size=16384 + iters=100 + for var in "$@" + do + case $var in + --topology=*) + topology=$(echo $var |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo $var |cut -f2 -d=) + ;; + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --mode=*) + mode=$(echo $var |cut -f2 -d=) + ;; + --batch_size=*) + batch_size=$(echo $var |cut -f2 -d=) + ;; + --iters=*) + iters=$(echo ${var} |cut -f2 -d=) + ;; + --int8=*) + int8=$(echo ${var} |cut -f2 -d=) + ;; + --config=*) + tuned_checkpoint=$(echo $var |cut -f2 -d=) + ;; + *) + echo "Error: No such parameter: ${var}" + exit 1 + ;; + esac + done + +} + +# run_tuning +function run_tuning { + MODEL_SCRIPT=dlrm_s_pytorch.py + + # Create the output directory in case it doesn't already exist + mkdir -p ${tuned_checkpoint}/dlrm_inference_accuracy_log + + LOG=${tuned_checkpoint}/dlrm_inference_accuracy_log + + CORES=`lscpu | grep Core | awk '{print $4}'` + + ARGS="" + if [[ ${int8} == "true" ]]; then + echo "running int8 path" + ARGS="$ARGS --int8" + else + echo "running fp32 path" + fi + + if [[ ${mode} == "accuracy" ]]; then + python -u $MODEL_SCRIPT \ + --raw-data-file=${dataset_location}/day --processed-data-file=${dataset_location}/terabyte_processed.npz \ + --data-set=terabyte \ + --memory-map --mlperf-bin-loader --round-targets=True --learning-rate=1.0 \ + --arch-mlp-bot=13-512-256-128 --arch-mlp-top=1024-1024-512-256-1 \ + --arch-sparse-feature-size=128 --max-ind-range=40000000 \ + --numpy-rand-seed=727 --inference-only --ipex-interaction \ + --print-freq=100 --print-time --mini-batch-size=2048 --test-mini-batch-size=16384 \ + --save-model ${tuned_checkpoint} --test-freq=2048 --print-auc $ARGS \ + --load-model=${input_model} --accuracy_only + elif [[ ${mode} == "performance" ]]; then + python -u $MODEL_SCRIPT \ + --raw-data-file=${dataset_location}/day --processed-data-file=${dataset_location}/terabyte_processed.npz \ + --data-set=terabyte --benchmark \ + --memory-map --mlperf-bin-loader --round-targets=True --learning-rate=1.0 \ + --arch-mlp-bot=13-512-256-128 --arch-mlp-top=1024-1024-512-256-1 \ + --arch-sparse-feature-size=128 --max-ind-range=40000000 --ipex-interaction \ + --numpy-rand-seed=727 --inference-only --num-batches=1000 \ + --print-freq=10 --print-time --mini-batch-size=128 --test-mini-batch-size=${batch_size} \ + --save-model ${tuned_checkpoint} + else + echo "Error: No such mode: ${mode}" + exit 1 + fi +} + +main "$@" diff --git a/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/run_quant.sh b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/run_quant.sh new file mode 100755 index 00000000000..58d8b1fe491 --- /dev/null +++ b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/run_quant.sh @@ -0,0 +1,68 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_tuning + +} + +# init params +function init_params { + tuned_checkpoint=saved_results + for var in "$@" + do + case $var in + --topology=*) + topology=$(echo $var |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo $var |cut -f2 -d=) + ;; + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --output_model=*) + tuned_checkpoint=$(echo $var |cut -f2 -d=) + ;; + *) + echo "Error: No such parameter: ${var}" + exit 1 + ;; + esac + done + +} + +CORES=`lscpu | grep Core | awk '{print $4}'` +# use first socket +numa_cmd="numactl -C 0-$((CORES-1)) " +echo "will run on core 0-$((CORES-1)) on socket 0" + +export OMP_NUM_THREADS=$CORES + +# run_tuning +function run_tuning { + MODEL_SCRIPT=dlrm_s_pytorch.py + + # Create the output directory in case it doesn't already exist + mkdir -p ${tuned_checkpoint}/dlrm_inference_accuracy_log + + LOG=${tuned_checkpoint}/dlrm_inference_accuracy_log + CORES=`lscpu | grep Core | awk '{print $4}'` + ARGS="" + + $numa_cmd python -u $MODEL_SCRIPT \ + --raw-data-file=${dataset_location}/day --processed-data-file=${dataset_location}/terabyte_processed.npz \ + --data-set=terabyte \ + --memory-map --mlperf-bin-loader --round-targets=True --learning-rate=1.0 \ + --arch-mlp-bot=13-512-256-128 --arch-mlp-top=1024-1024-512-256-1 \ + --arch-sparse-feature-size=128 --max-ind-range=40000000 \ + --numpy-rand-seed=727 --inference-only --ipex-interaction \ + --print-freq=100 --print-time --mini-batch-size=2048 --test-mini-batch-size=16384 \ + --test-freq=2048 --print-auc --tune --save-model=${tuned_checkpoint} $ARGS \ + --load-model=${input_model} --num-cpu-cores=${CORES} | tee $LOG +} + +main "$@" diff --git a/neural_compressor/torch/quantization/autotune.py b/neural_compressor/torch/quantization/autotune.py index 279b6be4633..e54a9d97748 100644 --- a/neural_compressor/torch/quantization/autotune.py +++ b/neural_compressor/torch/quantization/autotune.py @@ -93,17 +93,19 @@ def autotune( tuning_logger.trial_end(trial_index) if tuning_monitor.need_stop(): logger.info("Stopped tuning.") - del q_model # maybe gc.collect() is needed for memory release - best_quant_config: BaseConfig = tuning_monitor.get_best_quant_config() - # !!! Make sure to use deepcopy only when inplace is set to `True`. - q_model = quantize( - deepcopy(model), - quant_config=best_quant_config, - run_fn=run_fn, - run_args=run_args, - inplace=True, - example_inputs=example_inputs, - ) + if trial_index == 0: # recover the best q_model from previous results. + logger.info("Reconvering the best quantized model...") + del q_model # maybe gc.collect() is needed for memory release + best_quant_config: BaseConfig = tuning_monitor.get_best_quant_config() + # !!! Make sure to use deepcopy only when inplace is set to `True`. + q_model = quantize( + deepcopy(model), + quant_config=best_quant_config, + run_fn=run_fn, + run_args=run_args, + inplace=True, + example_inputs=example_inputs, + ) best_quant_model = q_model # quantize model inplace break tuning_logger.tuning_end() diff --git a/neural_compressor/torch/quantization/quantize.py b/neural_compressor/torch/quantization/quantize.py index 57197a91972..ff8298dad88 100644 --- a/neural_compressor/torch/quantization/quantize.py +++ b/neural_compressor/torch/quantization/quantize.py @@ -97,6 +97,7 @@ def quantize( example_inputs=example_inputs, mode=Mode.QUANTIZE, ) + setattr(q_model, "is_quantized", True) return q_model @@ -152,7 +153,7 @@ def prepare( example_inputs=example_inputs, mode=Mode.PREPARE, ) - setattr(prepared_model, "prepared", True) + setattr(prepared_model, "is_prepared", True) setattr(prepared_model, "quant_config", quant_config) setattr(prepared_model, "example_inputs", example_inputs) return prepared_model @@ -177,12 +178,12 @@ def convert( q_model = model if inplace else copy.deepcopy(model) # TODO: Optimize the check for prepared flag after adding HQT FP8 Quant - assert getattr(model, "prepared", False), "Please run prepare function before convert." + assert getattr(model, "is_prepared", False), "Please run prepare function before convert." - if getattr(model, "prepared", False): + if getattr(model, "is_prepared", False): if quant_config is None: quant_config = model.quant_config - example_inputs = model.example_inputs if getattr(model, "prepared", False) else None + example_inputs = model.example_inputs if getattr(model, "is_prepared", False) else None registered_configs = config_registry.get_cls_configs() if isinstance(quant_config, dict): @@ -215,4 +216,5 @@ def convert( example_inputs=example_inputs, mode=Mode.CONVERT, ) + setattr(q_model, "is_quantized", True) return q_model