diff --git a/docs/3x/PT_MXQuant.md b/docs/3x/PT_MXQuant.md
index 1cfb17ff30b..42e12d039a6 100644
--- a/docs/3x/PT_MXQuant.md
+++ b/docs/3x/PT_MXQuant.md
@@ -95,7 +95,7 @@ user_model = convert(model=user_model)
 
 ## Examples
 
-- PyTorch [huggingface models](/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx)
+- PyTorch [huggingface models](/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant)
 
 
 ## Reference
diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json
index 8520a9545b0..3a21f42bd20 100644
--- a/examples/.config/model_params_pytorch_3x.json
+++ b/examples/.config/model_params_pytorch_3x.json
@@ -1,46 +1,53 @@
-{
-    "pytorch": {
-      "gpt_j_ipex":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 1
-      },
-      "gpt_j_ipex_sq":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 1
-      },
-      "llama2_7b_ipex":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 1
-      },
-      "llama2_7b_ipex_sq":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 1
-      },
-      "opt_125m_ipex":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 8
-      },
-      "opt_125m_ipex_sq":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 8
-      }
-    }
-}
\ No newline at end of file
+{
+    "pytorch": {
+      "gpt_j_ipex":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
+      "gpt_j_ipex_sq":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
+      "llama2_7b_ipex":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
+      "llama2_7b_ipex_sq":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
+      "opt_125m_ipex":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 8
+      },
+      "opt_125m_ipex_sq":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 8
+      },
+      "dlrm_ipex": {
+        "model_src_dir": "recommendation/dlrm/static_quant/ipex",
+        "dataset_location": "/mnt/local_disk3/dataset/dlrm/dlrm/input",
+        "input_model": "/mnt/local_disk3/dataset/dlrm/dlrm/dlrm_weight/tb00_40M.pt",
+        "main_script": "dlrm_s_pytorch.py",
+        "batch_size": 16384
+      }
+    }
+}
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx/README.md b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant/README.md
similarity index 97%
rename from examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx/README.md
rename to examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant/README.md
index 6608cbcf726..e61d5a64ade 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx/README.md
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant/README.md
@@ -1,6 +1,7 @@
 # Run
 
 ## Run WOQ MX FP4 model
+
 ``` python
 python run_clm_no_trainer.py --model [model_name_or_id] --quantize --accuracy --tasks lambada_openai --w_dtype fp4 --woq
-```
\ No newline at end of file
+```
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant/requirements.txt
similarity index 100%
rename from examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx/requirements.txt
rename to examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant/requirements.txt
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant/run_clm_no_trainer.py
similarity index 100%
rename from examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx/run_clm_no_trainer.py
rename to examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant/run_clm_no_trainer.py
diff --git a/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/CODE_OF_CONDUCT.md b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/CODE_OF_CONDUCT.md
new file mode 100644
index 00000000000..0f7ad8bfc17
--- /dev/null
+++ b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/CODE_OF_CONDUCT.md
@@ -0,0 +1,5 @@
+# Code of Conduct
+
+Facebook has adopted a Code of Conduct that we expect project participants to adhere to.
+Please read the [full text](https://code.fb.com/codeofconduct/)
+so that you can understand what actions will and will not be tolerated.
diff --git a/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/CONTRIBUTING.md b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/CONTRIBUTING.md
new file mode 100644
index 00000000000..cc013a17ec8
--- /dev/null
+++ b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/CONTRIBUTING.md
@@ -0,0 +1,36 @@
+# Contributing to DLRM
+We want to make contributing to this project as easy and transparent as
+possible.
+
+## Pull Requests
+We actively welcome your pull requests.
+
+1. Fork the repo and create your branch from `master`.
+2. If you've added code that should be tested, add tests.
+3. If you've changed APIs, update the documentation.
+4. Ensure the test suite passes.
+5. Make sure your code lints.
+6. If you haven't already, complete the Contributor License Agreement ("CLA").
+
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Facebook's open source projects.
+
+Complete your CLA here: <https://code.facebook.com/cla>
+
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+
+Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.
+
+## Coding Style
+* 4 spaces for indentation rather than tabs
+* 80 character line length
+* in general, please maintain a consistent style with the rest of the code
+
+## License
+By contributing to DLRM, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.
diff --git a/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/LICENSE b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/LICENSE
new file mode 100644
index 00000000000..b96dcb0480a
--- /dev/null
+++ b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) Facebook, Inc. and its affiliates.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/README.md b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/README.md
new file mode 100644
index 00000000000..918cc1edc23
--- /dev/null
+++ b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/README.md
@@ -0,0 +1,90 @@
+Step-by-Step
+============
+
+This document is used to list steps of reproducing PyTorch DLRM tuning zoo result. and original DLRM README is in [DLRM README](https://github.com/facebookresearch/dlrm/blob/master/README.md)
+
+> **Note**
+>
+> Please  ensure your PC have >370G memory to run DLRM
+> IPEX version >= 1.11
+
+# Prerequisite
+
+### 1. Environment
+
+PyTorch 1.11 or higher version is needed with pytorch_fx backend.
+
+  ```shell
+  # Install dependency
+  cd examples/pytorch/recommendation/dlrm/quantization/ptq/ipex
+  pip install -r requirements.txt
+  ```
+> Note: Validated PyTorch [Version](/docs/source/installation_guide.md#validated-software-environment).
+
+### 2. Prepare Dataset
+
+  The code supports interface with the [Criteo Terabyte Dataset](https://labs.criteo.com/2013/12/download-terabyte-click-logs/)
+
+  1. download the raw data files day_0.gz, ...,day_23.gz and unzip them.
+  2. Specify the location of the unzipped text files day_0, ...,day_23, using --raw-data-file=<path/day> (the day number will be appended automatically), please refer "Run" command.
+
+### 3. Prepare pretrained model
+
+  Download the DLRM PyTorch weights (`tb00_40M.pt`, 90GB) from the
+[MLPerf repo](https://github.com/mlcommons/inference/tree/master/recommendation/dlrm/pytorch#more-information-about-the-model-weights)
+
+# Run
+### tune with INC
+  ```shell
+  cd examples/pytorch/recommendation/dlrm/quantization/ptq/ipex
+  bash run_quant.sh --input_model="/path/of/pretrained/model" --dataset_location="/path/of/dataset"
+  ```
+
+### benchmark
+```shell
+bash run_benchmark.sh --input_model="/path/of/pretrained/model" --dataset_location="/path/of/dataset" --mode=accuracy --int8=true
+```
+
+
+Examples of enabling Intel® Neural Compressor
+=========================
+
+This is a tutorial of how to enable DLRM model with Intel® Neural Compressor.
+
+
+### Code update
+
+We need update dlrm_s_pytorch.py like below
+
+```python
+# evaluation
+def eval_func(model):
+	args.int8 = model.is_quantized
+	with torch.no_grad():
+		return inference(
+			args,
+			model,
+			best_acc_test,
+			best_auc_test,
+			test_ld,
+			trace=args.int8
+		)
+
+# calibration
+def calib_fn(model):
+	calib_number = 0
+	for X_test, lS_o_test, lS_i_test, T in train_ld:
+		if calib_number < 102400:
+			model(X_test, lS_o_test, lS_i_test)
+			calib_number += 1
+
+from neural_compressor.torch.quantization import SmoothQuantConfig, autotune, TuningConfig
+tune_config = TuningConfig(config_set=SmoothQuantConfig.get_config_set_for_tuning())
+dlrm = autotune(
+	dlrm, 
+	tune_config=tune_config,
+	eval_fn=eval_func,
+	run_fn=calib_fn,
+)
+dlrm.save("saved_results")
+```
diff --git a/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/data_loader_terabyte.py b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/data_loader_terabyte.py
new file mode 100644
index 00000000000..5bc0c4d3aab
--- /dev/null
+++ b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/data_loader_terabyte.py
@@ -0,0 +1,388 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import os
+import numpy as np
+from torch.utils.data import Dataset
+import torch
+import time
+import math
+from tqdm import tqdm
+import argparse
+import extend_distributed as ext_dist
+
+
+class DataLoader:
+    """
+    DataLoader dedicated for the Criteo Terabyte Click Logs dataset
+    """
+
+    def __init__(
+            self,
+            data_filename,
+            data_directory,
+            days,
+            batch_size,
+            max_ind_range=-1,
+            split="train",
+            drop_last_batch=False
+    ):
+        self.data_filename = data_filename
+        self.data_directory = data_directory
+        self.days = days
+        self.batch_size = batch_size
+        self.max_ind_range = max_ind_range
+
+        total_file = os.path.join(
+            data_directory,
+            data_filename + "_day_count.npz"
+        )
+        with np.load(total_file) as data:
+            total_per_file = data["total_per_file"][np.array(days)]
+
+        self.length = sum(total_per_file)
+        if split == "test" or split == "val":
+            self.length = int(np.ceil(self.length / 2.))
+        self.split = split
+        self.drop_last_batch = drop_last_batch
+
+    def __iter__(self):
+        return iter(
+            _batch_generator(
+                self.data_filename, self.data_directory, self.days,
+                self.batch_size, self.split, self.drop_last_batch, self.max_ind_range
+            )
+        )
+
+    def __len__(self):
+        if self.drop_last_batch:
+            return self.length // self.batch_size
+        else:
+            return math.ceil(self.length / self.batch_size)
+
+
+def _transform_features(
+        x_int_batch, x_cat_batch, y_batch, max_ind_range, flag_input_torch_tensor=False
+):
+    if max_ind_range > 0:
+        x_cat_batch = x_cat_batch % max_ind_range
+
+    if flag_input_torch_tensor:
+        x_int_batch = torch.log(x_int_batch.clone().detach().type(torch.float) + 1)
+        x_cat_batch = x_cat_batch.clone().detach().type(torch.long)
+        y_batch = y_batch.clone().detach().type(torch.float32).view(-1, 1)
+    else:
+        x_int_batch = torch.log(torch.tensor(x_int_batch, dtype=torch.float) + 1)
+        x_cat_batch = torch.tensor(x_cat_batch, dtype=torch.long)
+        y_batch = torch.tensor(y_batch, dtype=torch.float32).view(-1, 1)
+
+    batch_size = x_cat_batch.shape[0]
+    feature_count = x_cat_batch.shape[1]
+    lS_o = torch.arange(batch_size).reshape(1, -1).repeat(feature_count, 1)
+
+    return x_int_batch, lS_o, x_cat_batch.t(), y_batch.view(-1, 1)
+
+
+def _batch_generator(
+        data_filename, data_directory, days, batch_size, split, drop_last, max_ind_range
+):
+    previous_file = None
+    for day in days:
+        filepath = os.path.join(
+            data_directory,
+            data_filename + "_{}_reordered.npz".format(day)
+        )
+
+        # print('Loading file: ', filepath)
+        with np.load(filepath) as data:
+            x_int = data["X_int"]
+            x_cat = data["X_cat"]
+            y = data["y"]
+
+        samples_in_file = y.shape[0]
+        batch_start_idx = 0
+        if split == "test" or split == "val":
+            length = int(np.ceil(samples_in_file / 2.))
+            if split == "test":
+                samples_in_file = length
+            elif split == "val":
+                batch_start_idx = samples_in_file - length
+
+        while batch_start_idx < samples_in_file - batch_size:
+
+            missing_samples = batch_size
+            if previous_file is not None:
+                missing_samples -= previous_file['y'].shape[0]
+
+            current_slice = slice(batch_start_idx, batch_start_idx + missing_samples)
+
+            x_int_batch = x_int[current_slice]
+            x_cat_batch = x_cat[current_slice]
+            y_batch = y[current_slice]
+
+            if previous_file is not None:
+                x_int_batch = np.concatenate(
+                    [previous_file['x_int'], x_int_batch],
+                    axis=0
+                )
+                x_cat_batch = np.concatenate(
+                    [previous_file['x_cat'], x_cat_batch],
+                    axis=0
+                )
+                y_batch = np.concatenate([previous_file['y'], y_batch], axis=0)
+                previous_file = None
+
+            if x_int_batch.shape[0] != batch_size:
+                raise ValueError('should not happen')
+
+            yield _transform_features(x_int_batch, x_cat_batch, y_batch, max_ind_range)
+
+            batch_start_idx += missing_samples
+        if batch_start_idx != samples_in_file:
+            current_slice = slice(batch_start_idx, samples_in_file)
+            if previous_file is not None:
+                previous_file = {
+                    'x_int' : np.concatenate(
+                        [previous_file['x_int'], x_int[current_slice]],
+                        axis=0
+                    ),
+                    'x_cat' : np.concatenate(
+                        [previous_file['x_cat'], x_cat[current_slice]],
+                        axis=0
+                    ),
+                    'y' : np.concatenate([previous_file['y'], y[current_slice]], axis=0)
+                }
+            else:
+                previous_file = {
+                    'x_int' : x_int[current_slice],
+                    'x_cat' : x_cat[current_slice],
+                    'y' : y[current_slice]
+                }
+
+    if not drop_last:
+        yield _transform_features(
+            previous_file['x_int'],
+            previous_file['x_cat'],
+            previous_file['y'],
+            max_ind_range
+        )
+
+
+def _test():
+    generator = _batch_generator(
+        data_filename='day',
+        data_directory='./input',
+        days=range(23),
+        split="train",
+        batch_size=2048,
+        drop_last=True,
+        max_ind_range=-1
+    )
+    t1 = time.time()
+    for x_int, lS_o, x_cat, y in generator:
+        t2 = time.time()
+        time_diff = t2 - t1
+        t1 = t2
+        print(
+            "time {} x_int.shape: {} lS_o.shape: {} x_cat.shape: {} y.shape: {}".format(
+                time_diff, x_int.shape, lS_o.shape, x_cat.shape, y.shape
+            )
+        )
+
+
+class CriteoBinDataset(Dataset):
+    """Binary version of criteo dataset."""
+
+    def __init__(self, data_file, counts_file,
+                 batch_size=1, max_ind_range=-1, bytes_per_feature=4):
+        # dataset
+        self.tar_fea = 1   # single target
+        self.den_fea = 13  # 13 dense  features
+        self.spa_fea = 26  # 26 sparse features
+        self.tad_fea = self.tar_fea + self.den_fea
+        self.tot_fea = self.tad_fea + self.spa_fea
+
+        self.batch_size = batch_size
+        self.max_ind_range = max_ind_range
+        self.bytes_per_entry = (bytes_per_feature * self.tot_fea * batch_size)
+
+        self.num_entries = math.ceil(os.path.getsize(data_file) / self.bytes_per_entry)
+
+        data_file_size = os.path.getsize(data_file)
+        bytes_per_sample = bytes_per_feature * self.tot_fea
+        if ext_dist.my_size > 1:
+            self.bytes_per_rank = self.bytes_per_entry // ext_dist.my_size
+        else:
+            self.bytes_per_rank = self.bytes_per_entry
+
+        if ext_dist.my_size > 1 and self.num_entries * self.bytes_per_entry > data_file_size:
+            last_batch = (data_file_size % self.bytes_per_entry) // bytes_per_sample
+            self.bytes_last_batch = last_batch // ext_dist.my_size * bytes_per_sample
+        else:
+            self.bytes_last_batch = self.bytes_per_rank
+
+        if self.bytes_last_batch == 0:
+            self.num_entries = self.num_entries - 1
+            self.bytes_last_batch = self.bytes_per_rank
+
+        print('data file:', data_file, 'number of batches:', self.num_entries)
+        self.file = open(data_file, 'rb')
+
+        with np.load(counts_file) as data:
+            self.counts = data["counts"]
+
+        # hardcoded for now
+        self.m_den = 13
+
+    def __len__(self):
+        return self.num_entries
+
+    def __getitem__(self, idx):
+        my_rank = ext_dist.dist.get_rank() if ext_dist.my_size > 1 else 0
+        rank_size = self.bytes_last_batch if idx == (self.num_entries - 1) else self.bytes_per_rank 
+        self.file.seek(idx * self.bytes_per_entry + rank_size * my_rank, 0)
+        raw_data = self.file.read(rank_size)
+        array = np.frombuffer(raw_data, dtype=np.int32)
+        tensor = torch.from_numpy(array).view((-1, self.tot_fea))
+
+        return _transform_features(x_int_batch=tensor[:, 1:14],
+                                   x_cat_batch=tensor[:, 14:],
+                                   y_batch=tensor[:, 0],
+                                   max_ind_range=self.max_ind_range,
+                                   flag_input_torch_tensor=True)
+
+    def __del__(self):
+        self.file.close()
+
+
+def numpy_to_binary(input_files, output_file_path, split='train'):
+    """Convert the data to a binary format to be read with CriteoBinDataset."""
+
+    # WARNING - both categorical and numerical data must fit into int32 for
+    # the following code to work correctly
+
+    with open(output_file_path, 'wb') as output_file:
+        if split == 'train':
+            for input_file in input_files:
+                print('Processing file: ', input_file)
+
+                np_data = np.load(input_file)
+                np_data = np.concatenate([np_data['y'].reshape(-1, 1),
+                                          np_data['X_int'],
+                                          np_data['X_cat']], axis=1)
+                np_data = np_data.astype(np.int32)
+
+                output_file.write(np_data.tobytes())
+        else:
+            assert len(input_files) == 1
+            np_data = np.load(input_files[0])
+            np_data = np.concatenate([np_data['y'].reshape(-1, 1),
+                                      np_data['X_int'],
+                                      np_data['X_cat']], axis=1)
+            np_data = np_data.astype(np.int32)
+
+            samples_in_file = np_data.shape[0]
+            midpoint = int(np.ceil(samples_in_file / 2.))
+            if split == "test":
+                begin = 0
+                end = midpoint
+            elif split == "val":
+                begin = midpoint
+                end = samples_in_file
+            else:
+                raise ValueError('Unknown split value: ', split)
+
+            output_file.write(np_data[begin:end].tobytes())
+
+
+def _preprocess(args):
+    train_files = ['{}_{}_reordered.npz'.format(args.input_data_prefix, day) for
+                   day in range(0, 23)]
+
+    test_valid_file = args.input_data_prefix + '_23_reordered.npz'
+
+    os.makedirs(args.output_directory, exist_ok=True)
+    for split in ['train', 'val', 'test']:
+        print('Running preprocessing for split =', split)
+
+        output_file = os.path.join(args.output_directory,
+                                   '{}_data.bin'.format(split))
+
+        input_files = train_files if split == 'train' else [test_valid_file]
+        numpy_to_binary(input_files=input_files,
+                        output_file_path=output_file,
+                        split=split)
+
+
+def _test_bin():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--output_directory', required=True)
+    parser.add_argument('--input_data_prefix', required=True)
+    parser.add_argument('--split', choices=['train', 'test', 'val'],
+                        required=True)
+    args = parser.parse_args()
+
+    _preprocess(args)
+
+    binary_data_file = os.path.join(args.output_directory,
+                                    '{}_data.bin'.format(args.split))
+
+    counts_file = os.path.join(args.output_directory, 'day_fea_count.npz')
+    dataset_binary = CriteoBinDataset(data_file=binary_data_file,
+                                            counts_file=counts_file,
+                                            batch_size=2048,)
+    from dlrm_data_pytorch import CriteoDataset 
+    from dlrm_data_pytorch import collate_wrapper_criteo_offset as collate_wrapper_criteo
+
+    binary_loader = torch.utils.data.DataLoader(
+        dataset_binary,
+        batch_size=None,
+        shuffle=False,
+        num_workers=0,
+        collate_fn=None,
+        pin_memory=False,
+        drop_last=False,
+    )
+
+    original_dataset = CriteoDataset(
+        dataset='terabyte',
+        max_ind_range=10 * 1000 * 1000,
+        sub_sample_rate=1,
+        randomize=True,
+        split=args.split,
+        raw_path=args.input_data_prefix,
+        pro_data='dummy_string',
+        memory_map=True
+    )
+
+    original_loader = torch.utils.data.DataLoader(
+        original_dataset,
+        batch_size=2048,
+        shuffle=False,
+        num_workers=0,
+        collate_fn=collate_wrapper_criteo,
+        pin_memory=False,
+        drop_last=False,
+    )
+
+    assert len(dataset_binary) == len(original_loader)
+    for i, (old_batch, new_batch) in tqdm(enumerate(zip(original_loader,
+                                                        binary_loader)),
+                                          total=len(dataset_binary)):
+
+        for j in range(len(new_batch)):
+            if not np.array_equal(old_batch[j], new_batch[j]):
+                raise ValueError('FAILED: Datasets not equal')
+        if i > len(dataset_binary):
+            break
+    print('PASSED')
+
+
+if __name__ == '__main__':
+    _test()
+    _test_bin()
diff --git a/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/data_utils.py b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/data_utils.py
new file mode 100644
index 00000000000..6ceef9517df
--- /dev/null
+++ b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/data_utils.py
@@ -0,0 +1,1292 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Description: generate inputs and targets for the DLRM benchmark
+#
+# Utility function(s) to download and pre-process public data sets
+#   - Criteo Kaggle Display Advertising Challenge Dataset
+#     https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset
+#   - Criteo Terabyte Dataset
+#     https://labs.criteo.com/2013/12/download-terabyte-click-logs
+#
+# After downloading dataset, run:
+#   getCriteoAdData(
+#       datafile="<path-to-train.txt>",
+#       o_filename=kaggleAdDisplayChallenge_processed.npz,
+#       max_ind_range=-1,
+#       sub_sample_rate=0.0,
+#       days=7,
+#       data_split='train',
+#       randomize='total',
+#       criteo_kaggle=True,
+#       memory_map=False
+#   )
+#   getCriteoAdData(
+#       datafile="<path-to-day_{0,...,23}>",
+#       o_filename=terabyte_processed.npz,
+#       max_ind_range=-1,
+#       sub_sample_rate=0.0,
+#       days=24,
+#       data_split='train',
+#       randomize='total',
+#       criteo_kaggle=False,
+#       memory_map=False
+#   )
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import sys
+# import os
+from os import path
+from multiprocessing import Process, Manager
+# import io
+# from io import StringIO
+# import collections as coll
+
+import numpy as np
+
+
+def convertUStringToDistinctIntsDict(mat, convertDicts, counts):
+    # Converts matrix of unicode strings into distinct integers.
+    #
+    # Inputs:
+    #     mat (np.array): array of unicode strings to convert
+    #     convertDicts (list): dictionary for each column
+    #     counts (list): number of different categories in each column
+    #
+    # Outputs:
+    #     out (np.array): array of output integers
+    #     convertDicts (list): dictionary for each column
+    #     counts (list): number of different categories in each column
+
+    # check if convertDicts and counts match correct length of mat
+    if len(convertDicts) != mat.shape[1] or len(counts) != mat.shape[1]:
+        print("Length of convertDicts or counts does not match input shape")
+        print("Generating convertDicts and counts...")
+
+        convertDicts = [{} for _ in range(mat.shape[1])]
+        counts = [0 for _ in range(mat.shape[1])]
+
+    # initialize output
+    out = np.zeros(mat.shape)
+
+    for j in range(mat.shape[1]):
+        for i in range(mat.shape[0]):
+            # add to convertDict and increment count
+            if mat[i, j] not in convertDicts[j]:
+                convertDicts[j][mat[i, j]] = counts[j]
+                counts[j] += 1
+            out[i, j] = convertDicts[j][mat[i, j]]
+
+    return out, convertDicts, counts
+
+
+def convertUStringToDistinctIntsUnique(mat, mat_uni, counts):
+    # mat is an array of 0,...,# samples, with each being 26 categorical features
+
+    # check if mat_unique and counts match correct length of mat
+    if len(mat_uni) != mat.shape[1] or len(counts) != mat.shape[1]:
+        print("Length of mat_unique or counts does not match input shape")
+        print("Generating mat_unique and counts...")
+
+        mat_uni = [np.array([]) for _ in range(mat.shape[1])]
+        counts = [0 for _ in range(mat.shape[1])]
+
+    # initialize output
+    out = np.zeros(mat.shape)
+    ind_map = [np.array([]) for _ in range(mat.shape[1])]
+
+    # find out and assign unique ids to features
+    for j in range(mat.shape[1]):
+        m = mat_uni[j].size
+        mat_concat = np.concatenate((mat_uni[j], mat[:, j]))
+        mat_uni[j], ind_map[j] = np.unique(mat_concat, return_inverse=True)
+        out[:, j] = ind_map[j][m:]
+        counts[j] = mat_uni[j].size
+
+    return out, mat_uni, counts
+
+
+def processCriteoAdData(d_path, d_file, npzfile, i, convertDicts, pre_comp_counts):
+    # Process Kaggle Display Advertising Challenge or Terabyte Dataset
+    # by converting unicode strings in X_cat to integers and
+    # converting negative integer values in X_int.
+    #
+    # Loads data in the form "{kaggle|terabyte}_day_i.npz" where i is the day.
+    #
+    # Inputs:
+    #   d_path (str): path for {kaggle|terabyte}_day_i.npz files
+    #   i (int): splits in the dataset (typically 0 to 7 or 0 to 24)
+
+    # process data if not all files exist
+    filename_i = npzfile + "_{0}_processed.npz".format(i)
+
+    if path.exists(filename_i):
+        print("Using existing " + filename_i, end="\n")
+    else:
+        print("Not existing " + filename_i)
+        with np.load(npzfile + "_{0}.npz".format(i)) as data:
+            # categorical features
+            '''
+            # Approach 1a: using empty dictionaries
+            X_cat, convertDicts, counts = convertUStringToDistinctIntsDict(
+                data["X_cat"], convertDicts, counts
+            )
+            '''
+            '''
+            # Approach 1b: using empty np.unique
+            X_cat, convertDicts, counts = convertUStringToDistinctIntsUnique(
+                data["X_cat"], convertDicts, counts
+            )
+            '''
+            # Approach 2a: using pre-computed dictionaries
+            X_cat_t = np.zeros(data["X_cat_t"].shape)
+            for j in range(26):
+                for k, x in enumerate(data["X_cat_t"][j, :]):
+                    X_cat_t[j, k] = convertDicts[j][x]
+            # continuous features
+            X_int = data["X_int"]
+            X_int[X_int < 0] = 0
+            # targets
+            y = data["y"]
+
+        np.savez_compressed(
+            filename_i,
+            # X_cat = X_cat,
+            X_cat=np.transpose(X_cat_t),  # transpose of the data
+            X_int=X_int,
+            y=y,
+        )
+        print("Processed " + filename_i, end="\n")
+    # sanity check (applicable only if counts have been pre-computed & are re-computed)
+    # for j in range(26):
+    #    if pre_comp_counts[j] != counts[j]:
+    #        sys.exit("ERROR: Sanity check on counts has failed")
+    # print("\nSanity check on counts passed")
+
+    return
+
+
+def concatCriteoAdData(
+        d_path,
+        d_file,
+        npzfile,
+        trafile,
+        days,
+        data_split,
+        randomize,
+        total_per_file,
+        total_count,
+        memory_map,
+        o_filename
+):
+    # Concatenates different days and saves the result.
+    #
+    # Inputs:
+    #   days (int): total number of days in the dataset (typically 7 or 24)
+    #   d_path (str): path for {kaggle|terabyte}_day_i.npz files
+    #   o_filename (str): output file name
+    #
+    # Output:
+    #   o_file (str): output file path
+
+    if memory_map:
+        # dataset break up per fea
+        # tar_fea = 1   # single target
+        den_fea = 13  # 13 dense  features
+        spa_fea = 26  # 26 sparse features
+        # tad_fea = tar_fea + den_fea
+        # tot_fea = tad_fea + spa_fea
+        # create offset per file
+        offset_per_file = np.array([0] + [x for x in total_per_file])
+        for i in range(days):
+            offset_per_file[i + 1] += offset_per_file[i]
+
+        '''
+        # Approach 1, 2 and 3 use indices, while Approach 4 does not use them
+        # create indices
+        indices = np.arange(total_count)
+        if data_split == "none":
+            if randomize == "total":
+                indices = np.random.permutation(indices)
+        else:
+            indices = np.array_split(indices, offset_per_file[1:-1])
+
+            # randomize train data (per day)
+            if randomize == "day":  # or randomize == "total":
+                for i in range(len(indices) - 1):
+                    indices[i] = np.random.permutation(indices[i])
+                print("Randomized indices per day ...")
+
+            train_indices = np.concatenate(indices[:-1])
+            test_indices = indices[-1]
+
+            # randomize train data (across days)
+            if randomize == "total":
+                train_indices = np.random.permutation(train_indices)
+                print("Randomized indices across days ...")
+
+            indices = np.concatenate((train_indices, test_indices))
+        # no reordering
+        # indices = np.arange(total_count)
+        '''
+        '''
+        # Approach 1: simple and slow (no grouping is used)
+        # check if data already exists
+        recreate_flag = False
+        for j in range(tot_fea):
+            filename_j = trafile + "_{0}_reordered.npy".format(j)
+            if path.exists(filename_j):
+                print("Using existing " + filename_j)
+            else:
+                recreate_flag = True
+        # load, reorder and concatenate data (memmap all reordered files per feature)
+        if recreate_flag:
+            # init reordered files (.npy appended automatically)
+            z = np.zeros((total_count))
+            for j in range(tot_fea):
+                filename_j = trafile + "_{0}_reordered".format(j)
+                np.save(filename_j, z)
+                print("Creating " + filename_j)
+
+            for i in range(days):
+                filename_i = d_path + npzfile + "_{0}_processed.npz".format(i)
+                with np.load(filename_i) as data:
+                    X_cat_t = np.transpose(data["X_cat"])
+                    X_int_t = np.transpose(data["X_int"])
+                    y = data["y"]
+                size = len(y)
+                # sanity check
+                if total_per_file[i] != size:
+                    sys.exit("ERROR: sanity check on number of samples failed")
+                # setup start and end ranges
+                start = offset_per_file[i]
+                end = offset_per_file[i + 1]
+                # print(filename_i)
+                # print("start=" + str(start) + " end=" + str(end)
+                #     + " diff=" + str(end - start) + "=" + str(total_per_file[i]))
+
+                for j in range(tot_fea):
+                    filename_j = trafile + "_{0}_reordered.npy".format(j)
+                    fj = np.load(filename_j, mmap_mode='r+')
+                    if j < tar_fea:
+                        fj[indices[start:end]] = y
+                    elif tar_fea <= j and j < tad_fea:
+                        fj[indices[start:end]] = X_int_t[j - tar_fea, :]
+                    else:
+                        fj[indices[start:end]] = X_cat_t[j - tad_fea, :]
+                    del fj
+        else:
+            print("Reordered fea files already exist, skipping ...")
+
+        # check if data already exists
+        recreate_flag = False
+        for i in range(days):
+            filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i)
+            if path.exists(filename_i):
+                print("Using existing " + filename_i)
+            else:
+                recreate_flag = True
+        # split reordered data by files (memmap all reordered files per feature)
+        # on the day boundary del the file object and memmap again
+        if recreate_flag:
+            for i in range(days):
+                filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i)
+                size = total_per_file[i]
+                X_int_t = np.zeros((den_fea, size))
+                X_cat_t = np.zeros((spa_fea, size))
+                # setup start and end ranges
+                start = offset_per_file[i]
+                end = offset_per_file[i + 1]
+                print("Creating " + filename_i)
+                # print("start=" + str(start) + " end=" + str(end)
+                #     + " diff=" + str(end - start) + "=" + str(total_per_file[i]))
+
+                for j in range(tot_fea):
+                    filename_j = trafile + "_{0}_reordered.npy".format(j)
+                    fj = np.load(filename_j, mmap_mode='r')
+                    if j < tar_fea:
+                        y = fj[start:end]
+                    elif tar_fea <= j and j < tad_fea:
+                        X_int_t[j - tar_fea, :] = fj[start:end]
+                    else:
+                        X_cat_t[j - tad_fea, :] = fj[start:end]
+                    del fj
+
+                np.savez_compressed(
+                    filename_i,
+                    X_cat=np.transpose(X_cat_t),  # transpose of the data
+                    X_int=np.transpose(X_int_t),  # transpose of the data
+                    y=y,
+                )
+        else:
+            print("Reordered day files already exist, skipping ...")
+        '''
+        '''
+        # Approach 2: group days
+        # check if data already exists
+        recreate_flag = False
+        for j in range(tot_fea):
+            filename_j = trafile + "_{0}_reordered.npy".format(j)
+            if path.exists(filename_j):
+                print("Using existing " + filename_j)
+            else:
+                recreate_flag = True
+        # load, reorder and concatenate data (memmap all reordered files per feature)
+        if recreate_flag:
+            # init reordered files (.npy appended automatically)
+            z = np.zeros((total_count))
+            for j in range(tot_fea):
+                filename_j = trafile + "_{0}_reordered".format(j)
+                np.save(filename_j, z)
+                print("Creating " + filename_j)
+
+            group_day = 3  # e.g. 8, 4 or 3
+            group_num = days // group_day
+            file_group = [i*group_day for i in range(group_num)] + [days]
+            for ii in range(group_num):
+                # for last may be group_size != group_num, therefore reset it below
+                group_size = file_group[ii + 1] - file_group[ii]
+                X_cat_t = [0]*group_size
+                X_int_t = [0]*group_size
+                y = [0]*group_size
+                start = [0]*group_size
+                end  = [0]*group_size
+                for ig in range(group_size):
+                    i = file_group[ii] + ig
+                    filename_i = d_path + npzfile + "_{0}_processed.npz".format(i)
+                    # setup start and end ranges
+                    start[ig] = offset_per_file[i]
+                    end[ig] = offset_per_file[i + 1]
+                    # print(filename_i)
+                    # load a group of files
+                    with np.load(filename_i) as data:
+                        X_cat_t[ig] = np.transpose(data["X_cat"])
+                        X_int_t[ig] = np.transpose(data["X_int"])
+                        y[ig] = data["y"]
+                    # sanity check
+                    if total_per_file[i] != len(y[ig]):
+                        sys.exit("ERROR: sanity check on number of samples failed")
+                # print("start=" + str(start) + " end=" + str(end)
+                #  + " diff=" + str(end[ig]-start[ig]) + "=" + str(total_per_file[i]))
+
+                for j in range(tot_fea):
+                    filename_j = trafile + "_{0}_reordered.npy".format(j)
+                    fj = np.load(filename_j, mmap_mode='r+')
+                    for ig in range(group_size):
+                        if j < tar_fea:
+                            fj[indices[start[ig]:end[ig]]] = y[ig]
+                        elif tar_fea <= j and j < tad_fea:
+                            fj[indices[start[ig]:end[ig]]] = X_int_t[ig][j - tar_fea, :]
+                        else:
+                            fj[indices[start[ig]:end[ig]]] = X_cat_t[ig][j - tad_fea, :]
+                    del fj
+        else:
+            print("Reordered fea files already exist, skipping ...")
+
+        # check if data already exists
+        recreate_flag = False
+        for i in range(days):
+            filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i)
+            if path.exists(filename_i):
+                print("Using existing " + filename_i)
+            else:
+                recreate_flag = True
+        # split reordered data by files (memmap all reordered files per feature)
+        # on the day boundary del the file object and memmap again
+        if recreate_flag:
+            for ii in range(group_num):
+                # for last may be group_size != group_num, therefore reset it below
+                group_size = file_group[ii + 1] - file_group[ii]
+                X_cat_t= []; X_int_t = []
+                for ig in range(group_size):
+                    i = file_group[ii] + ig
+                    X_int_t.append(np.zeros((den_fea, total_per_file[i])))
+                    X_cat_t.append(np.zeros((spa_fea, total_per_file[i])))
+                y = [0]*group_size
+                start = [0]*group_size
+                end  = [0]*group_size
+
+                for j in range(tot_fea):
+                    filename_j = trafile + "_{0}_reordered.npy".format(j)
+                    fj = np.load(filename_j, mmap_mode='r')
+                    # load a group of files
+                    for ig in range(group_size):
+                        i = file_group[ii] + ig
+                        # setup start and end ranges
+                        start[ig] = offset_per_file[i]
+                        end[ig] = offset_per_file[i + 1]
+                        # load data for the group of files
+                        if j < tar_fea:
+                            y[ig] = fj[start[ig]:end[ig]]
+                        elif tar_fea <= j and j < tad_fea:
+                            X_int_t[ig][j - tar_fea, :] = fj[start[ig]:end[ig]]
+                        else:
+                            X_cat_t[ig][j - tad_fea, :] = fj[start[ig]:end[ig]]
+                    del fj
+
+                for ig in range(group_size):
+                    i = file_group[ii] + ig
+                    filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i)
+                    print("Creating " + filename_i)
+                    np.savez_compressed(
+                        filename_i,
+                        X_cat=np.transpose(X_cat_t[ig]),  # transpose of the data
+                        X_int=np.transpose(X_int_t[ig]),  # transpose of the data
+                        y=y[ig],
+                    )
+        else:
+            print("Reordered day files already exist, skipping ...")
+        '''
+        '''
+        # Approach 3: group features
+        # check if data already exists
+        group_fea = 5  # e.g. 8, 5 or 4
+        group_num = tot_fea // group_fea
+        if tot_fea % group_fea != 0:  # sanity check
+            sys.exit("ERROR: the group_fea must divided tot_fea evenly.")
+        recreate_flag = False
+        for jn in range(group_num):
+            filename_j = trafile + "_{0}_reordered{1}.npy".format(
+                jn, group_fea
+            )
+            if path.exists(filename_j):
+                print("Using existing " + filename_j)
+            else:
+                recreate_flag = True
+        # load, reorder and concatenate data (memmap all reordered files per feature)
+        if recreate_flag:
+            # init reordered files (.npy appended automatically)
+            z = np.zeros((group_fea, total_count))
+            for jn in range(group_num):
+                filename_j = trafile + "_{0}_reordered{1}".format(
+                    jn, group_fea
+                )
+                np.save(filename_j, z)
+                print("Creating " + filename_j)
+
+            for i in range(days):
+                filename_i = d_path + npzfile + "_{0}_processed.npz".format(i)
+                with np.load(filename_i) as data:
+                    X_cat_t = np.transpose(data["X_cat"])
+                    X_int_t = np.transpose(data["X_int"])
+                    y = data["y"]
+                size = len(y)
+                # sanity check
+                if total_per_file[i] != size:
+                    sys.exit("ERROR: sanity check on number of samples failed")
+                # setup start and end ranges
+                start = offset_per_file[i]
+                end = offset_per_file[i + 1]
+                # print(filename_i)
+                # print("start=" + str(start) + " end=" + str(end)
+                #      + " diff=" + str(end - start) + "=" + str(total_per_file[i]))
+
+                for jn in range(group_num):
+                    filename_j = trafile + "_{0}_reordered{1}.npy".format(
+                        jn, group_fea
+                    )
+                    fj = np.load(filename_j, mmap_mode='r+')
+                    for jg in range(group_fea):
+                        j = jn * group_fea + jg
+                        # print("j=" + str(j) + " jn=" + str(jn) + " jg=" + str(jg))
+                        if j < tar_fea:
+                            fj[jg, indices[start:end]] = y
+                        elif tar_fea <= j and j < tad_fea:
+                            fj[jg, indices[start:end]] = X_int_t[j - tar_fea, :]
+                        else:
+                            fj[jg, indices[start:end]] = X_cat_t[j - tad_fea, :]
+                    del fj
+        else:
+            print("Reordered fea files already exist, skipping ...")
+
+        # check if data already exists
+        recreate_flag = False
+        for i in range(days):
+            filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i)
+            if path.exists(filename_i):
+                print("Using existing" + filename_i)
+            else:
+                recreate_flag = True
+        # split reordered data by files (memmap all reordered files per feature)
+        # on the day boundary del the file object and memmap again
+        if recreate_flag:
+            for i in range(days):
+                filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i)
+                size = total_per_file[i]
+                X_int_t = np.zeros((den_fea, size))
+                X_cat_t = np.zeros((spa_fea, size))
+                # setup start and end ranges
+                start = offset_per_file[i]
+                end = offset_per_file[i + 1]
+                print("Creating " + filename_i)
+                # print("start=" + str(start) + " end=" + str(end)
+                #      + " diff=" + str(end - start) + "=" + str(total_per_file[i]))
+
+                for jn in range(group_num):
+                    filename_j = trafile + "_{0}_reordered{1}.npy".format(
+                        jn, group_fea
+                    )
+                    fj = np.load(filename_j, mmap_mode='r')
+                    for jg in range(group_fea):
+                        j = jn * group_fea + jg
+                        # print("j=" + str(j) + " jn=" + str(jn) + " jg=" + str(jg))
+                        if j < tar_fea:
+                            y = fj[jg, start:end]
+                        elif tar_fea <= j and j < tad_fea:
+                            X_int_t[j - tar_fea, :] = fj[jg, start:end]
+                        else:
+                            X_cat_t[j - tad_fea, :] = fj[jg, start:end]
+                    del fj
+
+                np.savez_compressed(
+                    filename_i,
+                    X_cat=np.transpose(X_cat_t),  # transpose of the data
+                    X_int=np.transpose(X_int_t),  # transpose of the data
+                    y=y,
+                )
+
+        else:
+            print("Reordered day files already exist, skipping ...")
+        '''
+
+        # Approach 4: Fisher-Yates-Rao (FYR) shuffle algorithm
+        # 1st pass of FYR shuffle
+        # check if data already exists
+        recreate_flag = False
+        for j in range(days):
+            filename_j_y = npzfile + "_{0}_intermediate_y.npy".format(j)
+            filename_j_d = npzfile + "_{0}_intermediate_d.npy".format(j)
+            filename_j_s = npzfile + "_{0}_intermediate_s.npy".format(j)
+            if (
+                path.exists(filename_j_y)
+                and path.exists(filename_j_d)
+                and path.exists(filename_j_s)
+            ):
+                print(
+                    "Using existing\n"
+                    + filename_j_y + "\n"
+                    + filename_j_d + "\n"
+                    + filename_j_s
+                )
+            else:
+                recreate_flag = True
+        # reorder across buckets using sampling
+        if recreate_flag:
+            # init intermediate files (.npy appended automatically)
+            for j in range(days):
+                filename_j_y = npzfile + "_{0}_intermediate_y".format(j)
+                filename_j_d = npzfile + "_{0}_intermediate_d".format(j)
+                filename_j_s = npzfile + "_{0}_intermediate_s".format(j)
+                np.save(filename_j_y, np.zeros((total_per_file[j])))
+                np.save(filename_j_d, np.zeros((total_per_file[j], den_fea)))
+                np.save(filename_j_s, np.zeros((total_per_file[j], spa_fea)))
+            # start processing files
+            total_counter = [0] * days
+            for i in range(days):
+                filename_i = npzfile + "_{0}_processed.npz".format(i)
+                with np.load(filename_i) as data:
+                    X_cat = data["X_cat"]
+                    X_int = data["X_int"]
+                    y = data["y"]
+                size = len(y)
+                # sanity check
+                if total_per_file[i] != size:
+                    sys.exit("ERROR: sanity check on number of samples failed")
+                # debug prints
+                print("Reordering (1st pass) " + filename_i)
+
+                # create buckets using sampling of random ints
+                # from (discrete) uniform distribution
+                buckets = []
+                for _j in range(days):
+                    buckets.append([])
+                counter = [0] * days
+                days_to_sample = days if data_split == "none" else days - 1
+                if randomize == "total":
+                    rand_u = np.random.randint(low=0, high=days_to_sample, size=size)
+                    for k in range(size):
+                        # sample and make sure elements per buckets do not overflow
+                        if data_split == "none" or i < days - 1:
+                            # choose bucket
+                            p = rand_u[k]
+                            # retry of the bucket is full
+                            while total_counter[p] + counter[p] >= total_per_file[p]:
+                                p = np.random.randint(low=0, high=days_to_sample)
+                        else:  # preserve the last day/bucket if needed
+                            p = i
+                        buckets[p].append(k)
+                        counter[p] += 1
+                else:  # randomize is day or none
+                    for k in range(size):
+                        # do not sample, preserve the data in this bucket
+                        p = i
+                        buckets[p].append(k)
+                        counter[p] += 1
+
+                # sanity check
+                if np.sum(counter) != size:
+                    sys.exit("ERROR: sanity check on number of samples failed")
+                # debug prints
+                # print(counter)
+                # print(str(np.sum(counter)) + " = " + str(size))
+                # print([len(x) for x in buckets])
+                # print(total_counter)
+
+                # partially feel the buckets
+                for j in range(days):
+                    filename_j_y = npzfile + "_{0}_intermediate_y.npy".format(j)
+                    filename_j_d = npzfile + "_{0}_intermediate_d.npy".format(j)
+                    filename_j_s = npzfile + "_{0}_intermediate_s.npy".format(j)
+                    start = total_counter[j]
+                    end = total_counter[j] + counter[j]
+                    # target buckets
+                    fj_y = np.load(filename_j_y, mmap_mode='r+')
+                    # print("start=" + str(start) + " end=" + str(end)
+                    #       + " end - start=" + str(end - start) + " "
+                    #       + str(fj_y[start:end].shape) + " "
+                    #       + str(len(buckets[j])))
+                    fj_y[start:end] = y[buckets[j]]
+                    del fj_y
+                    # dense buckets
+                    fj_d = np.load(filename_j_d, mmap_mode='r+')
+                    # print("start=" + str(start) + " end=" + str(end)
+                    #       + " end - start=" + str(end - start) + " "
+                    #       + str(fj_d[start:end, :].shape) + " "
+                    #       + str(len(buckets[j])))
+                    fj_d[start:end, :] = X_int[buckets[j], :]
+                    del fj_d
+                    # sparse buckets
+                    fj_s = np.load(filename_j_s, mmap_mode='r+')
+                    # print("start=" + str(start) + " end=" + str(end)
+                    #       + " end - start=" + str(end - start) + " "
+                    #       + str(fj_s[start:end, :].shape) + " "
+                    #       + str(len(buckets[j])))
+                    fj_s[start:end, :] = X_cat[buckets[j], :]
+                    del fj_s
+                    # update counters for next step
+                    total_counter[j] += counter[j]
+
+        # 2nd pass of FYR shuffle
+        # check if data already exists
+        for j in range(days):
+            filename_j = npzfile + "_{0}_reordered.npz".format(j)
+            if path.exists(filename_j):
+                print("Using existing " + filename_j)
+            else:
+                recreate_flag = True
+        # reorder within buckets
+        if recreate_flag:
+            for j in range(days):
+                filename_j_y = npzfile + "_{0}_intermediate_y.npy".format(j)
+                filename_j_d = npzfile + "_{0}_intermediate_d.npy".format(j)
+                filename_j_s = npzfile + "_{0}_intermediate_s.npy".format(j)
+                fj_y = np.load(filename_j_y)
+                fj_d = np.load(filename_j_d)
+                fj_s = np.load(filename_j_s)
+
+                indices = range(total_per_file[j])
+                if randomize == "day" or randomize == "total":
+                    if data_split == "none" or j < days - 1:
+                        indices = np.random.permutation(range(total_per_file[j]))
+
+                filename_r = npzfile + "_{0}_reordered.npz".format(j)
+                print("Reordering (2nd pass) " + filename_r)
+                np.savez_compressed(
+                    filename_r,
+                    X_cat=fj_s[indices, :],
+                    X_int=fj_d[indices, :],
+                    y=fj_y[indices],
+                )
+
+        '''
+        # sanity check (under no reordering norms should be zero)
+        for i in range(days):
+            filename_i_o = npzfile + "_{0}_processed.npz".format(i)
+            print(filename_i_o)
+            with np.load(filename_i_o) as data_original:
+                X_cat_o = data_original["X_cat"]
+                X_int_o = data_original["X_int"]
+                y_o = data_original["y"]
+            filename_i_r = npzfile + "_{0}_reordered.npz".format(i)
+            print(filename_i_r)
+            with np.load(filename_i_r) as data_reordered:
+                X_cat_r = data_reordered["X_cat"]
+                X_int_r = data_reordered["X_int"]
+                y_r = data_reordered["y"]
+            print(np.linalg.norm(y_o - y_r))
+            print(np.linalg.norm(X_int_o - X_int_r))
+            print(np.linalg.norm(X_cat_o - X_cat_r))
+        '''
+
+    else:
+        print("Concatenating multiple days into %s.npz file" % str(d_path + o_filename))
+
+        # load and concatenate data
+        for i in range(days):
+            filename_i = npzfile + "_{0}_processed.npz".format(i)
+            with np.load(filename_i) as data:
+                if i == 0:
+                    X_cat = data["X_cat"]
+                    X_int = data["X_int"]
+                    y = data["y"]
+                else:
+                    X_cat = np.concatenate((X_cat, data["X_cat"]))
+                    X_int = np.concatenate((X_int, data["X_int"]))
+                    y = np.concatenate((y, data["y"]))
+            print("Loaded day:", i, "y = 1:", len(y[y == 1]), "y = 0:", len(y[y == 0]))
+
+        with np.load(d_path + d_file + "_fea_count.npz") as data:
+            counts = data["counts"]
+        print("Loaded counts!")
+
+        np.savez_compressed(
+            d_path + o_filename + ".npz",
+            X_cat=X_cat,
+            X_int=X_int,
+            y=y,
+            counts=counts,
+        )
+
+    return d_path + o_filename + ".npz"
+
+
+def transformCriteoAdData(X_cat, X_int, y, days, data_split, randomize, total_per_file):
+    # Transforms Criteo Kaggle or terabyte data by applying log transformation
+    # on dense features and converting everything to appropriate tensors.
+    #
+    # Inputs:
+    #     X_cat (ndarray): array of integers corresponding to preprocessed
+    #                      categorical features
+    #     X_int (ndarray): array of integers corresponding to dense features
+    #     y (ndarray):     array of bool corresponding to labels
+    #     data_split(str): flag for splitting dataset into training/validation/test
+    #                      sets
+    #     randomize (str): determines randomization scheme
+    #         "none": no randomization
+    #         "day": randomizes each day"s data (only works if split = True)
+    #         "total": randomizes total dataset
+    #
+    # Outputs:
+    #     if split:
+    #         X_cat_train (tensor): sparse features for training set
+    #         X_int_train (tensor): dense features for training set
+    #         y_train (tensor): labels for training set
+    #         X_cat_val (tensor): sparse features for validation set
+    #         X_int_val (tensor): dense features for validation set
+    #         y_val (tensor): labels for validation set
+    #         X_cat_test (tensor): sparse features for test set
+    #         X_int_test (tensor): dense features for test set
+    #         y_test (tensor): labels for test set
+    #     else:
+    #         X_cat (tensor): sparse features
+    #         X_int (tensor): dense features
+    #         y (tensor): label
+
+    # define initial set of indices
+    indices = np.arange(len(y))
+
+    # create offset per file
+    offset_per_file = np.array([0] + [x for x in total_per_file])
+    for i in range(days):
+        offset_per_file[i + 1] += offset_per_file[i]
+
+    # split dataset
+    if data_split == 'train':
+        indices = np.array_split(indices, offset_per_file[1:-1])
+
+        # randomize train data (per day)
+        if randomize == "day":  # or randomize == "total":
+            for i in range(len(indices) - 1):
+                indices[i] = np.random.permutation(indices[i])
+            print("Randomized indices per day ...")
+
+        train_indices = np.concatenate(indices[:-1])
+        test_indices = indices[-1]
+        test_indices, val_indices = np.array_split(test_indices, 2)
+
+        print("Defined training and testing indices...")
+
+        # randomize train data (across days)
+        if randomize == "total":
+            train_indices = np.random.permutation(train_indices)
+            print("Randomized indices across days ...")
+
+        # indices = np.concatenate((train_indices, test_indices))
+
+        # create training, validation, and test sets
+        X_cat_train = X_cat[train_indices]
+        X_int_train = X_int[train_indices]
+        y_train = y[train_indices]
+
+        X_cat_val = X_cat[val_indices]
+        X_int_val = X_int[val_indices]
+        y_val = y[val_indices]
+
+        X_cat_test = X_cat[test_indices]
+        X_int_test = X_int[test_indices]
+        y_test = y[test_indices]
+
+        print("Split data according to indices...")
+
+        X_cat_train = X_cat_train.astype(np.long)
+        X_int_train = np.log(X_int_train.astype(np.float32) + 1)
+        y_train = y_train.astype(np.float32)
+
+        X_cat_val = X_cat_val.astype(np.long)
+        X_int_val = np.log(X_int_val.astype(np.float32) + 1)
+        y_val = y_val.astype(np.float32)
+
+        X_cat_test = X_cat_test.astype(np.long)
+        X_int_test = np.log(X_int_test.astype(np.float32) + 1)
+        y_test = y_test.astype(np.float32)
+
+        print("Converted to tensors...done!")
+
+        return (
+            X_cat_train,
+            X_int_train,
+            y_train,
+            X_cat_val,
+            X_int_val,
+            y_val,
+            X_cat_test,
+            X_int_test,
+            y_test,
+        )
+
+    else:
+
+        # randomize data
+        if randomize == "total":
+            indices = np.random.permutation(indices)
+            print("Randomized indices...")
+
+        X_cat = X_cat[indices].astype(np.long)
+        X_int = np.log(X_int[indices].astype(np.float32) + 1)
+        y = y[indices].astype(np.float32)
+
+        print("Converted to tensors...done!")
+
+        return (X_cat, X_int, y, [], [], [], [], [], [])
+
+
+def getCriteoAdData(
+        datafile,
+        o_filename,
+        max_ind_range=-1,
+        sub_sample_rate=0.0,
+        days=7,
+        data_split='train',
+        randomize='total',
+        criteo_kaggle=True,
+        memory_map=False,
+        dataset_multiprocessing=False,
+):
+    # Passes through entire dataset and defines dictionaries for categorical
+    # features and determines the number of total categories.
+    #
+    # Inputs:
+    #    datafile : path to downloaded raw data file
+    #    o_filename (str): saves results under o_filename if filename is not ""
+    #
+    # Output:
+    #   o_file (str): output file path
+
+    #split the datafile into path and filename
+    lstr = datafile.split("/")
+    d_path = "/".join(lstr[0:-1]) + "/"
+    d_file = lstr[-1].split(".")[0] if criteo_kaggle else lstr[-1]
+    npzfile = d_path + ((d_file + "_day") if criteo_kaggle else d_file)
+    trafile = d_path + ((d_file + "_fea") if criteo_kaggle else "fea")
+
+    # count number of datapoints in training set
+    total_file = d_path + d_file + "_day_count.npz"
+    if path.exists(total_file):
+        with np.load(total_file) as data:
+            total_per_file = list(data["total_per_file"])
+        total_count = np.sum(total_per_file)
+        print("Skipping counts per file (already exist)")
+    else:
+        total_count = 0
+        total_per_file = []
+        if criteo_kaggle:
+            # WARNING: The raw data consists of a single train.txt file
+            # Each line in the file is a sample, consisting of 13 continuous and
+            # 26 categorical features (an extra space indicates that feature is
+            # missing and will be interpreted as 0).
+            if path.exists(datafile):
+                print("Reading data from path=%s" % (datafile))
+                with open(str(datafile)) as f:
+                    for _ in f:
+                        total_count += 1
+                total_per_file.append(total_count)
+                # reset total per file due to split
+                num_data_per_split, extras = divmod(total_count, days)
+                total_per_file = [num_data_per_split] * days
+                for j in range(extras):
+                    total_per_file[j] += 1
+                # split into days (simplifies code later on)
+                file_id = 0
+                boundary = total_per_file[file_id]
+                nf = open(npzfile + "_" + str(file_id), "w")
+                with open(str(datafile)) as f:
+                    for j, line in enumerate(f):
+                        if j == boundary:
+                            nf.close()
+                            file_id += 1
+                            nf = open(npzfile + "_" + str(file_id), "w")
+                            boundary += total_per_file[file_id]
+                        nf.write(line)
+                nf.close()
+            else:
+                sys.exit("ERROR: Criteo Kaggle Display Ad Challenge Dataset path is invalid; please download from https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset")
+        else:
+            # WARNING: The raw data consist of day_0.gz,... ,day_23.gz text files
+            # Each line in the file is a sample, consisting of 13 continuous and
+            # 26 categorical features (an extra space indicates that feature is
+            # missing and will be interpreted as 0).
+            for i in range(days):
+                datafile_i = datafile + "_" + str(i)  # + ".gz"
+                if path.exists(str(datafile_i)):
+                    print("Reading data from path=%s" % (str(datafile_i)))
+                    # file day_<number>
+                    total_per_file_count = 0
+                    with open(str(datafile_i)) as f:
+                        for _ in f:
+                            total_per_file_count += 1
+                    total_per_file.append(total_per_file_count)
+                    total_count += total_per_file_count
+                else:
+                    sys.exit("ERROR: Criteo Terabyte Dataset path is invalid; please download from https://labs.criteo.com/2013/12/download-terabyte-click-logs")
+
+    # process a file worth of data and reinitialize data
+    # note that a file main contain a single or multiple splits
+    def process_one_file(
+            datfile,
+            npzfile,
+            split,
+            num_data_in_split,
+            dataset_multiprocessing,
+            convertDictsDay=None,
+            resultDay=None
+    ):
+        if dataset_multiprocessing:
+            convertDicts_day = [{} for _ in range(26)]
+
+        with open(str(datfile)) as f:
+            y = np.zeros(num_data_in_split, dtype="i4")  # 4 byte int
+            X_int = np.zeros((num_data_in_split, 13), dtype="i4")  # 4 byte int
+            X_cat = np.zeros((num_data_in_split, 26), dtype="i4")  # 4 byte int
+            if sub_sample_rate == 0.0:
+                rand_u = 1.0
+            else:
+                rand_u = np.random.uniform(low=0.0, high=1.0, size=num_data_in_split)
+
+            i = 0
+            percent = 0
+            for k, line in enumerate(f):
+                # process a line (data point)
+                line = line.split('\t')
+                # set missing values to zero
+                for j in range(len(line)):
+                    if (line[j] == '') or (line[j] == '\n'):
+                        line[j] = '0'
+                # sub-sample data by dropping zero targets, if needed
+                target = np.int32(line[0])
+                if target == 0 and \
+                   (rand_u if sub_sample_rate == 0.0 else rand_u[k]) < sub_sample_rate:
+                    continue
+
+                y[i] = target
+                X_int[i] = np.array(line[1:14], dtype=np.int32)
+                if max_ind_range > 0:
+                    X_cat[i] = np.array(
+                        list(map(lambda x: int(x, 16) % max_ind_range, line[14:])),
+                        dtype=np.int32
+                    )
+                else:
+                    X_cat[i] = np.array(
+                        list(map(lambda x: int(x, 16), line[14:])),
+                        dtype=np.int32
+                    )
+
+                # count uniques
+                if dataset_multiprocessing:
+                    for j in range(26):
+                        convertDicts_day[j][X_cat[i][j]] = 1
+                    # debug prints
+                    if float(i)/num_data_in_split*100 > percent+1:
+                        percent = int(float(i)/num_data_in_split*100)
+                        print(
+                            "Load %d/%d (%d%%) Split: %d  Label True: %d  Stored: %d"
+                            % (
+                                i,
+                                num_data_in_split,
+                                percent,
+                                split,
+                                target,
+                                y[i],
+                            ),
+                            end="\n",
+                        )
+                else:
+                    for j in range(26):
+                        convertDicts[j][X_cat[i][j]] = 1
+                    # debug prints
+                    print(
+                        "Load %d/%d  Split: %d  Label True: %d  Stored: %d"
+                        % (
+                            i,
+                            num_data_in_split,
+                            split,
+                            target,
+                            y[i],
+                        ),
+                        end="\r",
+                    )
+                i += 1
+
+            # store num_data_in_split samples or extras at the end of file
+            # count uniques
+            # X_cat_t  = np.transpose(X_cat)
+            # for j in range(26):
+            #     for x in X_cat_t[j,:]:
+            #         convertDicts[j][x] = 1
+            # store parsed
+            filename_s = npzfile + "_{0}.npz".format(split)
+            if path.exists(filename_s):
+                print("\nSkip existing " + filename_s)
+            else:
+                np.savez_compressed(
+                    filename_s,
+                    X_int=X_int[0:i, :],
+                    # X_cat=X_cat[0:i, :],
+                    X_cat_t=np.transpose(X_cat[0:i, :]),  # transpose of the data
+                    y=y[0:i],
+                )
+                print("\nSaved " + npzfile + "_{0}.npz!".format(split))
+
+        if dataset_multiprocessing:
+            resultDay[split] = i
+            convertDictsDay[split] = convertDicts_day
+            return
+        else:
+            return i
+
+    # create all splits (reuse existing files if possible)
+    recreate_flag = False
+    convertDicts = [{} for _ in range(26)]
+    # WARNING: to get reproducible sub-sampling results you must reset the seed below
+    # np.random.seed(123)
+    # in this case there is a single split in each day
+    for i in range(days):
+        npzfile_i = npzfile + "_{0}.npz".format(i)
+        npzfile_p = npzfile + "_{0}_processed.npz".format(i)
+        if path.exists(npzfile_i):
+            print("Skip existing " + npzfile_i)
+        elif path.exists(npzfile_p):
+            print("Skip existing " + npzfile_p)
+        else:
+            recreate_flag = True
+
+    if recreate_flag:
+        if dataset_multiprocessing:
+            resultDay = Manager().dict()
+            convertDictsDay = Manager().dict()
+            processes = [Process(target=process_one_file,
+                                 name="process_one_file:%i" % i,
+                                 args=(npzfile + "_{0}".format(i),
+                                       npzfile,
+                                       i,
+                                       total_per_file[i],
+                                       dataset_multiprocessing,
+                                       convertDictsDay,
+                                       resultDay,
+                                       )
+                                 ) for i in range(0, days)]
+            for process in processes:
+                process.start()
+            for process in processes:
+                process.join()
+            for day in range(days):
+                total_per_file[day] = resultDay[day]
+                print("Constructing convertDicts Split: {}".format(day))
+                convertDicts_tmp = convertDictsDay[day]
+                for i in range(26):
+                    for j in convertDicts_tmp[i]:
+                        convertDicts[i][j] = 1
+        else:
+            for i in range(days):
+                total_per_file[i] = process_one_file(
+                    npzfile + "_{0}".format(i),
+                    npzfile,
+                    i,
+                    total_per_file[i],
+                    dataset_multiprocessing,
+                )
+
+    # report and save total into a file
+    total_count = np.sum(total_per_file)
+    if not path.exists(total_file):
+        np.savez_compressed(total_file, total_per_file=total_per_file)
+    print("Total number of samples:", total_count)
+    print("Divided into days/splits:\n", total_per_file)
+
+    # dictionary files
+    counts = np.zeros(26, dtype=np.int32)
+    if recreate_flag:
+        # create dictionaries
+        for j in range(26):
+            for i, x in enumerate(convertDicts[j]):
+                convertDicts[j][x] = i
+            dict_file_j = d_path + d_file + "_fea_dict_{0}.npz".format(j)
+            if not path.exists(dict_file_j):
+                np.savez_compressed(
+                    dict_file_j,
+                    unique=np.array(list(convertDicts[j]), dtype=np.int32)
+                )
+            counts[j] = len(convertDicts[j])
+        # store (uniques and) counts
+        count_file = d_path + d_file + "_fea_count.npz"
+        if not path.exists(count_file):
+            np.savez_compressed(count_file, counts=counts)
+    else:
+        # create dictionaries (from existing files)
+        for j in range(26):
+            with np.load(d_path + d_file + "_fea_dict_{0}.npz".format(j)) as data:
+                unique = data["unique"]
+            for i, x in enumerate(unique):
+                convertDicts[j][x] = i
+        # load (uniques and) counts
+        with np.load(d_path + d_file + "_fea_count.npz") as data:
+            counts = data["counts"]
+
+    # process all splits
+    if dataset_multiprocessing:
+        processes = [Process(target=processCriteoAdData,
+                           name="processCriteoAdData:%i" % i,
+                           args=(d_path,
+                                 d_file,
+                                 npzfile,
+                                 i,
+                                 convertDicts,
+                                 counts,
+                                 )
+                           ) for i in range(0, days)]
+        for process in processes:
+            process.start()
+        for process in processes:
+            process.join()
+
+    else:
+        for i in range(days):
+            processCriteoAdData(d_path, d_file, npzfile, i, convertDicts, counts)
+
+    o_file = concatCriteoAdData(
+        d_path,
+        d_file,
+        npzfile,
+        trafile,
+        days,
+        data_split,
+        randomize,
+        total_per_file,
+        total_count,
+        memory_map,
+        o_filename
+    )
+
+    return o_file
+
+
+def loadDataset(
+        dataset,
+        max_ind_range,
+        sub_sample_rate,
+        randomize,
+        data_split,
+        raw_path="",
+        pro_data="",
+        memory_map=False
+):
+    # dataset
+    if dataset == "kaggle":
+        days = 7
+        o_filename = "kaggleAdDisplayChallenge_processed"
+    elif dataset == "terabyte":
+        days = 24
+        o_filename = "terabyte_processed"
+    else:
+        raise(ValueError("Data set option is not supported"))
+
+    # split the datafile into path and filename
+    lstr = raw_path.split("/")
+    d_path = "/".join(lstr[0:-1]) + "/"
+    d_file = lstr[-1].split(".")[0] if dataset == "kaggle" else lstr[-1]
+    npzfile = (d_file + "_day") if dataset == "kaggle" else d_file
+    # trafile = d_path + ((d_file + "_fea") if dataset == "kaggle" else "fea")
+
+    # check if pre-processed data is available
+    data_ready = True
+    if memory_map:
+        for i in range(days):
+            reo_data = d_path + npzfile + "_{0}_reordered.npz".format(i)
+            if not path.exists(str(reo_data)):
+                data_ready = False
+    else:
+        if not path.exists(str(pro_data)):
+            data_ready = False
+
+    # pre-process data if needed
+    # WARNNING: when memory mapping is used we get a collection of files
+    if data_ready:
+        print("Reading pre-processed data=%s" % (str(pro_data)))
+        file = str(pro_data)
+    else:
+        print("Reading raw data=%s" % (str(raw_path)))
+        file = getCriteoAdData(
+            raw_path,
+            o_filename,
+            max_ind_range,
+            sub_sample_rate,
+            days,
+            data_split,
+            randomize,
+            dataset == "kaggle",
+            memory_map
+        )
+
+    return file, days
+
+
+if __name__ == "__main__":
+    ### import packages ###
+    import argparse
+
+    ### parse arguments ###
+    parser = argparse.ArgumentParser(
+        description="Preprocess Criteo dataset"
+    )
+    # model related parameters
+    parser.add_argument("--max-ind-range", type=int, default=-1)
+    parser.add_argument("--data-sub-sample-rate", type=float, default=0.0)  # in [0, 1]
+    parser.add_argument("--data-randomize", type=str, default="total")  # or day or none
+    parser.add_argument("--memory-map", action="store_true", default=False)
+    parser.add_argument("--data-set", type=str, default="kaggle")  # or terabyte
+    parser.add_argument("--raw-data-file", type=str, default="")
+    parser.add_argument("--processed-data-file", type=str, default="")
+    args = parser.parse_args()
+
+    loadDataset(
+        args.data_set,
+        args.max_ind_range,
+        args.data_sub_sample_rate,
+        args.data_randomize,
+        "train",
+        args.raw_data_file,
+        args.processed_data_file,
+        args.memory_map
+    )
diff --git a/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/dlrm_data_pytorch.py b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/dlrm_data_pytorch.py
new file mode 100644
index 00000000000..f6f30f8e663
--- /dev/null
+++ b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/dlrm_data_pytorch.py
@@ -0,0 +1,575 @@
+# Copyright (c) 2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Description: generate inputs and targets for the dlrm benchmark
+# The inpts and outputs are generated according to the following three option(s)
+# 1) random distribution
+# 2) synthetic distribution, based on unique accesses and distances between them
+#    i) R. Hassan, A. Harris, N. Topham and A. Efthymiou "Synthetic Trace-Driven
+#    Simulation of Cache Memory", IEEE AINAM'07
+# 3) public data set
+#    i)  Criteo Kaggle Display Advertising Challenge Dataset
+#    https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset
+#    ii) Criteo Terabyte Dataset
+#    https://labs.criteo.com/2013/12/download-terabyte-click-logs
+
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+# others
+from os import path
+import sys
+
+import data_utils
+
+# numpy
+import numpy as np
+from numpy import random as ra
+
+
+# pytorch
+import torch
+from torch.utils.data import Dataset, RandomSampler
+
+import data_loader_terabyte
+
+
+# Kaggle Display Advertising Challenge Dataset
+# dataset (str): name of dataset (Kaggle or Terabyte)
+# randomize (str): determines randomization scheme
+#            "none": no randomization
+#            "day": randomizes each day"s data (only works if split = True)
+#            "total": randomizes total dataset
+# split (bool) : to split into train, test, validation data-sets
+class CriteoDataset(Dataset):
+
+    def __init__(
+            self,
+            dataset,
+            max_ind_range,
+            sub_sample_rate,
+            randomize,
+            split="train",
+            raw_path="",
+            pro_data="",
+            memory_map=False,
+            dataset_multiprocessing=False,
+    ):
+        # dataset
+        # tar_fea = 1   # single target
+        den_fea = 13  # 13 dense  features
+        # spa_fea = 26  # 26 sparse features
+        # tad_fea = tar_fea + den_fea
+        # tot_fea = tad_fea + spa_fea
+        if dataset == "kaggle":
+            days = 7
+            out_file = "kaggleAdDisplayChallenge_processed"
+        elif dataset == "terabyte":
+            days = 24
+            out_file = "terabyte_processed"
+        else:
+            raise(ValueError("Data set option is not supported"))
+        self.max_ind_range = max_ind_range
+        self.memory_map = memory_map
+
+        # split the datafile into path and filename
+        lstr = raw_path.split("/")
+        self.d_path = "/".join(lstr[0:-1]) + "/"
+        self.d_file = lstr[-1].split(".")[0] if dataset == "kaggle" else lstr[-1]
+        self.npzfile = self.d_path + (
+            (self.d_file + "_day") if dataset == "kaggle" else self.d_file
+        )
+        self.trafile = self.d_path + (
+            (self.d_file + "_fea") if dataset == "kaggle" else "fea"
+        )
+
+        # check if pre-processed data is available
+        data_ready = True
+        if memory_map:
+            for i in range(days):
+                reo_data = self.npzfile + "_{0}_reordered.npz".format(i)
+                if not path.exists(str(reo_data)):
+                    data_ready = False
+        else:
+            if not path.exists(str(pro_data)):
+                data_ready = False
+
+        # pre-process data if needed
+        # WARNNING: when memory mapping is used we get a collection of files
+        if data_ready:
+            print("Reading pre-processed data=%s" % (str(pro_data)))
+            file = str(pro_data)
+        else:
+            print("Reading raw data=%s" % (str(raw_path)))
+            file = data_utils.getCriteoAdData(
+                raw_path,
+                out_file,
+                max_ind_range,
+                sub_sample_rate,
+                days,
+                split,
+                randomize,
+                dataset == "kaggle",
+                memory_map,
+                dataset_multiprocessing,
+            )
+
+        # get a number of samples per day
+        total_file = self.d_path + self.d_file + "_day_count.npz"
+        with np.load(total_file) as data:
+            total_per_file = data["total_per_file"]
+        # compute offsets per file
+        self.offset_per_file = np.array([0] + [x for x in total_per_file])
+        for i in range(days):
+            self.offset_per_file[i + 1] += self.offset_per_file[i]
+        # print(self.offset_per_file)
+
+        # setup data
+        if memory_map:
+            # setup the training/testing split
+            self.split = split
+            if split == 'none' or split == 'train':
+                self.day = 0
+                self.max_day_range = days if split == 'none' else days - 1
+            elif split == 'test' or split == 'val':
+                self.day = days - 1
+                num_samples = self.offset_per_file[days] - \
+                              self.offset_per_file[days - 1]
+                self.test_size = int(np.ceil(num_samples / 2.))
+                self.val_size = num_samples - self.test_size
+            else:
+                sys.exit("ERROR: dataset split is neither none, nor train or test.")
+
+            '''
+            # text
+            print("text")
+            for i in range(days):
+                fi = self.npzfile + "_{0}".format(i)
+                with open(fi) as data:
+                    ttt = 0; nnn = 0
+                    for _j, line in enumerate(data):
+                        ttt +=1
+                        if np.int32(line[0]) > 0:
+                            nnn +=1
+                    print("day=" + str(i) + " total=" + str(ttt) + " non-zeros="
+                          + str(nnn) + " ratio=" +str((nnn * 100.) / ttt) + "%")
+            # processed
+            print("processed")
+            for i in range(days):
+                fi = self.npzfile + "_{0}_processed.npz".format(i)
+                with np.load(fi) as data:
+                    yyy = data["y"]
+                ttt = len(yyy)
+                nnn = np.count_nonzero(yyy)
+                print("day=" + str(i) + " total=" + str(ttt) + " non-zeros="
+                      + str(nnn) + " ratio=" +str((nnn * 100.) / ttt) + "%")
+            # reordered
+            print("reordered")
+            for i in range(days):
+                fi = self.npzfile + "_{0}_reordered.npz".format(i)
+                with np.load(fi) as data:
+                    yyy = data["y"]
+                ttt = len(yyy)
+                nnn = np.count_nonzero(yyy)
+                print("day=" + str(i) + " total=" + str(ttt) + " non-zeros="
+                      + str(nnn) + " ratio=" +str((nnn * 100.) / ttt) + "%")
+            '''
+
+            # load unique counts
+            with np.load(self.d_path + self.d_file + "_fea_count.npz") as data:
+                self.counts = data["counts"]
+            self.m_den = den_fea  # X_int.shape[1]
+            self.n_emb = len(self.counts)
+            print("Sparse features= %d, Dense features= %d" % (self.n_emb, self.m_den))
+
+            # Load the test data
+            # Only a single day is used for testing
+            if self.split == 'test' or self.split == 'val':
+                # only a single day is used for testing
+                fi = self.npzfile + "_{0}_reordered.npz".format(
+                    self.day
+                )
+                with np.load(fi) as data:
+                    self.X_int = data["X_int"]  # continuous  feature
+                    self.X_cat = data["X_cat"]  # categorical feature
+                    self.y = data["y"]          # target
+
+        else:
+            # load and preprocess data
+            with np.load(file) as data:
+                X_int = data["X_int"]  # continuous  feature
+                X_cat = data["X_cat"]  # categorical feature
+                y = data["y"]          # target
+                self.counts = data["counts"]
+            self.m_den = X_int.shape[1]  # den_fea
+            self.n_emb = len(self.counts)
+            print("Sparse fea = %d, Dense fea = %d" % (self.n_emb, self.m_den))
+
+            # create reordering
+            indices = np.arange(len(y))
+
+            if split == "none":
+                # randomize all data
+                if randomize == "total":
+                    indices = np.random.permutation(indices)
+                    print("Randomized indices...")
+
+                X_int[indices] = X_int
+                X_cat[indices] = X_cat
+                y[indices] = y
+
+            else:
+                indices = np.array_split(indices, self.offset_per_file[1:-1])
+
+                # randomize train data (per day)
+                if randomize == "day":  # or randomize == "total":
+                    for i in range(len(indices) - 1):
+                        indices[i] = np.random.permutation(indices[i])
+                    print("Randomized indices per day ...")
+
+                train_indices = np.concatenate(indices[:-1])
+                test_indices = indices[-1]
+                test_indices, val_indices = np.array_split(test_indices, 2)
+
+                print("Defined %s indices..." % (split))
+
+                # randomize train data (across days)
+                if randomize == "total":
+                    train_indices = np.random.permutation(train_indices)
+                    print("Randomized indices across days ...")
+
+                # create training, validation, and test sets
+                if split == 'train':
+                    self.X_int = [X_int[i] for i in train_indices]
+                    self.X_cat = [X_cat[i] for i in train_indices]
+                    self.y = [y[i] for i in train_indices]
+                elif split == 'val':
+                    self.X_int = [X_int[i] for i in val_indices]
+                    self.X_cat = [X_cat[i] for i in val_indices]
+                    self.y = [y[i] for i in val_indices]
+                elif split == 'test':
+                    self.X_int = [X_int[i] for i in test_indices]
+                    self.X_cat = [X_cat[i] for i in test_indices]
+                    self.y = [y[i] for i in test_indices]
+
+            print("Split data according to indices...")
+
+    def __getitem__(self, index):
+
+        if isinstance(index, slice):
+            return [
+                self[idx] for idx in range(
+                    index.start or 0, index.stop or len(self), index.step or 1
+                )
+            ]
+
+        if self.memory_map:
+            if self.split == 'none' or self.split == 'train':
+                # check if need to switch to next day and load data
+                if index == self.offset_per_file[self.day]:
+                    # print("day_boundary switch", index)
+                    self.day_boundary = self.offset_per_file[self.day]
+                    fi = self.npzfile + "_{0}_reordered.npz".format(
+                        self.day
+                    )
+                    # print('Loading file: ', fi)
+                    with np.load(fi) as data:
+                        self.X_int = data["X_int"]  # continuous  feature
+                        self.X_cat = data["X_cat"]  # categorical feature
+                        self.y = data["y"]          # target
+                    self.day = (self.day + 1) % self.max_day_range
+
+                i = index - self.day_boundary
+            elif self.split == 'test' or self.split == 'val':
+                # only a single day is used for testing
+                i = index + (0 if self.split == 'test' else self.test_size)
+            else:
+                sys.exit("ERROR: dataset split is neither none, nor train or test.")
+        else:
+            i = index
+
+        if self.max_ind_range > 0:
+            return self.X_int[i], self.X_cat[i] % self.max_ind_range, self.y[i]
+        else:
+            return self.X_int[i], self.X_cat[i], self.y[i]
+
+    def _default_preprocess(self, X_int, X_cat, y):
+        X_int = torch.log(torch.tensor(X_int, dtype=torch.float) + 1)
+        if self.max_ind_range > 0:
+            X_cat = torch.tensor(X_cat % self.max_ind_range, dtype=torch.long)
+        else:
+            X_cat = torch.tensor(X_cat, dtype=torch.long)
+        y = torch.tensor(y.astype(np.float32))
+
+        return X_int, X_cat, y
+
+    def __len__(self):
+        if self.memory_map:
+            if self.split == 'none':
+                return self.offset_per_file[-1]
+            elif self.split == 'train':
+                return self.offset_per_file[-2]
+            elif self.split == 'test':
+                return self.test_size
+            elif self.split == 'val':
+                return self.val_size
+            else:
+                sys.exit("ERROR: dataset split is neither none, nor train nor test.")
+        else:
+            return len(self.y)
+
+
+def collate_wrapper_criteo_offset(list_of_tuples):
+    # where each tuple is (X_int, X_cat, y)
+    transposed_data = list(zip(*list_of_tuples))
+    X_int = torch.log(torch.tensor(transposed_data[0], dtype=torch.float) + 1)
+    X_cat = torch.tensor(transposed_data[1], dtype=torch.long)
+    T = torch.tensor(transposed_data[2], dtype=torch.float32).view(-1, 1)
+
+    batchSize = X_cat.shape[0]
+    featureCnt = X_cat.shape[1]
+
+    lS_i = [X_cat[:, i] for i in range(featureCnt)]
+    lS_o = [torch.tensor(range(batchSize)) for _ in range(featureCnt)]
+
+    return X_int, torch.stack(lS_o), torch.stack(lS_i), T
+
+
+def ensure_dataset_preprocessed(args, d_path):
+    _ = CriteoDataset(
+        args.data_set,
+        args.max_ind_range,
+        args.data_sub_sample_rate,
+        args.data_randomize,
+        "train",
+        args.raw_data_file,
+        args.processed_data_file,
+        args.memory_map,
+        args.dataset_multiprocessing
+    )
+
+    _ = CriteoDataset(
+        args.data_set,
+        args.max_ind_range,
+        args.data_sub_sample_rate,
+        args.data_randomize,
+        "test",
+        args.raw_data_file,
+        args.processed_data_file,
+        args.memory_map,
+        args.dataset_multiprocessing
+    )
+
+    for split in ['train', 'val', 'test']:
+        print('Running preprocessing for split =', split)
+
+        train_files = ['{}_{}_reordered.npz'.format(args.raw_data_file, day)
+                       for
+                       day in range(0, 23)]
+
+        test_valid_file = args.raw_data_file + '_23_reordered.npz'
+
+        output_file = d_path + '_{}.bin'.format(split)
+
+        input_files = train_files if split == 'train' else [test_valid_file]
+        data_loader_terabyte.numpy_to_binary(input_files=input_files,
+                                             output_file_path=output_file,
+                                             split=split)
+
+
+# Conversion from offset to length
+def offset_to_length_converter(lS_o, lS_i):
+    def diff(tensor):
+        return tensor[1:] - tensor[:-1]
+
+    return torch.stack(
+        [
+            diff(torch.cat((S_o, torch.tensor(lS_i[ind].shape))).int())
+            for ind, S_o in enumerate(lS_o)
+        ]
+    )
+
+
+def collate_wrapper_criteo_length(list_of_tuples):
+    # where each tuple is (X_int, X_cat, y)
+    transposed_data = list(zip(*list_of_tuples))
+    X_int = torch.log(torch.tensor(transposed_data[0], dtype=torch.float) + 1)
+    X_cat = torch.tensor(transposed_data[1], dtype=torch.long)
+    T = torch.tensor(transposed_data[2], dtype=torch.float32).view(-1, 1)
+
+    batchSize = X_cat.shape[0]
+    featureCnt = X_cat.shape[1]
+
+    lS_i = torch.stack([X_cat[:, i] for i in range(featureCnt)])
+    lS_o = torch.stack(
+        [torch.tensor(range(batchSize)) for _ in range(featureCnt)]
+    )
+
+    lS_l = offset_to_length_converter(lS_o, lS_i)
+
+    return X_int, lS_l, lS_i, T
+
+
+def make_criteo_data_and_loaders(args, offset_to_length_converter=False):
+    if args.memory_map and args.data_set == "terabyte":
+        # more efficient for larger batches
+        data_directory = path.dirname(args.raw_data_file)
+
+        if args.mlperf_bin_loader:
+            lstr = args.processed_data_file.split("/")
+            d_path = "/".join(lstr[0:-1]) + "/" + lstr[-1].split(".")[0]
+            train_file = d_path + "_train.bin"
+            test_file = d_path + "_test.bin"
+            # val_file = d_path + "_val.bin"
+            counts_file = args.raw_data_file + '_fea_count.npz'
+            if any(not path.exists(p) for p in [train_file,
+                                                test_file,
+                                                counts_file]):
+                ensure_dataset_preprocessed(args, d_path)
+
+            train_data = data_loader_terabyte.CriteoBinDataset(
+                data_file=train_file,
+                counts_file=counts_file,
+                batch_size=args.mini_batch_size,
+                max_ind_range=args.max_ind_range
+            )
+
+            train_loader = torch.utils.data.DataLoader(
+                train_data,
+                batch_size=None,
+                batch_sampler=None,
+                shuffle=False,
+                num_workers=0,
+                collate_fn=None,
+                pin_memory=False,
+                drop_last=False,
+                sampler=RandomSampler(train_data) if args.mlperf_bin_shuffle else None
+            )
+
+            test_data = data_loader_terabyte.CriteoBinDataset(
+                data_file=test_file,
+                counts_file=counts_file,
+                batch_size=args.test_mini_batch_size,
+                max_ind_range=args.max_ind_range
+            )
+
+            test_loader = torch.utils.data.DataLoader(
+                test_data,
+                batch_size=None,
+                batch_sampler=None,
+                shuffle=False,
+                num_workers=0,
+                collate_fn=None,
+                pin_memory=False,
+                drop_last=False,
+            )
+        else:
+            data_filename = args.raw_data_file.split("/")[-1]
+
+            train_data = CriteoDataset(
+                args.data_set,
+                args.max_ind_range,
+                args.data_sub_sample_rate,
+                args.data_randomize,
+                "train",
+                args.raw_data_file,
+                args.processed_data_file,
+                args.memory_map,
+                args.dataset_multiprocessing
+            )
+
+            test_data = CriteoDataset(
+                args.data_set,
+                args.max_ind_range,
+                args.data_sub_sample_rate,
+                args.data_randomize,
+                "test",
+                args.raw_data_file,
+                args.processed_data_file,
+                args.memory_map,
+                args.dataset_multiprocessing
+            )
+
+            train_loader = data_loader_terabyte.DataLoader(
+                data_directory=data_directory,
+                data_filename=data_filename,
+                days=list(range(23)),
+                batch_size=args.mini_batch_size,
+                max_ind_range=args.max_ind_range,
+                split="train"
+            )
+
+            test_loader = data_loader_terabyte.DataLoader(
+                data_directory=data_directory,
+                data_filename=data_filename,
+                days=[23],
+                batch_size=args.test_mini_batch_size,
+                max_ind_range=args.max_ind_range,
+                split="test"
+            )
+    else:
+        train_data = CriteoDataset(
+            args.data_set,
+            args.max_ind_range,
+            args.data_sub_sample_rate,
+            args.data_randomize,
+            "train",
+            args.raw_data_file,
+            args.processed_data_file,
+            args.memory_map,
+            args.dataset_multiprocessing,
+        )
+
+        test_data = CriteoDataset(
+            args.data_set,
+            args.max_ind_range,
+            args.data_sub_sample_rate,
+            args.data_randomize,
+            "test",
+            args.raw_data_file,
+            args.processed_data_file,
+            args.memory_map,
+            args.dataset_multiprocessing,
+        )
+
+        collate_wrapper_criteo = collate_wrapper_criteo_offset
+        if offset_to_length_converter:
+            collate_wrapper_criteo = collate_wrapper_criteo_length
+
+        train_loader = torch.utils.data.DataLoader(
+            train_data,
+            batch_size=args.mini_batch_size,
+            shuffle=False,
+            num_workers=args.num_workers,
+            collate_fn=collate_wrapper_criteo,
+            pin_memory=False,
+            drop_last=False,  # True
+        )
+
+        test_loader = torch.utils.data.DataLoader(
+            test_data,
+            batch_size=args.test_mini_batch_size,
+            shuffle=False,
+            num_workers=args.test_num_workers,
+            collate_fn=collate_wrapper_criteo,
+            pin_memory=False,
+            drop_last=False,  # True
+        )
+
+    return train_data, train_loader, test_data, test_loader
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/dlrm_s_pytorch.py b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/dlrm_s_pytorch.py
new file mode 100644
index 00000000000..12936c64165
--- /dev/null
+++ b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/dlrm_s_pytorch.py
@@ -0,0 +1,1140 @@
+# Copyright (c) 2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Description: an implementation of a deep learning recommendation model (DLRM)
+# The model input consists of dense and sparse features. The former is a vector
+# of floating point values. The latter is a list of sparse indices into
+# embedding tables, which consist of vectors of floating point values.
+# The selected vectors are passed to mlp networks denoted by triangles,
+# in some cases the vectors are interacted through operators (Ops).
+#
+# output:
+#                         vector of values
+# model:                        |
+#                              /\
+#                             /__\
+#                               |
+#       _____________________> Op  <___________________
+#     /                         |                      \
+#    /\                        /\                      /\
+#   /__\                      /__\           ...      /__\
+#    |                          |                       |
+#    |                         Op                      Op
+#    |                    ____/__\_____           ____/__\____
+#    |                   |_Emb_|____|__|    ...  |_Emb_|__|___|
+# input:
+# [ dense features ]     [sparse indices] , ..., [sparse indices]
+#
+# More precise definition of model layers:
+# 1) fully connected layers of an mlp
+# z = f(y)
+# y = Wx + b
+#
+# 2) embedding lookup (for a list of sparse indices p=[p1,...,pk])
+# z = Op(e1,...,ek)
+# obtain vectors e1=E[:,p1], ..., ek=E[:,pk]
+#
+# 3) Operator Op can be one of the following
+# Sum(e1,...,ek) = e1 + ... + ek
+# Dot(e1,...,ek) = [e1'e1, ..., e1'ek, ..., ek'e1, ..., ek'ek]
+# Cat(e1,...,ek) = [e1', ..., ek']'
+# where ' denotes transpose operation
+#
+# References:
+# [1] Maxim Naumov, Dheevatsa Mudigere, Hao-Jun Michael Shi, Jianyu Huang,
+# Narayanan Sundaram, Jongsoo Park, Xiaodong Wang, Udit Gupta, Carole-Jean Wu,
+# Alisson G. Azzolini, Dmytro Dzhulgakov, Andrey Mallevich, Ilia Cherniavskii,
+# Yinghai Lu, Raghuraman Krishnamoorthi, Ansha Yu, Volodymyr Kondratenko,
+# Stephanie Pereira, Xianjie Chen, Wenlin Chen, Vijay Rao, Bill Jia, Liang Xiong,
+# Misha Smelyanskiy, "Deep Learning Recommendation Model for Personalization and
+# Recommendation Systems", CoRR, arXiv:1906.00091, 2019
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import argparse
+
+# miscellaneous
+import builtins
+import datetime
+import sys
+import time
+
+
+# data generation
+import dlrm_data_pytorch as dp
+
+# numpy
+import numpy as np
+import sklearn.metrics
+
+# pytorch
+import torch
+import torch.nn as nn
+from torch._ops import ops
+from torch.autograd.profiler import record_function
+from torch.nn.parallel.parallel_apply import parallel_apply
+from torch.nn.parallel.replicate import replicate
+from torch.nn.parallel.scatter_gather import gather, scatter
+from torch.nn.parameter import Parameter
+from torch.optim.lr_scheduler import _LRScheduler
+from torch.utils import ThroughputBenchmark
+# For distributed run
+import extend_distributed as ext_dist
+
+
+try:
+    import intel_extension_for_pytorch as ipex
+except:
+    assert False, "please install intel-extension-for-pytorch, support version higher than 1.10"
+
+
+exc = getattr(builtins, "IOError", "FileNotFoundError")
+
+def freeze(model):
+    return torch.jit._recursive.wrap_cpp_module(torch._C._freeze_module(model._c, preserveParameters=True))
+
+
+def time_wrap():
+    return time.time()
+
+
+def dlrm_wrap(X, *emb_args):
+    with record_function("DLRM forward"):
+        return dlrm(X, *emb_args)
+
+
+def loss_fn_wrap(Z, T):
+    with record_function("DLRM loss compute"):
+        return dlrm.loss_fn(Z, T)
+
+# The following function is a wrapper to avoid checking this multiple times in th
+# loop below.
+def unpack_batch(b):
+    # Experiment with unweighted samples
+    return b[0], b[1], b[2], b[3], torch.ones(b[3].size()), None
+
+
+class LRPolicyScheduler(_LRScheduler):
+    def __init__(self, optimizer, num_warmup_steps, decay_start_step, num_decay_steps):
+        self.num_warmup_steps = num_warmup_steps
+        self.decay_start_step = decay_start_step
+        self.decay_end_step = decay_start_step + num_decay_steps
+        self.num_decay_steps = num_decay_steps
+
+        if self.decay_start_step < self.num_warmup_steps:
+            sys.exit("Learning rate warmup must finish before the decay starts")
+
+        super(LRPolicyScheduler, self).__init__(optimizer)
+
+    def get_lr(self):
+        step_count = self._step_count
+        if step_count < self.num_warmup_steps:
+            # warmup
+            scale = 1.0 - (self.num_warmup_steps - step_count) / self.num_warmup_steps
+            lr = [base_lr * scale for base_lr in self.base_lrs]
+            self.last_lr = lr
+        elif self.decay_start_step <= step_count and step_count < self.decay_end_step:
+            # decay
+            decayed_steps = step_count - self.decay_start_step
+            scale = ((self.num_decay_steps - decayed_steps) / self.num_decay_steps) ** 2
+            min_lr = 0.0000001
+            lr = [max(min_lr, base_lr * scale) for base_lr in self.base_lrs]
+            self.last_lr = lr
+        else:
+            if self.num_decay_steps > 0:
+                # freeze at last, either because we're after decay
+                # or because we're between warmup and decay
+                lr = self.last_lr
+            else:
+                # do not adjust
+                lr = self.base_lrs
+        return lr
+
+
+### define dlrm in PyTorch ###
+class DLRM_Net(nn.Module):
+    def create_mlp(self, ln, sigmoid_layer):
+        # build MLP layer by layer
+        layers = nn.ModuleList()
+        for i in range(0, ln.size - 1):
+            n = ln[i]
+            m = ln[i + 1]
+
+            # construct fully connected operator
+            LL = nn.Linear(int(n), int(m), bias=True)
+
+            # initialize the weights
+            # with torch.no_grad():
+            # custom Xavier input, output or two-sided fill
+            mean = 0.0  # std_dev = np.sqrt(variance)
+            std_dev = np.sqrt(2 / (m + n))  # np.sqrt(1 / m) # np.sqrt(1 / n)
+            W = np.random.normal(mean, std_dev, size=(m, n)).astype(np.float32)
+            std_dev = np.sqrt(1 / m)  # np.sqrt(2 / (m + 1))
+            bt = np.random.normal(mean, std_dev, size=m).astype(np.float32)
+            # approach 1
+            LL.weight.data = torch.tensor(W, requires_grad=True)
+            LL.bias.data = torch.tensor(bt, requires_grad=True)
+            # approach 2
+            # LL.weight.data.copy_(torch.tensor(W))
+            # LL.bias.data.copy_(torch.tensor(bt))
+            # approach 3
+            # LL.weight = Parameter(torch.tensor(W),requires_grad=True)
+            # LL.bias = Parameter(torch.tensor(bt),requires_grad=True)
+            layers.append(LL)
+
+            # construct sigmoid or relu operator
+            if i == sigmoid_layer:
+                layers.append(nn.Sigmoid())
+            else:
+                layers.append(nn.ReLU())
+
+        # approach 1: use ModuleList
+        # return layers
+        # approach 2: use Sequential container to wrap all layers
+        return torch.nn.Sequential(*layers)
+
+    def create_emb(self, m, ln, local_ln_emb=None):
+        emb_l = nn.ModuleList()
+        n_embs = ln.size if local_ln_emb is None else len(local_ln_emb)
+        for i in range(n_embs):
+            if local_ln_emb is None:
+                n = ln[i]
+            else:
+                n = ln[local_ln_emb[i]]
+            EE = nn.EmbeddingBag(n, m, mode="sum", sparse=True)
+            # initialize embeddings
+            if not args.inference_only:
+                nn.init.uniform_(EE.weight, a=-np.sqrt(1 / n), b=np.sqrt(1 / n))
+            emb_l.append(EE)
+        return emb_l
+
+    def __init__(
+        self,
+        m_spa=None,
+        ln_emb=None,
+        ln_bot=None,
+        ln_top=None,
+        sigmoid_bot=-1,
+        sigmoid_top=-1,
+        weighted_pooling=None,
+        loss_threshold=0.0,
+    ):
+        super(DLRM_Net, self).__init__()
+        self.loss_threshold = loss_threshold
+        #If running distributed, get local slice of embedding tables
+        if ext_dist.my_size > 1:
+            n_emb = len(ln_emb)
+            self.n_global_emb = n_emb
+            self.rank = ext_dist.dist.get_rank()
+            self.ln_emb = [i for i in range(n_emb)]
+            self.n_local_emb, self.n_emb_per_rank = ext_dist.get_split_lengths(n_emb)
+            self.local_ln_emb_slice = ext_dist.get_my_slice(n_emb)
+            self.local_ln_emb = self.ln_emb[self.local_ln_emb_slice]
+        else:
+            self.local_ln_emb = None
+        self.emb_l = self.create_emb(m_spa, ln_emb, self.local_ln_emb)
+        self.bot_l = self.create_mlp(ln_bot, sigmoid_bot)
+        self.top_l = self.create_mlp(ln_top, sigmoid_top)
+        self.loss_fn = torch.nn.BCELoss(reduction="mean")
+
+
+    def apply_mlp(self, x, layers):
+        # approach 1: use ModuleList
+        # for layer in layers:
+        #     x = layer(x)
+        # return x
+        # approach 2: use Sequential container to wrap all layers
+        return layers(x)
+
+    def apply_emb(self, emb_l, *emb_args):
+        # WARNING: notice that we are processing the batch at once. We implicitly
+        # assume that the data is laid out such that:
+        # 1. each embedding is indexed with a group of sparse indices,
+        #   corresponding to a single lookup
+        # 2. for each embedding the lookups are further organized into a batch
+        # 3. for a list of embedding tables there is a list of batched lookups
+        if isinstance(emb_l, ipex.nn.modules.MergedEmbeddingBagWithSGD):
+            return emb_l(emb_args, self.need_linearize_indices_and_offsets)
+        lS_o, lS_i = emb_args
+        ly = []
+        for k, sparse_index_group_batch in enumerate(lS_i):
+            sparse_offset_group_batch = lS_o[k]
+
+            # embedding lookup
+            # We are using EmbeddingBag, which implicitly uses sum operator.
+            # The embeddings are represented as tall matrices, with sum
+            # happening vertically across 0 axis, resulting in a row vector
+            E = emb_l[k]
+            V = E(
+                sparse_index_group_batch,
+                sparse_offset_group_batch,
+            )
+
+            ly.append(V)
+
+        return ly
+
+    def interact_features(self, x, ly):
+        if args.ipex_interaction:
+            T = [x] + list(ly)
+            R = ipex.nn.functional.interaction(*T)
+        else:
+            # concatenate dense and sparse features
+            (batch_size, d) = x.shape
+            T = torch.cat([x] + ly, dim=1).view((batch_size, -1, d))
+            # perform a dot product
+            Z = torch.bmm(T, torch.transpose(T, 1, 2))
+            # append dense feature with the interactions (into a row vector)
+            # approach 1: all
+            # Zflat = Z.view((batch_size, -1))
+            # approach 2: unique
+            _, ni, nj = Z.shape
+            # approach 1: tril_indices
+            # offset = -1
+            # li, lj = torch.tril_indices(ni, nj, offset=offset)
+            # approach 2: custom
+            offset = 0
+            li = torch.tensor([i for i in range(ni) for j in range(i + offset)])
+            lj = torch.tensor([j for i in range(nj) for j in range(i + offset)])
+            Zflat = Z[:, li, lj]
+            # concatenate dense features and interactions
+            R = torch.cat([x] + [Zflat], dim=1)
+        return R
+
+    def forward(self, dense_x, *emb_args):
+        if ext_dist.my_size > 1:
+            return self.distributed_forward(dense_x, *emb_args)
+        else:
+            return self.sequential_forward(dense_x, *emb_args)
+
+    def distributed_forward(self, dense_x, *emb_args):
+        batch_size = dense_x.size()[0]
+        vector_lenght = self.emb_l.weights[0].size()[1]
+        # WARNING: # of ranks must be <= batch size in distributed_forward call
+        if batch_size < ext_dist.my_size:
+            sys.exit("ERROR: batch_size (%d) must be larger than number of ranks (%d)" % (batch_size, ext_dist.my_size))
+
+        # embeddings
+        ly = self.apply_emb(self.emb_l, *emb_args)
+        a2a_req = ext_dist.alltoall(ly, self.n_emb_per_rank)
+        # bottom mlp
+        x = self.apply_mlp(dense_x, self.bot_l)
+        ly = a2a_req.wait()
+        _ly = []
+        for item in ly:
+            _ly += [item[:, emb_id * vector_lenght: (emb_id + 1) * vector_lenght] for emb_id in range(self.emb_l.n_tables)]
+        # interactions
+        z = self.interact_features(x, _ly)
+        # top mlp
+        p = self.apply_mlp(z, self.top_l)
+        # clamp output if needed
+        if 0.0 < self.loss_threshold and self.loss_threshold < 1.0:
+            z = torch.clamp(
+                p, min=self.loss_threshold, max=(1.0 - self.loss_threshold)
+            )
+        else:
+            z = p
+        return z
+ 
+
+    def sequential_forward(self, dense_x, *emb_args):
+        # process dense features (using bottom mlp), resulting in a row vector
+        x = self.apply_mlp(dense_x, self.bot_l)
+        # debug prints
+        # print("intermediate")
+        # print(x.detach().cpu().numpy())
+
+        # process sparse features(using embeddings), resulting in a list of row vectors
+        ly = self.apply_emb(self.emb_l, *emb_args)
+        # for y in ly:
+        #     print(y.detach().cpu().numpy())
+
+        # interact features (dense and sparse)
+        z = self.interact_features(x, ly)
+        # print(z.detach().cpu().numpy())
+
+        # obtain probability of a click (using top mlp)
+        p = self.apply_mlp(z, self.top_l)
+
+        # clamp output if needed
+        if 0.0 < self.loss_threshold and self.loss_threshold < 1.0:
+            z = torch.clamp(p, min=self.loss_threshold, max=(1.0 - self.loss_threshold))
+        else:
+            z = p
+
+        return z
+
+
+def dash_separated_ints(value):
+    vals = value.split("-")
+    for val in vals:
+        try:
+            int(val)
+        except ValueError:
+            raise argparse.ArgumentTypeError(
+                "%s is not a valid dash separated list of ints" % value
+            )
+
+    return value
+
+
+def trace_model(args, dlrm, test_ld, inplace=True):
+    dlrm.eval()
+    for j, inputBatch in enumerate(test_ld):
+        X, lS_o, lS_i, _, _, _ = unpack_batch(inputBatch)
+        if args.bf16:
+            # at::GradMode::is_enabled() will query a threadlocal flag
+            # but new thread generate from throughputbench mark will 
+            # init this flag to true, so we temporal cast embedding's
+            # weight to bfloat16 for now
+            if args.inference_only:
+                dlrm.emb_l.bfloat16()
+            dlrm = ipex.optimize(dlrm, dtype=torch.bfloat16, inplace=inplace)
+        elif args.int8 and not args.tune:
+            if args.num_cpu_cores != 0:
+                torch.set_num_threads(args.num_cpu_cores)
+            from neural_compressor.torch.quantization import load
+            dlrm = load(args.save_model)
+        elif args.int8 and args.tune:
+            dlrm = dlrm
+        else:
+            dlrm = ipex.optimize(dlrm, dtype=torch.float, inplace=True, auto_kernel_selection=True)
+            with torch.cpu.amp.autocast(enabled=args.bf16):
+                dlrm = torch.jit.trace(dlrm, (X, lS_o, lS_i), check_trace=True)
+                dlrm = torch.jit.freeze(dlrm)
+                dlrm(X, lS_o, lS_i)
+                dlrm(X, lS_o, lS_i)
+        return dlrm
+
+
+def run_throughput_benchmark(args, dlrm, test_ld):
+    bench = ThroughputBenchmark(dlrm)
+    for j, inputBatch in enumerate(test_ld):
+        X, lS_o, lS_i, T, W, CBPP = unpack_batch(inputBatch)
+        bench.add_input(X, lS_o, lS_i)
+        if args.num_batches > 0 and j == args.num_batches: 
+            break
+    args.num_batches = args.num_batches if args.num_batches > 0 else j
+    stats = bench.benchmark(
+        num_calling_threads=args.share_weight_instance,
+        num_warmup_iters=100,
+        num_iters=args.num_batches * args.share_weight_instance,
+    )
+    print(stats)
+    latency = stats.latency_avg_ms
+    throughput = (1 / latency) * 1000 * test_ld.dataset.batch_size * args.share_weight_instance
+    print("throughput: {:.3f} fps".format(throughput))
+    print("latency: {:.5f} ms".format(1/throughput * 1000))
+    exit(0)
+
+
+def inference(
+    args,
+    dlrm,
+    best_acc_test,
+    best_auc_test,
+    test_ld,
+    trace=True
+):
+    test_accu = 0
+    test_samp = 0
+
+    if args.print_auc:
+        scores = []
+        targets = []
+
+    total_time = 0
+    total_iter = 0
+    if args.inference_only and trace:
+        dlrm = trace_model(args, dlrm, test_ld)
+    if args.share_weight_instance != 0:
+        run_throughput_benchmark(args, dlrm, test_ld)
+    with torch.cpu.amp.autocast(enabled=args.bf16):
+        for i, testBatch in enumerate(test_ld):
+            should_print = ((i + 1) % args.print_freq == 0 or i + 1 == len(test_ld)) and args.inference_only
+            if should_print:
+                gT = 1000.0 * total_time / total_iter
+                print(
+                    "Finished {} it {}/{}, {:.2f} ms/it,".format(
+                        "inference", i + 1, len(test_ld), gT
+                    ),
+                    flush=True,
+                )
+                total_time = 0
+                total_iter = 0
+            # early exit if nbatches was set by the user and was exceeded
+            if args.inference_only and nbatches > 0 and i >= nbatches:
+                break
+
+            X_test, lS_o_test, lS_i_test, T_test, W_test, CBPP_test = unpack_batch(
+                testBatch
+            )
+
+            # forward pass
+
+            if not args.inference_only and isinstance(dlrm.emb_l, ipex.nn.modules.MergedEmbeddingBagWithSGD):
+                n_tables = lS_i_test.shape[0]
+                idx = [lS_i_test[i] for i in range(n_tables)]
+                offset = [lS_o_test[i] for i in range(n_tables)]
+                include_last = [False for i in range(n_tables)]
+                indices, offsets, indices_with_row_offsets = dlrm.emb_l.linearize_indices_and_offsets(idx, offset, include_last)
+
+            start = time_wrap()
+            if not args.inference_only and isinstance(dlrm.emb_l, ipex.nn.modules.MergedEmbeddingBagWithSGD):
+                Z_test = dlrm(X_test, indices, offsets, indices_with_row_offsets)
+            else:
+                Z_test = dlrm(X_test, lS_o_test, lS_i_test)
+
+    
+            total_time += (time_wrap() - start)
+            total_iter += 1
+
+            if args.print_auc:
+                S_test = Z_test.detach().cpu().float().numpy()  # numpy array
+                T_test = T_test.detach().cpu().float().numpy()  # numpy array
+                scores.append(S_test)
+                targets.append(T_test)
+            elif not args.inference_only:
+                with record_function("DLRM accuracy compute"):
+                    # compute loss and accuracy
+                    S_test = Z_test.detach().cpu().float().numpy()  # numpy array
+                    T_test = T_test.detach().cpu().float().numpy()  # numpy array
+
+                    mbs_test = T_test.shape[0]  # = mini_batch_size except last
+                    A_test = np.sum((np.round(S_test, 0) == T_test).astype(np.uint8))
+
+                    test_accu += A_test
+                    test_samp += mbs_test
+            else:
+                # do nothing to save time
+                pass
+
+    if args.print_auc:
+        with record_function("DLRM mlperf sklearn metrics compute"):
+            scores = np.concatenate(scores, axis=0)
+            targets = np.concatenate(targets, axis=0)
+
+            metrics = {
+                "recall": lambda y_true, y_score: sklearn.metrics.recall_score(
+                    y_true=y_true, y_pred=np.round(y_score)
+                ),
+                "precision": lambda y_true, y_score: sklearn.metrics.precision_score(
+                    y_true=y_true, y_pred=np.round(y_score)
+                ),
+                "f1": lambda y_true, y_score: sklearn.metrics.f1_score(
+                    y_true=y_true, y_pred=np.round(y_score)
+                ),
+                "ap": sklearn.metrics.average_precision_score,
+                "roc_auc": sklearn.metrics.roc_auc_score,
+                "accuracy": lambda y_true, y_score: sklearn.metrics.accuracy_score(
+                    y_true=y_true, y_pred=np.round(y_score)
+                ),
+            }
+
+        validation_results = {}
+        for metric_name, metric_function in metrics.items():
+            validation_results[metric_name] = metric_function(targets, scores)
+        acc_test = validation_results["accuracy"]
+    elif not args.inference_only:
+        acc_test = test_accu / test_samp
+    else:
+        pass
+
+    model_metrics_dict = {
+        "nepochs": args.nepochs,
+        "nbatches": nbatches,
+        "nbatches_test": nbatches_test,
+    }
+    if not args.inference_only:
+        model_metrics_dict["test_acc"] = acc_test
+
+    if args.print_auc:
+        is_best = validation_results["roc_auc"] > best_auc_test
+        if is_best:
+            best_auc_test = validation_results["roc_auc"]
+            model_metrics_dict["test_auc"] = best_auc_test
+        print(
+            "recall {:.4f}, precision {:.4f},".format(
+                validation_results["recall"],
+                validation_results["precision"],
+            )
+            + " f1 {:.4f}, ap {:.4f},".format(
+                validation_results["f1"], validation_results["ap"]
+            )
+            + " auc {:.4f}, best auc {:.4f},".format(
+                validation_results["roc_auc"], best_auc_test
+            )
+            + " accuracy {:3.3f} %, best accuracy {:3.3f} %".format(
+                validation_results["accuracy"] * 100, best_acc_test * 100
+            ),
+            flush=True,
+        )
+        print("Accuracy: {:.34} ".format(validation_results["roc_auc"]))
+    elif not args.inference_only:
+        is_best = acc_test > best_acc_test
+        if is_best:
+            best_acc_test = acc_test
+        print(
+            " accuracy {:3.3f} %, best {:3.3f} %".format(
+                acc_test * 100, best_acc_test * 100
+            ),
+            flush=True,
+        )
+    else:
+        pass
+    if not args.inference_only:
+        return model_metrics_dict, is_best
+    else:
+        return validation_results["roc_auc"]
+
+
+def run():
+    ### parse arguments ###
+    parser = argparse.ArgumentParser(
+        description="Train Deep Learning Recommendation Model (DLRM)"
+    )
+    # model related parameters
+    parser.add_argument("--arch-sparse-feature-size", type=int, default=2)
+    parser.add_argument(
+        "--arch-embedding-size", type=dash_separated_ints, default="4-3-2"
+    )
+    # j will be replaced with the table number
+    parser.add_argument("--arch-mlp-bot", type=dash_separated_ints, default="4-3-2")
+    parser.add_argument("--arch-mlp-top", type=dash_separated_ints, default="4-2-1")
+    # activations and loss
+    parser.add_argument("--activation-function", type=str, default="relu")
+    parser.add_argument("--loss-threshold", type=float, default=0.0)  # 1.0e-7
+    parser.add_argument("--round-targets", type=bool, default=False)
+    # data
+    parser.add_argument("--num-batches", type=int, default=0)
+    parser.add_argument("--data-set", type=str, default="kaggle")  # or terabyte
+    parser.add_argument("--raw-data-file", type=str, default="")
+    parser.add_argument("--processed-data-file", type=str, default="")
+    parser.add_argument("--max-ind-range", type=int, default=-1)
+    parser.add_argument("--memory-map", action="store_true", default=False)
+    parser.add_argument("--data-sub-sample-rate", type=float, default=0.0)  # in [0, 1]
+    parser.add_argument("--data-randomize", type=str, default="total")  # or day or none
+    parser.add_argument(
+        "--dataset-multiprocessing",
+        action="store_true",
+        default=False,
+        help="The Kaggle dataset can be multiprocessed in an environment \
+                        with more than 7 CPU cores and more than 20 GB of memory. \n \
+                        The Terabyte dataset can be multiprocessed in an environment \
+                        with more than 24 CPU cores and at least 1 TB of memory.",
+    )
+    # training
+    parser.add_argument("--mini-batch-size", type=int, default=1)
+    parser.add_argument("--nepochs", type=int, default=1)
+    parser.add_argument("--learning-rate", type=float, default=0.01)
+    parser.add_argument("--print-precision", type=int, default=5)
+    parser.add_argument("--numpy-rand-seed", type=int, default=123)
+    # inference
+    parser.add_argument("--inference-only", action="store_true", default=False)
+    # store/load model
+    parser.add_argument("--save-model", type=str, default="")
+    parser.add_argument("--load-model", type=str, default="")
+    # debugging and profiling
+    parser.add_argument("--print-freq", type=int, default=1)
+    parser.add_argument("--test-freq", type=int, default=-1)
+    parser.add_argument("--test-mini-batch-size", type=int, default=-1)
+    parser.add_argument("--print-time", action="store_true", default=False)
+    parser.add_argument("--print-wall-time", action="store_true", default=False)
+    parser.add_argument("--enable-profiling", action="store_true", default=False)
+    # stop at target AUC Terabyte (no subsampling) 0.8025
+    parser.add_argument("--mlperf-auc-threshold", type=float, default=0.0)
+    parser.add_argument("--mlperf-bin-loader", action="store_true", default=False)
+    parser.add_argument("--mlperf-bin-shuffle", action="store_true", default=False)
+    # LR policy
+    parser.add_argument("--lr-num-warmup-steps", type=int, default=0)
+    parser.add_argument("--lr-decay-start-step", type=int, default=0)
+    parser.add_argument("--lr-num-decay-steps", type=int, default=0)
+    # intel
+    parser.add_argument("--print-auc", action="store_true", default=False)
+    parser.add_argument("--should-test", action="store_true", default=False)
+    parser.add_argument("--bf16", action="store_true", default=False)
+    parser.add_argument("--share-weight-instance", type=int, default=0)
+    parser.add_argument("--num-cpu-cores", type=int, default=0)
+    parser.add_argument("--ipex-interaction", action="store_true", default=False)
+    parser.add_argument("--ipex-merged-emb", action="store_true", default=False)
+    parser.add_argument("--num-warmup-iters", type=int, default=1000)
+    parser.add_argument("--int8", action="store_true", default=False)
+    parser.add_argument("--dist-backend", type=str, default="ccl")
+    parser.add_argument("--tune", action="store_true", default=False)
+    parser.add_argument("--benchmark", action="store_true", default=False)
+    parser.add_argument("--accuracy_only", action="store_true", default=False)
+
+    global args
+    global nbatches
+    global nbatches_test
+    args = parser.parse_args()
+    ext_dist.init_distributed(backend=args.dist_backend)
+
+
+    ### some basic setup ###
+    np.random.seed(args.numpy_rand_seed)
+    np.set_printoptions(precision=args.print_precision)
+    torch.set_printoptions(precision=args.print_precision)
+    torch.manual_seed(args.numpy_rand_seed)
+
+    if args.test_mini_batch_size < 0:
+        # if the parameter is not set, use the training batch size
+        args.test_mini_batch_size = args.mini_batch_size
+
+    device = torch.device("cpu")
+    print("Using CPU...")
+
+    ### prepare training data ###
+    ln_bot = np.fromstring(args.arch_mlp_bot, dtype=int, sep="-")
+    # input data
+    train_data, train_ld, test_data, test_ld = dp.make_criteo_data_and_loaders(args)
+    nbatches = args.num_batches if args.num_batches > 0 else len(train_ld)
+    nbatches_test = len(test_ld)
+
+    ln_emb = train_data.counts
+    # enforce maximum limit on number of vectors per embedding
+    if args.max_ind_range > 0:
+        ln_emb = np.array(
+            list(
+                map(
+                    lambda x: x if x < args.max_ind_range else args.max_ind_range,
+                    ln_emb,
+                )
+            )
+        )
+    else:
+        ln_emb = np.array(ln_emb)
+    m_den = train_data.m_den
+    ln_bot[0] = m_den
+
+    args.ln_emb = ln_emb.tolist()
+
+    ### parse command line arguments ###
+    m_spa = args.arch_sparse_feature_size
+    ln_emb = np.asarray(ln_emb)
+    num_fea = ln_emb.size + 1  # num sparse + num dense features
+
+    m_den_out = ln_bot[ln_bot.size - 1]
+    # approach 1: all
+    # num_int = num_fea * num_fea + m_den_out
+    # approach 2: unique
+    num_int = (num_fea * (num_fea - 1)) // 2 + m_den_out
+
+    arch_mlp_top_adjusted = str(num_int) + "-" + args.arch_mlp_top
+    ln_top = np.fromstring(arch_mlp_top_adjusted, dtype=int, sep="-")
+
+    ### construct the neural network specified above ###
+    # WARNING: to obtain exactly the same initialization for
+    # the weights we need to start from the same random seed.
+    # np.random.seed(args.numpy_rand_seed)
+    global dlrm
+    dlrm = DLRM_Net(
+        m_spa,
+        ln_emb,
+        ln_bot,
+        ln_top,
+        sigmoid_bot=-1,
+        sigmoid_top=ln_top.size - 2,
+        loss_threshold=args.loss_threshold,
+    )
+    if args.ipex_merged_emb:
+        dlrm.emb_l = ipex.nn.modules.MergedEmbeddingBagWithSGD.from_embeddingbag_list(dlrm.emb_l, lr=args.learning_rate)
+        dlrm.need_linearize_indices_and_offsets = torch.BoolTensor([False])
+
+    if not args.inference_only:
+        optimizer = torch.optim.SGD(dlrm.parameters(), lr=args.learning_rate)
+        lr_scheduler = LRPolicyScheduler(
+            optimizer,
+            args.lr_num_warmup_steps,
+            args.lr_decay_start_step,
+            args.lr_num_decay_steps,
+        )
+
+    ### main loop ###
+
+    # training or inference
+    best_acc_test = 0
+    best_auc_test = 0
+    skip_upto_epoch = 0
+    skip_upto_batch = 0
+    total_time = 0
+    total_loss = 0
+    total_iter = 0
+    total_samp = 0
+
+    # Load model is specified
+    if not (args.load_model == ""):
+        print("Loading saved model {}".format(args.load_model))
+        ld_model = torch.load(args.load_model, map_location=torch.device("cpu"))
+        dlrm.load_state_dict(ld_model["state_dict"])
+        ld_j = ld_model["iter"]
+        ld_k = ld_model["epoch"]
+        ld_nepochs = ld_model["nepochs"]
+        ld_nbatches = ld_model["nbatches"]
+        ld_nbatches_test = ld_model["nbatches_test"]
+        ld_train_loss = ld_model["train_loss"]
+        ld_total_loss = ld_model["total_loss"]
+        ld_acc_test = ld_model["test_acc"]
+        if not args.inference_only:
+            optimizer.load_state_dict(ld_model["opt_state_dict"])
+            best_acc_test = ld_acc_test
+            total_loss = ld_total_loss
+            skip_upto_epoch = ld_k  # epochs
+            skip_upto_batch = ld_j  # batches
+        else:
+            args.print_freq = ld_nbatches
+            args.test_freq = 0
+
+        print(
+            "Saved at: epoch = {:d}/{:d}, batch = {:d}/{:d}, ntbatch = {:d}".format(
+                ld_k, ld_nepochs, ld_j, ld_nbatches, ld_nbatches_test
+            )
+        )
+        print(
+            "Training state: loss = {:.6f}".format(
+                ld_train_loss,
+            )
+        )
+        print("Testing state: accuracy = {:3.3f} %".format(ld_acc_test * 100))
+        del(ld_model)
+
+    ext_dist.barrier()
+    print("time/loss/accuracy (if enabled):")
+
+    if args.tune:
+        # evaluation
+        def eval_func(model):
+            args.int8 = getattr(model, "is_quantized", False)
+            with torch.no_grad():
+                return inference(
+                    args,
+                    model,
+                    best_acc_test,
+                    best_auc_test,
+                    test_ld,
+                    trace=args.int8
+                )
+
+        # calibration
+        def calib_fn(model):
+            calib_number = 0
+            for X_test, lS_o_test, lS_i_test, T in train_ld:
+                if calib_number < 100:
+                    model(X_test, lS_o_test, lS_i_test)
+                    calib_number += 1
+                else:
+                    break
+
+        X_test, lS_o_test, lS_i_test, T = next(iter(train_ld))
+        example_inputs = (X_test, lS_o_test, lS_i_test)
+        assert args.inference_only, "Please set inference_only in arguments"
+        from neural_compressor.torch.quantization import StaticQuantConfig, autotune, TuningConfig
+        tune_config = TuningConfig(config_set=StaticQuantConfig.get_config_set_for_tuning())
+
+        dlrm = autotune(
+            dlrm, 
+            tune_config=tune_config,
+            eval_fn=eval_func,
+            run_fn=calib_fn,
+            example_inputs=example_inputs,
+        )
+        dlrm.save(args.save_model)
+        exit(0)
+    if args.benchmark:
+        # To do
+        print('Not implemented yet')
+        exit(0)
+
+    if args.accuracy_only:
+        with torch.no_grad():
+            inference(
+                  args,
+                  dlrm,
+                  best_acc_test,
+                  best_auc_test,
+                  test_ld
+                  )
+        exit(0)
+
+
+    if args.bf16 and not args.inference_only:
+        for j, inputBatch in enumerate(train_ld):
+            X, lS_o, lS_i, T, W, CBPP = unpack_batch(inputBatch)
+            if ext_dist.my_size > 1:
+                local_bs = X.size()[0] // ext_dist.my_size
+                rank_id = dlrm.rank
+                X = X[rank_id * local_bs: (rank_id + 1) * local_bs]
+                T = T[rank_id * local_bs: (rank_id + 1) * local_bs]
+                global_bs = local_bs * ext_dist.my_size
+                lS_o = lS_o[:, :global_bs]
+                lS_i = lS_i[:, :global_bs]
+
+            if isinstance(dlrm.emb_l, ipex.nn.modules.MergedEmbeddingBagWithSGD):
+                if ext_dist.my_size > 1:
+                    batch_size = X.size()[0]
+                    g_i = lS_i[dlrm.local_ln_emb]
+                    g_o = lS_o[dlrm.local_ln_emb]
+                    n_tables = g_i.shape[0]
+                    idx = [g_i[i] for i in range(n_tables)]
+                    offset = [g_o[i] for i in range(n_tables)]
+                    include_last = [False for i in range(n_tables)]
+                    indices, offsets, indices_with_row_offsets = dlrm.emb_l.linearize_indices_and_offsets(idx, offset, include_last)
+                else:
+                    n_tables = lS_i.shape[0]
+                    idx = [lS_i[i] for i in range(n_tables)]
+                    offset = [lS_o[i] for i in range(n_tables)]
+                    include_last = [False for i in range(n_tables)]
+                    indices, offsets, indices_with_row_offsets = dlrm.emb_l.linearize_indices_and_offsets(idx, offset, include_last)
+            if isinstance(dlrm.emb_l, ipex.nn.modules.MergedEmbeddingBagWithSGD):
+                sample_input = (X, indices, offsets, indices_with_row_offsets)
+            else:
+                sample_input = (X, lS_o, lS_i)
+            break
+        dlrm, optimizer = ipex.optimize(dlrm, dtype=torch.bfloat16, optimizer=optimizer, inplace=True, sample_input=sample_input)
+
+        if args.ipex_merged_emb:
+            dlrm.emb_l.to_bfloat16_train()
+        for i in range(len(dlrm.top_l)):
+            if isinstance(dlrm.top_l[i], ipex.nn.utils._weight_prepack._IPEXLinear):
+                if isinstance(dlrm.top_l[i+1], torch.nn.ReLU):
+                    dlrm.top_l[i] = ipex.nn.modules.IPEXLinearEltwise(dlrm.top_l[i], 'relu')
+                else:
+                    dlrm.top_l[i] = ipex.nn.modules.IPEXLinearEltwise(dlrm.top_l[i], 'sigmoid')
+                dlrm.top_l[i + 1] = torch.nn.Identity()
+        for i in range(len(dlrm.bot_l)):
+            if isinstance(dlrm.bot_l[i], ipex.nn.utils._weight_prepack._IPEXLinear):
+                if isinstance(dlrm.bot_l[i+1], torch.nn.ReLU):
+                    dlrm.bot_l[i] = ipex.nn.modules.IPEXLinearEltwise(dlrm.bot_l[i], 'relu')
+                else:
+                    dlrm.bot_l[i] = ipex.nn.modules.IPEXLinearEltwise(dlrm.bot_l[i], 'sigmoid')
+                dlrm.bot_l[i + 1] = torch.nn.Identity()
+
+        if ext_dist.my_size > 1:
+            dlrm.bot_l = ext_dist.DDP(dlrm.bot_l)
+            dlrm.top_l = ext_dist.DDP(dlrm.top_l)
+    training_record = [0, 0]
+    def update_training_performance(time, iters, training_record=training_record):
+        if iters > args.num_warmup_iters:
+            training_record[0] += time
+            training_record[1] += 1
+
+    def print_training_performance( training_record=training_record):
+        if training_record[0] == 0:
+            print("num-batches larger than warm up iters, please increase num-batches or decrease warmup iters")
+            exit()
+        total_samples = training_record[1] * args.mini_batch_size
+        throughput = total_samples / training_record[0] * 1000
+        print("throughput: {:.3f} fps".format(throughput))
+
+    test_freq = args.test_freq if args.test_freq != -1  else nbatches // 20
+    with torch.autograd.profiler.profile(
+        enabled=args.enable_profiling, use_cuda=False, record_shapes=False
+    ) as prof:
+        if not args.inference_only:
+            k = 0
+            while k < args.nepochs:
+
+                if k < skip_upto_epoch:
+                    continue
+
+                for j, inputBatch in enumerate(train_ld):
+
+                    if j < skip_upto_batch:
+                        continue
+
+                    X, lS_o, lS_i, T, W, CBPP = unpack_batch(inputBatch)
+                    if ext_dist.my_size > 1:
+                        local_bs = X.size()[0] // ext_dist.my_size
+                        rank_id = dlrm.rank
+                        X = X[rank_id * local_bs: (rank_id + 1) * local_bs]
+                        T = T[rank_id * local_bs: (rank_id + 1) * local_bs]
+                        global_bs = local_bs * ext_dist.my_size
+                        lS_o = lS_o[:, :global_bs]
+                        lS_i = lS_i[:, :global_bs]
+
+                    if isinstance(dlrm.emb_l, ipex.nn.modules.MergedEmbeddingBagWithSGD):
+                        if ext_dist.my_size > 1:
+                            batch_size = X.size()[0]
+                            g_i = lS_i[dlrm.local_ln_emb]
+                            g_o = lS_o[dlrm.local_ln_emb]
+                            n_tables = g_i.shape[0]
+                            idx = [g_i[i] for i in range(n_tables)]
+                            offset = [g_o[i] for i in range(n_tables)]
+                            include_last = [False for i in range(n_tables)]
+                            indices, offsets, indices_with_row_offsets = dlrm.emb_l.linearize_indices_and_offsets(idx, offset, include_last)
+                        else:
+                            n_tables = lS_i.shape[0]
+                            idx = [lS_i[i] for i in range(n_tables)]
+                            offset = [lS_o[i] for i in range(n_tables)]
+                            include_last = [False for i in range(n_tables)]
+                            indices, offsets, indices_with_row_offsets = dlrm.emb_l.linearize_indices_and_offsets(idx, offset, include_last)
+
+                    t1 = time_wrap()
+
+                    # early exit if nbatches was set by the user and has been exceeded
+                    if nbatches > 0 and j >= nbatches:
+                        break
+
+                    mbs = T.shape[0]  # = args.mini_batch_size except maybe for last
+
+                    # forward pass
+                    with torch.cpu.amp.autocast(enabled=args.bf16):
+                        if isinstance(dlrm.emb_l, ipex.nn.modules.MergedEmbeddingBagWithSGD):
+                            Z = dlrm_wrap(
+                                X,
+                                indices,
+                                offsets,
+                                indices_with_row_offsets
+                            ).float()
+                        else:
+                            Z = dlrm_wrap(
+                                X,
+                                lS_o,
+                                lS_i,
+                            ).float()
+
+                    # loss
+                    E = loss_fn_wrap(Z, T)
+
+                    # compute loss and accuracy
+                    L = E.detach().cpu().numpy()  # numpy array
+
+                    with record_function("DLRM backward"):
+                        # scaled error gradient propagation
+                        # (where we do not accumulate gradients across mini-batches)
+                        optimizer.zero_grad(set_to_none=True)
+                        # backward pass
+                        E.backward()
+
+                    with record_function("DLRM update"):
+                        # optimizer
+                        optimizer.step()
+                    lr_scheduler.step()
+                    if isinstance(dlrm.emb_l, ipex.nn.modules.MergedEmbeddingBagWithSGD):
+                        dlrm.emb_l.sgd_args = dlrm.emb_l.sgd_args._replace(lr=lr_scheduler.get_last_lr()[0])
+
+                    t2 = time_wrap()
+                    total_time += t2 - t1
+
+                    total_loss += L * mbs
+                    total_iter += 1
+                    total_samp += mbs
+
+                    should_print = ((j + 1) % args.print_freq == 0) or (
+                        j + 1 == nbatches
+                    )
+                    should_test = (
+                        (args.should_test)
+                        and (((j + 1) % test_freq == 0) or (j + 1 == nbatches))
+                    )
+
+                    # print time, loss and accuracy
+                    if should_print or should_test:
+                        gT = 1000.0 * total_time / total_iter if args.print_time else -1
+                        total_time = 0
+
+                        train_loss = total_loss / total_samp
+                        total_loss = 0
+
+                        str_run_type = (
+                            "inference" if args.inference_only else "training"
+                        )
+
+                        wall_time = ""
+                        if args.print_wall_time:
+                            wall_time = " ({})".format(time.strftime("%H:%M"))
+
+                        print(
+                            "Finished {} it {}/{} of epoch {}, {:.2f} ms/it,".format(
+                                str_run_type, j + 1, nbatches, k, gT
+                            )
+                            + " loss {:.6f}".format(train_loss)
+                            + wall_time,
+                            flush=True,
+                        )
+                        update_training_performance(gT, j)
+
+                        total_iter = 0
+                        total_samp = 0
+
+                    # testing
+                    if should_test:
+                        model_metrics_dict, is_best = inference(
+                            args,
+                            dlrm,
+                            best_acc_test,
+                            best_auc_test,
+                            test_ld,
+                        )
+
+                        if (
+                            is_best
+                            and not (args.save_model == "")
+                            and not args.inference_only
+                        ):
+                            model_metrics_dict["epoch"] = k
+                            model_metrics_dict["iter"] = j + 1
+                            model_metrics_dict["train_loss"] = train_loss
+                            model_metrics_dict["total_loss"] = total_loss
+                            model_metrics_dict[
+                                "opt_state_dict"
+                            ] = optimizer.state_dict()
+                            print("Saving model to {}".format(args.save_model))
+                            torch.save(model_metrics_dict, args.save_model)
+
+                        if (
+                            (args.mlperf_auc_threshold > 0)
+                            and (best_auc_test > args.mlperf_auc_threshold)
+                        ):
+                            print(
+                                "MLPerf testing auc threshold "
+                                + str(args.mlperf_auc_threshold)
+                                + " reached, stop training"
+                            )
+                k += 1  # nepochs
+        else:
+            print("Testing for inference only")
+            with torch.no_grad():
+                inference(
+                    args,
+                    dlrm,
+                    best_acc_test,
+                    best_auc_test,
+                    test_ld
+                )
+
+    # profiling
+    if not args.inference_only:
+        print_training_performance()
+
+    if args.enable_profiling:
+        time_stamp = str(datetime.datetime.now()).replace(" ", "_")
+        with open("dlrm_s_pytorch" + time_stamp + "_shape.prof", "w") as prof_f:
+            prof_f.write(
+                prof.key_averages(group_by_input_shape=True).table(
+                    sort_by="self_cpu_time_total"
+                )
+            )
+        with open("dlrm_s_pytorch" + time_stamp + "_total.prof", "w") as prof_f:
+            prof_f.write(prof.key_averages().table(sort_by="self_cpu_time_total"))
+        prof.export_chrome_trace("dlrm_s_pytorch" + time_stamp + ".json")
+    exit(0)
+
+if __name__ == "__main__":
+    run()
diff --git a/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/extend_distributed.py b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/extend_distributed.py
new file mode 100644
index 00000000000..0b117975b25
--- /dev/null
+++ b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/extend_distributed.py
@@ -0,0 +1,424 @@
+import os
+import builtins
+import numpy as np
+import torch
+from torch.autograd import Function
+from torch.nn.parallel import DistributedDataParallel as DDP
+import torch.distributed as dist
+try:
+    import torch_ccl
+except ImportError as e:
+    #print(e)
+    torch_ccl = False
+
+my_rank = -1
+my_size = -1
+my_local_rank = -1
+my_local_size = -1
+alltoall_supported = False
+allgatherv_supported = False
+a2a_impl = os.environ.get('DLRM_ALLTOALL_IMPL', '')
+
+myreq = None
+
+def env2int(env_list, default = -1):
+    for e in env_list:
+        val = int(os.environ.get(e, -1))
+        if val >= 0: return val
+    return default
+
+def get_my_slice(n):
+    my_size = dist.get_world_size()
+    my_rank = dist.get_rank()
+    k, m = divmod(n, my_size)
+    return slice(my_rank * k + min(my_rank, m), (my_rank+1) * k + min(my_rank+1, m), 1)
+
+def get_split_lengths(n):
+    my_size = dist.get_world_size()
+    k, m = divmod(n, my_size)
+    if m == 0:
+        splits = None
+        my_len = k
+    else:
+        my_rank = dist.get_rank()
+        splits = [(k+1) if i < m else k for i in range(my_size)]
+        my_len = splits[my_rank]
+    return (my_len, splits)
+
+def init_distributed(rank = -1, size = -1, backend=''):
+    global myreq
+    #global my_rank
+    global my_size
+    global my_local_rank
+    global my_local_size
+    global a2a_impl
+    global alltoall_supported
+    global allgatherv_supported
+    # guess MPI ranks from env (works for IMPI, OMPI and MVAPICH2)
+    num_mpi_ranks = env2int(['PMI_SIZE', 'OMPI_COMM_WORLD_SIZE', 'MV2_COMM_WORLD_SIZE', 'WORLD_SIZE'])
+    if backend == '' and num_mpi_ranks > 1:
+        if torch_ccl and env2int(['CCL_WORKER_COUNT']) > 0:
+            backend = 'ccl'
+        elif dist.is_mpi_available():
+            backend = 'mpi'
+        else:
+            print("WARNING: MPI multi-process launch detected but PyTorch MPI backend not available.")
+            backend = 'gloo'
+    if backend != '':
+        #guess Rank and size
+        if rank == -1:
+            rank = env2int(['PMI_RANK', 'OMPI_COMM_WORLD_RANK', 'MV2_COMM_WORLD_RANK', 'RANK'], 0)
+        if size == -1:
+            size = env2int(['PMI_SIZE', 'OMPI_COMM_WORLD_SIZE', 'MV2_COMM_WORLD_SIZE', 'WORLD_SIZE'], 1)
+        if not os.environ.get('RANK', None) and rank != -1: os.environ['RANK'] = str(rank)
+        if not os.environ.get('WORLD_SIZE', None) and size != -1: os.environ['WORLD_SIZE'] = str(size)
+        if not os.environ.get('MASTER_PORT', None): os.environ['MASTER_PORT'] = '29500'
+        if not os.environ.get('MASTER_ADDR', None):
+            local_size = env2int(['MPI_LOCALNRANKS', 'OMPI_COMM_WORLD_LOCAL_SIZE', 'MV2_COMM_WORLD_LOCAL_SIZE'], 1)
+            if local_size != size and backend != 'mpi':
+                print("Warning: Looks like distributed multinode run but MASTER_ADDR env not set, using '127.0.0.1' as default")
+                print("If this run hangs, try exporting rank 0's hostname as MASTER_ADDR")
+            os.environ['MASTER_ADDR'] = '127.0.0.1'
+    if size > 1:
+        dist.init_process_group(backend, rank=rank, world_size=size)
+        my_rank = dist.get_rank()
+        my_size = dist.get_world_size()
+        my_local_rank = env2int(['MPI_LOCALRANKID', 'OMPI_COMM_WORLD_LOCAL_RANK', 'MV2_COMM_WORLD_LOCAL_RANK'], 0)
+        my_local_size = env2int(['MPI_LOCALNRANKS', 'OMPI_COMM_WORLD_LOCAL_SIZE', 'MV2_COMM_WORLD_LOCAL_SIZE'], 1)
+        if my_rank == 0: print("Running on %d ranks using %s backend" % (my_size, backend))
+        if backend == 'ccl':
+            print("Using CCL_ATL_TRANSPORT=%s" % os.environ.get('CCL_ATL_TRANSPORT', '(default)'))
+            print("Using CCL_ATL_SHM=%s" % os.environ.get('CCL_ATL_SHM', '(default)'))
+        if hasattr(dist, 'all_to_all_single'):
+            try:
+               # dist.all_to_all_single(torch.empty([0]), torch.empty([0]))
+                alltoall_supported = True
+            except RuntimeError:
+                pass
+        if a2a_impl == 'alltoall' and alltoall_supported == False:
+            print("Requested DLRM_ALLTOALL_IMPL=%s but backend %s does not support it, use scatter/gather based alltoall" % (a2a_impl, backend))
+            a2a_impl = 'scatter'
+        if a2a_impl != '': print("Using DLRM_ALLTOALL_IMPL=%s" % a2a_impl)
+        try:
+            x = torch.ones([my_rank])
+            y = torch.zeros([(my_size*(my_size-1))//2])
+            y = list(y.split([r for r in range(my_size)]))
+            dist.all_gather(y, x)
+            allgatherv_supported = True
+        except RuntimeError:
+            pass
+    else:
+        my_rank = 0
+        my_size = 1
+        my_local_rank = 0
+        my_local_size = 1
+    myreq = Request()
+
+class Request(object):
+    def __init__(self):
+        self.req = None
+        self.tensor = None
+        self.WaitFunction = All2All_Scatter_Wait
+
+    def wait(self):
+        ret = self.WaitFunction.apply(*self.tensor)
+        self.req = None
+        self.tensor = None
+        return ret
+
+class All2All_ScatterList_Req(Function):
+    @staticmethod
+    def forward(ctx, a2ai, *inputs):
+        global myreq
+        my_rank = dist.get_rank()
+        #print("All2All_ScatterList_Req:forward")
+        mb_split_lengths = a2ai.gNS if a2ai.gNS else a2ai.lN
+        emb_split_lengths = a2ai.gSS if a2ai.gSS else [a2ai.lS] * my_size
+        gather_list = []
+        req_list = []
+        for i in range(my_size):
+            for j in range(emb_split_lengths[i]):
+                out_tensor = inputs[0].new_empty([a2ai.lN, a2ai.E])
+                scatter_list = list(inputs[j].split(mb_split_lengths, dim = 0)) if i == my_rank else []
+                req = dist.scatter(out_tensor, scatter_list, src=i, async_op=True)
+                gather_list.append(out_tensor)
+                req_list.append(req)
+        myreq.req = req_list
+        myreq.tensor = tuple(gather_list)
+        myreq.a2ai = a2ai
+        return myreq.tensor
+
+    @staticmethod
+    def backward(ctx, *grad_output):
+        global myreq
+        #print("All2All_ScatterList_Req:backward")
+        for r in myreq.req:
+            r.wait()
+        myreq.req = None
+        grad_inputs = myreq.tensor
+        myreq.tensor = None
+        return (None, *grad_inputs)
+
+
+class All2All_ScatterList_Wait(Function):
+    @staticmethod
+    def forward(ctx, *output):
+        global myreq
+        #print("All2All_Scatter_Wait:forward")
+        ctx.a2ai = myreq.a2ai
+        for r in myreq.req:
+            r.wait()
+        myreq.req = None
+        myreq.tensor = None
+        return output
+
+    @staticmethod
+    def backward(ctx, *grad_output):
+        global myreq
+        my_rank = dist.get_rank()
+        a2ai = ctx.a2ai
+        grad_output = [t.contiguous() for t in grad_output]
+        mb_split_lengths = a2ai.gNS if a2ai.gNS else [a2ai.lN] * my_size
+        per_rank_split_lengths = a2ai.gSS if a2ai.gSS else [a2ai.lS] * my_size
+        grad_inputs = [grad_output[0].new_empty([ctx.a2ai.N, ctx.a2ai.E]) for _ in range(a2ai.lS)]
+        req_list = []
+        ind = 0
+        for i in range(my_size):
+            for j in range(per_rank_split_lengths[i]):
+                gather_list = list(grad_inputs[j].split(mb_split_lengths, dim = 0)) if i == my_rank else None
+                req = dist.gather(grad_output[ind], gather_list, dst = i, async_op=True)
+                req_list.append(req)
+                ind += 1
+        myreq.req = req_list
+        myreq.tensor = grad_inputs
+        return tuple(grad_output)
+
+
+
+class All2All_Scatter_Req(Function):
+    @staticmethod
+    def forward(ctx, a2ai, *inputs):
+        global myreq
+        #print("All2All_Scatter_Req:forward")
+        my_rank = dist.get_rank()
+        mb_split_lengths = a2ai.gNS if a2ai.gNS else a2ai.lN
+        emb_split_lengths = a2ai.gSS if a2ai.gSS else [a2ai.lS] * my_size
+        input = torch.cat(inputs, dim=1)
+        scatter_list = list(input.split(mb_split_lengths, dim=0))
+        gather_list = []
+        req_list = []
+        for i in range(my_size):
+            out_tensor = input.new_empty([a2ai.lN, emb_split_lengths[i] * a2ai.E])
+            req = dist.scatter(out_tensor, scatter_list if i == my_rank else [], src=i, async_op=True)
+            gather_list.append(out_tensor)
+            req_list.append(req)
+        myreq.req = req_list
+        myreq.tensor = tuple(gather_list)
+        myreq.a2ai = a2ai
+        ctx.a2ai = a2ai
+        return myreq.tensor
+
+    @staticmethod
+    def backward(ctx, *grad_output):
+        global myreq
+        #print("All2All_Scatter_Req:backward")
+        for r in myreq.req:
+            r.wait()
+        myreq.req = None
+        grad_input = myreq.tensor
+        grad_inputs = grad_input.split(ctx.a2ai.E, dim=1)
+        myreq.tensor = None
+        return (None, *grad_inputs)
+
+
+class All2All_Scatter_Wait(Function):
+    @staticmethod
+    def forward(ctx, *output):
+        global myreq
+        #print("All2All_Scatter_Wait:forward")
+        ctx.a2ai = myreq.a2ai
+        for r in myreq.req:
+            r.wait()
+        myreq.req = None
+        myreq.tensor = None
+        return output
+
+    @staticmethod
+    def backward(ctx, *grad_output):
+        global myreq
+        my_rank = dist.get_rank()
+        #print("All2All_Scatter_Wait:backward")
+        assert len(grad_output) == my_size
+        scatter_list = [t.contiguous() for t in grad_output]
+        a2ai = ctx.a2ai
+        mb_split_lengths = a2ai.gNS if a2ai.gNS else a2ai.lN
+        emb_split_lengths = a2ai.gSS if a2ai.gSS else [a2ai.lS] * my_size
+        grad_input = grad_output[0].new_empty([a2ai.N, a2ai.E*a2ai.lS])
+        gather_list = list(grad_input.split(mb_split_lengths, dim=0))
+        req_list = []
+        for i in range(my_size):
+            #req = dist.scatter(gather_list[i], scatter_list if i == my_rank else [], src=i, async_op=True)
+            req = dist.gather(scatter_list[i], gather_list if i == my_rank else [], dst=i, async_op=True)
+            req_list.append(req)
+        myreq.req = req_list
+        myreq.tensor = grad_input
+        return grad_output
+
+
+class All2All_Req(Function):
+    @staticmethod
+    def forward(ctx, a2ai, *inputs):
+        global myreq
+        #print("All2All_Req:forward")
+        mb_split_lengths = a2ai.gNS
+        if mb_split_lengths: mb_split_lengths = [m * a2ai.lS * a2ai.E for m in mb_split_lengths]
+        emb_split_lengths = a2ai.gSS
+        if emb_split_lengths: emb_split_lengths = [a2ai.lN * e * a2ai.E for e in emb_split_lengths]
+        input = torch.cat(inputs, dim=1).view([-1])
+        output = input.new_empty([a2ai.S*a2ai.lN*a2ai.E])
+        req = dist.all_to_all_single(output, input, emb_split_lengths, mb_split_lengths, async_op=True)
+        myreq.req = req
+        myreq.tensor = []
+        myreq.tensor.append(output)
+        myreq.tensor = tuple(myreq.tensor)
+        a2ai.mb_split_lengths = mb_split_lengths
+        a2ai.emb_split_lengths = emb_split_lengths
+        myreq.a2ai = a2ai
+        ctx.a2ai = a2ai
+        return myreq.tensor
+
+    @staticmethod
+    def backward(ctx, *grad_output):
+        global myreq
+        #print("All2All_Req:backward")
+        a2ai = ctx.a2ai
+        myreq.req.wait()
+        myreq.req = None
+        grad_input = myreq.tensor
+        grad_inputs = grad_input.view([a2ai.N, -1]).split(a2ai.E, dim=1)
+        grad_inputs = [gin.contiguous() for gin in grad_inputs]
+        myreq.tensor = None
+        return (None, *grad_inputs)
+
+
+class All2All_Wait(Function):
+    @staticmethod
+    def forward(ctx, *output):
+        global myreq
+        #print("All2All_Wait:forward")
+        a2ai = myreq.a2ai
+        ctx.a2ai = a2ai
+        myreq.req.wait()
+        myreq.req = None
+        myreq.tensor = None
+        emb_split_lengths = a2ai.emb_split_lengths if a2ai.emb_split_lengths else a2ai.lS * a2ai.lN * a2ai.E
+        outputs = output[0].split(emb_split_lengths)
+        outputs = tuple([out.view([a2ai.lN, -1]) for out in outputs])
+        return outputs
+
+    @staticmethod
+    def backward(ctx, *grad_outputs):
+        global myreq
+        #print("All2All_Wait:backward")
+        a2ai = ctx.a2ai
+        grad_outputs = [gout.contiguous().view([-1]) for gout in grad_outputs]
+        grad_output = torch.cat(grad_outputs)
+        grad_input = grad_output.new_empty([a2ai.N * a2ai.lS * a2ai.E])
+        req = dist.all_to_all_single(grad_input, grad_output, a2ai.mb_split_lengths, a2ai.emb_split_lengths, async_op=True)
+        myreq.req = req
+        myreq.tensor = grad_input
+        return (grad_output,)
+
+class AllGather(Function):
+
+    @staticmethod
+    def forward(ctx, input, global_lengths, dim=0):
+        if not isinstance(global_lengths, (list, tuple)):
+            global_lengths = [global_lengths] * my_size
+        my_rank = dist.get_rank()
+        assert(len(global_lengths) == my_size)
+        assert(global_lengths[my_rank] == input.size(dim))
+        local_start = sum(global_lengths[:my_rank])
+
+        output_size = list(input.size())
+
+        ctx.dim = dim
+        ctx.local_start = local_start
+        ctx.local_length = global_lengths[my_rank]
+
+        input = input.contiguous()
+        if dim == 0:
+            out_len = sum(global_lengths)
+            output_size[dim] = out_len
+            output = input.new_empty(output_size)
+            gather_list = list(output.split(global_lengths, dim=0))
+        else:
+            gather_list = [torch.empty_like(input) for _ in range(my_size)]
+            gather_list = []
+            for l in global_lengths:
+                output_size[dim] = l
+                gather_list.append(input.new_empty(output_size))
+
+        dist.all_gather(gather_list, input)
+
+        if dim != 0:
+            output = torch.cat(gather_list, dim=dim)
+
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        # print("Inside All2AllBackward")
+        dim = ctx.dim
+        start = ctx.local_start
+        length = ctx.local_length
+
+        grad_input = grad_output.narrow(dim, start, length)
+
+        return (grad_input, None, None)
+
+class All2AllInfo(object):
+    pass
+
+def alltoall(inputs, per_rank_split_lengths):
+    global myreq
+    N, E = inputs[0].size()
+    a2ai = All2AllInfo()
+    a2ai.lS = len(inputs)
+    a2ai.gSS = per_rank_split_lengths
+    a2ai.lN, a2ai.gNS = get_split_lengths(N)
+    a2ai.E = E
+    a2ai.N = N
+    a2ai.S = sum(per_rank_split_lengths) if per_rank_split_lengths else a2ai.lS * my_size
+    if a2a_impl == '' and alltoall_supported or a2a_impl == 'alltoall':
+        output = All2All_Req.apply(a2ai, *inputs)
+        myreq.WaitFunction = All2All_Wait
+    elif a2a_impl == '' or a2a_impl == 'scatter':
+        #print("Using All2All_Scatter_Req")
+        output = All2All_Scatter_Req.apply(a2ai, *inputs)
+        myreq.WaitFunction = All2All_Scatter_Wait
+    elif a2a_impl == 'scatter_list':
+        #print("Using All2All_ScatterList_Req")
+        output = All2All_ScatterList_Req.apply(a2ai, *inputs)
+        myreq.WaitFunction = All2All_ScatterList_Wait
+    else:
+        print("Unknown value set for DLRM_ALLTOALL_IMPL (%s), please use one of [alltoall, scatter, scatter_list]" % a2a_impl) 
+    return myreq
+
+def shuffle_data(inputs):
+    input = torch.cat(inputs)
+    output = input.new_empty(input.size())
+    req = dist.all_to_all_single(output, input) 
+    output = output.reshape(my_size, -1)
+    return output
+    
+
+def all_gather(input, lengths, dim=0):
+    #print("lengths: ", lengths)
+    if not lengths: lengths = [input.size(0)] * my_size
+    return AllGather.apply(input, lengths, dim)
+
+def barrier():
+    if my_size > 1:
+        dist.barrier()
+
diff --git a/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/requirements.txt b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/requirements.txt
new file mode 100644
index 00000000000..859bbfc346b
--- /dev/null
+++ b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/requirements.txt
@@ -0,0 +1,8 @@
+future
+numpy
+pydot
+neural-compressor
+scikit-learn
+tqdm
+torch>=1.11.0
+intel_extension_for_pytorch>=1.11.0
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/run_benchmark.sh b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/run_benchmark.sh
new file mode 100755
index 00000000000..3089868c3a0
--- /dev/null
+++ b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/run_benchmark.sh
@@ -0,0 +1,98 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_tuning
+
+}
+
+# init params
+function init_params {
+  tuned_checkpoint=saved_results
+  batch_size=16384
+  iters=100
+  for var in "$@"
+  do
+    case $var in
+      --topology=*)
+          topology=$(echo $var |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo $var |cut -f2 -d=)
+      ;;
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+      --mode=*)
+          mode=$(echo $var |cut -f2 -d=)
+      ;;
+      --batch_size=*)
+          batch_size=$(echo $var |cut -f2 -d=)
+      ;;
+      --iters=*)
+          iters=$(echo ${var} |cut -f2 -d=)
+      ;;
+      --int8=*)
+          int8=$(echo ${var} |cut -f2 -d=)
+      ;;
+      --config=*)
+          tuned_checkpoint=$(echo $var |cut -f2 -d=)
+      ;;
+      *)
+          echo "Error: No such parameter: ${var}"
+          exit 1
+      ;;
+    esac
+  done
+
+}
+
+# run_tuning
+function run_tuning {
+    MODEL_SCRIPT=dlrm_s_pytorch.py
+
+    # Create the output directory in case it doesn't already exist
+    mkdir -p ${tuned_checkpoint}/dlrm_inference_accuracy_log
+
+    LOG=${tuned_checkpoint}/dlrm_inference_accuracy_log
+
+    CORES=`lscpu | grep Core | awk '{print $4}'`
+
+    ARGS=""
+    if [[ ${int8} == "true" ]]; then
+        echo "running int8 path"
+        ARGS="$ARGS --int8"
+    else
+        echo "running fp32 path"
+    fi
+
+    if [[ ${mode} == "accuracy" ]]; then
+        python -u $MODEL_SCRIPT \
+        --raw-data-file=${dataset_location}/day --processed-data-file=${dataset_location}/terabyte_processed.npz \
+        --data-set=terabyte \
+        --memory-map --mlperf-bin-loader --round-targets=True --learning-rate=1.0 \
+        --arch-mlp-bot=13-512-256-128 --arch-mlp-top=1024-1024-512-256-1 \
+        --arch-sparse-feature-size=128 --max-ind-range=40000000 \
+        --numpy-rand-seed=727  --inference-only --ipex-interaction \
+        --print-freq=100 --print-time --mini-batch-size=2048 --test-mini-batch-size=16384 \
+        --save-model ${tuned_checkpoint} --test-freq=2048 --print-auc $ARGS \
+        --load-model=${input_model}  --accuracy_only
+    elif [[ ${mode} == "performance" ]]; then
+        python -u $MODEL_SCRIPT \
+        --raw-data-file=${dataset_location}/day --processed-data-file=${dataset_location}/terabyte_processed.npz \
+        --data-set=terabyte --benchmark \
+        --memory-map --mlperf-bin-loader --round-targets=True --learning-rate=1.0 \
+        --arch-mlp-bot=13-512-256-128 --arch-mlp-top=1024-1024-512-256-1 \
+        --arch-sparse-feature-size=128 --max-ind-range=40000000 --ipex-interaction \
+        --numpy-rand-seed=727  --inference-only --num-batches=1000 \
+        --print-freq=10 --print-time --mini-batch-size=128 --test-mini-batch-size=${batch_size} \
+        --save-model ${tuned_checkpoint}
+    else
+        echo "Error: No such mode: ${mode}"
+        exit 1
+    fi
+}
+
+main "$@"
diff --git a/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/run_quant.sh b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/run_quant.sh
new file mode 100755
index 00000000000..58d8b1fe491
--- /dev/null
+++ b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/run_quant.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_tuning
+
+}
+
+# init params
+function init_params {
+  tuned_checkpoint=saved_results
+  for var in "$@"
+  do
+    case $var in
+      --topology=*)
+          topology=$(echo $var |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo $var |cut -f2 -d=)
+      ;;
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+      --output_model=*)
+         tuned_checkpoint=$(echo $var |cut -f2 -d=)
+      ;;
+      *)
+          echo "Error: No such parameter: ${var}"
+          exit 1
+      ;;
+    esac
+  done
+
+}
+
+CORES=`lscpu | grep Core | awk '{print $4}'`
+# use first socket
+numa_cmd="numactl -C 0-$((CORES-1))  "
+echo "will run on core 0-$((CORES-1)) on socket 0" 
+
+export OMP_NUM_THREADS=$CORES
+
+# run_tuning
+function run_tuning {
+  MODEL_SCRIPT=dlrm_s_pytorch.py
+
+  # Create the output directory in case it doesn't already exist
+  mkdir -p ${tuned_checkpoint}/dlrm_inference_accuracy_log
+
+  LOG=${tuned_checkpoint}/dlrm_inference_accuracy_log
+  CORES=`lscpu | grep Core | awk '{print $4}'`
+  ARGS=""
+
+  $numa_cmd python -u $MODEL_SCRIPT \
+  --raw-data-file=${dataset_location}/day --processed-data-file=${dataset_location}/terabyte_processed.npz \
+  --data-set=terabyte \
+  --memory-map --mlperf-bin-loader --round-targets=True --learning-rate=1.0 \
+  --arch-mlp-bot=13-512-256-128 --arch-mlp-top=1024-1024-512-256-1 \
+  --arch-sparse-feature-size=128 --max-ind-range=40000000 \
+  --numpy-rand-seed=727  --inference-only --ipex-interaction \
+  --print-freq=100 --print-time --mini-batch-size=2048 --test-mini-batch-size=16384 \
+  --test-freq=2048 --print-auc --tune --save-model=${tuned_checkpoint} $ARGS \
+  --load-model=${input_model} --num-cpu-cores=${CORES} | tee $LOG
+}
+
+main "$@"
diff --git a/neural_compressor/torch/quantization/autotune.py b/neural_compressor/torch/quantization/autotune.py
index 279b6be4633..e54a9d97748 100644
--- a/neural_compressor/torch/quantization/autotune.py
+++ b/neural_compressor/torch/quantization/autotune.py
@@ -93,17 +93,19 @@ def autotune(
         tuning_logger.trial_end(trial_index)
         if tuning_monitor.need_stop():
             logger.info("Stopped tuning.")
-            del q_model  # maybe gc.collect() is needed for memory release
-            best_quant_config: BaseConfig = tuning_monitor.get_best_quant_config()
-            # !!! Make sure to use deepcopy only when inplace is set to `True`.
-            q_model = quantize(
-                deepcopy(model),
-                quant_config=best_quant_config,
-                run_fn=run_fn,
-                run_args=run_args,
-                inplace=True,
-                example_inputs=example_inputs,
-            )
+            if trial_index == 0:  # recover the best q_model from previous results.
+                logger.info("Reconvering the best quantized model...")
+                del q_model  # maybe gc.collect() is needed for memory release
+                best_quant_config: BaseConfig = tuning_monitor.get_best_quant_config()
+                # !!! Make sure to use deepcopy only when inplace is set to `True`.
+                q_model = quantize(
+                    deepcopy(model),
+                    quant_config=best_quant_config,
+                    run_fn=run_fn,
+                    run_args=run_args,
+                    inplace=True,
+                    example_inputs=example_inputs,
+                )
             best_quant_model = q_model  # quantize model inplace
             break
     tuning_logger.tuning_end()
diff --git a/neural_compressor/torch/quantization/quantize.py b/neural_compressor/torch/quantization/quantize.py
index 57197a91972..ff8298dad88 100644
--- a/neural_compressor/torch/quantization/quantize.py
+++ b/neural_compressor/torch/quantization/quantize.py
@@ -97,6 +97,7 @@ def quantize(
                 example_inputs=example_inputs,
                 mode=Mode.QUANTIZE,
             )
+    setattr(q_model, "is_quantized", True)
     return q_model
 
 
@@ -152,7 +153,7 @@ def prepare(
                 example_inputs=example_inputs,
                 mode=Mode.PREPARE,
             )
-            setattr(prepared_model, "prepared", True)
+            setattr(prepared_model, "is_prepared", True)
     setattr(prepared_model, "quant_config", quant_config)
     setattr(prepared_model, "example_inputs", example_inputs)
     return prepared_model
@@ -177,12 +178,12 @@ def convert(
     q_model = model if inplace else copy.deepcopy(model)
 
     # TODO: Optimize the check for prepared flag after adding HQT FP8 Quant
-    assert getattr(model, "prepared", False), "Please run prepare function before convert."
+    assert getattr(model, "is_prepared", False), "Please run prepare function before convert."
 
-    if getattr(model, "prepared", False):
+    if getattr(model, "is_prepared", False):
         if quant_config is None:
             quant_config = model.quant_config
-    example_inputs = model.example_inputs if getattr(model, "prepared", False) else None
+    example_inputs = model.example_inputs if getattr(model, "is_prepared", False) else None
 
     registered_configs = config_registry.get_cls_configs()
     if isinstance(quant_config, dict):
@@ -215,4 +216,5 @@ def convert(
                 example_inputs=example_inputs,
                 mode=Mode.CONVERT,
             )
+    setattr(q_model, "is_quantized", True)
     return q_model