From 81b77c8776d82c61afd1ba0834b60d37923b8615 Mon Sep 17 00:00:00 2001
From: sreedes <70613743+sreedes@users.noreply.github.com>
Date: Sat, 18 Dec 2021 04:39:09 +0530
Subject: [PATCH 01/13] fix: Model Registration with BYO scripts (#2797)

Co-authored-by: Basil Beirouti <beirb@amazon.com>
Co-authored-by: Payton Staub <pstaub@amazon.com>
Co-authored-by: Ahsan Khan <ahsan.al.zaki@gmail.com>
Co-authored-by: Mufaddal Rohawala <89424143+mufaddal-rohawala@users.noreply.github.com>
Co-authored-by: Basil Beirouti <BasilBeirouti@gmail.com>
Co-authored-by: Payton Staub <staubhpa@gmail.com>
Co-authored-by: Shreya Pandit <shreya.pandit25@gmail.com>
---
 src/sagemaker/model.py    | 23 +++++++++++--------
 tests/integ/test_mxnet.py | 48 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 62 insertions(+), 9 deletions(-)

diff --git a/src/sagemaker/model.py b/src/sagemaker/model.py
index 4461345fa0..5af5539a96 100644
--- a/src/sagemaker/model.py
+++ b/src/sagemaker/model.py
@@ -178,21 +178,26 @@ def register(
         """
         if self.model_data is None:
             raise ValueError("SageMaker Model Package cannot be created without model data.")
+        if image_uri is not None:
+            self.image_uri = image_uri
+        if model_package_group_name is not None:
+            container_def = self.prepare_container_def()
+        else:
+            container_def = {"Image": self.image_uri, "ModelDataUrl": self.model_data}
 
         model_pkg_args = sagemaker.get_model_package_args(
             content_types,
             response_types,
             inference_instances,
             transform_instances,
-            model_package_name,
-            model_package_group_name,
-            self.model_data,
-            image_uri or self.image_uri,
-            model_metrics,
-            metadata_properties,
-            marketplace_cert,
-            approval_status,
-            description,
+            model_package_name=model_package_name,
+            model_package_group_name=model_package_group_name,
+            model_metrics=model_metrics,
+            metadata_properties=metadata_properties,
+            marketplace_cert=marketplace_cert,
+            approval_status=approval_status,
+            description=description,
+            container_def_list=[container_def],
             drift_check_baselines=drift_check_baselines,
         )
         model_package = self.sagemaker_session.create_model_package_from_containers(
diff --git a/tests/integ/test_mxnet.py b/tests/integ/test_mxnet.py
index 65c89c5876..d13108d471 100644
--- a/tests/integ/test_mxnet.py
+++ b/tests/integ/test_mxnet.py
@@ -231,6 +231,54 @@ def test_register_model_package(
         sagemaker_session.sagemaker_client.delete_model_package(ModelPackageName=model_package_name)
 
 
+def test_register_model_package_versioned(
+    mxnet_training_job,
+    sagemaker_session,
+    mxnet_inference_latest_version,
+    mxnet_inference_latest_py_version,
+    cpu_instance_type,
+):
+    endpoint_name = "test-mxnet-deploy-model-{}".format(sagemaker_timestamp())
+
+    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
+        desc = sagemaker_session.sagemaker_client.describe_training_job(
+            TrainingJobName=mxnet_training_job
+        )
+        model_package_group_name = "register-model-package-{}".format(sagemaker_timestamp())
+        sagemaker_session.sagemaker_client.create_model_package_group(
+            ModelPackageGroupName=model_package_group_name
+        )
+        model_data = desc["ModelArtifacts"]["S3ModelArtifacts"]
+        script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist.py")
+        model = MXNetModel(
+            model_data,
+            "SageMakerRole",
+            entry_point=script_path,
+            py_version=mxnet_inference_latest_py_version,
+            sagemaker_session=sagemaker_session,
+            framework_version=mxnet_inference_latest_version,
+        )
+        model_pkg = model.register(
+            content_types=["application/json"],
+            response_types=["application/json"],
+            inference_instances=["ml.m5.large"],
+            transform_instances=["ml.m5.large"],
+            model_package_group_name=model_package_group_name,
+            approval_status="Approved",
+        )
+        assert isinstance(model_pkg, ModelPackage)
+        predictor = model.deploy(1, cpu_instance_type, endpoint_name=endpoint_name)
+        data = numpy.zeros(shape=(1, 1, 28, 28))
+        result = predictor.predict(data)
+        assert result is not None
+        sagemaker_session.sagemaker_client.delete_model_package(
+            ModelPackageName=model_pkg.model_package_arn
+        )
+        sagemaker_session.sagemaker_client.delete_model_package_group(
+            ModelPackageGroupName=model_package_group_name
+        )
+
+
 def test_deploy_model_with_tags_and_kms(
     mxnet_training_job,
     sagemaker_session,

From c8ca3b723844f16a1cf06663b8da426ee35344dd Mon Sep 17 00:00:00 2001
From: Navin Soni <navinns@amazon.com>
Date: Mon, 27 Dec 2021 23:01:24 +0000
Subject: [PATCH 02/13] fix: Add ContentType in test_auto_ml_describe

---
 tests/integ/test_auto_ml.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tests/integ/test_auto_ml.py b/tests/integ/test_auto_ml.py
index 1d1e656144..617ebae539 100644
--- a/tests/integ/test_auto_ml.py
+++ b/tests/integ/test_auto_ml.py
@@ -15,12 +15,12 @@
 import os
 
 import pytest
-import tests.integ
-from sagemaker import AutoML, CandidateEstimator, AutoMLInput
-
 from botocore.exceptions import ClientError
+
+import tests.integ
+from sagemaker import AutoML, AutoMLInput, CandidateEstimator
 from sagemaker.utils import unique_name_from_base
-from tests.integ import DATA_DIR, AUTO_ML_DEFAULT_TIMEMOUT_MINUTES, auto_ml_utils
+from tests.integ import AUTO_ML_DEFAULT_TIMEMOUT_MINUTES, DATA_DIR, auto_ml_utils
 from tests.integ.timeout import timeout
 
 ROLE = "SageMakerRole"
@@ -169,6 +169,7 @@ def test_auto_ml_describe_auto_ml_job(sagemaker_session):
                 }
             },
             "TargetAttributeName": TARGET_ATTRIBUTE_NAME,
+            "ContentType": "text/csv;header=present",
         }
     ]
     expected_default_output_config = {
@@ -205,6 +206,7 @@ def test_auto_ml_attach(sagemaker_session):
                 }
             },
             "TargetAttributeName": TARGET_ATTRIBUTE_NAME,
+            "ContentType": "text/csv;header=present",
         }
     ]
     expected_default_output_config = {

From 71043371f4abd7926d30d064670903a4afa192ba Mon Sep 17 00:00:00 2001
From: Payton Staub <pstaub@amazon.com>
Date: Mon, 27 Dec 2021 15:33:34 -0600
Subject: [PATCH 03/13] fix: Re-deploy static integ test endpoint if it is not
 found

---
 tests/integ/sagemaker/lineage/conftest.py | 27 +++++++++++++++++------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/tests/integ/sagemaker/lineage/conftest.py b/tests/integ/sagemaker/lineage/conftest.py
index dfc1ce585a..5b814bab5b 100644
--- a/tests/integ/sagemaker/lineage/conftest.py
+++ b/tests/integ/sagemaker/lineage/conftest.py
@@ -36,8 +36,8 @@
 from tests.integ.sagemaker.lineage.helpers import name, names
 
 SLEEP_TIME_SECONDS = 1
-STATIC_PIPELINE_NAME = "SdkIntegTestStaticPipeline14"
-STATIC_ENDPOINT_NAME = "SdkIntegTestStaticEndpoint14"
+STATIC_PIPELINE_NAME = "SdkIntegTestStaticPipeline15"
+STATIC_ENDPOINT_NAME = "SdkIntegTestStaticEndpoint15"
 
 
 @pytest.fixture
@@ -518,6 +518,13 @@ def _get_static_pipeline_execution_arn(sagemaker_session):
 def static_endpoint_context(sagemaker_session, static_pipeline_execution_arn):
     endpoint_arn = get_endpoint_arn_from_static_pipeline(sagemaker_session)
 
+    if endpoint_arn is None:
+        _deploy_static_endpoint(
+            execution_arn=static_pipeline_execution_arn,
+            sagemaker_session=sagemaker_session,
+        )
+        endpoint_arn = get_endpoint_arn_from_static_pipeline(sagemaker_session)
+
     contexts = sagemaker_session.sagemaker_client.list_contexts(SourceUri=endpoint_arn)[
         "ContextSummaries"
     ]
@@ -584,11 +591,17 @@ def static_dataset_artifact(static_model_artifact, sagemaker_session):
 
 
 def get_endpoint_arn_from_static_pipeline(sagemaker_session):
-    endpoint_arn = sagemaker_session.sagemaker_client.describe_endpoint(
-        EndpointName=STATIC_ENDPOINT_NAME
-    )["EndpointArn"]
+    try:
+        endpoint_arn = sagemaker_session.sagemaker_client.describe_endpoint(
+            EndpointName=STATIC_ENDPOINT_NAME
+        )["EndpointArn"]
 
-    return endpoint_arn
+        return endpoint_arn
+    except ClientError as e:
+        error = e.response["Error"]
+        if error["Code"] == "ValidationException":
+            return None
+        raise e
 
 
 def get_model_package_arn_from_static_pipeline(pipeline_execution_arn, sagemaker_session):
@@ -654,7 +667,7 @@ def _deploy_static_endpoint(execution_arn, sagemaker_session):
             sagemaker_session=sagemaker_session,
         )
         model_package.deploy(1, "ml.t2.medium", endpoint_name=STATIC_ENDPOINT_NAME)
-        time.sleep(60)
+        time.sleep(120)
     except ClientError as e:
         if e.response["Error"]["Code"] == "ValidationException":
             print(f"Endpoint {STATIC_ENDPOINT_NAME} already exists. Continuing.")

From ad29a0cd7d2d4cfa5b7a3823951e9926aaa37f53 Mon Sep 17 00:00:00 2001
From: Miyoung <myoung8739@gmail.com>
Date: Thu, 30 Dec 2021 07:30:29 -0800
Subject: [PATCH 04/13] documentation :SageMaker model parallel library 1.6.0
 API doc (#2814)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* update smdmp change log, archive api doc for 1.4.0 and 1.5.0

* add no-index flags

* finish api doc archive

* fix: Set ProcessingStep upload locations deterministically to avoid c… (#2790)

* fix: Prevent repack_model script from referencing nonexistent directories (#2755)

Co-authored-by: Payton Staub <pstaub@amazon.com>
Co-authored-by: Ahsan Khan <ahsan.al.zaki@gmail.com>

* fix: S3Input - add support for instance attributes (#2754)

* fix: typos and broken link (#2765)

Co-authored-by: Shreya Pandit <shreya.pandit25@gmail.com>

* add all api docs

* add appendix, fix links

* structural changes, fix links

* incorporate feedback

* prepare release v2.72.1

* update development version to v2.72.2.dev0

Co-authored-by: Payton Staub <staubhpa@gmail.com>
Co-authored-by: Payton Staub <pstaub@amazon.com>
Co-authored-by: Ahsan Khan <ahsan.al.zaki@gmail.com>
Co-authored-by: Mufaddal Rohawala <89424143+mufaddal-rohawala@users.noreply.github.com>
Co-authored-by: Mohamed Ali Jamaoui <m.ali.jamaoui@gmail.com>
Co-authored-by: Shreya Pandit <shreya.pandit25@gmail.com>
Co-authored-by: ci <ci>
Co-authored-by: Jeniya Tabassum <jeniya.tabassum@gmail.com>
---
 doc/api/training/smd_data_parallel.rst        |   6 +-
 doc/api/training/smd_model_parallel.rst       |  64 +-
 .../training/smd_model_parallel_general.rst   | 683 +++++++-------
 .../smd_model_parallel_change_log.rst         |  75 +-
 doc/api/training/smp_versions/archives.rst    |  10 +
 doc/api/training/smp_versions/latest.rst      |  26 +-
 .../latest/smd_model_parallel_common_api.rst  | 100 ++-
 .../latest/smd_model_parallel_pytorch.rst     | 125 ++-
 ...model_parallel_pytorch_tensor_parallel.rst | 835 ++++++++++++++++++
 .../latest/smd_model_parallel_tensorflow.rst  |   9 +-
 .../v1.4.0/smd_model_parallel_common_api.rst  | 488 ++++++++++
 .../v1.4.0/smd_model_parallel_pytorch.rst     | 572 ++++++++++++
 .../v1.4.0/smd_model_parallel_tensorflow.rst  | 172 ++++
 .../v1.5.0/smd_model_parallel_common_api.rst  | 488 ++++++++++
 .../v1.5.0/smd_model_parallel_pytorch.rst     | 572 ++++++++++++
 .../v1.5.0/smd_model_parallel_tensorflow.rst  | 172 ++++
 doc/api/training/smp_versions/v1_4_0.rst      |  12 +
 doc/api/training/smp_versions/v1_5_0.rst      |  12 +
 18 files changed, 3979 insertions(+), 442 deletions(-)
 create mode 100644 doc/api/training/smp_versions/archives.rst
 create mode 100644 doc/api/training/smp_versions/latest/smd_model_parallel_pytorch_tensor_parallel.rst
 create mode 100644 doc/api/training/smp_versions/v1.4.0/smd_model_parallel_common_api.rst
 create mode 100644 doc/api/training/smp_versions/v1.4.0/smd_model_parallel_pytorch.rst
 create mode 100644 doc/api/training/smp_versions/v1.4.0/smd_model_parallel_tensorflow.rst
 create mode 100644 doc/api/training/smp_versions/v1.5.0/smd_model_parallel_common_api.rst
 create mode 100644 doc/api/training/smp_versions/v1.5.0/smd_model_parallel_pytorch.rst
 create mode 100644 doc/api/training/smp_versions/v1.5.0/smd_model_parallel_tensorflow.rst
 create mode 100644 doc/api/training/smp_versions/v1_4_0.rst
 create mode 100644 doc/api/training/smp_versions/v1_5_0.rst

diff --git a/doc/api/training/smd_data_parallel.rst b/doc/api/training/smd_data_parallel.rst
index 27c0e5dea7..14f70a777f 100644
--- a/doc/api/training/smd_data_parallel.rst
+++ b/doc/api/training/smd_data_parallel.rst
@@ -1,6 +1,6 @@
-##########################
-Distributed data parallel
-##########################
+###############################################
+The SageMaker Distributed Data Parallel Library
+###############################################
 
 SageMaker's distributed data parallel library extends SageMaker’s training
 capabilities on deep learning models with near-linear scaling efficiency,
diff --git a/doc/api/training/smd_model_parallel.rst b/doc/api/training/smd_model_parallel.rst
index 47a0af6775..c40bc258fb 100644
--- a/doc/api/training/smd_model_parallel.rst
+++ b/doc/api/training/smd_model_parallel.rst
@@ -1,5 +1,5 @@
-Distributed model parallel
---------------------------
+The SageMaker Distributed Model Parallel Library
+------------------------------------------------
 
 The Amazon SageMaker distributed model parallel library is a model parallelism library for training
 large deep learning models that were previously difficult to train due to GPU memory limitations.
@@ -9,49 +9,35 @@ allowing you to increase prediction accuracy by creating larger models with more
 You can use the library to automatically partition your existing TensorFlow and PyTorch workloads
 across multiple GPUs with minimal code changes. The library's API can be accessed through the Amazon SageMaker SDK.
 
-Use the following sections to learn more about the model parallelism and the library.
-
-Use with the SageMaker Python SDK
-=================================
-
-Use the following page to learn how to configure and enable distributed model parallel
-when you configure an Amazon SageMaker Python SDK `Estimator`.
+See the following sections to learn more about the SageMaker model parallel library APIs.
 
 .. toctree::
-   :maxdepth: 1
+   :maxdepth: 3
 
+   smp_versions/latest
    smd_model_parallel_general
 
-API Documentation
-=================
-
-The library contains a Common API that is shared across frameworks, as well as APIs
-that are specific to supported frameworks, TensorFlow and PyTorch.
-
-Select a version to see the API documentation for version. To use the library, reference the
-**Common API** documentation alongside the framework specific API documentation.
-
-.. toctree::
-   :maxdepth: 1
-
-   smp_versions/latest.rst
-   smp_versions/v1_3_0.rst
-   smp_versions/v1_2_0.rst
-   smp_versions/v1_1_0.rst
-
-It is recommended to use this documentation alongside `SageMaker Distributed Model Parallel
-<http://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel.html>`__ in the Amazon SageMaker
-developer guide. This developer guide documentation includes:
 
-   -  An overview of model parallelism and the library
-      `core features <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-core-features.html>`__
-   -  Instructions on how to modify `TensorFlow
-      <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-customize-training-script.html#model-parallel-customize-training-script-tf>`__
-      and `PyTorch
-      <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-customize-training-script.html#model-parallel-customize-training-script-pt>`__
-      training scripts
-   -  `Configuration tips and pitfalls
-      <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-customize-tips-pitfalls.html>`__
+.. tip::
+
+  We recommended using this API documentation with the conceptual guide at
+  `SageMaker's Distributed Model Parallel
+  <http://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel.html>`_
+  in the *Amazon SageMaker developer guide*. This developer guide documentation includes:
+
+  - An overview of model parallelism, and the library's
+    `core features <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-core-features.html>`_,
+    and `extended features for PyTorch <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-extended-features-pytorch.html>`_.
+  - Instructions on how to modify `TensorFlow
+    <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-customize-training-script-tf.html>`_
+    and `PyTorch
+    <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-customize-training-script-pt.html>`_
+    training scripts.
+  - Instructions on how to `run a distributed training job using the SageMaker Python SDK
+    and the SageMaker model parallel library
+    <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-sm-sdk.html>`_.
+  - `Configuration tips and pitfalls
+    <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-customize-tips-pitfalls.html>`_.
 
 
 .. important::
diff --git a/doc/api/training/smd_model_parallel_general.rst b/doc/api/training/smd_model_parallel_general.rst
index 03c9c0078a..71f9115580 100644
--- a/doc/api/training/smd_model_parallel_general.rst
+++ b/doc/api/training/smd_model_parallel_general.rst
@@ -1,338 +1,212 @@
-.. admonition:: Contents
+#################################
+Use with the SageMaker Python SDK
+#################################
 
-   - :ref:`sm-sdk-modelparallel-params`
-   - :ref:`ranking-basics`
+Walk through the following pages to learn about the SageMaker model parallel library's APIs
+to configure and enable distributed model parallelism
+through an Amazon SageMaker estimator.
 
 .. _sm-sdk-modelparallel-params:
 
-Required SageMaker Python SDK parameters
-========================================
-
-The TensorFlow and PyTorch ``Estimator`` objects contains a ``distribution`` parameter,
-which is used to enable and specify parameters for the
-initialization of the SageMaker distributed model parallel library. The library internally uses MPI,
-so in order to use model parallelism, MPI must also be enabled using the ``distribution`` parameter.
-
-The following is an example of how you can launch a new PyTorch training job with the library.
-
-.. code-block:: python3
-
-   sagemaker_session = sagemaker.session.Session(boto_session=session)
-
-   mpi_options = {
-                  "enabled" : True,
-                  "processes_per_host" : 8,
-                  "custom_mpi_options" : "--mca btl_vader_single_copy_mechanism none "
-                  }
-
-   smp_options = {
-                  "enabled":True,
-                  "parameters": {
-                     "microbatches": 4,
-                     "placement_strategy": "spread",
-                     "pipeline": "interleaved",
-                     "optimize": "speed",
-                     "partitions": 2,
-                     "ddp": True,
-                  }
-               }
-
-   smd_mp_estimator = PyTorch(
-            entry_point="training-script.py", # Pick your train script
-            source_dir='utils',
-            role=role,
-            instance_type='ml.p3.16xlarge',
-            sagemaker_session=sagemaker_session,
-            framework_version='1.6.0',
-            py_version='py3',
-            instance_count=1,
-            distribution={
-               "smdistributed": {"modelparallel": smp_options},
-               "mpi": mpi_options
-            },
-            base_job_name="SMD-MP-demo",
-         )
-
-   smd_mp_estimator.fit('s3://my_bucket/my_training_data/')
-
-``smdistributed`` Parameters
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-You can use the following parameters to initialize the library using the ``parameters``
-in the ``smdistributed`` of ``distribution``.
-
-Note: ``partitions`` is required in ``parameters`` of ``smp_options``. All other parameters in the following
-table are optional.
-
-.. table::
-   :widths: 10 20 10 60
+Configuration Parameters for ``distribution``
+=============================================
 
-   +---------------------------+-------------------------+-------------------+-----------------------+
-   | **Parameter**             | **Type / Valid values** | **Default**       | **Description**       |
-   |                           |                         |                   |                       |
-   +---------------------------+-------------------------+-------------------+-----------------------+
-   | ``partitions`` (required) | int                     | -                 | The number of         |
-   |                           |                         |                   | partitions to         |
-   |                           |                         |                   | split the model       |
-   |                           |                         |                   | into.                 |
-   +---------------------------+-------------------------+-------------------+-----------------------+
-   | ``microbatches``          | int                     | 1                 | The number of         |
-   |                           |                         |                   | microbatches to       |
-   |                           |                         |                   | perform               |
-   |                           |                         |                   | pipelining            |
-   |                           |                         |                   | over. 1 means         |
-   |                           |                         |                   | no pipelining.        |
-   |                           |                         |                   | Batch size must       |
-   |                           |                         |                   | be divisible by       |
-   |                           |                         |                   | the number of         |
-   |                           |                         |                   | microbatches.         |
-   +---------------------------+-------------------------+-------------------+-----------------------+
-   | ``pipeline``              | ``"interleaved"``       | ``"interleaved"`` | The pipeline          |
-   |                           | or ``"simple"``         |                   | schedule.             |
-   |                           |                         |                   |                       |
-   +---------------------------+-------------------------+-------------------+-----------------------+
-   | ``optimize``              | ``"memory"`` or         | ``"memory"``      | Whether the library   |
-   |                           | ``"speed"``             |                   | should optimize       |
-   |                           |                         |                   | for speed or          |
-   |                           |                         |                   | memory during         |
-   |                           |                         |                   | partitioning          |
-   |                           |                         |                   | decision and          |
-   |                           |                         |                   | pipeline              |
-   |                           |                         |                   | execution.            |
-   |                           |                         |                   |                       |
-   |                           |                         |                   |                       |
-   |                           |                         |                   | **speed**             |
-   |                           |                         |                   | When the library is   |
-   |                           |                         |                   | configured to         |
-   |                           |                         |                   | optimize speed,       |
-   |                           |                         |                   | it attempts to        |
-   |                           |                         |                   | balance the           |
-   |                           |                         |                   | number of             |
-   |                           |                         |                   | operations            |
-   |                           |                         |                   | executed in           |
-   |                           |                         |                   | each device,          |
-   |                           |                         |                   | and executes a        |
-   |                           |                         |                   | less strict           |
-   |                           |                         |                   | pipeline              |
-   |                           |                         |                   | schedule in           |
-   |                           |                         |                   | which a               |
-   |                           |                         |                   | microbatch can        |
-   |                           |                         |                   | start executing       |
-   |                           |                         |                   | before the            |
-   |                           |                         |                   | previous              |
-   |                           |                         |                   | microbatch is         |
-   |                           |                         |                   | completely            |
-   |                           |                         |                   | finished on           |
-   |                           |                         |                   | that device.          |
-   |                           |                         |                   |                       |
-   |                           |                         |                   |                       |
-   |                           |                         |                   | **memory**            |
-   |                           |                         |                   | When the library      |
-   |                           |                         |                   | optimizes             |
-   |                           |                         |                   | memory, it            |
-   |                           |                         |                   | attempts to           |
-   |                           |                         |                   | balance the           |
-   |                           |                         |                   | total number of       |
-   |                           |                         |                   | stored                |
-   |                           |                         |                   | trainable             |
-   |                           |                         |                   | parameters and        |
-   |                           |                         |                   | activations on        |
-   |                           |                         |                   | each device and       |
-   |                           |                         |                   | imposes a             |
-   |                           |                         |                   | strict pipeline       |
-   |                           |                         |                   | schedule on the       |
-   |                           |                         |                   | backend.              |
-   +---------------------------+-------------------------+-------------------+-----------------------+
-   | ``placement_strategy``    | ``"cluster"`` or        | ``"cluster"``     | When hybrid           |
-   |                           | ``"spread"``            |                   | model/data            |
-   |                           |                         |                   | parallelism is        |
-   |                           |                         |                   | used,                 |
-   |                           |                         |                   | cluster               |
-   |                           |                         |                   | places a single       |
-   |                           |                         |                   | model replica         |
-   |                           |                         |                   | in neighboring        |
-   |                           |                         |                   | device IDs,           |
-   |                           |                         |                   | whereas               |
-   |                           |                         |                   | spread                |
-   |                           |                         |                   | places them as        |
-   |                           |                         |                   | far as                |
-   |                           |                         |                   | possible.             |
-   |                           |                         |                   |                       |
-   |                           |                         |                   | Example:              |
-   |                           |                         |                   | - 8 GPUs: [0,         |
-   |                           |                         |                   | 1, 2, 3, 4, 5,        |
-   |                           |                         |                   | 6, 7], 4-way          |
-   |                           |                         |                   | model                 |
-   |                           |                         |                   | parallelism,          |
-   |                           |                         |                   | 2-way data            |
-   |                           |                         |                   | parallelism.          |
-   |                           |                         |                   | Two model             |
-   |                           |                         |                   | replicas, each        |
-   |                           |                         |                   | partitioned           |
-   |                           |                         |                   | across 4 GPUs.        |
-   |                           |                         |                   |                       |
-   |                           |                         |                   |                       |
-   |                           |                         |                   | **spread**            |
-   |                           |                         |                   | places                |
-   |                           |                         |                   | the two model         |
-   |                           |                         |                   | replicas in [0,       |
-   |                           |                         |                   | 2, 4, 6] and          |
-   |                           |                         |                   | [1, 3, 5, 7].         |
-   |                           |                         |                   |                       |
-   |                           |                         |                   |                       |
-   |                           |                         |                   | **cluster**           |
-   |                           |                         |                   | places the two        |
-   |                           |                         |                   | model replicas        |
-   |                           |                         |                   | in [0, 1, 2, 3]       |
-   |                           |                         |                   | and [4, 5, 6, 7].     |
-   |                           |                         |                   |                       |
-   |                           |                         |                   | This can be           |
-   |                           |                         |                   | useful, for           |
-   |                           |                         |                   | instance, for         |
-   |                           |                         |                   | performing            |
-   |                           |                         |                   | model                 |
-   |                           |                         |                   | parallelism           |
-   |                           |                         |                   | across                |
-   |                           |                         |                   | instances, and        |
-   |                           |                         |                   | leaving the           |
-   |                           |                         |                   | intra-node            |
-   |                           |                         |                   | high-bandwidth        |
-   |                           |                         |                   | NVLinks for           |
-   |                           |                         |                   | data                  |
-   |                           |                         |                   | parallelism.          |
-   +---------------------------+-------------------------+-------------------+-----------------------+
-   | ``auto_partition``        | bool                    | ``True``          | Enable                |
-   |                           |                         |                   | auto-partitioning.    |
-   |                           |                         |                   | If disabled,          |
-   |                           |                         |                   | ``default_partition`` |
-   |                           |                         |                   | parameter             |
-   |                           |                         |                   | must be               |
-   |                           |                         |                   | provided.             |
-   +---------------------------+-------------------------+-------------------+-----------------------+
-   | ``default_partition``     | int                     | ``0``             | The partition         |
-   |                           |                         |                   | ID to place           |
-   | (required if              |                         |                   | operations/modules    |
-   | auto_partition if false)  |                         |                   | that are not          |
-   |                           |                         |                   | placed in any         |
-   |                           |                         |                   | ``smp.partition``     |
-   |                           |                         |                   | contexts.             |
-   +---------------------------+-------------------------+-------------------+-----------------------+
-
-
-.. rubric:: TensorFlow-specific parameters
-
-.. table::
-   :widths: 10 20 10 60
+Amazon SageMaker's TensorFlow and PyTorch estimator objects contain a ``distribution`` parameter,
+which you can use to enable and specify parameters for SageMaker distributed training.
+The SageMaker model parallel library internally uses MPI.
+To use model parallelism, both ``smdistributed`` and MPI must be enabled
+through the ``distribution`` parameter.
+
+.. tip::
+
+  This page provides you a complete list of parameters you can use
+  when you construct a SageMaker estimator and configure for distributed training.
+
+  To find examples of how to construct a SageMaker estimator with the distributed training parameters, see
+  `Launch a SageMaker Distributed Model Parallel Training Job <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-sm-sdk.html>`_
+  in the `SageMaker's Distributed Model Parallel developer guide <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel.html>`_.
+
+.. contents:: Table of Contents
+  :depth: 3
+  :local:
+
+Parameters for ``smdistributed``
+----------------------------------
 
-   +----------------+-------------------------+-------------+-----------------+
-   | **Parameter**  | **Type / Valid values** | **Default** | **Description** |
-   |                |                         |             |                 |
-   +----------------+-------------------------+-------------+-----------------+
-   | ``contiguous`` | bool                    | ``True``    | Whether the     |
-   |                |                         |             | model           |
-   |                |                         |             | partitions      |
-   |                |                         |             | should be       |
-   |                |                         |             | contiguous. If  |
-   |                |                         |             | true, each      |
-   |                |                         |             | partition forms |
-   |                |                         |             | a connected     |
-   |                |                         |             | component in    |
-   |                |                         |             | the             |
-   |                |                         |             | computational   |
-   |                |                         |             | graph, unless   |
-   |                |                         |             | the graph       |
-   |                |                         |             | itself is not   |
-   |                |                         |             | connected.      |
-   +----------------+-------------------------+-------------+-----------------+
-   | ``horovod``    | bool                    | ``False``   | Must be set to  |
-   |                |                         |             | ``True`` if     |
-   |                |                         |             | hybrid          |
-   |                |                         |             | model/data      |
-   |                |                         |             | parallelism is  |
-   |                |                         |             | used and the    |
-   |                |                         |             | data            |
-   |                |                         |             | parallelism     |
-   |                |                         |             | (DP) framework  |
-   |                |                         |             | is Horovod.     |
-   +----------------+-------------------------+-------------+-----------------+
-
-.. rubric:: PyTorch-specific parameters
-
-.. table::
+You can use the following parameters to initialize the library
+configuring a dictionary for ``modelparallel``, which goes
+into the ``smdistributed`` option for the ``distribution`` parameter.
+
+.. note::
+
+    ``partitions`` for TensorFlow and ``pipeline_parallel_degree`` for PyTorch are required parameters.
+    All other parameters in the following
+    table are optional.
+
+Common Parameters
+~~~~~~~~~~~~~~~~~
+
+.. list-table::
+   :widths: 10 20 10 60
+   :header-rows: 1
+
+   * - Parameter
+     - Type / Valid values
+     - Default
+     - Description
+   * - ``partitions`` for TensorFlow and PyTorch with smdistributed-modelparallel<v1.6,
+       ``pipeline_parallel_degree`` for PyTorch v1.8.1 with smdistributed-modelparallel>=v1.6)
+     - int
+     -
+     - **Required.** The number of partitions to split the model into.
+       In case of ``pipeline_parallel_degree`` for PyTorch, this is the number of devices
+       over which pipeline parallelism will be performed.
+   * - ``microbatches``
+     - int
+     - 1
+     - The number of microbatches to perform pipelining over. 1 means no pipelining.
+       Batch size must be divisible by the number of microbatches.
+   * - ``pipeline``
+     - ``"interleaved"`` or ``"simple"``
+     - ``"interleaved"``
+     - The pipeline schedule.
+   * - ``optimize``
+     - ``"memory"`` or ``"speed"``
+     - ``"memory"``
+     - Determines the distribution mechanism of transformer layers.
+       If optimizing ``speed``, there will be less communication across tensor-parallel ranks
+       and layer normalization will not be distributed. However, there will be duplicate activations
+       stored across tensor-parallel ranks.
+       If optimizing ``memory``, there will be no redundant activations stored,
+       but this will result in more communication overhead across tensor parallel ranks.
+   * - ``placement_strategy``
+     - ``"cluster"``, ``"spread"``, or a permutation of the string ``D``, ``P``, and ``T``.
+     - ``"cluster"``
+     - Determines the mapping of model partitions onto physical devices.
+       When hybrid model/data parallelism is used, ``cluster`` places a single model replica in
+       neighboring device IDs. Contrarily, ``spread`` places a model replica as far as possible.
+       For more information, see :ref:`ranking-basics`.
+
+       In case of the permutation letters, ``D`` stands for reduced-data parallelism,
+       ``P`` stands for pipeline parallelism,
+       and ``T`` stands for tensor parallelism.
+       ``spread`` is equivalent to ``"TPD"``, and ``cluster`` is equivalent to ``"DPT"``.
+       For more information, see :ref:`ranking-basics-tensor-parallelism`.
+
+       Note: For TensorFlow, tensor parallelism is not implemented and
+       available parameter values are only ``"spread"`` and ``"cluster"``.
+   * - ``auto_partition``
+     - bool
+     - ``True``
+     - Enable auto-partitioning. If disabled, ``default_partition`` parameter must be provided.
+   * - ``default_partition``
+     - int
+     - ``0``
+     - **Required** if ``auto_partition`` is false. The partition ID to place operations/modules
+       that are not placed in any ``smp.partition`` contexts.
+
+TensorFlow-specific Parameters
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. list-table::
    :widths: 10 20 10 60
+   :header-rows: 1
+
+   * - Parameter
+     - Type / Valid values
+     - Default
+     - Description
+   * - ``contiguous``
+     - bool
+     - ``True``
+     - Whether the model partitions should be contiguous. If true, each partition forms a connected component in the computational graph, unless the graph itself is not connected.
+   * - ``horovod``
+     - bool
+     - ``False``
+     - Must be set to ``True`` if hybrid model/data parallelism is used and the data parallelism (DP) framework is Horovod.
+
+
+PyTorch-specific Parameters
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. list-table::
+  :widths: 10 20 10 60
+  :header-rows: 1
+
+  * - Parameter
+    - Type / Valid values
+    - Default
+    - Description
+  * - ``memory_weight``
+    - float [0.0, 1.0]
+    - ``0.2`` if ``optimize`` is ``"speed"``, else ``0.8``
+    - The weight of memory balancing in the auto-partitioni ng objective, as opposed to balancing computational load. If 0.0, the library only tries to balance computation; if 1.0 the library only tries to balance the memory use. Any value in between interpolates between these extremes.
+  * - ``ddp``
+    - bool
+    - ``False``
+    - Must be set to True if hybrid model/data parallelism is used with DistributedDataParallel. DistributedDataParallel is used with NCCL backend, and uses the MASTER_PORT provided by SageMaker.
+  * - ``active_microbatches`` (**smdistributed-modelparallel**>=v1.3)
+    - int
+    - ``partitions`` + 2
+    - This is the maximum number of microbatches that are simultaneously in execution during pipelining. Jointly scaling batch size and number of microbatches can often mitigate the pipeline bubble overhead, but that can lead to increased memory usage if too many microbatches are simultaneously in execution. In such cases setting the number of active microbatches to a lower number can help control memory usage. By default this is set to two plus the number of partitions of the model.
+  * - ``deterministic_server`` (**smdistributed-modelparallel**>=v1.3)
+    - bool
+    - ``False``
+    - Setting this to true ensures that the execution server for pipelining executes requests in the same order across all data parallel ranks.
+  * -  ``offload_activations`` (**smdistributed-modelparallel**>=v1.6)
+    - bool
+    - False
+    - Enables activation
+      offloading. To improve GPU memory usage, use activation offloading
+      only when (1) the ``microbatches`` and ``active_microbatches`` are
+      greater than 1, and (2) activation checkpointing is enabled for at
+      least one module in the model.
+  * - ``activation_loading_horizon`` (**smdistributed-modelparallel**>=v1.6)
+    - int
+    - 4
+    - Specify the number
+      of pipeline tasks. This determines how early the activations should
+      be loaded back to the GPU, expressed in number of pipeline tasks.
+      Smaller value indicates that activations are loaded closer in time to
+      when they are needed for backward pass. Setting this value too small
+      might improve memory usage, but might potentially cause throughput
+      loss and GPU bottlenecks during the CPU-to-GPU data transfer.
+  * - ``tensor_parallel_degree`` (**smdistributed-modelparallel**>=v1.6)
+    - int
+    - 1
+    - The number of devices over which the tensor parallel modules will be distributed.
+      If ``tensor_parallel_degree`` is greater than 1, then ``ddp`` must be set to ``True``.
+  * - ``fp16_params`` (**smdistributed-modelparallel**>=v1.6)
+    - bool
+    - ``False``
+    - If ``True``, the parameters of the distributed modules will be initialized in FP16.
+  * - ``shard_optimizer_state`` (**smdistributed-modelparallel**>=v1.6)
+    - bool
+    - ``False``
+    - If ``True``, the library shards the optimizer state of all parameters across
+      the data parallel processes which hold the same parameter.
+      This optimizer state sharding happens in a balanced manner.
+      Note that when sharding optimizer state, full optimizer saving is not currently supported.
+      Please save partial optimizer state. For more information about saving and loading checkpoints with
+      optimizer state sharding, see `Instructions for Checkpointing with Tensor Parallelism <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-extended-features-pytorch-saving-loading-checkpoints.html>`_.
+  * - ``prescaled_batch`` (**smdistributed-modelparallel**>=v1.6)
+    - bool
+    - ``False``
+    - If ``True`` and when ``smp.nn.DistributedTransformerLMHead`` is used
+      (this is typically used for GPT-2 or GPT-3 models),
+      the library assumes that the devices in the same tensor parallelism group
+      receive the same input data. Otherwise, it is assumed that they receive
+      different examples. To learn more, see :ref:`prescaled-batch`.
+  * - ``skip_tracing`` (**smdistributed-modelparallel**>=v1.6)
+    - bool
+    - False
+    - Skips the initial tracing step. This can be useful in very large models
+      where even model tracing at the CPU is not possible due to memory constraints.
+
+
+Parameters for ``mpi``
+----------------------
 
-   +--------------------------+-------------------------+--------------------+--------------------------------------+
-   | **Parameter**            | **Type / Valid values** | **Default**        | **Description**                      |
-   |                          |                         |                    |                                      |
-   +--------------------------+-------------------------+--------------------+--------------------------------------+
-   | ``memory_weight``        | float (between          | 0.2 if             | The weight of                        |
-   |                          | 0.0 and 1.0)            | ``optimize`` is    | memory                               |
-   |                          |                         | ``"speed"``,       | balancing in                         |
-   |                          |                         | else 0.8           | the                                  |
-   |                          |                         |                    | auto-partitioni                      |
-   |                          |                         |                    | ng                                   |
-   |                          |                         |                    | objective, as                        |
-   |                          |                         |                    | opposed to                           |
-   |                          |                         |                    | balancing                            |
-   |                          |                         |                    | computational                        |
-   |                          |                         |                    | load. If 0.0,                        |
-   |                          |                         |                    | the library only tries               |
-   |                          |                         |                    | to balance                           |
-   |                          |                         |                    | computation; if                      |
-   |                          |                         |                    | 1.0 the library only                 |
-   |                          |                         |                    | tries to                             |
-   |                          |                         |                    | balance the                          |
-   |                          |                         |                    | memory use. Any                      |
-   |                          |                         |                    | value in                             |
-   |                          |                         |                    | between                              |
-   |                          |                         |                    | interpolates                         |
-   |                          |                         |                    | between these                        |
-   |                          |                         |                    | extremes.                            |
-   +--------------------------+-------------------------+--------------------+--------------------------------------+
-   | ``ddp``                  | bool                    | ``False``          | Must be set to                       |
-   |                          |                         |                    | ``True`` if                          |
-   |                          |                         |                    | hybrid                               |
-   |                          |                         |                    | model/data                           |
-   |                          |                         |                    | parallelism is                       |
-   |                          |                         |                    | used                                 |
-   |                          |                         |                    | with ``DistributedDataParallel``.    |
-   |                          |                         |                    | ``DistributedDataParallel``          |
-   |                          |                         |                    | is used with                         |
-   |                          |                         |                    | NCCL backend,                        |
-   |                          |                         |                    | and uses the                         |
-   |                          |                         |                    | ``MASTER_PORT``                      |
-   |                          |                         |                    | provided by                          |
-   |                          |                         |                    | SageMaker.                           |
-   +--------------------------+-------------------------+--------------------+--------------------------------------+
-   | ``active_microbatches``  | int                     | ``partitions`` + 2 | This is the maximum number of        |
-   | (Only >= v1.3)           |                         |                    | microbatches that are simultaneously |
-   |                          |                         |                    | in execution during pipelining.      |
-   |                          |                         |                    | Jointly scaling batch                |
-   |                          |                         |                    | size and number of microbatches      |
-   |                          |                         |                    | can often mitigate the pipeline      |
-   |                          |                         |                    | bubble overhead, but that can        |
-   |                          |                         |                    | lead to increased memory usage       |
-   |                          |                         |                    | if too many microbatches are         |
-   |                          |                         |                    | simultaneously in execution.         |
-   |                          |                         |                    | In such cases setting the            |
-   |                          |                         |                    | number of active                     |
-   |                          |                         |                    | microbatches to a lower number       |
-   |                          |                         |                    | can help control memory usage.       |
-   |                          |                         |                    | By default this is set to two        |
-   |                          |                         |                    | plus the number of                   |
-   |                          |                         |                    | partitions of the model.             |
-   +--------------------------+-------------------------+--------------------+--------------------------------------+
-   | ``deterministic_server`` | bool                    | ``False``          | Setting this to true                 |
-   | (Only >= v1.3)           |                         |                    | ensures that the execution           |
-   |                          |                         |                    | server for pipelining                |
-   |                          |                         |                    | executes requests in the             |
-   |                          |                         |                    | same order across all                |
-   |                          |                         |                    | data parallel ranks.                 |
-   +--------------------------+-------------------------+--------------------+--------------------------------------+
-
-
-``mpi`` Parameters
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 For the ``"mpi"`` key, a dict must be passed which contains:
 
 * ``"enabled"``: Set to ``True`` to launch the training job with MPI.
@@ -369,8 +243,8 @@ For the ``"mpi"`` key, a dict must be passed which contains:
 
 .. _ranking-basics:
 
-Ranking Basics
-==============
+Ranking Basics without Tensor Parallelism
+=========================================
 
 The library maintains a one-to-one mapping between processes and available GPUs:
 for each GPU, there is a corresponding CPU process. Each CPU process
@@ -387,27 +261,136 @@ launched in the instance. For instance, in the preceding
 example, ``local_rank``\ s of the processes will range from 0 to 7,
 since there are 8 GPUs in a ``p3dn.24xlarge`` instance.
 
-When the library is used together with data parallelism (Horovod for TensorFlow
+When model parallelism is used together with data parallelism (Horovod for TensorFlow
 and DDP for PyTorch), the library partitions the set of processes into
 disjoint \ ``mp_group``\ s. An ``mp_group`` is a subset of all processes
-that together hold a single, partitioned model replica. For instance, if
-a single node job is launched with 8 local processes, and
-``partitions`` is 2 (meaning the model will be split into 2), there are
+that together hold a single, partitioned model replica.
+
+For instance, if
+a single node job is launched with 8 local processes with
+``partitions=2`` (meaning the model will be split into 2), there are
 four \ ``mp_group``\ s. The specific sets of processes that form the
-``mp_group``\ s can be adjusted by the ``placement_strategy`` option. In
-this example, if ``placement_strategy`` is ``spread``, then the four
-``mp_group``\ s are ``[0, 4], [1, 5], [2, 6], [3, 7]``. An
-``mp_rank`` is the rank of a process within its own ``mp_group``. In the
-previous example, the ``mp_rank`` of process 1 is 0, and ``mp_rank`` of
-process 6 is 1.
-
-Analogously, the library defines ``dp_group``\ s as the sets of processes that
-all hold the same model partition, and perform data parallelism among
-each other. In the example above, there are two ``dp_group``\ s,
-``[0, 1, 2, 3]`` and ``[4, 5, 6, 7]``,
-
-since each process within the ``dp_group`` holds the same partition of
-the model, and makes allreduce calls among themselves. Allreduce for
-data parallelism does not take place *across* ``dp_group``\ s.
-``dp_rank`` is defined as the rank of a process within its ``dp_group``.
-In the preceding example, the \ ``dp_rank`` of process 6 is 2.
+``mp_group``\ s can be adjusted by the ``placement_strategy`` option.
+
+- If ``placement_strategy`` is ``spread``, then the four
+  ``mp_group``\ s are ``[0, 4], [1, 5], [2, 6], [3, 7]``. The
+  ``mp_rank`` is the rank of a process within each ``mp_group``. For example,
+  the ``mp_rank`` is 0 for the processes 0, 1, 2, and 3, and the ``mp_rank`` is 1 for
+  the processes 4, 5, 6, and 7.
+
+  Analogously, the library defines ``dp_group``\ s as sets of processes that
+  all hold the same model partition, and perform data parallelism among
+  each other. If ``placement_strategy`` is ``spread``, there are two ``dp_group``\ s:
+  ``[0, 1, 2, 3]`` and ``[4, 5, 6, 7]``.
+
+  Since each process within the ``dp_group`` holds the same partition of
+  the model, and makes allreduce calls among themselves. Allreduce for
+  data parallelism does not take place *across* ``dp_group``\ s.
+  ``dp_rank`` is defined as the rank of a process within its ``dp_group``.
+  In the preceding example, the \ ``dp_rank`` of process 6 is 2.
+
+- If ``placement_strategy`` is ``cluster``, the four ``mp_group``\ s
+  become ``[0, 1], [2, 3], [4, 5], [6, 7]``, and the the two ``dp_group``\ s become
+  ``[0, 2, 4, 6]`` and ``[1, 3, 5, 7]``.
+
+.. _ranking-basics-tensor-parallelism:
+
+Placement Strategy with Tensor Parallelism
+==========================================
+
+In addition to the two placement strategies introduced in the previous section,
+the library provides additional placement strategies for extended tensor parallelism features
+for PyTorch. The additional placement strategies (parallelism types) are denoted as follows:
+
+- ``D`` stands for (reduced) data parallelism.
+- ``P`` stands for pipeline parallelism.
+- ``T`` stands for tensor parallelism.
+
+With given permutation of the tree letters, the library takes the right-most letter
+as the first strategy performs over the global ranks in ascending order.
+Contrarily, the parallelism type represented by the left-most letter is performed
+over the ranks that are as distant as possible.
+
+- **Example:** Given 8 devices with ``tp_size() == 2``,
+  ``pp_size() == 2``, ``rdp_size() == 2``
+
+  - ``placement_strategy: "DPT"`` gives
+
+    ==== ======== ======= =======
+    rank rdp_rank pp_rank tp_rank
+    ==== ======== ======= =======
+    0    0        0       0
+    1    0        0       1
+    2    0        1       0
+    3    0        1       1
+    4    1        0       0
+    5    1        0       1
+    6    1        1       0
+    7    1        1       1
+    ==== ======== ======= =======
+
+  - ``placement_strategy: "PTD"`` gives
+
+    ==== ======== ======= =======
+    rank rdp_rank pp_rank tp_rank
+    ==== ======== ======= =======
+    0    0        0       0
+    1    1        0       0
+    2    0        0       1
+    3    1        0       1
+    4    0        1       0
+    5    1        1       0
+    6    0        1       1
+    7    1        1       1
+    ==== ======== ======= =======
+
+Because the neighboring ranks are placed on the same instance with
+high-bandwidth NVLinks, it is recommended to place the
+parallelism type that has higher bandwidth requirements for your model
+on the right-most position in the ``placement_strategy`` string. Because
+tensor parallelism often requires frequent communication, placing
+``T`` in the right-most position is recommended (as in the default
+``"cluster"`` strategy). In many large models, keeping the default of
+``"cluster"`` would result in the best performance.
+
+
+.. _prescaled-batch:
+
+Prescaled Batch
+===============
+
+``prescaled_batch`` is a configuration parameter that can be useful for
+``DistributedTransformerLMHead``, which is used for GPT-2 and GPT-3.
+
+The way tensor parallelism works is that when a module is distributed,
+the inputs to the distributed module in different ``tp_rank``\ s gets
+shuffled around in a way that is sliced by the hidden dimension and
+scaled by the batch dimension. For example, if tensor parallel degree is
+8, the inputs to ``DistributedTransformer`` (a tensor with shape
+``[B, S, H]`` where ``B``\ =batch size, ``S``\ =sequence length,
+``H``\ =hidden width) in different ``tp_rank``\ s will be communicated
+around, and the shapes will become ``[8B, S, H/8]``. Each ``tp_rank``
+has the batch from all the peer ``tp_rank``\ s, but only the slice that
+interacts with their local partition of the module.
+
+By default, the library assumes that each ``tp_rank`` gets assigned a
+different batch, and performs the communication described above. If
+``prescaled_batch`` is true, then the library assumes that the input
+batch is already scaled (and is the same across the ``tp_rank``\ s), and
+only does the slicing. In the example above, the library assumes that
+input tensor has shape ``[8B, S, H]``, and only converts it into
+``[8B, S, H/8]``. So if ``prescaled_batch`` is true, it is the user’s
+responsibility to feed the same batch to the ``tp_rank``\ s in the same
+``TP_GROUP``. This can be done by doing the data sharding based on
+``smp.rdp_size()`` and ``smp.rdp_rank()``, instead of ``smp.dp_size()``
+and ``smp.dp_rank()``. When ``prescaled_batch`` is true, the global
+batch size is ``smp.rdp_size()`` multiplied by the per-``MP_GROUP``
+batch size. When ``prescaled_batch`` is false, global batch size is
+``smp.dp_size()`` multiplied by the per-``PP_GROUP`` batch size.
+
+If you use pipeline parallelism degree 1, then you can keep
+``prescaled_batch`` false (the default option). If you use a pipeline
+parallellism degree more than 1, it is recommended to use
+``prescaled_batch`` true, so that you can increase per-``MP_GROUP``
+batch size for efficient pipelining, without running into out-of-memory
+issues.
diff --git a/doc/api/training/smd_model_parallel_release_notes/smd_model_parallel_change_log.rst b/doc/api/training/smd_model_parallel_release_notes/smd_model_parallel_change_log.rst
index 249a38573e..feed17a101 100644
--- a/doc/api/training/smd_model_parallel_release_notes/smd_model_parallel_change_log.rst
+++ b/doc/api/training/smd_model_parallel_release_notes/smd_model_parallel_change_log.rst
@@ -1,6 +1,67 @@
-Sagemaker Distributed Model Parallel 1.4.0 Release Notes
+Sagemaker Distributed Model Parallel 1.6.0 Release Notes
 ========================================================
 
+*Date: December. 20. 2021*
+
+**New Features**
+
+- **PyTorch**
+
+  - Added extended memory-saving features for PyTorch 1.8.1:
+
+    - Tensor parallelism
+    - Optimizer state sharding
+    - Activation checkpointing
+    - Activation offloading
+
+    For more information, see the following documentation:
+
+    - `SageMaker distributed model parallel developer guide <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-extended-features-pytorch.html>`_
+    - `SageMaker distributed model parallel API documentation for v1.6.0 <https://sagemaker.readthedocs.io/en/stable/api/training/smp_versions/latest.html>`_
+
+**Migration to AWS Deep Learning Containers**
+
+This version passed benchmark testing and is migrated to the following
+AWS Deep Learning Container(s):
+
+- Deep Learning Container for PyTorch 1.8.1:
+
+  .. code::
+
+    763104351884.dkr.ecr.<region>.amazonaws.com/pytorch-training:1.8.1-gpu-py36-cu111-ubuntu18.04
+
+----
+
+Release History
+===============
+
+Sagemaker Distributed Model Parallel 1.5.0 Release Notes
+--------------------------------------------------------
+
+*Date: November. 03. 2021*
+
+**New Features**
+
+- **PyTorch**
+
+  - Currency update for PyTorch 1.10.0
+
+**Migration to AWS Deep Learning Containers**
+
+This version passed benchmark testing and is migrated to the following
+AWS Deep Learning Containers:
+
+- Deep Learning Container for PyTorch 1.10.0:
+
+  .. code::
+
+    763104351884.dkr.ecr.<region>.amazonaws.com/pytorch-training:1.10.0-gpu-py38-cu113-ubuntu20.04-sagemaker
+
+----
+
+Sagemaker Distributed Model Parallel 1.4.0 Release Notes
+--------------------------------------------------------
+
 *Date: June. 29. 2021*
 
 **New Features**
@@ -15,17 +76,19 @@ Sagemaker Distributed Model Parallel 1.4.0 Release Notes
 This version passed benchmark testing and is migrated to the following
 AWS Deep Learning Containers:
 
-- TensorFlow 2.5.0 DLC release: `v1.0-tf-2.5.0-tr-py37
-  <https://github.com/aws/deep-learning-containers/releases/tag/v1.0-tf-2.5.0-tr-py37>`__
+- Deep Learning Container for TensorFlow 2.5.0:
 
   .. code::
 
     763104351884.dkr.ecr.<region>.amazonaws.com/tensorflow-training:2.5.0-gpu-py37-cu112-ubuntu18.04-v1.0
 
-----
+- Deep Learning Container for PyTorch 1.9.1:
 
-Release History
-===============
+  .. code::
+
+    763104351884.dkr.ecr.<region>.amazonaws.com/pytorch-training:1.9.1-gpu-py38-cu111-ubuntu20.04
+
+----
 
 Sagemaker Distributed Model Parallel 1.3.1 Release Notes
 --------------------------------------------------------
diff --git a/doc/api/training/smp_versions/archives.rst b/doc/api/training/smp_versions/archives.rst
new file mode 100644
index 0000000000..c1b3d55491
--- /dev/null
+++ b/doc/api/training/smp_versions/archives.rst
@@ -0,0 +1,10 @@
+.. _smdmp-pt-version-archive:
+
+.. toctree::
+    :maxdepth: 1
+
+    v1_5_0.rst
+    v1_4_0.rst
+    v1_3_0.rst
+    v1_2_0.rst
+    v1_1_0.rst
diff --git a/doc/api/training/smp_versions/latest.rst b/doc/api/training/smp_versions/latest.rst
index c99975cd27..336fe7df87 100644
--- a/doc/api/training/smp_versions/latest.rst
+++ b/doc/api/training/smp_versions/latest.rst
@@ -1,5 +1,16 @@
+###############################################
+Use the Library's API to Adapt Training Scripts
+###############################################
 
-Version 1.4.0 (Latest)
+The library provides Common APIs that you can use across frameworks,
+as well as framework-specific APIs for TensorFlow and PyTorch.
+
+Select the latest or one of the previous versions of the API documentation
+depending on which version of the library you need to use.
+To use the library, reference the
+**Common API** documentation alongside the framework specific API documentation.
+
+Version 1.6.0 (Latest)
 ======================
 
 To use the library, reference the Common API documentation alongside the framework specific API documentation.
@@ -9,4 +20,17 @@ To use the library, reference the Common API documentation alongside the framewo
 
    latest/smd_model_parallel_common_api
    latest/smd_model_parallel_pytorch
+   latest/smd_model_parallel_pytorch_tensor_parallel
    latest/smd_model_parallel_tensorflow
+
+To find archived API documentation for the previous versions of the library,
+see the following link:
+
+
+Documentation Archive
+=====================
+
+.. toctree::
+   :maxdepth: 1
+
+   archives
diff --git a/doc/api/training/smp_versions/latest/smd_model_parallel_common_api.rst b/doc/api/training/smp_versions/latest/smd_model_parallel_common_api.rst
index 82ef6c6df0..d1f6b4d45b 100644
--- a/doc/api/training/smp_versions/latest/smd_model_parallel_common_api.rst
+++ b/doc/api/training/smp_versions/latest/smd_model_parallel_common_api.rst
@@ -1,14 +1,16 @@
-.. admonition:: Contents
-
-   - :ref:`communication_api`
-   - :ref:`mpi_basics`
-
 Common API
 ==========
 
 The following SageMaker distribute model parallel APIs are common across all frameworks.
 
-**Important**: This API document assumes you use the following import statement in your training scripts.
+.. contents:: Table of Contents
+  :depth: 3
+  :local:
+
+The Library's Core APIs
+-----------------------
+
+This API document assumes you use the following import statement in your training scripts.
 
 **TensorFlow**
 
@@ -254,30 +256,78 @@ The following SageMaker distribute model parallel APIs are common across all fra
 .. _mpi_basics:
 
 MPI Basics
-^^^^^^^^^^
+----------
 
 The library exposes the following basic MPI primitives to its Python API:
 
--  ``smp.rank()``: The rank of the current process.
--  ``smp.size()``: The total number of processes.
--  ``smp.mp_rank()``: The rank of the process among the processes that
-   hold the current model replica.
--  ``smp.dp_rank()``: The rank of the process among the processes that
-   hold different replicas of the same model partition.
--  ``smp.dp_size()``: The total number of model replicas.
--  ``smp.local_rank()``: The rank among the processes on the current
-   instance.
--  ``smp.local_size()``: The total number of processes on the current
-   instance.
--  ``smp.get_mp_group()``: The list of ranks over which the current
-   model replica is partitioned.
--  ``smp.get_dp_group()``: The list of ranks that hold different
-   replicas of the same model partition.
-
-   .. _communication_api:
+**Global**
+
+-  ``smp.rank()`` : The global rank of the current process.
+-  ``smp.size()`` : The total number of processes.
+-  ``smp.get_world_process_group()`` :
+   ``torch.distributed.ProcessGroup`` that contains all processes.
+-  ``smp.CommGroup.WORLD``: The communication group corresponding to all processes.
+-  ``smp.local_rank()``: The rank among the processes on the current instance.
+-  ``smp.local_size()``: The total number of processes on the current instance.
+-  ``smp.get_mp_group()``: The list of ranks over which the current model replica is partitioned.
+-  ``smp.get_dp_group()``: The list of ranks that hold different replicas of the same model partition.
+
+**Tensor Parallelism**
+
+-  ``smp.tp_rank()`` : The rank of the process within its
+   tensor-parallelism group.
+-  ``smp.tp_size()`` : The size of the tensor-parallelism group.
+-  ``smp.get_tp_process_group()`` : Equivalent to
+   ``torch.distributed.ProcessGroup`` that contains the processes in the
+   current tensor-parallelism group.
+-  ``smp.CommGroup.TP_GROUP`` : The communication group corresponding to
+   the current tensor parallelism group.
+
+**Pipeline Parallelism**
+
+-  ``smp.pp_rank()`` : The rank of the process within its
+   pipeline-parallelism group.
+-  ``smp.pp_size()`` : The size of the pipeline-parallelism group.
+-  ``smp.get_pp_process_group()`` : ``torch.distributed.ProcessGroup``
+   that contains the processes in the current pipeline-parallelism group.
+-  ``smp.CommGroup.PP_GROUP`` : The communication group corresponding to
+   the current pipeline parallelism group.
+
+**Reduced-Data Parallelism**
+
+-  ``smp.rdp_rank()`` : The rank of the process within its
+   reduced-data-parallelism group.
+-  ``smp.rdp_size()`` : The size of the reduced-data-parallelism group.
+-  ``smp.get_rdp_process_group()`` : ``torch.distributed.ProcessGroup``
+   that contains the processes in the current reduced data parallelism
+   group.
+-  ``smp.CommGroup.RDP_GROUP`` : The communication group corresponding
+   to the current reduced data parallelism group.
+
+**Model Parallelism**
+
+-  ``smp.mp_rank()`` : The rank of the process within its model-parallelism
+   group.
+-  ``smp.mp_size()`` : The size of the model-parallelism group.
+-  ``smp.get_mp_process_group()`` : ``torch.distributed.ProcessGroup``
+   that contains the processes in the current model-parallelism group.
+-  ``smp.CommGroup.MP_GROUP`` : The communication group corresponding to
+   the current model parallelism group.
+
+**Data Parallelism**
+
+-  ``smp.dp_rank()`` : The rank of the process within its data-parallelism
+   group.
+-  ``smp.dp_size()`` : The size of the data-parallelism group.
+-  ``smp.get_dp_process_group()`` : ``torch.distributed.ProcessGroup``
+   that contains the processes in the current data-parallelism group.
+-  ``smp.CommGroup.DP_GROUP`` : The communication group corresponding to
+   the current data-parallelism group.
+
+.. _communication_api:
 
 Communication API
-^^^^^^^^^^^^^^^^^
+-----------------
 
 The library provides a few communication primitives which can be helpful while
 developing the training script. These primitives use the following
diff --git a/doc/api/training/smp_versions/latest/smd_model_parallel_pytorch.rst b/doc/api/training/smp_versions/latest/smd_model_parallel_pytorch.rst
index 6e98e7fc66..3ca65c17cb 100644
--- a/doc/api/training/smp_versions/latest/smd_model_parallel_pytorch.rst
+++ b/doc/api/training/smp_versions/latest/smd_model_parallel_pytorch.rst
@@ -1,14 +1,8 @@
-.. admonition:: Contents
-
-   - :ref:`pytorch_saving_loading`
-   - :ref:`pytorch_saving_loading_instructions`
-
 PyTorch API
 ===========
 
-**Supported versions: 1.7.1, 1.8.1**
-
-This API document assumes you use the following import statements in your training scripts.
+To use the PyTorch-specific APIs for SageMaker distributed model parallism,
+you need to add the following import statement at the top of your training script.
 
 .. code:: python
 
@@ -19,10 +13,10 @@ This API document assumes you use the following import statements in your traini
 
    Refer to
    `Modify a PyTorch Training Script
-   <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-customize-training-script.html#model-parallel-customize-training-script-pt>`_
+   <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-customize-training-script-pt.html>`_
    to learn how to use the following API in your PyTorch training script.
 
-.. class:: smp.DistributedModel
+.. py:class:: smp.DistributedModel()
 
    A sub-class of ``torch.nn.Module`` which specifies the model to be
    partitioned. Accepts a ``torch.nn.Module`` object ``module`` which is
@@ -42,7 +36,6 @@ This API document assumes you use the following import statements in your traini
    is \ ``model``) can only be made inside a ``smp.step``-decorated
    function.
 
-
    Since ``DistributedModel``  is a ``torch.nn.Module``, a forward pass can
    be performed by calling the \ ``DistributedModel`` object on the input
    tensors.
@@ -56,7 +49,6 @@ This API document assumes you use the following import statements in your traini
    arguments, replacing the PyTorch operations \ ``torch.Tensor.backward``
    or ``torch.autograd.backward``.
 
-
    The API for ``model.backward`` is very similar to
    ``torch.autograd.backward``. For example, the following
    ``backward`` calls:
@@ -90,7 +82,7 @@ This API document assumes you use the following import statements in your traini
 
    **Using DDP**
 
-   If DDP is enabled, do not not place a PyTorch
+   If DDP is enabled with the SageMaker model parallel library, do not not place a PyTorch
    ``DistributedDataParallel`` wrapper around the ``DistributedModel`` because
    the ``DistributedModel`` wrapper will also handle data parallelism.
 
@@ -284,6 +276,113 @@ This API document assumes you use the following import statements in your traini
       `register_comm_hook <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel.register_comm_hook>`__
       in the PyTorch documentation.
 
+  **Behavior of** ``smp.DistributedModel`` **with Tensor Parallelism**
+
+  When a model is wrapped by ``smp.DistributedModel``, the library
+  immediately traverses the modules of the model object, and replaces the
+  modules that are supported for tensor parallelism with their distributed
+  counterparts. This replacement happens in place. If there are no other
+  references to the original modules in the script, they are
+  garbage-collected. The module attributes that previously referred to the
+  original submodules now refer to the distributed versions of those
+  submodules.
+
+  **Example:**
+
+  .. code:: python
+
+     # register DistributedSubmodule as the distributed version of Submodule
+     # (note this is a hypothetical example, smp.nn.DistributedSubmodule does not exist)
+     smp.tp_register_with_module(Submodule, smp.nn.DistributedSubmodule)
+
+     class MyModule(nn.Module):
+         def __init__(self):
+             ...
+
+             self.submodule = Submodule()
+         ...
+
+     # enabling tensor parallelism for the entire model
+     with smp.tensor_parallelism():
+         model = MyModule()
+
+     # here model.submodule is still a Submodule object
+     assert isinstance(model.submodule, Submodule)
+
+     model = smp.DistributedModel(model)
+
+     # now model.submodule is replaced with an equivalent instance
+     # of smp.nn.DistributedSubmodule
+     assert isinstance(model.module.submodule, smp.nn.DistributedSubmodule)
+
+  If ``pipeline_parallel_degree`` (equivalently, ``partitions``) is 1, the
+  placement of model partitions into GPUs and the initial broadcast of
+  model parameters and buffers across data-parallel ranks take place
+  immediately. This is because it does not need to wait for the model
+  partition when ``smp.DistributedModel`` wrapper is called. For other
+  cases with ``pipeline_parallel_degree`` greater than 1, the broadcast
+  and device placement will be deferred until the first call of an
+  ``smp.step``-decorated function happens. This is because the first
+  ``smp.step``-decorated function call is when the model partitioning
+  happens if pipeline parallelism is enabled.
+
+  Because of the module replacement during the ``smp.DistributedModel``
+  call, any ``load_state_dict`` calls on the model, as well as any direct
+  access to model parameters, such as during the optimizer creation,
+  should be done **after** the ``smp.DistributedModel`` call.
+
+  Since the broadcast of the model parameters and buffers happens
+  immediately during ``smp.DistributedModel`` call when the degree of
+  pipeline parallelism is 1, using ``@smp.step`` decorators is not
+  required when tensor parallelism is used by itself (without pipeline
+  parallelism).
+
+  For more information about the library's tensor parallelism APIs for PyTorch,
+  see :ref:`smdmp-pytorch-tensor-parallel`.
+
+  **Additional Methods of** ``smp.DistributedModel`` **for Tensor Parallelism**
+
+  The following are the new methods of ``smp.DistributedModel``, in
+  addition to the ones listed in the
+  `documentation <https://sagemaker.readthedocs.io/en/stable/api/training/smp_versions/v1.2.0/smd_model_parallel_pytorch.html#smp.DistributedModel>`__.
+
+  .. function:: distributed_modules()
+
+     -  An iterator that runs over the set of distributed
+        (tensor-parallelized) modules in the model
+
+  .. function:: is_distributed_parameter(param)
+
+     -  Returns ``True`` if the given ``nn.Parameter`` is distributed over
+        tensor-parallel ranks.
+
+  .. function::  is_distributed_buffer(buf)
+
+     -  Returns ``True`` if the given buffer is distributed over
+        tensor-parallel ranks.
+
+  .. function::  is_scaled_batch_parameter(param)
+
+     -  Returns ``True`` if the given ``nn.Parameter`` is operates on the
+        scaled batch (batch over the entire ``TP_GROUP``, and not only the
+        local batch).
+
+  .. function::  is_scaled_batch_buffer(buf)
+
+     -  Returns ``True`` if the parameter corresponding to the given
+        buffer operates on the scaled batch (batch over the entire
+        ``TP_GROUP``, and not only the local batch).
+
+  .. function::  default_reducer_named_parameters()
+
+     -  Returns an iterator that runs over ``(name, param)`` tuples, for
+        ``param`` that is allreduced over the ``DP_GROUP``.
+
+  .. function::  scaled_batch_reducer_named_parameters()
+
+     -  Returns an iterator that runs over ``(name, param)`` tuples, for
+        ``param`` that is allreduced over the ``RDP_GROUP``.
+
 
 
 .. class:: smp.DistributedOptimizer
diff --git a/doc/api/training/smp_versions/latest/smd_model_parallel_pytorch_tensor_parallel.rst b/doc/api/training/smp_versions/latest/smd_model_parallel_pytorch_tensor_parallel.rst
new file mode 100644
index 0000000000..413fc7cc46
--- /dev/null
+++ b/doc/api/training/smp_versions/latest/smd_model_parallel_pytorch_tensor_parallel.rst
@@ -0,0 +1,835 @@
+.. _smdmp-pytorch-tensor-parallel:
+
+PyTorch API for Tensor Parallelism
+==================================
+
+SageMaker distributed tensor parallelism works by replacing specific submodules
+in the model with their distributed implementations. The distributed modules
+have their parameters and optimizer states partitioned across tensor-parallel
+ranks. This is to compute the same output as it would have been computed by
+the original modules. Since tensor parallelism occurs across data-parallel
+ranks, a rank might collect slices of the activations corresponding to the
+data shards on other devices that are part of the same tensor parallelism group.
+
+You can enable or disable tensor parallelism for specific parts of the model.
+Within the enabled parts, the replacements with distributed modules will take
+place on a best-effort basis for those module supported for tensor parallelism.
+Alternatively, you can directly import and use the library’s distributed
+modules in the model definition.
+
+Some of the supported modules (such as ``smp.nn.Transformer``) are high-level
+blocks that contain many operations. Because custom implementations
+(as opposed to the built-in PyTorch modules) are typically used for these
+high-level blocks, the library offers an API that you can use to register
+specific distributed versions with such custom modules (provided that they
+are functionally equivalent). This allows the library to automatically replace
+the occurrences of such PyTorch modules with their distributed counterparts
+provided by the library.
+For more information, see the following topics.
+
+.. contents:: Topics
+  :depth: 3
+  :local:
+
+.. _registering-tp-modules:
+
+Registering Tensor Parallelism Distributed Modules
+--------------------------------------------------
+
+Although PyTorch natively provides some of the commonly used (and
+tensor-parallelizable) building blocks such as Transformer, users often
+use custom implementations for such higher-level modules. To distribute
+such modules with tensor parallelism, you need to register the
+distributed modules to the custom module implementation in your class,
+so that the library knows how to distribute the custom module. When you
+register the distributed modules, make sure the custom module that you
+use is functionally equivalent to the distributed module. You can verify
+this by taking a look at the equivalent reference implementations in the
+:ref:`smdmp-tp-appendix`.
+These implementations are functionally equivalent to their distributed
+versions in ``smp.nn`` module.
+
+.. decorator:: @smp.tp_register(dist_module, init_hook=None, forward_hook=None, return_hook=None)
+
+   -  A class decorator that registers the ``dist_module`` class with
+      the module class that it is attached to. The hooks can be used to
+      adapt to different interfaces used with ``__init__`` and
+      ``forward`` methods.
+   -  **Arguments:**
+
+      -  ``dist_module``: A subclass of ``smp.nn.DistributedModule``
+         that implements the distributed version of the module class the
+         decorator is attached to. Any distributed module class defined
+         in ``smp.nn`` module can be used.
+      -  ``init_hook``: A callable that translates the arguments of the
+         original module ``__init__`` method to an ``(args, kwargs)``
+         tuple compatible with the arguments of the corresponding
+         distributed module ``__init__`` method. Must return a tuple,
+         whose first element is an iterable representing the positional
+         arguments, and second element is a ``dict`` representing the
+         keyword arguments. The input signature of the ``init_hook``
+         must **exactly** match the signature of the original
+         ``__init__`` method (including argument order and default
+         values), except it must exclude ``self``.
+      -  ``forward_hook``: A callable that translates the arguments of
+         the original module ``forward`` method to an ``(args, kwargs)``
+         tuple compatible with the arguments of the corresponding
+         distributed module ``forward`` method. Must return a tuple,
+         whose first element is an iterable representing the positional
+         arguments, and second element is a ``dict`` representing the
+         keyword arguments. The input signature of the ``init_hook``
+         must **exactly** match the signature of the original
+         ``forward`` method (including argument order and default
+         values), except it must exclude ``self``.
+      -  ``return_hook``: A callable that translates the object returned
+         from the distributed module to the return object expected of
+         the original module.
+
+   -  **Example:**
+
+      .. code:: python
+
+         init_hook = lambda config: ((), config.to_dict())
+
+         # register smp.nn.DistributedTransformer
+         # as the distributed version of MyTransformer
+         @smp.tp_register(smp.nn.DistributedTransformer, init_hook=init_hook)
+         class MyTransformer(nn.Module):
+             def __init__(self, config):
+                 ...
+
+             def forward(self, hidden_states, attention_mask):
+                 ...
+
+.. function:: smp.tp_register_with_module(module_cls, dist_module, init_hook=None, forward_hook=None, return_hook=None)
+
+   -  When you do not have direct access to model definition code, you
+      can use this API to similarly register a distributed module with
+      an existing module class.
+
+   -  **Arguments:**
+
+      -  ``module_cls``: The existing module class that will be
+         distributed.
+      -  ``dist_module``: A subclass of ``smp.nn.DistributedModule``
+         that implements the distributed version of the module class the
+         decorator is attached to. Any distributed module class defined
+         in ``smp.nn`` module can be used.
+      -  ``init_hook``: A callable that translates the arguments of the
+         original module ``__init__`` method to an ``(args, kwargs)``
+         tuple compatible with the arguments of the corresponding
+         distributed module ``__init__`` method. Must return a tuple,
+         whose first element is an iterable representing the positional
+         arguments, and second element is a ``dict`` representing the
+         keyword arguments. The input signature of the ``init_hook``
+         must **exactly** match the signature of the original
+         ``__init__`` method (including argument order and default
+         values), except it must exclude ``self``.
+      -  ``forward_hook``: A callable that translates the arguments of
+         the original module ``forward`` method to an ``(args, kwargs)``
+         tuple compatible with the arguments of the corresponding
+         distributed module ``forward`` method. Must return a tuple,
+         whose first element is an iterable representing the positional
+         arguments, and second element is a ``dict`` representing the
+         keyword arguments. The input signature of the ``init_hook``
+         must **exactly** match the signature of the original
+         ``forward`` method (including argument order and default
+         values), except it must exclude ``self``.
+      -  ``return_hook``: A callable that translates the object returned
+         from the distributed module to the return object expected of
+         the original module.
+
+   -  **Example:**
+
+      .. code:: python
+
+         from somelibrary import MyTransformer
+
+         init_hook = lambda config: ((), config.to_dict())
+
+         # register smp.nn.DistributedTransformer as the distributed version of MyTransformer
+         smp.tp_register_with_module(MyTransformer,
+                                     smp.nn.DistributedTransformer,
+                                     init_hook=init_hook)
+
+.. _smdmp-supported-modules-for-tp:
+
+Supported Modules for Tensor Parallelism
+----------------------------------------
+
+The following modules are supported for tensor
+parallelism.
+
+-  ``smp.nn.DistributedLinear`` (implements ``nn.Linear``)
+-  ``smp.nn.DistributedTransformerLMHead``
+-  ``smp.nn.DistributedTransformer``
+-  ``smp.nn.DistributedTransformerLayer``
+-  ``smp.nn.DistributedAttentionLayer``
+-  ``smp.nn.DistributedTransformerOutputLayer``
+-  ``smp.nn.DistributedEmbedding``
+
+.. contents:: Topics
+  :depth: 3
+  :local:
+
+.. _tp-module-api:
+
+Tensor Parallelism Module APIs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. class:: smp.nn.DistributedLinear(in_features, out_features)
+
+   -  Tensor-parallel implementation of the ``nn.Linear`` class.
+      Functionally equivalent to an ``nn.Linear`` module with the same
+      ``in_features`` and ``out_features``. In other words,
+      ``in_features`` and ``out_features`` are the number of *global*
+      channels across tensor-parallel ranks.
+   -  **Arguments:**
+
+      -  ``in_features``: The total number of input channels for the
+         linear layer across all tensor-parallel ranks.
+      -  ``out_features``: The total number of output channels for the
+         linear layer across all tensor-parallel ranks.
+
+.. class:: smp.nn.DistributedTransformerLMHead(num_layers=12, num_attention_heads=32, attention_head_size=32, hidden_size=1024, intermediate_size=4096, vocab_size=30522, num_positions=1024, attention_dropout_prob=0.1, hidden_dropout_prob=0.1, activation="gelu", layernorm_epsilon=1e-5, num_token_types=0, causal_mask_size=None, add_cross_attention=False, add_lm_head=True,  initializer_range=0.02, use_normal_initialization=False, pre_layernorm=False, post_layernorm=True)
+
+   -  Constructs a distributed transformer model, including embeddings
+      and a single LM head. A word embedding of size
+      ``(vocab_size, hidden_size)`` is created, as well as a positional
+      embedding of size ``(num_positions, hidden_size)``, and the
+      embeddings are added together. If ``num_token_types`` is larger
+      than 0, a separate embedding of size
+      ``(num_token_types, hidden_size)`` is created, and further added
+      on top.
+   -  The embeddings are fed through a ``DistributedTransformer``, and
+      if ``add_lm_head`` is ``True``, the output passes through a single
+      LM head, which is a linear module without bias whose weight is
+      tied to the word embeddings.
+   -  See ``DistributedTransformerLayer`` for a description of the rest
+      of the arguments.
+   -  **Methods:**
+
+      -  ``forward(self, inputs)``
+
+         -  If ``add_cross_attention`` is ``True``, ``inputs`` must be a
+            tuple
+            ``(input_ids, attention_mask, token_type_ids, position_ids, cross_states, cross_states, cross_mask, labels)``.
+         -  Otherwise, ``inputs`` must be a tuple
+            ``(input_ids, attention_mask, token_type_ids, position_ids, labels)``.
+         -  If ``token_type_ids`` is ``None``, token type embedding will
+            not be used.
+         -  ``input_ids`` is assumed to be of shape ``[N, S]``, where
+            ``N`` is the batch size and ``S`` is sequence length.
+         -  ``attention_mask`` is assumed to be a 0-1 tensor of shape
+            ``[N, S]``, where 1 represents a masked position.
+
+.. class:: smp.nn.DistributedTransformer(num_layers=12, num_attention_heads=32, attention_head_size=32, hidden_size=1024, intermediate_size=4096, attention_dropout_prob=0.1, hidden_dropout_prob=0.1, activation="gelu", layernorm_epsilon=1e-5, initializer_range=0.02, use_normal_initialization=False, causal_mask_size=None, add_cross_attention=False, pre_layernorm=False, post_layernorm=True)
+
+   -  A sequence of ``smp.nn.DistributedTransformerLayer``\ s, whose
+      number is given by ``num_layers`` argument. For the other
+      arguments and methods, refer to
+      ``smp.nn.DistributedTransformerLayer``.
+   -  If both ``pre_layernorm`` and ``post_layernorm`` are ``True``,
+      layer normalization is applied to both the input and the output of
+      the ``DistributedTransformer``, in addition to the intermediate
+      attention and transformer-output layers.
+
+.. class:: smp.nn.DistributedTransformerLayer(num_attention_heads=32, attention_head_size=32, hidden_size=1024, intermediate_size=4096, attention_dropout_prob=0.1, hidden_dropout_prob=0.1, activation="gelu", layernorm_epsilon=1e-5, initializer_range=0.02, use_normal_initialization=False, causal_mask_size=None, add_cross_attention=False, pre_layernorm=False, post_layernorm=True)
+
+   -  Tensor-parallel implementation of a single transformer layer.
+      Number of attention heads, hidden size, and intermediate size
+      refer to the global quantities across all tensor-parallel ranks.
+   -  **Arguments:**
+
+      -  ``num_attention_heads``: The total number of attention heads
+         across tensor-parallel ranks
+      -  ``attention_head_size``: The number of channels of a single
+         attention head.
+      -  ``hidden_size``: The hidden dimension of the transformer. The
+         input tensor ``hidden_states`` is assumed to have its last
+         dimension size equal to ``hidden_size``.
+      -  ``intermediate_size``: The number of output channels in the
+         first linear transformation of the transformer output layer.
+         ``DistributedTransformerOutputLayer`` first maps
+         ``hidden_size`` dimensions of its input tensor into
+         ``intermediate_size`` dimensions, and then maps it back into
+         ``hidden_size`` dimensions.
+      -  ``attention_dropout_prob``: The dropout probability applied to
+         the attention probabilities.
+      -  ``hidden_dropout_prob``: The dropout probability used in
+         dropout layers other than the one applied to the attention
+         probabilities.
+      -  ``activation``: Choice of activation function to use at the
+         output layer. Must be ``"gelu"`` or ``"relu"``.
+      -  ``layernorm_epsilon``: The epsilon added to the denominator of
+         layer normalization for numerical stability.
+      -  ``initializer_range``: If ``use_normal_initialization`` is
+         ``True``, the standard deviation of the normal random variable
+         to initialize the weights with.
+      -  ``use_normal_initialization``: If ``True``, the weights are
+         initialized with normal distribution with standard deviation
+         given by ``initializer_range``. Otherwise, default PyTorch
+         initialization is used.
+      -  ``causal_mask_size``: If ``None``, no causal mask is used on
+         attentions. Otherwise, should be set to maximum sequence length
+         to apply a causal mask to the attention scores. This is used,
+         for instance, in GPT-2.
+      -  ``add_cross_attention``: If ``True``, a cross-attention layer
+         will be added after the self-attention block. The
+         cross-attention layer computes the attention keys and values
+         based on the ``cross_states`` input (instead of
+         ``hidden_states`` input, as in self-attention. This is used in
+         the decoder block of encoder-decoder architectures. For
+         encoder-only architectures that only use self-attention, this
+         should be kept ``False``.
+      -  ``pre_layernorm``: If ``True``, inserts layer normalization at
+         the input. At least one of ``pre_layernorm`` and
+         ``post_layernorm`` must be ``True``.
+      -  ``post_layernorm``: If ``True``, inserts layer normalization at
+         the output. At least one of ``pre_layernorm`` and
+         ``post_layernorm`` must be ``True``.
+
+   -  **Methods:**
+
+      -  ``forward(self, inputs)``: Forward pass for the transformer
+         layer.
+
+         -  **Arguments:**
+
+            -  If ``add_cross_attention=False``, ``inputs`` must be a
+               tuple ``(hidden_states, attention_mask)``, where
+               ``hidden_states`` is assumed to be a tensor of dimensions
+               ``[N, S, H]``, where ``N`` is batch size, ``S`` is
+               sequence length, and ``H`` is ``hidden_size``.
+               ``attention_mask`` is assumed to be a tensor of
+               dimensions ``[N, 1, 1, S]``, where ``N`` is the batch
+               size, and ``S`` is the sequence length.
+            -  If ``add_cross_attention=True``, ``inputs`` must be a
+               tuple
+               ``(hidden_states, cross_states, attention_mask, cross_mask)``,
+               where ``hidden_states`` is assumed to be a tensor of
+               dimensions ``[N, S_1, H]``, where ``N`` is batch size,
+               ``S_1`` is sequence length, and ``H`` is ``hidden_size``.
+               ``cross_states`` is assumed to be a tensor of size
+               ``[N, S_2, H]``, similarly interpreted.
+               ``attention_mask`` is assumed to be a tensor of
+               dimensions ``[N, 1, 1, S_1]``, where ``N`` is the batch
+               size, and ``S_1`` is the sequence length, and
+               ``cross_mask`` is assumed to be a tensor of size
+               ``[N, 1, 1, S_2]``. Keys and values for the attention
+               heads in the cross-attention layer (but not the
+               self-attention layer) are computed using
+               ``cross_states``, and ``cross_mask`` is applied as the
+               attention mask in the cross-attention layer (but not the
+               self-attention layer).
+
+         -  **Returns:**
+
+            -  If ``add_cross_attention=False``, a tuple
+               ``(hidden_states, attention_mask)``, where
+               ``hidden_states`` is the output of the transformer, and
+               ``attention_mask`` is the same the ``attention_mask``
+               argument.
+            -  If ``add_cross_attention=True``, a tuple
+               ``(hidden_states, cross_states, attention_mask, cross_mask)``,
+               where ``hidden_states`` is the output of the transformer,
+               and the next three tensors are the same as the input
+               arguments.
+
+.. class:: smp.nn.DistributedAttentionLayer(num_attention_heads=32, attention_head_size=32, hidden_size=1024, attention_dropout_prob=0.1, hidden_dropout_prob=0.1, layernorm_epsilon=1e-5, initializer_range=0.02, use_normal_initialization=False, cross_attention=False, causal_mask_size=None, pre_layernorm=False, post_layernorm=True)
+
+   -  A distributed implementation for the attention block. Includes the
+      computation of the self- or cross-attention (context layer),
+      followed by a linear mapping and dropout, which is optionally
+      followed by the residual-connection and layer normalization.
+   -  **Arguments:**
+
+      -  See ``DistributedTransformerLayer`` for a description of the
+         arguments.
+      -  If ``cross_attention`` is ``True``, computes the attentions
+         with respect to the ``cross_states`` tensor of the ``forward``
+         method input tuple.
+
+   -  **Methods:**
+
+      -  ``forward(self, inputs)``: Forward pass for the attention
+         layer.
+
+         -  **Arguments:**
+
+            -  If ``cross_attention=False``, ``inputs`` must be a tuple
+               ``(hidden_states, attention_mask)``, where
+               ``hidden_states`` is assumed to be a tensor of dimensions
+               ``[N, S, H]``, where ``N`` is batch size, ``S`` is
+               sequence length, and ``H`` is ``hidden_size``.
+               ``attention_mask`` is assumed to be a tensor of
+               dimensions ``[N, 1, 1, S]``, \***\* where ``N`` is the
+               batch size, and ``S`` is the sequence length.
+            -  If ``cross_attention=True``, ``inputs`` must be a tuple
+               ``(hidden_states, cross_states, attention_mask)``, where
+               ``hidden_states`` is assumed to be a tensor of dimensions
+               ``[N, S_1, H]``, where ``N`` is batch size, ``S_1`` is
+               sequence length, and ``H`` is ``hidden_size``.
+               ``cross_states`` is assumed to be a tensor of size
+               ``[N, S_2, H]``, similarly interpreted.
+               ``attention_mask`` is assumed to be a tensor of
+               dimensions ``[N, 1, 1, S_2]``, where ``N`` is the batch
+               size, and ``S_2`` is the sequence length. Keys and values
+               for the attention heads are computed using
+               ``cross_states``.
+
+         -  **Returns:**
+
+            -  A single tensor that is the output of the attention
+               layer.
+
+.. class:: smp.nn.DistributedTransformerOutputLayer(hidden_size=1024, intermediate_size=4096,  hidden_dropout_prob=0.1, activation="gelu", layernorm_epsilon=1e-5, initializer_range=0.02, use_normal_initialization=False, pre_layernorm=False, post_layernorm=True)
+
+   -  Distributed implementation of a single transformer output layer. A
+      single ``DistributedTransformerLayer`` with
+      ``add_cross_attention=False`` consists of a single
+      ``DistributedAttentionLayer`` immediately followed by a single
+      ``DistributedTransformerOutputLayer``. The latter linearly maps
+      the last channel of the input tensor from ``hidden_size`` to
+      ``intermediate_size``, and then maps it back to ``hidden_size``.
+   -  **Arguments:**
+
+      -  See ``DistributedTransformerLayer`` for a description of the
+         arguments.
+
+.. class:: smp.nn.DistributedEmbedding(num_embeddings,embedding_dim, padding_idx=None, max_norm=None, norm_type=2.0, scale_grad_by_freq=False, sparse=False, _weight=None, initializer_range=0.02, _skip_allgather=False,_skip_scatter_and_merge=False,)
+
+   -  Distributed implementation of a single Embedding Layer. Currently
+      only supports splitting across the embedding_dim.
+   -  **Arguments:**
+
+      -  See ``DistributedEmbedding`` for a description of the
+         arguments.
+
+.. _enabling-tp:
+
+Enabling Tensor Parallelism
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+There are two ways tensor parallelism can be enabled.
+
+First, you can use
+the distributed module implementations in ``smp.nn`` module directly in
+your model definition. See :ref:`smdmp-supported-modules-for-tp`
+for a complete list of built-in distributed modules. Here is an example
+of how this can be done:
+
+.. code:: python
+
+   import torch.nn as nn
+   import smdistributed.modelparallel.torch as smp
+
+   class TransformerModel:
+       def __init__(self):
+           self.embedding = nn.Embedding(vocab_size, hidden_size)
+
+           # directly instantiate smp.nn.DistributedTransformer and use it
+           self.encoder = smp.nn.DistributedTransformer(num_layers, hidden_size, **kwargs)
+
+           self.pooler = nn.Linear(hidden_size, hidden_size)
+
+       def forward(self, hidden_states):
+           emb_out = self.embedding(hidden_states)
+           enc_out = self.encoder(emb_out)
+           return self.pooler(enc_out)
+
+Second, you can enable tensor parallelism for specific modules or blocks
+of code, which will automatically enable tensor parallelism for the
+supported modules within that scope. To do this, you can use the
+following API:
+
+.. decorator:: smp.tensor_parallelism(enabled=True, **kwargs)
+
+   -  A context manager that enables or disables tensor parallelism for
+      any supported module that is created inside. If there are nested
+      contexts, the innermost will override the rest. If there are
+      multiple supported modules created within the context, where one
+      is the submodule of the other, only the outermost module will be
+      distributed. If a supported module shares weights with another
+      (supported or unsupported) module, or if its hyperparameters do
+      not support distribution (e.g., not divisible by the tensor
+      parallelism degree), tensor parallelism will **not** be enabled
+      for this module even if this API is used.
+
+      **Example:**
+
+      .. code:: python
+
+         with smp.tensor_parallelism():
+             self.m0 = nn.Linear(20, 20)                   # will be distributed
+             with smp.tensor_parallelism(enabled=False):
+                 self.m1 = nn.Linear(20, 20)               # will not be distributed
+
+   - Keyword arguments `kwargs` can be used to modify the configurations of the distributed modules created inside the context. If a keyword argument provided here matches any `__init__` method arguments of a `DistributedModule` that substitutes a module created inside the `smp.tensor_parallelism` context, this keyword will override the value defined in the `init_hook`.
+
+.. function:: smp.set_tensor_parallelism(module, enabled=True, **kwargs)
+
+   -  Enables or disables tensor parallelism for the supported
+      submodules of ``module``. If enabling, the outermost supported
+      modules will be distributed. If disabling, tensor parallelism will
+      be disabled for the entire module subtree of ``module``. Unlike
+      the context manager, this API can be used after the model creation
+      (but before wrapping with :class:`smp.DistributedModel`), so direct
+      access to model definition code is not required. If a supported
+      module shares weights with another (supported or unsupported)
+      module, or if its hyperparameters do not support distribution
+      (e.g., not divisible by the tensor parallelism degree), tensor
+      parallelism will **not** be enabled for this module.
+   -  Keyword arguments ``kwargs`` can be used to modify the
+      configurations of the distributed modules created inside the
+      context. If a keyword argument provided here matches any
+      ``__init__`` method arguments of a :class:`smp.DistributedModel` that
+      substitutes a module created inside the ``smp.tensor_parallelism``
+      context, this keyword will override the value defined in the
+      ``init_hook``.
+   -  **Example:**
+
+      .. code:: python
+
+         model = MyModel()
+         smp.set_tensor_parallelism(model.encoder, True)
+         smp.set_tensor_parallelism(model.encoder.embedding, True)
+
+         # outermost supported submodules in model.encoder will be distributed, except for
+         # model.encoder.embedding
+         model = smp.DistributedModel(model)
+         optimizer = smp.DistributedOptimizer(optimizer)
+
+.. _activation-checkpointing-api:
+
+Activation Checkpointing APIs
+-----------------------------
+
+``smdistributed.modelparallel`` provides three APIs to enable
+activation checkpointing: one for checkpointing modules,
+one for checkpointing sequential modules, and
+one for checkpointing pretrained models.
+
+For a conceptual guide and examples, see
+`Activation Checkpointing <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-extended-features-pytorch-activation-checkpointing.html>`_
+in the *SageMaker's Distributed Model Parallel developer guide*.
+
+.. class:: smdistributed.modelparallel.torch.patches.checkpoint.checkpoint(module, *args, preserve_rng_state=True)
+
+   -  Checkpoints the module passed. Throws error if, during manual
+      partitioning, all children of module are not on same rank as the
+      module itself, i.e. the module tree is split across multiple
+      partitions. During auto-partitioning, if the module is split
+      across multiple partitions, then this call is ignored(with a
+      warning). Note that this call applies to the module instance only,
+      not to the module class.
+
+   -  **Arguments:**
+
+      -  ``module (Instance of nn.Module)``: The module to be
+         checkpointed. Note that unlike native checkpointing in
+         PyTorch’s, activation checkpointing in
+         ``smdistributed.modelparallel`` is at the granularity of a
+         module. A generic function cannot be passed here.
+      -  ``args``: Tuple containing inputs to the module.
+      -  ``preserve_rng_state (bool, default=True)``: Omit stashing and
+         restoring the RNG state during each checkpoint.
+
+.. class:: smdistributed.modelparallel.torch.patches.checkpoint.checkpoint_sequential(sequential_module, input, strategy="each", preserve_rng_state=True, pack_args_as_tuple=False)
+
+   -  Checkpoints the modules inside
+      `nn.Sequential <https://pytorch.org/docs/stable/generated/torch.nn.Sequential.html>`__.
+      This can be used even if different layers that are part of the
+      sequential container lie on different partitions. Each layer part
+      of the sequential module that is checkpointed must lie completely
+      within one partition. If this is not the case during manual
+      partitioning, then an error will be thrown. If this is not the
+      case during auto partitioning, a warning will be raised and this
+      module will be run without checkpointing.
+
+   -  **Arguments**
+
+      -  ``sequential_module (nn.Sequential)``: the sequential module to
+         be checkpointed.
+      -  ``input (torch.Tensor or a tuple of torch.Tensors)``: input to
+         the module, which can be a tensor or a tuple of tensors. If a
+         tuple is passed, then pack_args_as_tuple should be set to True.
+      -  ``strategy (string, default=“each”)`` : Strategy determines how
+         many layers part of the sequential module need to be grouped
+         together for one checkpointing call. This determines how much
+         memory can be reduced. It can take the following values
+
+         -  ``each`` : The default is to checkpoint each module inside
+            the sequential separately.
+         -  ``contiguous``: Groups consecutive layers on the same
+            partition together. For example, if a sequential consists of
+            [a, b, c, d] where a,b are on pp_rank0 and c,d are on
+            pp_rank 1, then this strategy would checkpoint a,b together
+            and then c,d together. This means effectively, inputs of a,
+            outputs of b, inputs of c, and outputs of d are in memory;
+            the reamining activations are recomputed.
+         -  ``group_2, group_3, group_4, etc:`` More generally,
+            ``group_x`` where x is an integer. This strategy provides
+            more flexibility in how many layers to group together.
+            ``group_x`` groups x layers together on a best effort basis.
+            It can group x layers together if there are x layers
+            consecutively on the same partition. For example:
+            [a,b,c,d,e] where a,b are on pp_rank0 and c,d,e are on
+            pp_rank 1. If the strategy is ``group_3,`` then a,b are
+            checkpointed together on pp_rank0 and c,d,e are checkpointed
+            together on pp_rank1.
+
+      -  ``preserve_rng_state (bool, default=True)``: Set to ``False``
+         to omit stashing and restoring the RNG state during each
+         checkpoint.
+      -  ``pack_args_as_tuple (bool, default=False)``: To ensure that
+         backward works correctly, the autograd function has to unpack
+         any tuples received. If the checkpointed layer takes a tuple as
+         input, then this needs to be set to True.
+
+.. class:: smp.set_activation_checkpointing(module, preserve_rng_state=True, pack_args_as_tuple=False, strategy="each")
+
+   -  This API is recommended when importing pretrained models from
+      libraries, such as PyTorch and Hugging Face Transformers. This is
+      particularly useful when you don’t have access to the model
+      definition code and not be able to replace a module call with
+      checkpoint.
+
+   -  **Arguments**:
+
+      -  ``module (Instance of nn.Module or nn.Sequential)``: The module
+         to checkpoint.
+      -  ``preserve_rng_state (bool, default=True)``: Set to ``False``
+         to omit stashing and restoring the RNG state during each
+         checkpoint.
+      -  ``pack_args_as_tuple (bool, default=False)``: *Can only be
+         passed when module is a sequential module.* To ensure that
+         backward works correctly, the autograd function has to unpack
+         any tuples received. If the layer checkpointed takes a tuple as
+         input, then this needs to be set to True.
+      -  ``strategy: (string, default=“each”)``: *Can only be passed
+         when module is a sequential module.* Strategy determines how
+         many layers part of the sequential module need to be grouped
+         together for one checkpointing call.
+      -  This determines how much memory can be reduced. It can take the
+         following values
+
+         -  ``each`` : The default is to checkpoint each module inside
+            the sequential separately.
+         -  ``contiguous``: Groups consecutive layers on the same
+            partition together. For example if a sequential consists of
+            ``[a, b, c, d]`` where ``a, b`` are on ``pp_rank0`` and ``c, d`` are on
+            ``pp_rank 1``, then this strategy would checkpoint a,b together
+            and then ``c, d`` together. This means effectively, the inputs of
+            ``a``, outputs of ``b``, inputs of ``c``, and outputs of ``d`` are in
+            memory, and the rest of the activations are recomputed.
+         -  ``group_2, group_3, group_4, etc:`` More generally,
+            ``group_x`` where x is an integer. This strategy provides
+            more flexibility in how many layers to group together.
+            ``group_x`` groups x number of layers together on a best
+            effort basis if there are x layers consecutively in the same
+            partition. **Example**: Assume a module with layers ``[a, b,
+            c, d, e]``. The layers a and b are on pp_rank0, and ``c``, ``d``, and
+            ``e`` are on ``pp_rank 1``. If the strategy is ``group_3,`` then ``a``,
+            ``b`` are checkpointed together on ``pp_rank0``, and ``c``, ``d``, ``e`` are
+            checkpointed together on ``pp_rank1``.
+
+.. _smdmp-tp-appendix:
+
+Appendix: Reference Implementations for Modules
+-----------------------------------------------
+
+The following are reference implementations for transformer-related
+modules. Note that this is not the actual ``smdistributed`` source code,
+but the distributed implementations provided in the library are the
+distributed versions of these reference implementations, and can be used
+to determine whether the distributed modules perform the same operations
+as the custom modules in your script.
+
+To keep the implementations simple, we only assume keyword arguments,
+and assume the existence of a method ``parse_args(kwargs)``, which
+parses the arguments to ``__init__`` methods and sets the relevant
+attributes of the module, such as ``hidden_size`` and
+``num_attention_heads``.
+
+``smp.nn.DistributedTransformer``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: python
+
+   class Transformer(nn.Module):
+       def __init__(self, **kwargs):
+           super(Transformer, self).__init__()
+           self.parse_args(kwargs)
+
+           self.layers = []
+           for l in range(self.num_layers):
+               self.layers.append(TransformerLayer(**kwargs))
+
+           self.seq_layers = nn.Sequential(*self.layers)
+
+       def forward(self, inp):
+           return self.seq_layers(inp)
+
+``smp.nn.DistributedTransformerLayer``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: python
+
+   class TransformerLayer(nn.Module):
+       def __init__(self, **kwargs):
+           super(TransformerLayer, self).__init__()
+           self.parse_args(kwargs)
+
+           self.attention = AttentionLayer(**kwargs)
+           self.output = TransformerOutputLayer(**kwargs)
+
+           if self.add_cross_attention:
+               self.cross_attention = AttentionLayer(cross_attention=True, **kwargs)
+
+       def forward(self, inp):
+           if self.add_cross_attention:
+               hidden_states, cross_states, attention_mask, cross_mask = inp
+           else:
+               hidden_states, attention_mask = inp
+
+           attention_output = self.attention((hidden_states, attention_mask))
+           if self.add_cross_attention:
+               attention_output = self.cross_attention((attention_output,
+                                                        cross_states,
+                                                        cross_mask))
+
+           output = self.output(attention_output)
+
+           if self.add_cross_attention:
+               return output, cross_states, attention_mask, cross_mask
+           else:
+               return output, attention_mask
+
+``smp.nn.DistributedAttentionLayer``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: python
+
+   class AttentionLayer(nn.Module):
+       def __init__(self, **kwargs):
+           super(AttentionLayer, self).__init__()
+           self.parse_args(kwargs)
+           self.attention_head_size = self.hidden_size // self.num_attention_heads
+
+           self.query = nn.Linear(self.hidden_size, self.hidden_size)
+           self.key = nn.Linear(self.hidden_size, self.hidden_size)
+           self.value = nn.Linear(self.hidden_size, self.hidden_size)
+           self.dense = nn.Linear(self.hidden_size, self.hidden_size)
+
+           self.dropout1 = nn.Dropout(self.attention_dropout_prob)
+           self.dropout2 = nn.Dropout(self.hidden_dropout_prob)
+
+           if self.pre_layernorm:
+               self.pre_layernorm = nn.LayerNorm(self.hidden_size,
+                                       eps=self.layernorm_epsilon)
+
+           if self.post_layernorm:
+               self.layernorm = nn.LayerNorm(self.hidden_size,
+                                       eps=self.layernorm_epsilon)
+
+       def transpose(self, tensor, key=False):
+           shape = tensor.size()[:-1] +
+                           (self.num_attention_heads, self.attention_head_size)
+           tensor = torch.reshape(tensor, shape)
+           if key:
+               return tensor.permute(0, 2, 3, 1)
+           else:
+               return tensor.permute(0, 2, 1, 3)
+
+       def forward(self, inp):
+           if self.cross_attention:
+               hidden_states, cross_states, attention_mask = inp
+           else:
+               hidden_states, attention_mask = inp
+
+           if self.pre_layernorm:
+               norm_states = self.pre_layernorm(hidden_states)
+           else:
+               norm_states = hidden_states
+
+           query_layer = self.query(norm_states)
+
+           if self.cross_attention:
+               key_layer = self.key(cross_states)
+               value_layer = self.value(cross_states)
+           else:
+               key_layer = self.key(norm_states)
+               value_layer = self.value(norm_states)
+
+           query_layer = self.transpose(query_layer)
+           key_layer = self.transpose(key_layer, key=True)
+           value_layer = self.transpose(value_layer)
+
+           attention_scores = torch.matmul(query_layer, key_layer)
+           attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+           if not self.cross_attention and self.causal_mask is not None:
+               attention_scores = self.apply_causal_mask(attention_scores)
+
+           attention_scores = attention_scores + attention_mask
+
+           attention_probs = F.softmax(attention_scores, dim=-1)
+           attention_probs = self.dropout1(attention_probs)
+
+           context_layer = torch.matmul(attention_probs, value_layer)
+           context_layer = context_layer.permute(0, 2, 1, 3)
+           new_context_layer_shape = context_layer.size()[:-2] + \
+                                       (self.local_attention_size,)
+           context_layer = torch.reshape(context_layer, new_context_layer_shape)
+
+           self_attention = self.dense(context_layer)
+           self_attention = self.dropout2(self_attention)
+
+           if self.post_layernorm:
+               return self.layernorm(self_attention + hidden_states)
+           else:
+               return self_attention
+
+``smp.nn.DistributedTransformerOutputLayer``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: python
+
+   class TransformerOutputLayer(nn.Module):
+       def __init__(self, **kwargs):
+           super(TransformerOutputLayer, self).__init__()
+           self.parse_args(kwargs)
+
+           self.dense1 = nn.Linear(self.hidden_size, self.intermediate_size)
+           self.dense2 = nn.Linear(self.intermediate_size, self.hidden_size)
+
+           self.dropout = nn.Dropout(self.attention_dropout_prob)
+
+           if self.pre_layernorm:
+               self.pre_layernorm = nn.LayerNorm(self.hidden_size,
+                                       eps=self.layernorm_epsilon)
+
+           if self.post_layernorm:
+               self.layernorm = nn.LayerNorm(self.hidden_size,
+                                       eps=self.layernorm_epsilon)
+
+       def forward(self, inp):
+           if self.pre_layernorm:
+               norm_inp = self.pre_layernorm(inp)
+           else:
+               norm_inp = inp
+
+           dense1_output = self.dense1(norm_inp)
+           if self.activation == "gelu":
+               act_output = F.gelu(dense1_output)
+           else:
+               act_output = F.relu(dense1_output)
+
+           dense2_output = self.dense2(act_output)
+           output = self.dropout(dense2_output)
+
+           if self.post_layernorm:
+               return self.layernorm(inp + output)
+           else:
+               return output
diff --git a/doc/api/training/smp_versions/latest/smd_model_parallel_tensorflow.rst b/doc/api/training/smp_versions/latest/smd_model_parallel_tensorflow.rst
index 6eefe5cad8..7f21f7a557 100644
--- a/doc/api/training/smp_versions/latest/smd_model_parallel_tensorflow.rst
+++ b/doc/api/training/smp_versions/latest/smd_model_parallel_tensorflow.rst
@@ -1,9 +1,8 @@
 TensorFlow API
 ==============
 
-**Supported version: 2.3.1, 2.4.1, 2.5.0**
-
-**Important**: This API document assumes you use the following import statement in your training scripts.
+To use the TensorFlow-specific APIs for SageMaker distributed model parallism,
+you need to add the following import statement at the top of your training script.
 
 .. code:: python
 
@@ -13,8 +12,8 @@ TensorFlow API
 
    Refer to
    `Modify a TensorFlow Training Script
-   <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-customize-training-script.html#model-parallel-customize-training-script-tf>`_
-   to learn how to use the following API in your TensorFlow training script.
+   <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-customize-training-script-tf.html>`_
+   to learn how to use the following APIs in your TensorFlow training script.
 
 .. class:: smp.DistributedModel
    :noindex:
diff --git a/doc/api/training/smp_versions/v1.4.0/smd_model_parallel_common_api.rst b/doc/api/training/smp_versions/v1.4.0/smd_model_parallel_common_api.rst
new file mode 100644
index 0000000000..625a7fcbf1
--- /dev/null
+++ b/doc/api/training/smp_versions/v1.4.0/smd_model_parallel_common_api.rst
@@ -0,0 +1,488 @@
+.. admonition:: Contents
+
+   - :ref:`communication_api`
+   - :ref:`mpi_basics`
+
+Common API
+==========
+
+The following SageMaker distribute model parallel APIs are common across all frameworks.
+
+**Important**: This API document assumes you use the following import statement in your training scripts.
+
+**TensorFlow**
+
+.. code:: python
+
+   import smdistributed.modelparallel.tensorflow as smp
+
+**PyTorch**
+
+.. code:: python
+
+   import smdistributed.modelparallel.torch as smp
+
+
+.. function:: smp.init( )
+   :noindex:
+
+   Initialize the library. Must be called at the beginning of training script.
+
+.. function:: @smp.step(non_split_inputs, input_split_axes, [*args, **kwargs])
+   :noindex:
+
+   A decorator that must be placed over a function that represents a single
+   forward and backward pass (for training use cases), or a single forward
+   pass (for evaluation use cases). Any computation that is defined inside
+   the ``smp.step``-decorated function is executed in a pipelined manner.
+
+   By default, every tensor input to the function is split across its batch
+   dimension into a number of microbatches specified while launching the
+   training job. This behavior can be customized through the arguments to
+   ``smp.step``, described below. The library then orchestrates the execution of
+   each microbatch across all partitions, based on the chosen pipeline
+   type.
+
+   In a typical use case, forward pass and back-propagation are executed
+   inside an \ ``smp.step``-decorated function and gradients, loss, and
+   other relevant metrics (such as accuracy, etc.) are returned from
+   ``smp.step``-decorated function.
+
+   Any gradient post-processing operation, such as gradient clipping and
+   allreduce, as well as ``optimizer.apply_gradients`` calls (for TF) or
+   ``optimizer.step`` (for PT) should be applied on the gradients returned
+   from the ``smp.step`` function, and not inside the ``smp.step``
+   function. This is because every operation inside ``smp.step`` is
+   executed once per microbatch, so having these operations inside
+   ``smp.step`` can either be inefficient (in the case of allreduce), or
+   lead to wrong results (in the case of ``apply_gradients`` /
+   ``optimizer.step``).
+
+   If the objects returned from the ``smp.step``-decorated function contain
+   ``tf.Tensor``\ s / ``torch.Tensor``\ s, they are converted to
+   ``StepOutput`` objects. A ``StepOutput`` object encapsulates all
+   versions of the tensor across different microbatches
+   (see ``StepOutput`` entry for more information).
+
+   The argument to ``smp.step`` decorated function should either be a tensor
+   or an instance of list, tuple, dict or set for it to be split across
+   microbatches. If your object doesn't fall into this category, you can make
+   the library split your object, by implementing ``smp_slice`` method.
+
+   Below is an example of how to use it with PyTorch.
+
+   .. code:: python
+
+      class CustomType:
+          def __init__(self, tensor):
+              self.data = tensor
+
+          # The library will call this to invoke slicing on the object passing in total microbatches (num_mb)
+          # and the current microbatch index (mb).
+          def smp_slice(self, num_mb, mb, axis):
+              dim_size = list(self.data.size())[axis]
+
+              split_size = dim_size // num_mb
+              sliced_tensor = self.data.narrow(axis, mb * split_size, split_size)
+              return CustomType(sliced_tensor, self.other)
+
+      custom_obj = CustomType(torch.ones(4,))
+
+      @smp.step()
+      def step(custom_obj):
+          loss = model(custom_obj)
+          model.backward(loss)
+          return loss
+
+
+   **Important:** ``smp.step`` splits the batch into microbatches, and
+   executes everything inside the decorated function once per microbatch.
+   This might affect the behavior of batch normalization, any operation
+   that explicitly uses the batch size information, or any other Python
+   code that is expected to run once.
+
+   **TensorFlow-specific behavior**
+
+   ``smp.step`` is a wrapper that
+   inherits from and extends the behavior of ``tf.function``, and as such,
+   all the caveats that apply to the use of ``tf.function``\ s also apply
+   to ``smp.step``. In particular, any operation that is inside
+   ``smp.step`` executes in graph mode, and not eager mode.
+
+   In the first call, ``smp.step`` performs tracing of the wrapped function every time
+   one of the tensor arguments changes their shape or dtype, or for every
+   new value of a Python argument, if there is one. Tracing is expensive,
+   so such scenarios should be avoided as much as possible or,
+   alternatively, an ``input_signature`` argument must be provided. For
+   more information on the usage of ``tf.function``, refer to the
+   TensorFlow documentation:
+
+   -  https://www.tensorflow.org/api_docs/python/tf/function\
+   -  https://www.tensorflow.org/guide/function\
+
+   Each ``smp.step`` decorated function must have a return value that depends on the
+   output of ``smp.DistributedModel``.
+
+   **Common parameters**
+
+   -  ``non_split_inputs`` (``list``): The list of arguments to the decorated function
+      that should not be split along the batch dimension. Should be used
+      for all input tensors that do not have a batch dimension. Should be a
+      list of argument names as ``str``, as they appear in the signature of
+      the ``smp.step``-decorated function. By default it is considered an
+      empty list.
+
+   -  ``input_split_axes`` (``dict``): A dict that maps the argument name to its batch
+      axis. The keys should be the argument names as ``str``, as they
+      appear in the signature of the ``smp.step``-decorated function.  By
+      default all batch axes are assumed to be the 0-axis.
+
+   **TensorFlow-only parameters**
+
+   -  All arguments of ``tf.function``. Note:
+      The \ ``experimental_compile`` argument of ``tf.function`` may not
+      work as expected with ``smp.step``, since it interferes with
+      pipelining and model partitioning. To enable XLA with the library, you can
+      instead use \ ``tf.config.optimizer.set_jit(True)``.
+
+   **PyTorch-only parameters**
+
+   -  ``detach_outputs`` (``bool``) : If ``True``, calls ``torch.Tensor.detach()`` on
+      all returned ``torch.Tensor`` outputs. Setting it to ``False``
+      increases memory consumption, unless ``detach()`` is manually called
+      on the returned tensors, because the model graph is not cleared from
+      memory after the training step. Set to \ ``True`` by default.
+
+   **Returns**
+
+   -  The same object(s) returned from the decorated function. All
+      returned \ ``tf.Tensor``, \ ``tf.Variable``  objects (for TF) or
+      ``torch.Tensor`` objects (for PT) are wrapped inside
+      a \ ``StepOutput`` object, even when they are inside a Python
+      ``list``, ``tuple``, or ``dict``.
+
+
+
+.. class:: StepOutput
+   :noindex:
+
+
+   A class that encapsulates all versions of a ``tf.Tensor``
+   or \ ``torch.Tensor`` across all microbatches.
+
+   When a particular ``tf.Tensor`` or ``torch.Tensor`` is computed inside
+   ``smp.step``, different versions of the tensor are computed for each
+   microbatch.
+
+   When this tensor is returned from ``smp.step`` and is accessed outside
+   of the decorated function, it appears as a ``StepOutput`` object, which
+   contains all such versions. For example,
+
+   -  In the case of Tensorflow, the gradient for a particular
+      ``tf.Variable`` is computed on each microbatch individually, and if
+      this gradient is returned from ``smp.step``, all gradients for this
+      ``tf.Variable`` become part of the same ``StepOutput`` object. The
+      ``StepOutput`` class offers the following API for commonly-used
+      post-processing operations on such tensors.
+   -  In the case of PyTorch, the loss for each microbatch is computed
+      individually and all the ``torch.Tensor``\ s that represent the loss
+      for different microbatches become part of same ``StepOutput`` object,
+      if loss is returned from the ``smp.step`` function.
+
+
+   The ``StepOutput`` class offers the following API for commonly-used
+   post-processing operations on tensors.
+
+   .. data:: StepOutput.outputs
+      :noindex:
+
+      Returns a list of the underlying tensors, indexed by microbatch.
+
+   .. function:: StepOutput.reduce_mean( )
+      :noindex:
+
+      Returns a ``tf.Tensor``, ``torch.Tensor`` that averages the constituent ``tf.Tensor`` s
+      ``torch.Tensor`` s. This is commonly used for averaging loss and gradients across microbatches.
+
+   .. function:: StepOutput.reduce_sum( )
+      :noindex:
+
+      Returns a ``tf.Tensor`` /
+      ``torch.Tensor`` that sums the constituent
+      ``tf.Tensor``\ s/\ ``torch.Tensor``\ s.
+
+   .. function:: StepOutput.concat( )
+      :noindex:
+
+      Returns a
+      ``tf.Tensor``/``torch.Tensor`` that concatenates tensors along the
+      batch dimension using ``tf.concat`` / ``torch.cat``.
+
+   .. function:: StepOutput.stack( )
+      :noindex:
+
+      Applies ``tf.stack`` / ``torch.stack``
+      operation to the list of constituent ``tf.Tensor``\ s /
+      ``torch.Tensor``\ s.
+
+   **TensorFlow-only methods**
+
+   .. function:: StepOutput.merge( )
+      :noindex:
+
+      Returns a ``tf.Tensor`` that
+      concatenates the constituent ``tf.Tensor``\ s along the batch
+      dimension. This is commonly used for merging the model predictions
+      across microbatches.
+
+   .. function:: StepOutput.accumulate(method="variable", var=None)
+      :noindex:
+
+      Functionally the same as ``StepOutput.reduce_mean()``. However, it is
+      more memory-efficient, especially for large numbers of microbatches,
+      since it does not wait for all constituent \ ``tf.Tensor``\ s to be
+      ready to start averaging them, thereby saving memory.
+
+      In some cases (XLA for example) ``StepOutput.reduce_mean()`` might end
+      up being more memory-efficient than ``StepOutput.accumulate()``.
+
+      **Parameters**
+
+      -  ``method`` (``"add_n"`` or ``"accumulate_n"`` or ``"variable"``):
+         If ``"add_n"`` or ``"accumulate_n"``, the library uses
+         ``tf.add_n`` and ``tf.accumulate_n``, respectively, to implement
+         accumulation. If ``"variable"``, the library uses an internal ``tf.Variable``
+         into which to accumulate the tensors. Default is \ ``"variable"``.
+         Note: Memory usage behavior of these choices can depend on the model
+         and implementation.
+
+      -  ``var``: A ``tf.Variable`` into which, if provided, the library uses to
+         accumulate the tensors. If \ ``None``, the library internally creates a
+         variable. If ``method`` is not ``"variable"``, this argument is
+         ignored.
+
+.. _mpi_basics:
+   :noindex:
+
+MPI Basics
+^^^^^^^^^^
+
+The library exposes the following basic MPI primitives to its Python API:
+
+-  ``smp.rank()``: The rank of the current process.
+-  ``smp.size()``: The total number of processes.
+-  ``smp.mp_rank()``: The rank of the process among the processes that
+   hold the current model replica.
+-  ``smp.dp_rank()``: The rank of the process among the processes that
+   hold different replicas of the same model partition.
+-  ``smp.dp_size()``: The total number of model replicas.
+-  ``smp.local_rank()``: The rank among the processes on the current
+   instance.
+-  ``smp.local_size()``: The total number of processes on the current
+   instance.
+-  ``smp.get_mp_group()``: The list of ranks over which the current
+   model replica is partitioned.
+-  ``smp.get_dp_group()``: The list of ranks that hold different
+   replicas of the same model partition.
+
+   .. _communication_api:
+      :noindex:
+
+Communication API
+^^^^^^^^^^^^^^^^^
+
+The library provides a few communication primitives which can be helpful while
+developing the training script. These primitives use the following
+``enum`` s as arguments to specify which processes the communication
+should involve.
+​
+
+**Helper structures**
+
+.. data:: smp.CommGroup
+   :noindex:
+
+   An ``enum`` that takes the values
+   ``CommGroup.WORLD``, ``CommGroup.MP_GROUP``, and ``CommGroup.DP_GROUP``.
+   These values can also be accessed as ``smp.WORLD``, ``smp.MP_GROUP``,
+   and ``smp.DP_GROUP`` respectively.
+
+   -  ``CommGroup.WORLD``: Represents the entire group of processes used in
+      training
+   -  ``CommGroup.MP_GROUP``: Represents the group of processes that hold
+      the same model replica as the current process. The processes in a
+      single ``MP_GROUP`` collectively store an entire replica of the
+      model.
+   -  ``CommGroup.DP_GROUP``: Represents the group of processes that hold
+      the same model partition as the current process. The processes in a
+      single ``DP_GROUP`` perform data parallelism/allreduce among
+      themselves.
+
+.. data:: smp.RankType
+   :noindex:
+
+   An ``enum`` that takes the values
+   ``RankType.WORLD_RANK``, ``RankType.MP_RANK``, and ``RankType.DP_RANK``.
+
+   -  ``RankType.WORLD_RANK``: The associated rank is to be interpreted as
+      the rank of the process across all processes used in training.
+   -  ``RankType.MP_RANK``: The associated rank is to be interpreted as the
+      rank of the process within the ``MP_GROUP``.
+   -  ``RankType.DP_RANK``: The associated rank is to be interpreted as the
+      rank of the process within the ``DP_GROUP``.
+
+
+**Communication primitives:**
+
+.. function:: smp.broadcast(obj, group)
+   :noindex:
+
+   Sends the object to all processes in the
+   group. The receiving process must call ``smp.recv_from`` to receive the
+   sent object.
+
+   **Inputs**
+
+   -  ``obj``: An arbitrary picklable Python object that will be broadcast.
+
+   -  ``group``: A ``CommGroup`` argument that represents to which group of
+      processes the object will be sent.
+
+   **Notes**
+
+   -  When you use ``broadcast`` on the sender process, there needs
+      to be an accompanying ``smp.recv_from()`` call on the receiver
+      processes.
+
+   -  This is a synchronous call; the ``broadcast`` statement
+      returns only after all ranks participating in the call have made a
+      matching ``recv_from`` call.
+
+   **Example**
+
+   .. code:: python
+
+      if smp.rank() == 0:
+          smp.broadcast(something, group=smp.CommGroup.WORLD)
+      else:
+          smp.recv_from(0, rank_type=smp.RankType.WORLD_RANK)
+
+.. function:: smp.send(obj, dest_rank, rank_type)
+   :noindex:
+
+   Sends the object ``obj`` to
+   ``dest_rank``, which is of a type specified by ``rank_type``.
+
+   **Inputs**
+
+   -  ``obj``: An arbitrary picklable Python object that will be sent.
+
+   -  ``dest_rank`` (``int``): An integer denoting the rank of the receiving process.
+
+   -  ``rank_type`` (``enum``): A ``smp.RankType`` ``enum`` that determines how
+      ``dest_rank`` is to be interpreted. For example if ``dest_rank`` is 1
+      and ``rank_type`` is ``MP_RANK``, then ``obj`` is sent to process
+      with ``mp_rank`` 1 in the ``MP_GROUP`` which contains the current
+      process.
+
+   **Notes**
+
+   -  Note: \ This is a synchronous call; the ``send`` statement returns
+      only after the destination rank has made a matching
+      ``recv_from`` call.
+
+.. function:: smp.recv_from(src_rank, rank_type)
+   :noindex:
+
+   Receive an object from a peer process. Can be used with a matching
+   ``smp.send`` or a ``smp.broadcast`` call.
+
+   **Inputs**
+
+   -  ``src_rank`` (``int``): An integer denoting rank of the sending process.
+
+   -  ``rank_type`` (``enum``): A ``smp.RankType`` ``enum`` that determines how
+      ``dest_rank`` is to be interpreted. For example if ``src_rank`` is 1
+      and ``rank_type`` is ``MP_RANK``, then the object is received from
+      the process with ``mp_rank`` 1 in the ``MP_GROUP`` which contains the
+      current process.
+
+   **Returns**
+
+   Returns the python object that is sent by the peer process.
+
+   **Notes**
+
+   -  Note: This is a synchronous call; the ``recv_from`` statement returns
+      only after the source rank has made a matching ``send`` or
+      ``broadcast`` call, and the object is received.
+
+.. function:: smp.allgather(obj, group)
+   :noindex:
+
+   A collective call that gathers all the
+   submitted objects across all ranks in the specified ``group``. Returns a
+   list whose ``i``\ th index contains the object submitted by the
+   ``i``\ th rank in ``group``.
+
+   **Inputs**
+
+   -  ``obj``: An arbitrary picklable Python object that will be
+      allgathered.
+
+   -  ``group`` : A ``CommGroup`` argument that represents which group of
+      processes participate in ``allgather``.
+
+   **Notes**
+
+   -  Note: This is a synchronous call; the ``allgather`` statement returns
+      only after all ranks participating in the call have made a matching
+      ``allgather`` call, and all the objects are received at the current
+      rank.
+
+   **Examples**
+
+   .. code:: python
+
+      # assuming mp_size() == 2
+
+      if smp.mp_rank() == 0:
+          out = smp.allgather(obj1, smp.CommGroup.MP_GROUP)  # returns [obj1, obj2]
+      else:
+          out = smp.allgather(obj2, smp.CommGroup.MP_GROUP)  # returns [obj1, obj2]
+
+.. function:: smp.barrier(group=smp.WORLD)
+   :noindex:
+
+   A statement that hangs until all
+   processes in the specified group reach the barrier statement, similar to
+   ``MPI_Barrier()``.
+
+   **Inputs**
+
+   -  ``group``: An ``smp.CommGroup`` ``enum`` that specifies the group of
+      processes participating in the barrier call. Defaults to
+      ``smp.WORLD``.
+
+   **Examples**
+
+   -  Assume there are 8 processes and 2 model partitions, and
+      therefore 4 \ ``mp_group``\ s, and 2 ``dp_group``\ s. If
+      the \ ``barrier`` call is passed the value ``smp.MP_GROUP`` for its
+      group argument, then each process only waits until the other process
+      of its own ``mp_group`` reaches that point. It does not wait for
+      processes outside that ``mp_group``.
+
+.. function:: smp.dp_barrier()
+   :noindex:
+
+   Same as passing ``smp.DP_GROUP``\ to ``smp.barrier()``.
+   Waits for the processes in the same \ ``dp_group`` as
+   the current process to reach the same point in execution.
+
+.. function:: smp.mp_barrier()
+   :noindex:
+
+   Same as passing ``smp.MP_GROUP`` to
+   ``smp.barrier()``. Waits for the processes in the same ``mp_group`` as
+   the current process to reach the same point in execution.
diff --git a/doc/api/training/smp_versions/v1.4.0/smd_model_parallel_pytorch.rst b/doc/api/training/smp_versions/v1.4.0/smd_model_parallel_pytorch.rst
new file mode 100644
index 0000000000..d2fcb95954
--- /dev/null
+++ b/doc/api/training/smp_versions/v1.4.0/smd_model_parallel_pytorch.rst
@@ -0,0 +1,572 @@
+.. admonition:: Contents
+
+   - :ref:`pytorch_saving_loading`
+   - :ref:`pytorch_saving_loading_instructions`
+
+PyTorch API
+===========
+
+**Supported versions: 1.7.1, 1.8.1**
+
+This API document assumes you use the following import statements in your training scripts.
+
+.. code:: python
+
+   import smdistributed.modelparallel.torch as smp
+
+
+.. tip::
+
+   Refer to
+   `Modify a PyTorch Training Script
+   <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-customize-training-script.html#model-parallel-customize-training-script-pt>`_
+   to learn how to use the following API in your PyTorch training script.
+
+.. class:: smp.DistributedModel
+   :noindex:
+
+   A sub-class of ``torch.nn.Module`` which specifies the model to be
+   partitioned. Accepts a ``torch.nn.Module`` object ``module`` which is
+   the model to be partitioned. The returned ``DistributedModel`` object
+   internally manages model parallelism and data parallelism. Only one
+   model in the training script can be wrapped with
+   ``smp.DistributedModel``.
+
+   **Example:**
+
+   .. code:: python
+
+      model = smp.DistributedModel(model)
+
+   **Important**: The ``__call__`` and  ``backward`` method calls on the
+   ``smp.DistributedModel`` object (in the following example, the object
+   is \ ``model``) can only be made inside a ``smp.step``-decorated
+   function.
+
+
+   Since ``DistributedModel``  is a ``torch.nn.Module``, a forward pass can
+   be performed by calling the \ ``DistributedModel`` object on the input
+   tensors.
+
+   .. code:: python
+
+      predictions = model(inputs)   # model is a smp.DistributedModel object
+
+   For a backward pass, one needs to call the backward function on
+   the \ ``DistributedModel`` object, with tensors and gradients as
+   arguments, replacing the PyTorch operations \ ``torch.Tensor.backward``
+   or ``torch.autograd.backward``.
+
+
+   The API for ``model.backward`` is very similar to
+   ``torch.autograd.backward``. For example, the following
+   ``backward`` calls:
+
+   .. code:: python
+
+      torch.autograd.backward(loss) or loss.backward()
+
+   should be replaced with:
+
+   .. code:: python
+
+      model.backward(loss) # loss is a tensor with only one element as its data
+
+   Similarly, for non-scalar tensors, replace the following
+   ``backward`` call containing incoming gradient arguments:
+
+   .. code:: python
+
+      torch.autograd.backward(outputs, out_grads)
+
+   with the following line:
+
+   .. code:: python
+
+      model.backward(outputs, out_grads)
+
+   In these examples, all ``__call__``  and ``backward`` method calls on
+   the model objects (``model(inputs)`` and ``model.backward(loss)``) must be made inside
+   a ``smp.step``-decorated function.
+
+   **Using DDP**
+
+   If DDP is enabled, do not not place a PyTorch
+   ``DistributedDataParallel`` wrapper around the ``DistributedModel`` because
+   the ``DistributedModel`` wrapper will also handle data parallelism.
+
+   Unlike the original DDP wrapper, when you use ``DistributedModel``,
+   model parameters and buffers are not immediately broadcast across
+   processes when the wrapper is called. Instead, the broadcast is deferred to the first call of the
+   ``smp.step``-decorated function when the partition is done.
+
+   **Parameters**
+
+   -  ``module`` (``torch.nn.Module``): Module to be distributed (data parallelism and model parallelism).
+
+   -  ``trace_device`` (``"cpu"`` or ``"gpu"``) (default: ``"gpu"``)
+      Whether to perform the tracing step on the GPU or CPU. The tracing step gathers
+      information on the order of execution of modules, the shapes of
+      intermediate outputs, and execution times, to be used by the
+      partitioning algorithm. If ``trace_device`` is set to GPU, accurate
+      module execution times can be gathered during tracing for potentially
+      improved partitioning decision. However, if the model is too large to
+      fit in a single GPU, then ``trace_device`` should be set to CPU.
+
+   -  ``trace_execution_times`` (``bool``) (default: ``False``): If ``True``,
+      the library profiles the execution time of each module during tracing, and uses
+      it in the partitioning decision. This improves the partitioning
+      decision, but it might make the tracing slower. It may also introduce
+      some degree of non-determinism in partitioning results, because of the
+      inherent randomness in module execution times. Must be ``False`` if
+      ``trace_device`` is ``"cpu"``.
+
+   -  ``overlapping_allreduce`` (``bool``) (default: ``True``): This is only
+      applicable for hybrid data parallelism/model parallelism use cases (when
+      ``ddp`` is set to ``True`` while launching training). The library uses this flag
+      to decide whether to do overlapping allreduce whenever a parameter
+      gradients are ready. This leads to overlapping of communication and
+      computation and can improve performance. If this is set to ``False`` ,
+      allreduce is performed at the end of the step.
+
+   -  ``backward_passes_per_step`` (``int``) (default: 1): This is only
+      applicable for hybrid data parallelism/model parallelism use cases (when
+      ``ddp`` is set to ``True`` in config). This parameter indicates the
+      number of backward passes to perform before calling allreduce on DDP.
+      This allows accumulating updates over multiple mini-batches before
+      reducing and applying them.
+
+   -  ``average_grads_across_microbatches`` (``bool``) (default: ``True``):
+      Whether or not the computed gradients should be averaged across
+      microbatches. If ``False``, the computed gradients will be summed across
+      microbatches, but not divided by the number of microbatches. In typical
+      use case where the computed loss is averaged over the mini-batch, this
+      should be left as ``True``. If you use a loss function that only sums
+      the per-sample loss across the batch (and not divide by the batch size),
+      then this must be set to ``False`` for correctness.
+
+   -  ``bucket_cap_mb`` (default: 25): \ ``DistributedDataParallel`` buckets
+      parameters into multiple buckets so that gradient reduction of each
+      bucket can potentially overlap with backward
+      computation. \ ``bucket_cap_mb``\ controls the bucket size in MegaBytes
+      (MB).
+
+   -  ``trace_memory_usage`` (default: False): When set to True, the library attempts
+      to measure memory usage per module during tracing. If this is disabled,
+      memory usage will be estimated through the sizes of tensors returned from
+      the module.
+
+   -  ``broadcast_buffers`` (default: True): Flag to be used with ``ddp=True``.
+      This parameter is forwarded to the underlying ``DistributedDataParallel`` wrapper.
+      Please see: `broadcast_buffer <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel>`__.
+
+   -  ``gradient_as_bucket_view`` (default: False): To be
+      used with ``ddp=True``. This parameter is forwarded to the underlying
+      ``DistributedDataParallel`` wrapper. Please see `gradient_as_bucket_view <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel>`__.
+
+   **Properties**
+
+   -  ``partitioned``: Is ``True`` if the model is partitioned, ``False``
+      otherwise. Initialized to ``False`` when ``DistributedModel`` is first
+      created. It becomes be ``True`` during the first call
+      to ``smp.step``-decorated function. Once the model is partitioned, the
+      local parameters or local ``state_dict`` can be fetched using the
+      following methods.
+
+   **Methods**
+
+   .. function:: backward(tensors, grad_tensors)
+      :noindex:
+
+      Triggers a distributed backward
+      pass across model partitions. Example usage provided in the previous
+      section. The API is very similar
+      to https://pytorch.org/docs/stable/autograd.html#torch.autograd.backward.
+      ``retain_grad`` and ``create_graph``  flags are not supported.
+
+   .. function:: local_buffers( )
+      :noindex:
+
+      Returns an iterator over buffers for the modules in
+      the partitioned model that have been assigned to the current process.
+
+   .. function:: local_named_buffers( )
+      :noindex:
+
+      Returns an iterator over buffers for the
+      modules in the partitioned model that have been assigned to the current
+      process. This yields both the name of the buffer as well as the buffer
+      itself.
+
+   .. function:: local_parameters( )
+      :noindex:
+
+      Returns an iterator over parameters for the
+      modules in the partitioned model that have been assigned to the current
+      process.
+
+   .. function:: local_named_parameters( )
+      :noindex:
+
+      Returns an iterator over parameters for
+      the modules in the partitioned model that have been assigned to the
+      current process. This yields both the name of the parameter as well as
+      the parameter itself.
+
+   .. function:: local_modules( )
+      :noindex:
+
+      Returns an iterator over the modules in the
+      partitioned model that have been assigned to the current process.
+
+   .. function:: local_named_modules( )
+      :noindex:
+
+      Returns an iterator over the modules in the
+      partitioned model that have been assigned to the current process. This
+      yields both the name of the module as well as the module itself.
+
+   .. function:: local_state_dict( )
+      :noindex:
+
+      Returns the ``state_dict`` that contains local
+      parameters that belong to the current \ ``mp_rank``. This ``state_dict``
+      contains a key \ ``_smp_is_partial`` to indicate this is a
+      partial \ ``state_dict``, which indicates whether the
+      ``state_dict`` contains elements corresponding to only the current
+      partition, or to the entire model.
+
+   .. function:: state_dict( )
+      :noindex:
+
+      Returns the ``state_dict`` that contains parameters
+      for the entire model. It first collects the \ ``local_state_dict``  and
+      gathers and merges the \ ``local_state_dict`` from all ``mp_rank``\ s to
+      create a full ``state_dict``. Please note that this needs to be called on all ranks with
+      ``dp_rank()==0`` to ensure the gather happens properly.
+      If it is only called on all such ranks, it can hang.
+
+   .. function:: load_state_dict( )
+      :noindex:
+
+      Same as the ``torch.module.load_state_dict()`` ,
+      except: It first gathers and merges the ``state_dict``\ s across
+      ``mp_rank``\ s, if they are partial. The actual loading happens after the
+      model partition so that each rank knows its local parameters.
+
+   .. function:: register_post_partition_hook(hook)
+      :noindex:
+
+      Registers a callable ``hook`` to
+      be executed after the model is partitioned. This is useful in situations
+      where an operation needs to be executed after the model partition during
+      the first call to ``smp.step``, but before the actual execution of the
+      first forward pass. Returns a ``RemovableHandle`` object ``handle``,
+      which can be used to remove the hook by calling ``handle.remove()``.
+
+   .. function:: cpu( )
+      :noindex:
+
+      Allgathers parameters and buffers across all ``mp_rank``\ s and moves them
+      to the CPU.
+
+   .. function:: join( )
+      :noindex:
+
+      A context manager to be used in conjunction with an instance of
+      ``smp.DistributedModel`` to be able to train with uneven inputs across
+      participating processes. This is only supported when ``ddp=True``. This will use the join with the wrapped
+      ``DistributedDataParallel`` instance. For more information, see:
+      `join <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel.join>`__
+      in the PyTorch documentation.
+
+   .. function:: register_comm_hook( state, callable )
+      :noindex:
+
+      **Available for PyTorch 1.8.1 only**
+      Registers a communication hook which is an enhancement that provides
+      a flexible hook ``callable`` to users where they can specify how
+      gradients are aggregated across multiple workers. This method will be called on the wrapped ``DistributedDataParallel`` instance.
+
+      Please note that when you register a comm hook you have full control of how the gradients are processed.
+      When using only data parallelism with Torch DDP you are expected to average grads across data parallel replicas within the hook.
+      Similarly, when using DistributedModel you have to averaging grads across data parallel replicas within the hook.
+      In addition to that, you also have to average grads across microbatches within the hook unless you explicitly desire to not average based on your loss function.
+      See ``average_grads_across_microbatches`` for more information about averaging grads across microbatches.
+
+      This is only supported when ``ddp=True`` and ``overlapping_allreduce=True`` (default).
+      For more information, see:
+      `register_comm_hook <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel.register_comm_hook>`__
+      in the PyTorch documentation.
+
+
+
+.. class:: smp.DistributedOptimizer
+   :noindex:
+
+   **Parameters**
+   - ``optimizer``
+
+   An optimizer wrapper for saving/loading optimizer states. This wrapper
+   returns ``optimizer`` with the following methods overridden:
+
+   .. function:: state_dict( )
+      :noindex:
+
+      Returns the ``state_dict`` that contains optimizer state for the entire model.
+      It first collects the ``local_state_dict`` and gathers and merges
+      the ``local_state_dict`` from all ``mp_rank``s to create a full
+      ``state_dict``.
+
+   .. function::  load_state_dict( )
+      :noindex:
+
+      Same as the ``torch.optimizer.load_state_dict()`` , except:
+
+         -  It first gathers and merges the local ``state_dict``\ s if they are
+            partial.
+         -  The actual loading happens after the model partition so that each
+            rank knows its local parameters.
+
+   .. function::  local_state_dict( )
+      :noindex:
+
+      Returns the ``state_dict`` that contains the
+      local optimizer state that belongs to the current \ ``mp_rank``. This
+      ``state_dict`` contains a key \ ``_smp_is_partial`` to indicate this is
+      a partial \ ``state_dict``, which indicates whether the
+      ``state_dict`` contains elements corresponding to only the current
+      partition, or to the entire model.
+
+   ​
+.. function:: smp.partition(index)
+   :noindex:
+
+   **Inputs**
+
+   -  ``index`` (int) - The index of the partition.
+
+   A context manager which places all modules defined inside into the
+   partition with ID ``index``.  The ``index`` argument must be less than
+   the number of partitions.
+
+   Use ``smp.partition`` to implement manual partitioning.
+   If ``"auto_partition"`` is ``True``, then the
+   ``smp.partition`` contexts are ignored. Any module that is not placed in
+   any ``smp.partition`` context is placed in the
+   ``default_partition`` defined through the SageMaker Python SDK.
+
+   When ``smp.partition`` contexts are nested, the innermost context
+   overrides the rest (see the following example). In PyTorch, manual
+   partitioning should be done inside the module \ ``__init__``, and the
+   partition assignment applies to the modules that are *created* inside
+   the ``smp.partition`` context.
+
+   Example:
+
+   .. code:: python
+
+      class Model(torch.nn.Module):
+          def __init__(self):
+              with smp.partition(1):
+                  self.child0 = Child0()            # child0 on partition 1
+                  with smp.partition(2):
+                      self.child1 = Child1()        # child1 on partition 2
+                  self.child2 = Child2()            # child2 on partition 1
+              self.child3 = Child3()                # child3 on default_partition
+
+.. function:: smp.get_world_process_group( )
+   :noindex:
+
+   Returns a ``torch.distributed`` ``ProcessGroup`` that consists of all
+   processes, which can be used with the ``torch.distributed`` API.
+   Requires ``"ddp": True`` in SageMaker Python SDK parameters.
+
+.. function:: smp.get_mp_process_group( )
+   :noindex:
+
+   Returns a ``torch.distributed`` ``ProcessGroup`` that consists of the
+   processes in the ``MP_GROUP`` which contains the current process, which
+   can be used with the \ ``torch.distributed`` API. Requires
+   ``"ddp": True`` in SageMaker Python SDK parameters.
+
+.. function:: smp.get_dp_process_group( )
+   :noindex:
+
+   Returns a ``torch.distributed`` ``ProcessGroup`` that consists of the
+   processes in the ``DP_GROUP`` which contains the current process, which
+   can be used with the \ ``torch.distributed`` API. Requires
+   ``"ddp": True`` in SageMaker Python SDK parameters.
+
+.. function:: smp.is_initialized( )
+   :noindex:
+
+   Returns ``True`` if ``smp.init`` has already been called for the
+   process, and ``False`` otherwise.
+
+.. function::smp.is_tracing( )
+   :noindex:
+
+   Returns ``True`` if the current process is running the tracing step, and
+   ``False`` otherwise.
+
+.. data:: smp.nn.FusedLayerNorm
+   :noindex:
+
+   `Apex Fused Layer Norm <https://nvidia.github.io/apex/layernorm.html>`__ is currently not
+   supported by the library. ``smp.nn.FusedLayerNorm`` replaces ``apex``
+   ``FusedLayerNorm`` and provides the same functionality. This requires
+   ``apex`` to be installed on the system.
+
+.. data:: smp.optimizers.FusedNovoGrad
+   :noindex:
+
+
+   `Fused Novo Grad optimizer <https://nvidia.github.io/apex/optimizers.html#apex.optimizers.FusedNovoGrad>`__ is
+   currently not supported by the library. ``smp.optimizers.FusedNovoGrad`` replaces ``apex`` ``FusedNovoGrad``
+   optimizer and provides the same functionality. This requires ``apex`` to
+   be installed on the system.
+
+.. data:: smp.optimizers.FusedLamb
+   :noindex:
+
+
+   `FusedLamb optimizer <https://nvidia.github.io/apex/optimizers.html#apex.optimizers.FusedLAMB>`__
+   currently doesn’t work with the library. ``smp.optimizers.FusedLamb`` replaces
+   ``apex`` ``FusedLamb`` optimizer and provides the same functionality.
+   This requires ``apex`` to be installed on the system.
+
+.. data:: smp.amp.GradScaler
+   :noindex:
+
+   `Torch AMP Gradscaler <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler>`__
+   currently doesn’t work with the library. ``smp.amp.GradScaler`` replaces
+   ``torch.amp.GradScaler`` and provides the same functionality.
+
+.. _pytorch_saving_loading:
+   :noindex:
+
+APIs for Saving and Loading
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. function:: smp.save( )
+   :noindex:
+
+   Saves an object. This operation is similar to ``torch.save()``, except
+   it has an additional keyword argument, ``partial``, and accepts only
+   string type for the argument ``f`` (file). If ``partial=True``, each
+   ``mp_rank`` saves a separate checkpoint file and the library adds an ``mp_rank``
+   index to your saved file.
+
+   **Parameters**
+
+   -  ``obj`` (dict): A saved object.
+   -  ``f`` (str): A string containing a file name.
+   -  ``partial`` (bool, default= ``True``):  When set to ``True``, each
+      ``mp_rank`` saves a separate checkpoint file and the library adds an
+      ``mp_rank`` index to the saved file. If you want to be able to load
+      and further train a model that you save with ``smp.save()``, you must
+      set ``partial=True``.
+   -  ``pickle_module`` (picklemodule, default = module ``"pickle"`` from ``"/opt/conda/lib/python3.6/pickle.py"``):
+      A module used for pickling metadata and objects.
+   -  ``pickle_protocol``  (int, default=2): Can be specified to
+      override the defaultprotocol.
+
+.. function:: smp.load( )
+   :noindex:
+
+   Loads an object saved with ``smp.save()`` from a file.
+
+   Similar to, `torch.load() <https://pytorch.org/docs/stable/generated/torch.load.html>`__,
+   except it has an additional keyword argument, ``partial``, and accepts
+   only string type for the argument ``f`` (file). If \ ``partial=True``,
+   then each ``mp_rank`` loads a separate checkpoint file.
+
+   **Parameters**
+
+   -  ``f`` (string): A string containing a file name.
+   -  ``map_location`` (function): A function
+      `torch.device <https://pytorch.org/docs/stable/tensor_attributes.html#torch.torch.device>`__,
+      a string, or a dict specifying how to remap storage locations.
+   -  ``pickle_module`` (pickle module): A module used for unpickling
+      metadata and objects (has to match the \ ``pickle_module``\ used to
+      serialize file).
+   -  ``pickle_load_args`` (Python 3 only): Optional keyword arguments
+      passed to ``pickle_module.load()`` and ``pickle_module.Unpickler()``.
+   -  ``partial`` (bool, default= ``True``): When set to ``True``, each
+      ``mp_rank`` loads the checkpoint corresponding to the ``mp_rank``.
+      Should be used when loading a model trained with the library.
+
+.. _pytorch_saving_loading_instructions:
+   :noindex:
+
+General Instruction For Saving and Loading
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The library can save partial or full checkpoints.
+
+-  For partial checkpoints, each ``mp_rank`` saves its own checkpoint
+   file with only the parameters that belong to that rank.
+-  For full checkpoints, the library saves a single checkpoint that contains
+   entire model parameters.
+
+When **saving** using ``smp.save()``, each rank only holds its own
+parameters. If you want to save the full model, there will be some
+communication between the ranks to create the full model. If you save
+checkpoints often, you should save partial checkpoints for best
+performance.
+
+When **loading** using ``smp.load()``, the library can load either partial or |
+full checkpoints or full checkpoints saved by a non-model-parallel model. If you
+want to resume training with a non-model-parallel model or do inference, you need
+a full checkpoint.
+
+The following is an example of how you can save and load a checkpoint:
+
+.. code:: python
+
+   # Original model and optimizer
+   model = MyModel(...)
+   optimizer = MyOpt(...)
+
+   # model parallel wrapper
+   model = smp.DistributedModel(model)
+   optimizer = smp.DistributedOptimizer(optimizer)
+
+   # To save, always save on dp_rank 0 to avoid data racing
+   if partial:
+       # To save the partial model on each mp rank
+       # the library will create `checkpoint.pt_{mprank}` for each mp rank
+       if save_partial_model:
+           if smp.dp_rank() == 0:
+               model_dict = model.local_state_dict() # save the partial model
+               opt_dict = optimizer.local_state_dict() # save the partial optimizer state
+               smp.save(
+                   {"model_state_dict": model_dict, "optimizer_state_dict": opt_dict},
+                   f"/checkpoint.pt",
+                   partial=True,
+               )
+
+       # To save the full model
+       if save_full_model:
+           if smp.dp_rank() == 0:
+               model_dict = model.state_dict() # save the full model
+               opt_dict = optimizer.state_dict() # save the full optimizer state
+               smp.save(
+                   {"model_state_dict": model_dict, "optimizer_state_dict": opt_dict},
+                   "/checkpoint.pt",
+                   partial=False,
+               )
+
+   # To load, load on all ranks.
+   # The only difference for partial/full loading is the partial flag in smp.load
+   # Load partial checkpoint
+   if partial_checkpoint:
+       checkpoint = smp.load("/checkpoint.pt", partial=True)
+       model.load_state_dict(checkpoint["model_state_dict"])
+       optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
+   # Load full checkpoint
+   if full_checkpoint:
+       checkpoint = smp.load("/checkpoint.pt", partial=False)
+       model.load_state_dict(checkpoint["model_state_dict"])
+       optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
diff --git a/doc/api/training/smp_versions/v1.4.0/smd_model_parallel_tensorflow.rst b/doc/api/training/smp_versions/v1.4.0/smd_model_parallel_tensorflow.rst
new file mode 100644
index 0000000000..131fc327ac
--- /dev/null
+++ b/doc/api/training/smp_versions/v1.4.0/smd_model_parallel_tensorflow.rst
@@ -0,0 +1,172 @@
+TensorFlow API
+==============
+
+**Supported version: 2.3.1, 2.4.1, 2.5.0**
+
+**Important**: This API document assumes you use the following import statement in your training scripts.
+
+.. code:: python
+
+   import smdistributed.modelparallel.tensorflow as smp
+
+.. tip::
+
+   Refer to
+   `Modify a TensorFlow Training Script
+   <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-customize-training-script.html#model-parallel-customize-training-script-tf>`_
+   to learn how to use the following API in your TensorFlow training script.
+
+.. class:: smp.DistributedModel
+   :noindex:
+
+   A sub-class of the Keras \ ``Model`` class, which defines the model to
+   be partitioned. Model definition is done by sub-classing
+   ``smp.DistributedModel`` class, and implementing the ``call()`` method,
+   in the same way as the Keras model sub-classing API. Any operation that
+   is part of the \ ``smp.DistributedModel.call()`` method is subject to
+   partitioning, meaning that every operation placed inside executes in
+   exactly one of the devices (the operations outside run on all devices).
+
+
+   Similar to the regular Keras API, the forward pass is done by directly
+   calling the model object on the input tensors. For example:
+
+   .. code:: python
+
+      predictions = model(inputs)   # model is a smp.DistributedModel object
+
+   However, ``model()`` calls can only be made inside a
+   ``smp.step``-decorated function.
+
+   The outputs from a ``smp.DistributedModel`` are available in all ranks,
+   regardless of which rank computed the last operation.
+
+   **Methods:**
+
+   .. function:: save_model(save_path="/opt/ml/model")
+      :noindex:
+
+      **Inputs**
+      - ``save_path`` (``string``): A path to save an unpartitioned model with latest training weights.
+
+      Saves the entire,
+      unpartitioned model with the latest trained weights to ``save_path`` in
+      TensorFlow ``SavedModel`` format. Defaults to ``"/opt/ml/model"``, which
+      SageMaker monitors to upload the model artifacts to Amazon S3.
+
+.. function:: smp.partition(index)
+   :noindex:
+
+   **Inputs**
+
+   -  ``index`` (``int``): The index of the partition.
+
+   A context manager which places all operations defined inside into the
+   partition whose ID is equal to ``index``. When
+   ``smp.partition`` contexts are nested, the innermost context overrides
+   the rest. The ``index`` argument must be smaller than the number of
+   partitions.
+
+   ``smp.partition`` is used in the manual partitioning API;
+   if \ ``"auto_partition"`` parameter is set to ``True`` while launching
+   training, then ``smp.partition`` contexts are ignored. Any operation
+   that is not placed in any ``smp.partition`` context is placed in the
+   ``default_partition``, as shown in the following example:
+
+   .. code:: python
+
+      # auto_partition: False
+      # default_partition: 0
+      smp.init()
+      [...]
+      x = tf.constant(1.2)                     # placed in partition 0
+      with smp.partition(1):
+          y = tf.add(x, tf.constant(2.3))      # placed in partition 1
+          with smp.partition(3):
+              z = tf.reduce_sum(y)             # placed in partition 3
+
+
+.. function:: register_post_partition_hook(hook)
+   :noindex:
+
+    Registers a callable ``hook`` to
+    be executed after the model is partitioned. This is useful in situations
+    where an operation needs to be executed after the model partition during
+    the first call to ``smp.step``, but before the actual execution of the
+    first forward pass.
+
+    .. code:: python
+
+        @smp.register_post_partition_hook
+        def test_eager():
+            # All statements here will be executed right after partition but before the first forward pass
+            tf.print("Entered hook through eager context")
+
+.. class:: smp.CheckpointManager
+   :noindex:
+
+
+   A subclass of TensorFlow
+   `CheckpointManager <https://www.tensorflow.org/api_docs/python/tf/train/CheckpointManager>`__,
+   which is used to manage checkpoints. The usage is similar to TensorFlow
+   ``CheckpointManager``.
+
+   The following returns a ``CheckpointManager`` object.
+
+   .. code:: python
+
+      smp.CheckpointManager(checkpoint,
+                            directory="/opt/ml/checkpoints",
+                            max_to_keep=None,
+                            checkpoint_name="ckpt")
+
+   **Parameters**
+
+   -  ``checkpoint``: A `tf.train.Checkpoint
+      <https://www.tensorflow.org/api_docs/python/tf/train/Checkpoint>`__ instance
+      that represents a model checkpoint.
+
+   -  ``directory``: (``str``) The path to a directory in which to write
+      checkpoints. A file named "checkpoint" is also written to this
+      directory (in a human-readable text format) which contains the state
+      of the ``CheckpointManager``. Defaults to
+      ``"/opt/ml/checkpoints"``, which is the directory that SageMaker
+      monitors for uploading the checkpoints to Amazon S3.
+   -  ``max_to_keep`` (``int``): The number of checkpoints to keep. If
+      ``None``, all checkpoints are kept.
+   -  ``checkpoint_name`` (``str``): Custom name for the checkpoint file.
+      Defaults to ``"ckpt"``.
+
+
+   **Methods:**
+
+   .. function:: save( )
+      :noindex:
+
+      Saves a new checkpoint in the specified directory. Internally uses ``tf.train.CheckpointManager.save()``.
+
+   .. function:: restore( )
+      :noindex:
+
+      Restores the latest checkpoint in the specified directory.
+      Internally uses ``tf.train.CheckpointManager.restore()``.
+
+
+   **Examples:**
+
+   .. code:: python
+
+      checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)
+      ckpt_manager = smp.CheckpointManager(checkpoint, max_to_keep=5)  # use /opt/ml/checkpoints
+
+      for inputs in train_ds:
+          loss = train_step(inputs)
+          # [...]
+          ckpt_manager.save()  # save a new checkpoint in /opt/ml/checkpoints
+
+   .. code:: python
+
+      for step, inputs in enumerate(train_ds):
+          if step == 0:
+              ckpt_manager.restore()
+          loss = train_step(inputs)
diff --git a/doc/api/training/smp_versions/v1.5.0/smd_model_parallel_common_api.rst b/doc/api/training/smp_versions/v1.5.0/smd_model_parallel_common_api.rst
new file mode 100644
index 0000000000..625a7fcbf1
--- /dev/null
+++ b/doc/api/training/smp_versions/v1.5.0/smd_model_parallel_common_api.rst
@@ -0,0 +1,488 @@
+.. admonition:: Contents
+
+   - :ref:`communication_api`
+   - :ref:`mpi_basics`
+
+Common API
+==========
+
+The following SageMaker distribute model parallel APIs are common across all frameworks.
+
+**Important**: This API document assumes you use the following import statement in your training scripts.
+
+**TensorFlow**
+
+.. code:: python
+
+   import smdistributed.modelparallel.tensorflow as smp
+
+**PyTorch**
+
+.. code:: python
+
+   import smdistributed.modelparallel.torch as smp
+
+
+.. function:: smp.init( )
+   :noindex:
+
+   Initialize the library. Must be called at the beginning of training script.
+
+.. function:: @smp.step(non_split_inputs, input_split_axes, [*args, **kwargs])
+   :noindex:
+
+   A decorator that must be placed over a function that represents a single
+   forward and backward pass (for training use cases), or a single forward
+   pass (for evaluation use cases). Any computation that is defined inside
+   the ``smp.step``-decorated function is executed in a pipelined manner.
+
+   By default, every tensor input to the function is split across its batch
+   dimension into a number of microbatches specified while launching the
+   training job. This behavior can be customized through the arguments to
+   ``smp.step``, described below. The library then orchestrates the execution of
+   each microbatch across all partitions, based on the chosen pipeline
+   type.
+
+   In a typical use case, forward pass and back-propagation are executed
+   inside an \ ``smp.step``-decorated function and gradients, loss, and
+   other relevant metrics (such as accuracy, etc.) are returned from
+   ``smp.step``-decorated function.
+
+   Any gradient post-processing operation, such as gradient clipping and
+   allreduce, as well as ``optimizer.apply_gradients`` calls (for TF) or
+   ``optimizer.step`` (for PT) should be applied on the gradients returned
+   from the ``smp.step`` function, and not inside the ``smp.step``
+   function. This is because every operation inside ``smp.step`` is
+   executed once per microbatch, so having these operations inside
+   ``smp.step`` can either be inefficient (in the case of allreduce), or
+   lead to wrong results (in the case of ``apply_gradients`` /
+   ``optimizer.step``).
+
+   If the objects returned from the ``smp.step``-decorated function contain
+   ``tf.Tensor``\ s / ``torch.Tensor``\ s, they are converted to
+   ``StepOutput`` objects. A ``StepOutput`` object encapsulates all
+   versions of the tensor across different microbatches
+   (see ``StepOutput`` entry for more information).
+
+   The argument to ``smp.step`` decorated function should either be a tensor
+   or an instance of list, tuple, dict or set for it to be split across
+   microbatches. If your object doesn't fall into this category, you can make
+   the library split your object, by implementing ``smp_slice`` method.
+
+   Below is an example of how to use it with PyTorch.
+
+   .. code:: python
+
+      class CustomType:
+          def __init__(self, tensor):
+              self.data = tensor
+
+          # The library will call this to invoke slicing on the object passing in total microbatches (num_mb)
+          # and the current microbatch index (mb).
+          def smp_slice(self, num_mb, mb, axis):
+              dim_size = list(self.data.size())[axis]
+
+              split_size = dim_size // num_mb
+              sliced_tensor = self.data.narrow(axis, mb * split_size, split_size)
+              return CustomType(sliced_tensor, self.other)
+
+      custom_obj = CustomType(torch.ones(4,))
+
+      @smp.step()
+      def step(custom_obj):
+          loss = model(custom_obj)
+          model.backward(loss)
+          return loss
+
+
+   **Important:** ``smp.step`` splits the batch into microbatches, and
+   executes everything inside the decorated function once per microbatch.
+   This might affect the behavior of batch normalization, any operation
+   that explicitly uses the batch size information, or any other Python
+   code that is expected to run once.
+
+   **TensorFlow-specific behavior**
+
+   ``smp.step`` is a wrapper that
+   inherits from and extends the behavior of ``tf.function``, and as such,
+   all the caveats that apply to the use of ``tf.function``\ s also apply
+   to ``smp.step``. In particular, any operation that is inside
+   ``smp.step`` executes in graph mode, and not eager mode.
+
+   In the first call, ``smp.step`` performs tracing of the wrapped function every time
+   one of the tensor arguments changes their shape or dtype, or for every
+   new value of a Python argument, if there is one. Tracing is expensive,
+   so such scenarios should be avoided as much as possible or,
+   alternatively, an ``input_signature`` argument must be provided. For
+   more information on the usage of ``tf.function``, refer to the
+   TensorFlow documentation:
+
+   -  https://www.tensorflow.org/api_docs/python/tf/function\
+   -  https://www.tensorflow.org/guide/function\
+
+   Each ``smp.step`` decorated function must have a return value that depends on the
+   output of ``smp.DistributedModel``.
+
+   **Common parameters**
+
+   -  ``non_split_inputs`` (``list``): The list of arguments to the decorated function
+      that should not be split along the batch dimension. Should be used
+      for all input tensors that do not have a batch dimension. Should be a
+      list of argument names as ``str``, as they appear in the signature of
+      the ``smp.step``-decorated function. By default it is considered an
+      empty list.
+
+   -  ``input_split_axes`` (``dict``): A dict that maps the argument name to its batch
+      axis. The keys should be the argument names as ``str``, as they
+      appear in the signature of the ``smp.step``-decorated function.  By
+      default all batch axes are assumed to be the 0-axis.
+
+   **TensorFlow-only parameters**
+
+   -  All arguments of ``tf.function``. Note:
+      The \ ``experimental_compile`` argument of ``tf.function`` may not
+      work as expected with ``smp.step``, since it interferes with
+      pipelining and model partitioning. To enable XLA with the library, you can
+      instead use \ ``tf.config.optimizer.set_jit(True)``.
+
+   **PyTorch-only parameters**
+
+   -  ``detach_outputs`` (``bool``) : If ``True``, calls ``torch.Tensor.detach()`` on
+      all returned ``torch.Tensor`` outputs. Setting it to ``False``
+      increases memory consumption, unless ``detach()`` is manually called
+      on the returned tensors, because the model graph is not cleared from
+      memory after the training step. Set to \ ``True`` by default.
+
+   **Returns**
+
+   -  The same object(s) returned from the decorated function. All
+      returned \ ``tf.Tensor``, \ ``tf.Variable``  objects (for TF) or
+      ``torch.Tensor`` objects (for PT) are wrapped inside
+      a \ ``StepOutput`` object, even when they are inside a Python
+      ``list``, ``tuple``, or ``dict``.
+
+
+
+.. class:: StepOutput
+   :noindex:
+
+
+   A class that encapsulates all versions of a ``tf.Tensor``
+   or \ ``torch.Tensor`` across all microbatches.
+
+   When a particular ``tf.Tensor`` or ``torch.Tensor`` is computed inside
+   ``smp.step``, different versions of the tensor are computed for each
+   microbatch.
+
+   When this tensor is returned from ``smp.step`` and is accessed outside
+   of the decorated function, it appears as a ``StepOutput`` object, which
+   contains all such versions. For example,
+
+   -  In the case of Tensorflow, the gradient for a particular
+      ``tf.Variable`` is computed on each microbatch individually, and if
+      this gradient is returned from ``smp.step``, all gradients for this
+      ``tf.Variable`` become part of the same ``StepOutput`` object. The
+      ``StepOutput`` class offers the following API for commonly-used
+      post-processing operations on such tensors.
+   -  In the case of PyTorch, the loss for each microbatch is computed
+      individually and all the ``torch.Tensor``\ s that represent the loss
+      for different microbatches become part of same ``StepOutput`` object,
+      if loss is returned from the ``smp.step`` function.
+
+
+   The ``StepOutput`` class offers the following API for commonly-used
+   post-processing operations on tensors.
+
+   .. data:: StepOutput.outputs
+      :noindex:
+
+      Returns a list of the underlying tensors, indexed by microbatch.
+
+   .. function:: StepOutput.reduce_mean( )
+      :noindex:
+
+      Returns a ``tf.Tensor``, ``torch.Tensor`` that averages the constituent ``tf.Tensor`` s
+      ``torch.Tensor`` s. This is commonly used for averaging loss and gradients across microbatches.
+
+   .. function:: StepOutput.reduce_sum( )
+      :noindex:
+
+      Returns a ``tf.Tensor`` /
+      ``torch.Tensor`` that sums the constituent
+      ``tf.Tensor``\ s/\ ``torch.Tensor``\ s.
+
+   .. function:: StepOutput.concat( )
+      :noindex:
+
+      Returns a
+      ``tf.Tensor``/``torch.Tensor`` that concatenates tensors along the
+      batch dimension using ``tf.concat`` / ``torch.cat``.
+
+   .. function:: StepOutput.stack( )
+      :noindex:
+
+      Applies ``tf.stack`` / ``torch.stack``
+      operation to the list of constituent ``tf.Tensor``\ s /
+      ``torch.Tensor``\ s.
+
+   **TensorFlow-only methods**
+
+   .. function:: StepOutput.merge( )
+      :noindex:
+
+      Returns a ``tf.Tensor`` that
+      concatenates the constituent ``tf.Tensor``\ s along the batch
+      dimension. This is commonly used for merging the model predictions
+      across microbatches.
+
+   .. function:: StepOutput.accumulate(method="variable", var=None)
+      :noindex:
+
+      Functionally the same as ``StepOutput.reduce_mean()``. However, it is
+      more memory-efficient, especially for large numbers of microbatches,
+      since it does not wait for all constituent \ ``tf.Tensor``\ s to be
+      ready to start averaging them, thereby saving memory.
+
+      In some cases (XLA for example) ``StepOutput.reduce_mean()`` might end
+      up being more memory-efficient than ``StepOutput.accumulate()``.
+
+      **Parameters**
+
+      -  ``method`` (``"add_n"`` or ``"accumulate_n"`` or ``"variable"``):
+         If ``"add_n"`` or ``"accumulate_n"``, the library uses
+         ``tf.add_n`` and ``tf.accumulate_n``, respectively, to implement
+         accumulation. If ``"variable"``, the library uses an internal ``tf.Variable``
+         into which to accumulate the tensors. Default is \ ``"variable"``.
+         Note: Memory usage behavior of these choices can depend on the model
+         and implementation.
+
+      -  ``var``: A ``tf.Variable`` into which, if provided, the library uses to
+         accumulate the tensors. If \ ``None``, the library internally creates a
+         variable. If ``method`` is not ``"variable"``, this argument is
+         ignored.
+
+.. _mpi_basics:
+   :noindex:
+
+MPI Basics
+^^^^^^^^^^
+
+The library exposes the following basic MPI primitives to its Python API:
+
+-  ``smp.rank()``: The rank of the current process.
+-  ``smp.size()``: The total number of processes.
+-  ``smp.mp_rank()``: The rank of the process among the processes that
+   hold the current model replica.
+-  ``smp.dp_rank()``: The rank of the process among the processes that
+   hold different replicas of the same model partition.
+-  ``smp.dp_size()``: The total number of model replicas.
+-  ``smp.local_rank()``: The rank among the processes on the current
+   instance.
+-  ``smp.local_size()``: The total number of processes on the current
+   instance.
+-  ``smp.get_mp_group()``: The list of ranks over which the current
+   model replica is partitioned.
+-  ``smp.get_dp_group()``: The list of ranks that hold different
+   replicas of the same model partition.
+
+   .. _communication_api:
+      :noindex:
+
+Communication API
+^^^^^^^^^^^^^^^^^
+
+The library provides a few communication primitives which can be helpful while
+developing the training script. These primitives use the following
+``enum`` s as arguments to specify which processes the communication
+should involve.
+​
+
+**Helper structures**
+
+.. data:: smp.CommGroup
+   :noindex:
+
+   An ``enum`` that takes the values
+   ``CommGroup.WORLD``, ``CommGroup.MP_GROUP``, and ``CommGroup.DP_GROUP``.
+   These values can also be accessed as ``smp.WORLD``, ``smp.MP_GROUP``,
+   and ``smp.DP_GROUP`` respectively.
+
+   -  ``CommGroup.WORLD``: Represents the entire group of processes used in
+      training
+   -  ``CommGroup.MP_GROUP``: Represents the group of processes that hold
+      the same model replica as the current process. The processes in a
+      single ``MP_GROUP`` collectively store an entire replica of the
+      model.
+   -  ``CommGroup.DP_GROUP``: Represents the group of processes that hold
+      the same model partition as the current process. The processes in a
+      single ``DP_GROUP`` perform data parallelism/allreduce among
+      themselves.
+
+.. data:: smp.RankType
+   :noindex:
+
+   An ``enum`` that takes the values
+   ``RankType.WORLD_RANK``, ``RankType.MP_RANK``, and ``RankType.DP_RANK``.
+
+   -  ``RankType.WORLD_RANK``: The associated rank is to be interpreted as
+      the rank of the process across all processes used in training.
+   -  ``RankType.MP_RANK``: The associated rank is to be interpreted as the
+      rank of the process within the ``MP_GROUP``.
+   -  ``RankType.DP_RANK``: The associated rank is to be interpreted as the
+      rank of the process within the ``DP_GROUP``.
+
+
+**Communication primitives:**
+
+.. function:: smp.broadcast(obj, group)
+   :noindex:
+
+   Sends the object to all processes in the
+   group. The receiving process must call ``smp.recv_from`` to receive the
+   sent object.
+
+   **Inputs**
+
+   -  ``obj``: An arbitrary picklable Python object that will be broadcast.
+
+   -  ``group``: A ``CommGroup`` argument that represents to which group of
+      processes the object will be sent.
+
+   **Notes**
+
+   -  When you use ``broadcast`` on the sender process, there needs
+      to be an accompanying ``smp.recv_from()`` call on the receiver
+      processes.
+
+   -  This is a synchronous call; the ``broadcast`` statement
+      returns only after all ranks participating in the call have made a
+      matching ``recv_from`` call.
+
+   **Example**
+
+   .. code:: python
+
+      if smp.rank() == 0:
+          smp.broadcast(something, group=smp.CommGroup.WORLD)
+      else:
+          smp.recv_from(0, rank_type=smp.RankType.WORLD_RANK)
+
+.. function:: smp.send(obj, dest_rank, rank_type)
+   :noindex:
+
+   Sends the object ``obj`` to
+   ``dest_rank``, which is of a type specified by ``rank_type``.
+
+   **Inputs**
+
+   -  ``obj``: An arbitrary picklable Python object that will be sent.
+
+   -  ``dest_rank`` (``int``): An integer denoting the rank of the receiving process.
+
+   -  ``rank_type`` (``enum``): A ``smp.RankType`` ``enum`` that determines how
+      ``dest_rank`` is to be interpreted. For example if ``dest_rank`` is 1
+      and ``rank_type`` is ``MP_RANK``, then ``obj`` is sent to process
+      with ``mp_rank`` 1 in the ``MP_GROUP`` which contains the current
+      process.
+
+   **Notes**
+
+   -  Note: \ This is a synchronous call; the ``send`` statement returns
+      only after the destination rank has made a matching
+      ``recv_from`` call.
+
+.. function:: smp.recv_from(src_rank, rank_type)
+   :noindex:
+
+   Receive an object from a peer process. Can be used with a matching
+   ``smp.send`` or a ``smp.broadcast`` call.
+
+   **Inputs**
+
+   -  ``src_rank`` (``int``): An integer denoting rank of the sending process.
+
+   -  ``rank_type`` (``enum``): A ``smp.RankType`` ``enum`` that determines how
+      ``dest_rank`` is to be interpreted. For example if ``src_rank`` is 1
+      and ``rank_type`` is ``MP_RANK``, then the object is received from
+      the process with ``mp_rank`` 1 in the ``MP_GROUP`` which contains the
+      current process.
+
+   **Returns**
+
+   Returns the python object that is sent by the peer process.
+
+   **Notes**
+
+   -  Note: This is a synchronous call; the ``recv_from`` statement returns
+      only after the source rank has made a matching ``send`` or
+      ``broadcast`` call, and the object is received.
+
+.. function:: smp.allgather(obj, group)
+   :noindex:
+
+   A collective call that gathers all the
+   submitted objects across all ranks in the specified ``group``. Returns a
+   list whose ``i``\ th index contains the object submitted by the
+   ``i``\ th rank in ``group``.
+
+   **Inputs**
+
+   -  ``obj``: An arbitrary picklable Python object that will be
+      allgathered.
+
+   -  ``group`` : A ``CommGroup`` argument that represents which group of
+      processes participate in ``allgather``.
+
+   **Notes**
+
+   -  Note: This is a synchronous call; the ``allgather`` statement returns
+      only after all ranks participating in the call have made a matching
+      ``allgather`` call, and all the objects are received at the current
+      rank.
+
+   **Examples**
+
+   .. code:: python
+
+      # assuming mp_size() == 2
+
+      if smp.mp_rank() == 0:
+          out = smp.allgather(obj1, smp.CommGroup.MP_GROUP)  # returns [obj1, obj2]
+      else:
+          out = smp.allgather(obj2, smp.CommGroup.MP_GROUP)  # returns [obj1, obj2]
+
+.. function:: smp.barrier(group=smp.WORLD)
+   :noindex:
+
+   A statement that hangs until all
+   processes in the specified group reach the barrier statement, similar to
+   ``MPI_Barrier()``.
+
+   **Inputs**
+
+   -  ``group``: An ``smp.CommGroup`` ``enum`` that specifies the group of
+      processes participating in the barrier call. Defaults to
+      ``smp.WORLD``.
+
+   **Examples**
+
+   -  Assume there are 8 processes and 2 model partitions, and
+      therefore 4 \ ``mp_group``\ s, and 2 ``dp_group``\ s. If
+      the \ ``barrier`` call is passed the value ``smp.MP_GROUP`` for its
+      group argument, then each process only waits until the other process
+      of its own ``mp_group`` reaches that point. It does not wait for
+      processes outside that ``mp_group``.
+
+.. function:: smp.dp_barrier()
+   :noindex:
+
+   Same as passing ``smp.DP_GROUP``\ to ``smp.barrier()``.
+   Waits for the processes in the same \ ``dp_group`` as
+   the current process to reach the same point in execution.
+
+.. function:: smp.mp_barrier()
+   :noindex:
+
+   Same as passing ``smp.MP_GROUP`` to
+   ``smp.barrier()``. Waits for the processes in the same ``mp_group`` as
+   the current process to reach the same point in execution.
diff --git a/doc/api/training/smp_versions/v1.5.0/smd_model_parallel_pytorch.rst b/doc/api/training/smp_versions/v1.5.0/smd_model_parallel_pytorch.rst
new file mode 100644
index 0000000000..d2fcb95954
--- /dev/null
+++ b/doc/api/training/smp_versions/v1.5.0/smd_model_parallel_pytorch.rst
@@ -0,0 +1,572 @@
+.. admonition:: Contents
+
+   - :ref:`pytorch_saving_loading`
+   - :ref:`pytorch_saving_loading_instructions`
+
+PyTorch API
+===========
+
+**Supported versions: 1.7.1, 1.8.1**
+
+This API document assumes you use the following import statements in your training scripts.
+
+.. code:: python
+
+   import smdistributed.modelparallel.torch as smp
+
+
+.. tip::
+
+   Refer to
+   `Modify a PyTorch Training Script
+   <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-customize-training-script.html#model-parallel-customize-training-script-pt>`_
+   to learn how to use the following API in your PyTorch training script.
+
+.. class:: smp.DistributedModel
+   :noindex:
+
+   A sub-class of ``torch.nn.Module`` which specifies the model to be
+   partitioned. Accepts a ``torch.nn.Module`` object ``module`` which is
+   the model to be partitioned. The returned ``DistributedModel`` object
+   internally manages model parallelism and data parallelism. Only one
+   model in the training script can be wrapped with
+   ``smp.DistributedModel``.
+
+   **Example:**
+
+   .. code:: python
+
+      model = smp.DistributedModel(model)
+
+   **Important**: The ``__call__`` and  ``backward`` method calls on the
+   ``smp.DistributedModel`` object (in the following example, the object
+   is \ ``model``) can only be made inside a ``smp.step``-decorated
+   function.
+
+
+   Since ``DistributedModel``  is a ``torch.nn.Module``, a forward pass can
+   be performed by calling the \ ``DistributedModel`` object on the input
+   tensors.
+
+   .. code:: python
+
+      predictions = model(inputs)   # model is a smp.DistributedModel object
+
+   For a backward pass, one needs to call the backward function on
+   the \ ``DistributedModel`` object, with tensors and gradients as
+   arguments, replacing the PyTorch operations \ ``torch.Tensor.backward``
+   or ``torch.autograd.backward``.
+
+
+   The API for ``model.backward`` is very similar to
+   ``torch.autograd.backward``. For example, the following
+   ``backward`` calls:
+
+   .. code:: python
+
+      torch.autograd.backward(loss) or loss.backward()
+
+   should be replaced with:
+
+   .. code:: python
+
+      model.backward(loss) # loss is a tensor with only one element as its data
+
+   Similarly, for non-scalar tensors, replace the following
+   ``backward`` call containing incoming gradient arguments:
+
+   .. code:: python
+
+      torch.autograd.backward(outputs, out_grads)
+
+   with the following line:
+
+   .. code:: python
+
+      model.backward(outputs, out_grads)
+
+   In these examples, all ``__call__``  and ``backward`` method calls on
+   the model objects (``model(inputs)`` and ``model.backward(loss)``) must be made inside
+   a ``smp.step``-decorated function.
+
+   **Using DDP**
+
+   If DDP is enabled, do not not place a PyTorch
+   ``DistributedDataParallel`` wrapper around the ``DistributedModel`` because
+   the ``DistributedModel`` wrapper will also handle data parallelism.
+
+   Unlike the original DDP wrapper, when you use ``DistributedModel``,
+   model parameters and buffers are not immediately broadcast across
+   processes when the wrapper is called. Instead, the broadcast is deferred to the first call of the
+   ``smp.step``-decorated function when the partition is done.
+
+   **Parameters**
+
+   -  ``module`` (``torch.nn.Module``): Module to be distributed (data parallelism and model parallelism).
+
+   -  ``trace_device`` (``"cpu"`` or ``"gpu"``) (default: ``"gpu"``)
+      Whether to perform the tracing step on the GPU or CPU. The tracing step gathers
+      information on the order of execution of modules, the shapes of
+      intermediate outputs, and execution times, to be used by the
+      partitioning algorithm. If ``trace_device`` is set to GPU, accurate
+      module execution times can be gathered during tracing for potentially
+      improved partitioning decision. However, if the model is too large to
+      fit in a single GPU, then ``trace_device`` should be set to CPU.
+
+   -  ``trace_execution_times`` (``bool``) (default: ``False``): If ``True``,
+      the library profiles the execution time of each module during tracing, and uses
+      it in the partitioning decision. This improves the partitioning
+      decision, but it might make the tracing slower. It may also introduce
+      some degree of non-determinism in partitioning results, because of the
+      inherent randomness in module execution times. Must be ``False`` if
+      ``trace_device`` is ``"cpu"``.
+
+   -  ``overlapping_allreduce`` (``bool``) (default: ``True``): This is only
+      applicable for hybrid data parallelism/model parallelism use cases (when
+      ``ddp`` is set to ``True`` while launching training). The library uses this flag
+      to decide whether to do overlapping allreduce whenever a parameter
+      gradients are ready. This leads to overlapping of communication and
+      computation and can improve performance. If this is set to ``False`` ,
+      allreduce is performed at the end of the step.
+
+   -  ``backward_passes_per_step`` (``int``) (default: 1): This is only
+      applicable for hybrid data parallelism/model parallelism use cases (when
+      ``ddp`` is set to ``True`` in config). This parameter indicates the
+      number of backward passes to perform before calling allreduce on DDP.
+      This allows accumulating updates over multiple mini-batches before
+      reducing and applying them.
+
+   -  ``average_grads_across_microbatches`` (``bool``) (default: ``True``):
+      Whether or not the computed gradients should be averaged across
+      microbatches. If ``False``, the computed gradients will be summed across
+      microbatches, but not divided by the number of microbatches. In typical
+      use case where the computed loss is averaged over the mini-batch, this
+      should be left as ``True``. If you use a loss function that only sums
+      the per-sample loss across the batch (and not divide by the batch size),
+      then this must be set to ``False`` for correctness.
+
+   -  ``bucket_cap_mb`` (default: 25): \ ``DistributedDataParallel`` buckets
+      parameters into multiple buckets so that gradient reduction of each
+      bucket can potentially overlap with backward
+      computation. \ ``bucket_cap_mb``\ controls the bucket size in MegaBytes
+      (MB).
+
+   -  ``trace_memory_usage`` (default: False): When set to True, the library attempts
+      to measure memory usage per module during tracing. If this is disabled,
+      memory usage will be estimated through the sizes of tensors returned from
+      the module.
+
+   -  ``broadcast_buffers`` (default: True): Flag to be used with ``ddp=True``.
+      This parameter is forwarded to the underlying ``DistributedDataParallel`` wrapper.
+      Please see: `broadcast_buffer <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel>`__.
+
+   -  ``gradient_as_bucket_view`` (default: False): To be
+      used with ``ddp=True``. This parameter is forwarded to the underlying
+      ``DistributedDataParallel`` wrapper. Please see `gradient_as_bucket_view <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel>`__.
+
+   **Properties**
+
+   -  ``partitioned``: Is ``True`` if the model is partitioned, ``False``
+      otherwise. Initialized to ``False`` when ``DistributedModel`` is first
+      created. It becomes be ``True`` during the first call
+      to ``smp.step``-decorated function. Once the model is partitioned, the
+      local parameters or local ``state_dict`` can be fetched using the
+      following methods.
+
+   **Methods**
+
+   .. function:: backward(tensors, grad_tensors)
+      :noindex:
+
+      Triggers a distributed backward
+      pass across model partitions. Example usage provided in the previous
+      section. The API is very similar
+      to https://pytorch.org/docs/stable/autograd.html#torch.autograd.backward.
+      ``retain_grad`` and ``create_graph``  flags are not supported.
+
+   .. function:: local_buffers( )
+      :noindex:
+
+      Returns an iterator over buffers for the modules in
+      the partitioned model that have been assigned to the current process.
+
+   .. function:: local_named_buffers( )
+      :noindex:
+
+      Returns an iterator over buffers for the
+      modules in the partitioned model that have been assigned to the current
+      process. This yields both the name of the buffer as well as the buffer
+      itself.
+
+   .. function:: local_parameters( )
+      :noindex:
+
+      Returns an iterator over parameters for the
+      modules in the partitioned model that have been assigned to the current
+      process.
+
+   .. function:: local_named_parameters( )
+      :noindex:
+
+      Returns an iterator over parameters for
+      the modules in the partitioned model that have been assigned to the
+      current process. This yields both the name of the parameter as well as
+      the parameter itself.
+
+   .. function:: local_modules( )
+      :noindex:
+
+      Returns an iterator over the modules in the
+      partitioned model that have been assigned to the current process.
+
+   .. function:: local_named_modules( )
+      :noindex:
+
+      Returns an iterator over the modules in the
+      partitioned model that have been assigned to the current process. This
+      yields both the name of the module as well as the module itself.
+
+   .. function:: local_state_dict( )
+      :noindex:
+
+      Returns the ``state_dict`` that contains local
+      parameters that belong to the current \ ``mp_rank``. This ``state_dict``
+      contains a key \ ``_smp_is_partial`` to indicate this is a
+      partial \ ``state_dict``, which indicates whether the
+      ``state_dict`` contains elements corresponding to only the current
+      partition, or to the entire model.
+
+   .. function:: state_dict( )
+      :noindex:
+
+      Returns the ``state_dict`` that contains parameters
+      for the entire model. It first collects the \ ``local_state_dict``  and
+      gathers and merges the \ ``local_state_dict`` from all ``mp_rank``\ s to
+      create a full ``state_dict``. Please note that this needs to be called on all ranks with
+      ``dp_rank()==0`` to ensure the gather happens properly.
+      If it is only called on all such ranks, it can hang.
+
+   .. function:: load_state_dict( )
+      :noindex:
+
+      Same as the ``torch.module.load_state_dict()`` ,
+      except: It first gathers and merges the ``state_dict``\ s across
+      ``mp_rank``\ s, if they are partial. The actual loading happens after the
+      model partition so that each rank knows its local parameters.
+
+   .. function:: register_post_partition_hook(hook)
+      :noindex:
+
+      Registers a callable ``hook`` to
+      be executed after the model is partitioned. This is useful in situations
+      where an operation needs to be executed after the model partition during
+      the first call to ``smp.step``, but before the actual execution of the
+      first forward pass. Returns a ``RemovableHandle`` object ``handle``,
+      which can be used to remove the hook by calling ``handle.remove()``.
+
+   .. function:: cpu( )
+      :noindex:
+
+      Allgathers parameters and buffers across all ``mp_rank``\ s and moves them
+      to the CPU.
+
+   .. function:: join( )
+      :noindex:
+
+      A context manager to be used in conjunction with an instance of
+      ``smp.DistributedModel`` to be able to train with uneven inputs across
+      participating processes. This is only supported when ``ddp=True``. This will use the join with the wrapped
+      ``DistributedDataParallel`` instance. For more information, see:
+      `join <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel.join>`__
+      in the PyTorch documentation.
+
+   .. function:: register_comm_hook( state, callable )
+      :noindex:
+
+      **Available for PyTorch 1.8.1 only**
+      Registers a communication hook which is an enhancement that provides
+      a flexible hook ``callable`` to users where they can specify how
+      gradients are aggregated across multiple workers. This method will be called on the wrapped ``DistributedDataParallel`` instance.
+
+      Please note that when you register a comm hook you have full control of how the gradients are processed.
+      When using only data parallelism with Torch DDP you are expected to average grads across data parallel replicas within the hook.
+      Similarly, when using DistributedModel you have to averaging grads across data parallel replicas within the hook.
+      In addition to that, you also have to average grads across microbatches within the hook unless you explicitly desire to not average based on your loss function.
+      See ``average_grads_across_microbatches`` for more information about averaging grads across microbatches.
+
+      This is only supported when ``ddp=True`` and ``overlapping_allreduce=True`` (default).
+      For more information, see:
+      `register_comm_hook <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel.register_comm_hook>`__
+      in the PyTorch documentation.
+
+
+
+.. class:: smp.DistributedOptimizer
+   :noindex:
+
+   **Parameters**
+   - ``optimizer``
+
+   An optimizer wrapper for saving/loading optimizer states. This wrapper
+   returns ``optimizer`` with the following methods overridden:
+
+   .. function:: state_dict( )
+      :noindex:
+
+      Returns the ``state_dict`` that contains optimizer state for the entire model.
+      It first collects the ``local_state_dict`` and gathers and merges
+      the ``local_state_dict`` from all ``mp_rank``s to create a full
+      ``state_dict``.
+
+   .. function::  load_state_dict( )
+      :noindex:
+
+      Same as the ``torch.optimizer.load_state_dict()`` , except:
+
+         -  It first gathers and merges the local ``state_dict``\ s if they are
+            partial.
+         -  The actual loading happens after the model partition so that each
+            rank knows its local parameters.
+
+   .. function::  local_state_dict( )
+      :noindex:
+
+      Returns the ``state_dict`` that contains the
+      local optimizer state that belongs to the current \ ``mp_rank``. This
+      ``state_dict`` contains a key \ ``_smp_is_partial`` to indicate this is
+      a partial \ ``state_dict``, which indicates whether the
+      ``state_dict`` contains elements corresponding to only the current
+      partition, or to the entire model.
+
+   ​
+.. function:: smp.partition(index)
+   :noindex:
+
+   **Inputs**
+
+   -  ``index`` (int) - The index of the partition.
+
+   A context manager which places all modules defined inside into the
+   partition with ID ``index``.  The ``index`` argument must be less than
+   the number of partitions.
+
+   Use ``smp.partition`` to implement manual partitioning.
+   If ``"auto_partition"`` is ``True``, then the
+   ``smp.partition`` contexts are ignored. Any module that is not placed in
+   any ``smp.partition`` context is placed in the
+   ``default_partition`` defined through the SageMaker Python SDK.
+
+   When ``smp.partition`` contexts are nested, the innermost context
+   overrides the rest (see the following example). In PyTorch, manual
+   partitioning should be done inside the module \ ``__init__``, and the
+   partition assignment applies to the modules that are *created* inside
+   the ``smp.partition`` context.
+
+   Example:
+
+   .. code:: python
+
+      class Model(torch.nn.Module):
+          def __init__(self):
+              with smp.partition(1):
+                  self.child0 = Child0()            # child0 on partition 1
+                  with smp.partition(2):
+                      self.child1 = Child1()        # child1 on partition 2
+                  self.child2 = Child2()            # child2 on partition 1
+              self.child3 = Child3()                # child3 on default_partition
+
+.. function:: smp.get_world_process_group( )
+   :noindex:
+
+   Returns a ``torch.distributed`` ``ProcessGroup`` that consists of all
+   processes, which can be used with the ``torch.distributed`` API.
+   Requires ``"ddp": True`` in SageMaker Python SDK parameters.
+
+.. function:: smp.get_mp_process_group( )
+   :noindex:
+
+   Returns a ``torch.distributed`` ``ProcessGroup`` that consists of the
+   processes in the ``MP_GROUP`` which contains the current process, which
+   can be used with the \ ``torch.distributed`` API. Requires
+   ``"ddp": True`` in SageMaker Python SDK parameters.
+
+.. function:: smp.get_dp_process_group( )
+   :noindex:
+
+   Returns a ``torch.distributed`` ``ProcessGroup`` that consists of the
+   processes in the ``DP_GROUP`` which contains the current process, which
+   can be used with the \ ``torch.distributed`` API. Requires
+   ``"ddp": True`` in SageMaker Python SDK parameters.
+
+.. function:: smp.is_initialized( )
+   :noindex:
+
+   Returns ``True`` if ``smp.init`` has already been called for the
+   process, and ``False`` otherwise.
+
+.. function::smp.is_tracing( )
+   :noindex:
+
+   Returns ``True`` if the current process is running the tracing step, and
+   ``False`` otherwise.
+
+.. data:: smp.nn.FusedLayerNorm
+   :noindex:
+
+   `Apex Fused Layer Norm <https://nvidia.github.io/apex/layernorm.html>`__ is currently not
+   supported by the library. ``smp.nn.FusedLayerNorm`` replaces ``apex``
+   ``FusedLayerNorm`` and provides the same functionality. This requires
+   ``apex`` to be installed on the system.
+
+.. data:: smp.optimizers.FusedNovoGrad
+   :noindex:
+
+
+   `Fused Novo Grad optimizer <https://nvidia.github.io/apex/optimizers.html#apex.optimizers.FusedNovoGrad>`__ is
+   currently not supported by the library. ``smp.optimizers.FusedNovoGrad`` replaces ``apex`` ``FusedNovoGrad``
+   optimizer and provides the same functionality. This requires ``apex`` to
+   be installed on the system.
+
+.. data:: smp.optimizers.FusedLamb
+   :noindex:
+
+
+   `FusedLamb optimizer <https://nvidia.github.io/apex/optimizers.html#apex.optimizers.FusedLAMB>`__
+   currently doesn’t work with the library. ``smp.optimizers.FusedLamb`` replaces
+   ``apex`` ``FusedLamb`` optimizer and provides the same functionality.
+   This requires ``apex`` to be installed on the system.
+
+.. data:: smp.amp.GradScaler
+   :noindex:
+
+   `Torch AMP Gradscaler <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler>`__
+   currently doesn’t work with the library. ``smp.amp.GradScaler`` replaces
+   ``torch.amp.GradScaler`` and provides the same functionality.
+
+.. _pytorch_saving_loading:
+   :noindex:
+
+APIs for Saving and Loading
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. function:: smp.save( )
+   :noindex:
+
+   Saves an object. This operation is similar to ``torch.save()``, except
+   it has an additional keyword argument, ``partial``, and accepts only
+   string type for the argument ``f`` (file). If ``partial=True``, each
+   ``mp_rank`` saves a separate checkpoint file and the library adds an ``mp_rank``
+   index to your saved file.
+
+   **Parameters**
+
+   -  ``obj`` (dict): A saved object.
+   -  ``f`` (str): A string containing a file name.
+   -  ``partial`` (bool, default= ``True``):  When set to ``True``, each
+      ``mp_rank`` saves a separate checkpoint file and the library adds an
+      ``mp_rank`` index to the saved file. If you want to be able to load
+      and further train a model that you save with ``smp.save()``, you must
+      set ``partial=True``.
+   -  ``pickle_module`` (picklemodule, default = module ``"pickle"`` from ``"/opt/conda/lib/python3.6/pickle.py"``):
+      A module used for pickling metadata and objects.
+   -  ``pickle_protocol``  (int, default=2): Can be specified to
+      override the defaultprotocol.
+
+.. function:: smp.load( )
+   :noindex:
+
+   Loads an object saved with ``smp.save()`` from a file.
+
+   Similar to, `torch.load() <https://pytorch.org/docs/stable/generated/torch.load.html>`__,
+   except it has an additional keyword argument, ``partial``, and accepts
+   only string type for the argument ``f`` (file). If \ ``partial=True``,
+   then each ``mp_rank`` loads a separate checkpoint file.
+
+   **Parameters**
+
+   -  ``f`` (string): A string containing a file name.
+   -  ``map_location`` (function): A function
+      `torch.device <https://pytorch.org/docs/stable/tensor_attributes.html#torch.torch.device>`__,
+      a string, or a dict specifying how to remap storage locations.
+   -  ``pickle_module`` (pickle module): A module used for unpickling
+      metadata and objects (has to match the \ ``pickle_module``\ used to
+      serialize file).
+   -  ``pickle_load_args`` (Python 3 only): Optional keyword arguments
+      passed to ``pickle_module.load()`` and ``pickle_module.Unpickler()``.
+   -  ``partial`` (bool, default= ``True``): When set to ``True``, each
+      ``mp_rank`` loads the checkpoint corresponding to the ``mp_rank``.
+      Should be used when loading a model trained with the library.
+
+.. _pytorch_saving_loading_instructions:
+   :noindex:
+
+General Instruction For Saving and Loading
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The library can save partial or full checkpoints.
+
+-  For partial checkpoints, each ``mp_rank`` saves its own checkpoint
+   file with only the parameters that belong to that rank.
+-  For full checkpoints, the library saves a single checkpoint that contains
+   entire model parameters.
+
+When **saving** using ``smp.save()``, each rank only holds its own
+parameters. If you want to save the full model, there will be some
+communication between the ranks to create the full model. If you save
+checkpoints often, you should save partial checkpoints for best
+performance.
+
+When **loading** using ``smp.load()``, the library can load either partial or |
+full checkpoints or full checkpoints saved by a non-model-parallel model. If you
+want to resume training with a non-model-parallel model or do inference, you need
+a full checkpoint.
+
+The following is an example of how you can save and load a checkpoint:
+
+.. code:: python
+
+   # Original model and optimizer
+   model = MyModel(...)
+   optimizer = MyOpt(...)
+
+   # model parallel wrapper
+   model = smp.DistributedModel(model)
+   optimizer = smp.DistributedOptimizer(optimizer)
+
+   # To save, always save on dp_rank 0 to avoid data racing
+   if partial:
+       # To save the partial model on each mp rank
+       # the library will create `checkpoint.pt_{mprank}` for each mp rank
+       if save_partial_model:
+           if smp.dp_rank() == 0:
+               model_dict = model.local_state_dict() # save the partial model
+               opt_dict = optimizer.local_state_dict() # save the partial optimizer state
+               smp.save(
+                   {"model_state_dict": model_dict, "optimizer_state_dict": opt_dict},
+                   f"/checkpoint.pt",
+                   partial=True,
+               )
+
+       # To save the full model
+       if save_full_model:
+           if smp.dp_rank() == 0:
+               model_dict = model.state_dict() # save the full model
+               opt_dict = optimizer.state_dict() # save the full optimizer state
+               smp.save(
+                   {"model_state_dict": model_dict, "optimizer_state_dict": opt_dict},
+                   "/checkpoint.pt",
+                   partial=False,
+               )
+
+   # To load, load on all ranks.
+   # The only difference for partial/full loading is the partial flag in smp.load
+   # Load partial checkpoint
+   if partial_checkpoint:
+       checkpoint = smp.load("/checkpoint.pt", partial=True)
+       model.load_state_dict(checkpoint["model_state_dict"])
+       optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
+   # Load full checkpoint
+   if full_checkpoint:
+       checkpoint = smp.load("/checkpoint.pt", partial=False)
+       model.load_state_dict(checkpoint["model_state_dict"])
+       optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
diff --git a/doc/api/training/smp_versions/v1.5.0/smd_model_parallel_tensorflow.rst b/doc/api/training/smp_versions/v1.5.0/smd_model_parallel_tensorflow.rst
new file mode 100644
index 0000000000..131fc327ac
--- /dev/null
+++ b/doc/api/training/smp_versions/v1.5.0/smd_model_parallel_tensorflow.rst
@@ -0,0 +1,172 @@
+TensorFlow API
+==============
+
+**Supported version: 2.3.1, 2.4.1, 2.5.0**
+
+**Important**: This API document assumes you use the following import statement in your training scripts.
+
+.. code:: python
+
+   import smdistributed.modelparallel.tensorflow as smp
+
+.. tip::
+
+   Refer to
+   `Modify a TensorFlow Training Script
+   <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-customize-training-script.html#model-parallel-customize-training-script-tf>`_
+   to learn how to use the following API in your TensorFlow training script.
+
+.. class:: smp.DistributedModel
+   :noindex:
+
+   A sub-class of the Keras \ ``Model`` class, which defines the model to
+   be partitioned. Model definition is done by sub-classing
+   ``smp.DistributedModel`` class, and implementing the ``call()`` method,
+   in the same way as the Keras model sub-classing API. Any operation that
+   is part of the \ ``smp.DistributedModel.call()`` method is subject to
+   partitioning, meaning that every operation placed inside executes in
+   exactly one of the devices (the operations outside run on all devices).
+
+
+   Similar to the regular Keras API, the forward pass is done by directly
+   calling the model object on the input tensors. For example:
+
+   .. code:: python
+
+      predictions = model(inputs)   # model is a smp.DistributedModel object
+
+   However, ``model()`` calls can only be made inside a
+   ``smp.step``-decorated function.
+
+   The outputs from a ``smp.DistributedModel`` are available in all ranks,
+   regardless of which rank computed the last operation.
+
+   **Methods:**
+
+   .. function:: save_model(save_path="/opt/ml/model")
+      :noindex:
+
+      **Inputs**
+      - ``save_path`` (``string``): A path to save an unpartitioned model with latest training weights.
+
+      Saves the entire,
+      unpartitioned model with the latest trained weights to ``save_path`` in
+      TensorFlow ``SavedModel`` format. Defaults to ``"/opt/ml/model"``, which
+      SageMaker monitors to upload the model artifacts to Amazon S3.
+
+.. function:: smp.partition(index)
+   :noindex:
+
+   **Inputs**
+
+   -  ``index`` (``int``): The index of the partition.
+
+   A context manager which places all operations defined inside into the
+   partition whose ID is equal to ``index``. When
+   ``smp.partition`` contexts are nested, the innermost context overrides
+   the rest. The ``index`` argument must be smaller than the number of
+   partitions.
+
+   ``smp.partition`` is used in the manual partitioning API;
+   if \ ``"auto_partition"`` parameter is set to ``True`` while launching
+   training, then ``smp.partition`` contexts are ignored. Any operation
+   that is not placed in any ``smp.partition`` context is placed in the
+   ``default_partition``, as shown in the following example:
+
+   .. code:: python
+
+      # auto_partition: False
+      # default_partition: 0
+      smp.init()
+      [...]
+      x = tf.constant(1.2)                     # placed in partition 0
+      with smp.partition(1):
+          y = tf.add(x, tf.constant(2.3))      # placed in partition 1
+          with smp.partition(3):
+              z = tf.reduce_sum(y)             # placed in partition 3
+
+
+.. function:: register_post_partition_hook(hook)
+   :noindex:
+
+    Registers a callable ``hook`` to
+    be executed after the model is partitioned. This is useful in situations
+    where an operation needs to be executed after the model partition during
+    the first call to ``smp.step``, but before the actual execution of the
+    first forward pass.
+
+    .. code:: python
+
+        @smp.register_post_partition_hook
+        def test_eager():
+            # All statements here will be executed right after partition but before the first forward pass
+            tf.print("Entered hook through eager context")
+
+.. class:: smp.CheckpointManager
+   :noindex:
+
+
+   A subclass of TensorFlow
+   `CheckpointManager <https://www.tensorflow.org/api_docs/python/tf/train/CheckpointManager>`__,
+   which is used to manage checkpoints. The usage is similar to TensorFlow
+   ``CheckpointManager``.
+
+   The following returns a ``CheckpointManager`` object.
+
+   .. code:: python
+
+      smp.CheckpointManager(checkpoint,
+                            directory="/opt/ml/checkpoints",
+                            max_to_keep=None,
+                            checkpoint_name="ckpt")
+
+   **Parameters**
+
+   -  ``checkpoint``: A `tf.train.Checkpoint
+      <https://www.tensorflow.org/api_docs/python/tf/train/Checkpoint>`__ instance
+      that represents a model checkpoint.
+
+   -  ``directory``: (``str``) The path to a directory in which to write
+      checkpoints. A file named "checkpoint" is also written to this
+      directory (in a human-readable text format) which contains the state
+      of the ``CheckpointManager``. Defaults to
+      ``"/opt/ml/checkpoints"``, which is the directory that SageMaker
+      monitors for uploading the checkpoints to Amazon S3.
+   -  ``max_to_keep`` (``int``): The number of checkpoints to keep. If
+      ``None``, all checkpoints are kept.
+   -  ``checkpoint_name`` (``str``): Custom name for the checkpoint file.
+      Defaults to ``"ckpt"``.
+
+
+   **Methods:**
+
+   .. function:: save( )
+      :noindex:
+
+      Saves a new checkpoint in the specified directory. Internally uses ``tf.train.CheckpointManager.save()``.
+
+   .. function:: restore( )
+      :noindex:
+
+      Restores the latest checkpoint in the specified directory.
+      Internally uses ``tf.train.CheckpointManager.restore()``.
+
+
+   **Examples:**
+
+   .. code:: python
+
+      checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)
+      ckpt_manager = smp.CheckpointManager(checkpoint, max_to_keep=5)  # use /opt/ml/checkpoints
+
+      for inputs in train_ds:
+          loss = train_step(inputs)
+          # [...]
+          ckpt_manager.save()  # save a new checkpoint in /opt/ml/checkpoints
+
+   .. code:: python
+
+      for step, inputs in enumerate(train_ds):
+          if step == 0:
+              ckpt_manager.restore()
+          loss = train_step(inputs)
diff --git a/doc/api/training/smp_versions/v1_4_0.rst b/doc/api/training/smp_versions/v1_4_0.rst
new file mode 100644
index 0000000000..4485ae6a40
--- /dev/null
+++ b/doc/api/training/smp_versions/v1_4_0.rst
@@ -0,0 +1,12 @@
+
+Version 1.4.x
+=============
+
+To use the library, reference the Common API documentation alongside the framework specific API documentation.
+
+.. toctree::
+   :maxdepth: 1
+
+   v1.4.0/smd_model_parallel_common_api
+   v1.4.0/smd_model_parallel_pytorch
+   v1.4.0/smd_model_parallel_tensorflow
diff --git a/doc/api/training/smp_versions/v1_5_0.rst b/doc/api/training/smp_versions/v1_5_0.rst
new file mode 100644
index 0000000000..c93761efa4
--- /dev/null
+++ b/doc/api/training/smp_versions/v1_5_0.rst
@@ -0,0 +1,12 @@
+
+Version 1.5.x
+=============
+
+To use the library, reference the Common API documentation alongside the framework specific API documentation.
+
+.. toctree::
+   :maxdepth: 1
+
+   v1.5.0/smd_model_parallel_common_api
+   v1.5.0/smd_model_parallel_pytorch
+   v1.5.0/smd_model_parallel_tensorflow

From bbe62847cbc4160ffcb37397eeaef6bb1d527bb2 Mon Sep 17 00:00:00 2001
From: Mufaddal Rohawala <89424143+mufaddal-rohawala@users.noreply.github.com>
Date: Fri, 31 Dec 2021 13:00:05 -0800
Subject: [PATCH 05/13] fix: fix kmeans test deletion sequence, increment
 lineage statics (#2815)

---
 src/sagemaker/session.py                  | 1 +
 tests/integ/sagemaker/lineage/conftest.py | 4 ++--
 tests/integ/test_kmeans.py                | 9 ++++-----
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/sagemaker/session.py b/src/sagemaker/session.py
index 828371c6dc..189c9cb308 100644
--- a/src/sagemaker/session.py
+++ b/src/sagemaker/session.py
@@ -3565,6 +3565,7 @@ def endpoint_from_production_variants(
         if data_capture_config_dict is not None:
             config_options["DataCaptureConfig"] = data_capture_config_dict
 
+        LOGGER.info("Creating endpoint-config with name %s", name)
         self.sagemaker_client.create_endpoint_config(**config_options)
 
         return self.create_endpoint(endpoint_name=name, config_name=name, tags=tags, wait=wait)
diff --git a/tests/integ/sagemaker/lineage/conftest.py b/tests/integ/sagemaker/lineage/conftest.py
index 5b814bab5b..bb051b9634 100644
--- a/tests/integ/sagemaker/lineage/conftest.py
+++ b/tests/integ/sagemaker/lineage/conftest.py
@@ -36,8 +36,8 @@
 from tests.integ.sagemaker.lineage.helpers import name, names
 
 SLEEP_TIME_SECONDS = 1
-STATIC_PIPELINE_NAME = "SdkIntegTestStaticPipeline15"
-STATIC_ENDPOINT_NAME = "SdkIntegTestStaticEndpoint15"
+STATIC_PIPELINE_NAME = "SdkIntegTestStaticPipeline16"
+STATIC_ENDPOINT_NAME = "SdkIntegTestStaticEndpoint16"
 
 
 @pytest.fixture
diff --git a/tests/integ/test_kmeans.py b/tests/integ/test_kmeans.py
index c4def3b439..056b068f3b 100644
--- a/tests/integ/test_kmeans.py
+++ b/tests/integ/test_kmeans.py
@@ -76,11 +76,10 @@ def test_kmeans(sagemaker_session, cpu_instance_type, training_set):
         for record in result:
             assert record.label["closest_cluster"] is not None
             assert record.label["distance_to_cluster"] is not None
-
-    predictor.delete_model()
-    with pytest.raises(Exception) as exception:
-        sagemaker_session.sagemaker_client.describe_model(ModelName=model.name)
-        assert "Could not find model" in str(exception.value)
+        predictor.delete_model()
+        with pytest.raises(Exception) as exception:
+            sagemaker_session.sagemaker_client.describe_model(ModelName=model.name)
+            assert "Could not find model" in str(exception.value)
 
 
 def test_async_kmeans(sagemaker_session, cpu_instance_type, training_set):

From b02676922b76e23b2a46cb770eaa8bfb1076517d Mon Sep 17 00:00:00 2001
From: Mufaddal Rohawala <89424143+mufaddal-rohawala@users.noreply.github.com>
Date: Mon, 3 Jan 2022 15:42:03 -0800
Subject: [PATCH 06/13] fix: Increment static lineage pipeline (#2817)

---
 tests/integ/sagemaker/lineage/conftest.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/integ/sagemaker/lineage/conftest.py b/tests/integ/sagemaker/lineage/conftest.py
index bb051b9634..b6cebdcb61 100644
--- a/tests/integ/sagemaker/lineage/conftest.py
+++ b/tests/integ/sagemaker/lineage/conftest.py
@@ -36,8 +36,8 @@
 from tests.integ.sagemaker.lineage.helpers import name, names
 
 SLEEP_TIME_SECONDS = 1
-STATIC_PIPELINE_NAME = "SdkIntegTestStaticPipeline16"
-STATIC_ENDPOINT_NAME = "SdkIntegTestStaticEndpoint16"
+STATIC_PIPELINE_NAME = "SdkIntegTestStaticPipeline17"
+STATIC_ENDPOINT_NAME = "SdkIntegTestStaticEndpoint17"
 
 
 @pytest.fixture

From 87c1d2cc71d92f01e3a44264a8bb19b4b01cebfe Mon Sep 17 00:00:00 2001
From: Payton Staub <staubhpa@gmail.com>
Date: Wed, 5 Jan 2022 15:25:39 -0800
Subject: [PATCH 07/13] fix: Fix lineage query integ tests (#2823)

Co-authored-by: Payton Staub <pstaub@amazon.com>
---
 .../lineage/test_dataset_artifact.py          | 15 +---------
 .../lineage/test_endpoint_context.py          | 25 ++---------------
 .../sagemaker/lineage/test_model_artifact.py  | 28 ++-----------------
 3 files changed, 6 insertions(+), 62 deletions(-)

diff --git a/tests/integ/sagemaker/lineage/test_dataset_artifact.py b/tests/integ/sagemaker/lineage/test_dataset_artifact.py
index 4b1d35aa16..be03a85e86 100644
--- a/tests/integ/sagemaker/lineage/test_dataset_artifact.py
+++ b/tests/integ/sagemaker/lineage/test_dataset_artifact.py
@@ -12,11 +12,9 @@
 # language governing permissions and limitations under the License.
 """This module contains code to test SageMaker ``DatasetArtifact``"""
 from __future__ import absolute_import
-from tests.integ.sagemaker.lineage.helpers import traverse_graph_forward
 
 
 def test_trained_models(
-    sagemaker_session,
     dataset_artifact_associated_models,
     trial_component_obj,
     model_artifact_obj1,
@@ -31,20 +29,9 @@ def test_trained_models(
 
 def test_endpoint_contexts(
     static_dataset_artifact,
-    sagemaker_session,
 ):
     contexts_from_query = static_dataset_artifact.endpoint_contexts()
 
-    associations_from_api = traverse_graph_forward(
-        static_dataset_artifact.artifact_arn, sagemaker_session=sagemaker_session
-    )
-
     assert len(contexts_from_query) > 0
     for context in contexts_from_query:
-        # assert that the contexts from the query
-        # appear in the association list from the lineage API
-        assert any(
-            x
-            for x in associations_from_api
-            if x["DestinationArn"] == context.context_arn and x["DestinationType"] == "Endpoint"
-        )
+        assert context.context_type == "Endpoint"
diff --git a/tests/integ/sagemaker/lineage/test_endpoint_context.py b/tests/integ/sagemaker/lineage/test_endpoint_context.py
index d3b0c225bd..07cc48142d 100644
--- a/tests/integ/sagemaker/lineage/test_endpoint_context.py
+++ b/tests/integ/sagemaker/lineage/test_endpoint_context.py
@@ -12,15 +12,9 @@
 # language governing permissions and limitations under the License.
 """This module contains code to test SageMaker ``Contexts``"""
 from __future__ import absolute_import
-from tests.integ.sagemaker.lineage.helpers import traverse_graph_back
 
 
-def test_model(
-    endpoint_context_associate_with_model,
-    model_obj,
-    endpoint_action_obj,
-    sagemaker_session,
-):
+def test_model(endpoint_context_associate_with_model, model_obj, endpoint_action_obj):
     model_list = endpoint_context_associate_with_model.models()
     for model in model_list:
         assert model.source_arn == endpoint_action_obj.action_arn
@@ -29,25 +23,12 @@ def test_model(
         assert model.destination_type == "Model"
 
 
-def test_dataset_artifacts(
-    static_endpoint_context,
-    sagemaker_session,
-):
+def test_dataset_artifacts(static_endpoint_context):
     artifacts_from_query = static_endpoint_context.dataset_artifacts()
 
-    associations_from_api = traverse_graph_back(
-        static_endpoint_context.context_arn, sagemaker_session=sagemaker_session
-    )
-
     assert len(artifacts_from_query) > 0
     for artifact in artifacts_from_query:
-        # assert that the artifacts from the query
-        # appear in the association list from the lineage API
-        assert any(
-            x
-            for x in associations_from_api
-            if x["SourceArn"] == artifact.artifact_arn and x["SourceType"] == "DataSet"
-        )
+        assert artifact.artifact_type == "DataSet"
 
 
 def test_training_job_arns(
diff --git a/tests/integ/sagemaker/lineage/test_model_artifact.py b/tests/integ/sagemaker/lineage/test_model_artifact.py
index ca4dc2d94c..8d9048726d 100644
--- a/tests/integ/sagemaker/lineage/test_model_artifact.py
+++ b/tests/integ/sagemaker/lineage/test_model_artifact.py
@@ -12,11 +12,9 @@
 # language governing permissions and limitations under the License.
 """This module contains code to test SageMaker ``DatasetArtifact``"""
 from __future__ import absolute_import
-from tests.integ.sagemaker.lineage.helpers import traverse_graph_forward, traverse_graph_back
 
 
 def test_endpoints(
-    sagemaker_session,
     model_artifact_associated_endpoints,
     endpoint_deployment_action_obj,
     endpoint_context_obj,
@@ -32,44 +30,22 @@ def test_endpoints(
 
 def test_endpoint_contexts(
     static_model_artifact,
-    sagemaker_session,
 ):
     contexts_from_query = static_model_artifact.endpoint_contexts()
 
-    associations_from_api = traverse_graph_forward(
-        static_model_artifact.artifact_arn, sagemaker_session=sagemaker_session
-    )
-
     assert len(contexts_from_query) > 0
     for context in contexts_from_query:
-        # assert that the contexts from the query
-        # appear in the association list from the lineage API
-        assert any(
-            x
-            for x in associations_from_api
-            if x["DestinationArn"] == context.context_arn and x["DestinationType"] == "Endpoint"
-        )
+        assert context.context_type == "Endpoint"
 
 
 def test_dataset_artifacts(
     static_model_artifact,
-    sagemaker_session,
 ):
     artifacts_from_query = static_model_artifact.dataset_artifacts()
 
-    associations_from_api = traverse_graph_back(
-        static_model_artifact.artifact_arn, sagemaker_session=sagemaker_session
-    )
-
     assert len(artifacts_from_query) > 0
     for artifact in artifacts_from_query:
-        # assert that the artifacts from the query
-        # appear in the association list from the lineage API
-        assert any(
-            x
-            for x in associations_from_api
-            if x["SourceArn"] == artifact.artifact_arn and x["SourceType"] == "DataSet"
-        )
+        assert artifact.artifact_type == "DataSet"
 
 
 def test_training_job_arns(

From 496b2595f48dbfa82e068c8878d2fcf23f00f108 Mon Sep 17 00:00:00 2001
From: Xiaoguang Chen <68292680+xgchena@users.noreply.github.com>
Date: Fri, 7 Jan 2022 10:31:32 -0800
Subject: [PATCH 08/13] change: Add label_headers option for Clarify
 ModelExplainabilityMonitor (#2707)

Co-authored-by: Payton Staub <pstaub@amazon.com>
Co-authored-by: Ahsan Khan <ahsan.al.zaki@gmail.com>
Co-authored-by: Shreya Pandit <shreya.pandit25@gmail.com>
Co-authored-by: Basil Beirouti <beirb@amazon.com>
Co-authored-by: Mufaddal Rohawala <89424143+mufaddal-rohawala@users.noreply.github.com>
Co-authored-by: Basil Beirouti <BasilBeirouti@gmail.com>
Co-authored-by: Payton Staub <staubhpa@gmail.com>
Co-authored-by: Mohamed Ali Jamaoui <m.ali.jamaoui@gmail.com>
Co-authored-by: ci <ci>
Co-authored-by: Jeniya Tabassum <jeniya.tabassum@gmail.com>
Co-authored-by: sreedes <70613743+sreedes@users.noreply.github.com>
Co-authored-by: Navin Soni <navinns@amazon.com>
Co-authored-by: Miyoung <myoung8739@gmail.com>
---
 src/sagemaker/clarify.py                      | 14 ++++---
 .../model_monitor/clarify_model_monitoring.py | 31 +++++++++++----
 tests/integ/test_clarify_model_monitor.py     |  3 +-
 .../monitor/test_clarify_model_monitor.py     | 38 ++++++++++++++++---
 4 files changed, 68 insertions(+), 18 deletions(-)

diff --git a/src/sagemaker/clarify.py b/src/sagemaker/clarify.py
index 0829e25f4b..006cc4846c 100644
--- a/src/sagemaker/clarify.py
+++ b/src/sagemaker/clarify.py
@@ -290,11 +290,15 @@ def __init__(
             probability_threshold (float): An optional value for binary prediction tasks in which
                 the model returns a probability, to indicate the threshold to convert the
                 prediction to a boolean value. Default is 0.5.
-            label_headers (list): List of label values - one for each score of the ``probability``.
+            label_headers (list[str]): List of headers, each for a predicted score in model output.
+                For bias analysis, it is used to extract the label value with the highest score as
+                predicted label. For explainability job, It is used to beautify the analysis report
+                by replacing placeholders like "label0".
         """
         self.label = label
         self.probability = probability
         self.probability_threshold = probability_threshold
+        self.label_headers = label_headers
         if probability_threshold is not None:
             try:
                 float(probability_threshold)
@@ -1060,10 +1064,10 @@ def run_explainability(
             explainability_config (:class:`~sagemaker.clarify.ExplainabilityConfig` or list):
                 Config of the specific explainability method or a list of ExplainabilityConfig
                 objects. Currently, SHAP and PDP are the two methods supported.
-            model_scores(str|int|ModelPredictedLabelConfig):  Index or JSONPath location in the
-                model output for the predicted scores to be explained. This is not required if the
-                model output is a single score. Alternatively, an instance of
-                ModelPredictedLabelConfig can be provided.
+            model_scores (int or str or :class:`~sagemaker.clarify.ModelPredictedLabelConfig`):
+                Index or JSONPath to locate the predicted scores in the model output. This is not
+                required if the model output is a single score. Alternatively, it can be an instance
+                of ModelPredictedLabelConfig to provide more parameters like label_headers.
             wait (bool): Whether the call should wait until the job completes (default: True).
             logs (bool): Whether to show the logs produced by the job.
                 Only meaningful when ``wait`` is True (default: True).
diff --git a/src/sagemaker/model_monitor/clarify_model_monitoring.py b/src/sagemaker/model_monitor/clarify_model_monitoring.py
index 10da0bf6c9..09de7b5c05 100644
--- a/src/sagemaker/model_monitor/clarify_model_monitoring.py
+++ b/src/sagemaker/model_monitor/clarify_model_monitoring.py
@@ -26,7 +26,7 @@
 from sagemaker import image_uris, s3
 from sagemaker.session import Session
 from sagemaker.utils import name_from_base
-from sagemaker.clarify import SageMakerClarifyProcessor
+from sagemaker.clarify import SageMakerClarifyProcessor, ModelPredictedLabelConfig
 
 _LOGGER = logging.getLogger(__name__)
 
@@ -833,9 +833,10 @@ def suggest_baseline(
                 specific explainability method. Currently, only SHAP is supported.
             model_config (:class:`~sagemaker.clarify.ModelConfig`): Config of the model and its
                 endpoint to be created.
-            model_scores (int or str): Index or JSONPath location in the model output for the
-                predicted scores to be explained. This is not required if the model output is
-                a single score.
+            model_scores (int or str or :class:`~sagemaker.clarify.ModelPredictedLabelConfig`):
+                Index or JSONPath to locate the predicted scores in the model output. This is not
+                required if the model output is a single score. Alternatively, it can be an instance
+                of ModelPredictedLabelConfig to provide more parameters like label_headers.
             wait (bool): Whether the call should wait until the job completes (default: False).
             logs (bool): Whether to show the logs produced by the job.
                 Only meaningful when wait is True (default: False).
@@ -865,14 +866,24 @@ def suggest_baseline(
         headers = copy.deepcopy(data_config.headers)
         if headers and data_config.label in headers:
             headers.remove(data_config.label)
+        if model_scores is None:
+            inference_attribute = None
+            label_headers = None
+        elif isinstance(model_scores, ModelPredictedLabelConfig):
+            inference_attribute = str(model_scores.label)
+            label_headers = model_scores.label_headers
+        else:
+            inference_attribute = str(model_scores)
+            label_headers = None
         self.latest_baselining_job_config = ClarifyBaseliningConfig(
             analysis_config=ExplainabilityAnalysisConfig(
                 explainability_config=explainability_config,
                 model_config=model_config,
                 headers=headers,
+                label_headers=label_headers,
             ),
             features_attribute=data_config.features,
-            inference_attribute=model_scores if model_scores is None else str(model_scores),
+            inference_attribute=inference_attribute,
         )
         self.latest_baselining_job_name = baselining_job_name
         self.latest_baselining_job = ClarifyBaseliningJob(
@@ -1166,7 +1177,7 @@ def attach(cls, monitor_schedule_name, sagemaker_session=None):
 class ExplainabilityAnalysisConfig:
     """Analysis configuration for ModelExplainabilityMonitor."""
 
-    def __init__(self, explainability_config, model_config, headers=None):
+    def __init__(self, explainability_config, model_config, headers=None, label_headers=None):
         """Creates an analysis config dictionary.
 
         Args:
@@ -1175,13 +1186,19 @@ def __init__(self, explainability_config, model_config, headers=None):
             model_config (sagemaker.clarify.ModelConfig): Config object related to bias
                 configurations.
             headers (list[str]): A list of feature names (without label) of model/endpint input.
+            label_headers (list[str]): List of headers, each for a predicted score in model output.
+                It is used to beautify the analysis report by replacing placeholders like "label0".
+
         """
+        predictor_config = model_config.get_predictor_config()
         self.analysis_config = {
             "methods": explainability_config.get_explainability_config(),
-            "predictor": model_config.get_predictor_config(),
+            "predictor": predictor_config,
         }
         if headers is not None:
             self.analysis_config["headers"] = headers
+        if label_headers is not None:
+            predictor_config["label_headers"] = label_headers
 
     def _to_dict(self):
         """Generates a request dictionary using the parameters provided to the class."""
diff --git a/tests/integ/test_clarify_model_monitor.py b/tests/integ/test_clarify_model_monitor.py
index 6891082285..3f48fa1032 100644
--- a/tests/integ/test_clarify_model_monitor.py
+++ b/tests/integ/test_clarify_model_monitor.py
@@ -53,6 +53,7 @@
 HEADER_OF_LABEL = "Label"
 HEADERS_OF_FEATURES = ["F1", "F2", "F3", "F4", "F5", "F6", "F7"]
 ALL_HEADERS = [*HEADERS_OF_FEATURES, HEADER_OF_LABEL]
+HEADER_OF_PREDICTION = "Decision"
 DATASET_TYPE = "text/csv"
 CONTENT_TYPE = DATASET_TYPE
 ACCEPT_TYPE = DATASET_TYPE
@@ -325,7 +326,7 @@ def scheduled_explainability_monitor(
 ):
     monitor_schedule_name = utils.unique_name_from_base("explainability-monitor")
     analysis_config = ExplainabilityAnalysisConfig(
-        shap_config, model_config, headers=HEADERS_OF_FEATURES
+        shap_config, model_config, headers=HEADERS_OF_FEATURES, label_headers=[HEADER_OF_PREDICTION]
     )
     s3_uri_monitoring_output = os.path.join(
         "s3://",
diff --git a/tests/unit/sagemaker/monitor/test_clarify_model_monitor.py b/tests/unit/sagemaker/monitor/test_clarify_model_monitor.py
index e13755f208..7c1d497d64 100644
--- a/tests/unit/sagemaker/monitor/test_clarify_model_monitor.py
+++ b/tests/unit/sagemaker/monitor/test_clarify_model_monitor.py
@@ -279,6 +279,7 @@
 # for bias
 ANALYSIS_CONFIG_LABEL = "Label"
 ANALYSIS_CONFIG_HEADERS_OF_FEATURES = ["F1", "F2", "F3"]
+ANALYSIS_CONFIG_LABEL_HEADERS = ["Decision"]
 ANALYSIS_CONFIG_ALL_HEADERS = [*ANALYSIS_CONFIG_HEADERS_OF_FEATURES, ANALYSIS_CONFIG_LABEL]
 ANALYSIS_CONFIG_LABEL_VALUES = [1]
 ANALYSIS_CONFIG_FACET_NAME = "F1"
@@ -330,6 +331,11 @@
         "content_type": CONTENT_TYPE,
     },
 }
+EXPLAINABILITY_ANALYSIS_CONFIG_WITH_LABEL_HEADERS = copy.deepcopy(EXPLAINABILITY_ANALYSIS_CONFIG)
+# noinspection PyTypeChecker
+EXPLAINABILITY_ANALYSIS_CONFIG_WITH_LABEL_HEADERS["predictor"][
+    "label_headers"
+] = ANALYSIS_CONFIG_LABEL_HEADERS
 
 
 @pytest.fixture()
@@ -1048,12 +1054,31 @@ def test_explainability_analysis_config(shap_config, model_config):
         explainability_config=shap_config,
         model_config=model_config,
         headers=ANALYSIS_CONFIG_HEADERS_OF_FEATURES,
+        label_headers=ANALYSIS_CONFIG_LABEL_HEADERS,
     )
-    assert EXPLAINABILITY_ANALYSIS_CONFIG == config._to_dict()
+    assert EXPLAINABILITY_ANALYSIS_CONFIG_WITH_LABEL_HEADERS == config._to_dict()
 
 
+@pytest.mark.parametrize(
+    "model_scores,explainability_analysis_config",
+    [
+        (INFERENCE_ATTRIBUTE, EXPLAINABILITY_ANALYSIS_CONFIG),
+        (
+            ModelPredictedLabelConfig(
+                label=INFERENCE_ATTRIBUTE, label_headers=ANALYSIS_CONFIG_LABEL_HEADERS
+            ),
+            EXPLAINABILITY_ANALYSIS_CONFIG_WITH_LABEL_HEADERS,
+        ),
+    ],
+)
 def test_model_explainability_monitor_suggest_baseline(
-    model_explainability_monitor, sagemaker_session, data_config, shap_config, model_config
+    model_explainability_monitor,
+    sagemaker_session,
+    data_config,
+    shap_config,
+    model_config,
+    model_scores,
+    explainability_analysis_config,
 ):
     clarify_model_monitor = model_explainability_monitor
     # suggest baseline
@@ -1061,12 +1086,12 @@ def test_model_explainability_monitor_suggest_baseline(
         data_config=data_config,
         explainability_config=shap_config,
         model_config=model_config,
-        model_scores=INFERENCE_ATTRIBUTE,
+        model_scores=model_scores,
         job_name=BASELINING_JOB_NAME,
     )
     assert isinstance(clarify_model_monitor.latest_baselining_job, ClarifyBaseliningJob)
     assert (
-        EXPLAINABILITY_ANALYSIS_CONFIG
+        explainability_analysis_config
         == clarify_model_monitor.latest_baselining_job_config.analysis_config._to_dict()
     )
     clarify_baselining_job = clarify_model_monitor.latest_baselining_job
@@ -1081,6 +1106,7 @@ def test_model_explainability_monitor_suggest_baseline(
         analysis_config=None,  # will pick up config from baselining job
         baseline_job_name=BASELINING_JOB_NAME,
         endpoint_input=ENDPOINT_NAME,
+        explainability_analysis_config=explainability_analysis_config,
         #  will pick up attributes from baselining job
     )
 
@@ -1133,6 +1159,7 @@ def test_model_explainability_monitor_created_with_config(
         sagemaker_session=sagemaker_session,
         analysis_config=analysis_config,
         constraints=CONSTRAINTS,
+        explainability_analysis_config=EXPLAINABILITY_ANALYSIS_CONFIG,
     )
 
     # update schedule
@@ -1263,6 +1290,7 @@ def _test_model_explainability_monitor_create_schedule(
         features_attribute=FEATURES_ATTRIBUTE,
         inference_attribute=str(INFERENCE_ATTRIBUTE),
     ),
+    explainability_analysis_config=None,
 ):
     # create schedule
     with patch(
@@ -1278,7 +1306,7 @@ def _test_model_explainability_monitor_create_schedule(
         )
         if not isinstance(analysis_config, str):
             upload.assert_called_once()
-            assert json.loads(upload.call_args[0][0]) == EXPLAINABILITY_ANALYSIS_CONFIG
+            assert json.loads(upload.call_args[0][0]) == explainability_analysis_config
 
     # validation
     expected_arguments = {

From 8e9d9b71724ac7aecfdfb3f808d95b1c3af276d2 Mon Sep 17 00:00:00 2001
From: Jonathan Guinegagne <12092593+JGuinegagne@users.noreply.github.com>
Date: Fri, 7 Jan 2022 15:29:58 -0500
Subject: [PATCH 09/13] feat: default repack encryption (#2821)

Co-authored-by: Shreya Pandit <shreya.pandit25@gmail.com>
---
 .gitignore                        |  3 ++-
 src/sagemaker/estimator.py        |  1 +
 src/sagemaker/fw_utils.py         | 11 ++++++++++
 src/sagemaker/model.py            |  1 +
 src/sagemaker/session.py          |  5 +++++
 src/sagemaker/session_settings.py | 34 ++++++++++++++++++++++++++++++
 src/sagemaker/utils.py            |  8 +++++++
 tests/unit/test_estimator.py      |  4 ++--
 tests/unit/test_fw_utils.py       | 35 +++++++++++++++++++++++++++++++
 tests/unit/test_utils.py          | 25 ++++++++++++++++++++++
 tox.ini                           |  1 +
 11 files changed, 125 insertions(+), 3 deletions(-)
 create mode 100644 src/sagemaker/session_settings.py

diff --git a/.gitignore b/.gitignore
index 84e184aa92..1b6b4ca1cf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,4 +27,5 @@ venv/
 *.swp
 .docker/
 env/
-.vscode/
\ No newline at end of file
+.vscode/
+.python-version
\ No newline at end of file
diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py
index d603188f74..cf039fa010 100644
--- a/src/sagemaker/estimator.py
+++ b/src/sagemaker/estimator.py
@@ -2343,6 +2343,7 @@ def _stage_user_code_in_s3(self):
             dependencies=self.dependencies,
             kms_key=kms_key,
             s3_resource=self.sagemaker_session.s3_resource,
+            settings=self.sagemaker_session.settings,
         )
 
     def _model_source_dir(self):
diff --git a/src/sagemaker/fw_utils.py b/src/sagemaker/fw_utils.py
index 87b94711ae..ce1bbd538d 100644
--- a/src/sagemaker/fw_utils.py
+++ b/src/sagemaker/fw_utils.py
@@ -19,8 +19,10 @@
 import shutil
 import tempfile
 from collections import namedtuple
+from typing import Optional
 
 import sagemaker.image_uris
+from sagemaker.session_settings import SessionSettings
 import sagemaker.utils
 
 from sagemaker.deprecations import renamed_warning
@@ -203,6 +205,7 @@ def tar_and_upload_dir(
     dependencies=None,
     kms_key=None,
     s3_resource=None,
+    settings: Optional[SessionSettings] = None,
 ):
     """Package source files and upload a compress tar file to S3.
 
@@ -230,6 +233,9 @@ def tar_and_upload_dir(
         s3_resource (boto3.resource("s3")): Optional. Pre-instantiated Boto3 Resource
             for S3 connections, can be used to customize the configuration,
             e.g. set the endpoint URL (default: None).
+        settings (sagemaker.session_settings.SessionSettings): Optional. The settings
+            of the SageMaker ``Session``, can be used to override the default encryption
+            behavior (default: None).
     Returns:
         sagemaker.fw_utils.UserCode: An object with the S3 bucket and key (S3 prefix) and
             script name.
@@ -241,6 +247,7 @@ def tar_and_upload_dir(
     dependencies = dependencies or []
     key = "%s/sourcedir.tar.gz" % s3_key_prefix
     tmp = tempfile.mkdtemp()
+    encrypt_artifact = True if settings is None else settings.encrypt_repacked_artifacts
 
     try:
         source_files = _list_files_to_compress(script, directory) + dependencies
@@ -250,6 +257,10 @@ def tar_and_upload_dir(
 
         if kms_key:
             extra_args = {"ServerSideEncryption": "aws:kms", "SSEKMSKeyId": kms_key}
+        elif encrypt_artifact:
+            # encrypt the tarball at rest in S3 with the default AWS managed KMS key for S3
+            # see https://docs.aws.amazon.com/AmazonS3/latest/API/API_PutObject.html#API_PutObject_RequestSyntax
+            extra_args = {"ServerSideEncryption": "aws:kms"}
         else:
             extra_args = None
 
diff --git a/src/sagemaker/model.py b/src/sagemaker/model.py
index 5af5539a96..830bb50dab 100644
--- a/src/sagemaker/model.py
+++ b/src/sagemaker/model.py
@@ -1131,6 +1131,7 @@ def _upload_code(self, key_prefix, repack=False):
                 script=self.entry_point,
                 directory=self.source_dir,
                 dependencies=self.dependencies,
+                settings=self.sagemaker_session.settings,
             )
 
         if repack and self.model_data is not None and self.entry_point is not None:
diff --git a/src/sagemaker/session.py b/src/sagemaker/session.py
index 189c9cb308..56f008be84 100644
--- a/src/sagemaker/session.py
+++ b/src/sagemaker/session.py
@@ -42,6 +42,7 @@
     sts_regional_endpoint,
 )
 from sagemaker import exceptions
+from sagemaker.session_settings import SessionSettings
 
 LOGGER = logging.getLogger("sagemaker")
 
@@ -85,6 +86,7 @@ def __init__(
         sagemaker_runtime_client=None,
         sagemaker_featurestore_runtime_client=None,
         default_bucket=None,
+        settings=SessionSettings(),
     ):
         """Initialize a SageMaker ``Session``.
 
@@ -110,6 +112,8 @@ def __init__(
                 If not provided, a default bucket will be created based on the following format:
                 "sagemaker-{region}-{aws-account-id}".
                 Example: "sagemaker-my-custom-bucket".
+            settings (sagemaker.session_settings.SessionSettings): Optional. Set of optional
+                parameters to apply to the session.
         """
         self._default_bucket = None
         self._default_bucket_name_override = default_bucket
@@ -117,6 +121,7 @@ def __init__(
         self.s3_client = None
         self.config = None
         self.lambda_client = None
+        self.settings = settings
 
         self._initialize(
             boto_session=boto_session,
diff --git a/src/sagemaker/session_settings.py b/src/sagemaker/session_settings.py
new file mode 100644
index 0000000000..53ff9a9f0d
--- /dev/null
+++ b/src/sagemaker/session_settings.py
@@ -0,0 +1,34 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+"""Defines classes to parametrize a SageMaker ``Session``."""
+
+from __future__ import absolute_import
+
+
+class SessionSettings(object):
+    """Optional container class for settings to apply to a SageMaker session."""
+
+    def __init__(self, encrypt_repacked_artifacts=True) -> None:
+        """Initialize the ``SessionSettings`` of a SageMaker ``Session``.
+
+        Args:
+            encrypt_repacked_artifacts (bool): Flag to indicate whether to encrypt the artifacts
+                at rest in S3 using the default AWS managed KMS key for S3 when a custom KMS key
+                is not provided (Default: True).
+        """
+        self._encrypt_repacked_artifacts = encrypt_repacked_artifacts
+
+    @property
+    def encrypt_repacked_artifacts(self) -> bool:
+        """Return True if repacked artifacts at rest in S3 should be encrypted by default."""
+        return self._encrypt_repacked_artifacts
diff --git a/src/sagemaker/utils.py b/src/sagemaker/utils.py
index 4409c0b954..5c617b0155 100644
--- a/src/sagemaker/utils.py
+++ b/src/sagemaker/utils.py
@@ -29,6 +29,7 @@
 from six.moves.urllib import parse
 
 from sagemaker import deprecations
+from sagemaker.session_settings import SessionSettings
 
 
 ECR_URI_PATTERN = r"^(\d+)(\.)dkr(\.)ecr(\.)(.+)(\.)(.*)(/)(.*:.*)$"
@@ -429,8 +430,15 @@ def _save_model(repacked_model_uri, tmp_model_path, sagemaker_session, kms_key):
         bucket, key = url.netloc, url.path.lstrip("/")
         new_key = key.replace(os.path.basename(key), os.path.basename(repacked_model_uri))
 
+        settings = (
+            sagemaker_session.settings if sagemaker_session is not None else SessionSettings()
+        )
+        encrypt_artifact = settings.encrypt_repacked_artifacts
+
         if kms_key:
             extra_args = {"ServerSideEncryption": "aws:kms", "SSEKMSKeyId": kms_key}
+        elif encrypt_artifact:
+            extra_args = {"ServerSideEncryption": "aws:kms"}
         else:
             extra_args = None
         sagemaker_session.boto_session.resource(
diff --git a/tests/unit/test_estimator.py b/tests/unit/test_estimator.py
index 69e030b567..248eda1aa5 100644
--- a/tests/unit/test_estimator.py
+++ b/tests/unit/test_estimator.py
@@ -2323,8 +2323,8 @@ def test_different_code_location_kms_key(utils, sagemaker_session):
     obj = sagemaker_session.boto_session.resource("s3").Object
 
     obj.assert_called_with("another-location", "%s/source/sourcedir.tar.gz" % fw._current_job_name)
-
-    obj().upload_file.assert_called_with(utils.create_tar_file(), ExtraArgs=None)
+    extra_args = {"ServerSideEncryption": "aws:kms"}
+    obj().upload_file.assert_called_with(utils.create_tar_file(), ExtraArgs=extra_args)
 
 
 @patch("sagemaker.utils")
diff --git a/tests/unit/test_fw_utils.py b/tests/unit/test_fw_utils.py
index be70182be8..42aebf3dc5 100644
--- a/tests/unit/test_fw_utils.py
+++ b/tests/unit/test_fw_utils.py
@@ -24,6 +24,7 @@
 
 from sagemaker import fw_utils
 from sagemaker.utils import name_from_image
+from sagemaker.session_settings import SessionSettings
 
 TIMESTAMP = "2017-10-10-14-14-15"
 
@@ -93,6 +94,40 @@ def test_tar_and_upload_dir_s3_with_kms(utils, sagemaker_session):
     obj.upload_file.assert_called_with(utils.create_tar_file(), ExtraArgs=extra_args)
 
 
+@patch("sagemaker.utils")
+def test_tar_and_upload_dir_s3_kms_enabled_by_default(utils, sagemaker_session):
+    bucket = "mybucket"
+    s3_key_prefix = "something/source"
+    script = "inference.py"
+    result = fw_utils.tar_and_upload_dir(sagemaker_session, bucket, s3_key_prefix, script)
+
+    assert result == fw_utils.UploadedCode(
+        "s3://{}/{}/sourcedir.tar.gz".format(bucket, s3_key_prefix), script
+    )
+
+    extra_args = {"ServerSideEncryption": "aws:kms"}
+    obj = sagemaker_session.resource("s3").Object("", "")
+    obj.upload_file.assert_called_with(utils.create_tar_file(), ExtraArgs=extra_args)
+
+
+@patch("sagemaker.utils")
+def test_tar_and_upload_dir_s3_without_kms_with_overridden_settings(utils, sagemaker_session):
+    bucket = "mybucket"
+    s3_key_prefix = "something/source"
+    script = "inference.py"
+    settings = SessionSettings(encrypt_repacked_artifacts=False)
+    result = fw_utils.tar_and_upload_dir(
+        sagemaker_session, bucket, s3_key_prefix, script, settings=settings
+    )
+
+    assert result == fw_utils.UploadedCode(
+        "s3://{}/{}/sourcedir.tar.gz".format(bucket, s3_key_prefix), script
+    )
+
+    obj = sagemaker_session.resource("s3").Object("", "")
+    obj.upload_file.assert_called_with(utils.create_tar_file(), ExtraArgs=None)
+
+
 def test_mp_config_partition_exists():
     mp_parameters = {}
     with pytest.raises(ValueError):
diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py
index 5c0b217299..4b8ce1de20 100644
--- a/tests/unit/test_utils.py
+++ b/tests/unit/test_utils.py
@@ -27,6 +27,7 @@
 from mock import call, patch, Mock, MagicMock
 
 import sagemaker
+from sagemaker.session_settings import SessionSettings
 
 BUCKET_WITHOUT_WRITING_PERMISSION = "s3://bucket-without-writing-permission"
 
@@ -390,6 +391,13 @@ def test_repack_model_without_source_dir(tmp, fake_s3):
         "/code/inference.py",
     }
 
+    extra_args = {"ServerSideEncryption": "aws:kms"}
+    object_mock = fake_s3.object_mock
+    _, _, kwargs = object_mock.mock_calls[0]
+
+    assert "ExtraArgs" in kwargs
+    assert kwargs["ExtraArgs"] == extra_args
+
 
 def test_repack_model_with_entry_point_without_path_without_source_dir(tmp, fake_s3):
 
@@ -415,12 +423,20 @@ def test_repack_model_with_entry_point_without_path_without_source_dir(tmp, fake
             "s3://fake/location",
             "s3://destination-bucket/model.tar.gz",
             fake_s3.sagemaker_session,
+            kms_key="kms_key",
         )
     finally:
         os.chdir(cwd)
 
     assert list_tar_files(fake_s3.fake_upload_path, tmp) == {"/code/inference.py", "/model"}
 
+    extra_args = {"ServerSideEncryption": "aws:kms", "SSEKMSKeyId": "kms_key"}
+    object_mock = fake_s3.object_mock
+    _, _, kwargs = object_mock.mock_calls[0]
+
+    assert "ExtraArgs" in kwargs
+    assert kwargs["ExtraArgs"] == extra_args
+
 
 def test_repack_model_from_s3_to_s3(tmp, fake_s3):
 
@@ -434,6 +450,7 @@ def test_repack_model_from_s3_to_s3(tmp, fake_s3):
     )
 
     fake_s3.tar_and_upload("model-dir", "s3://fake/location")
+    fake_s3.sagemaker_session.settings = SessionSettings(encrypt_repacked_artifacts=False)
 
     sagemaker.utils.repack_model(
         "inference.py",
@@ -450,6 +467,11 @@ def test_repack_model_from_s3_to_s3(tmp, fake_s3):
         "/model",
     }
 
+    object_mock = fake_s3.object_mock
+    _, _, kwargs = object_mock.mock_calls[0]
+    assert "ExtraArgs" in kwargs
+    assert kwargs["ExtraArgs"] is None
+
 
 def test_repack_model_from_file_to_file(tmp):
     create_file_tree(tmp, ["model", "dependencies/a", "source-dir/inference.py"])
@@ -581,6 +603,7 @@ def __init__(self, tmp):
         self.sagemaker_session = MagicMock()
         self.location_map = {}
         self.current_bucket = None
+        self.object_mock = MagicMock()
 
         self.sagemaker_session.boto_session.resource().Bucket().download_file.side_effect = (
             self.download_file
@@ -606,6 +629,7 @@ def tar_and_upload(self, path, fake_location):
 
     def mock_s3_upload(self):
         dst = os.path.join(self.tmp, "dst")
+        object_mock = self.object_mock
 
         class MockS3Object(object):
             def __init__(self, bucket, key):
@@ -616,6 +640,7 @@ def upload_file(self, target, **kwargs):
                 if self.bucket in BUCKET_WITHOUT_WRITING_PERMISSION:
                     raise exceptions.S3UploadFailedError()
                 shutil.copy2(target, dst)
+                object_mock.upload_file(target, **kwargs)
 
         self.sagemaker_session.boto_session.resource().Object = MockS3Object
         return dst
diff --git a/tox.ini b/tox.ini
index b8dc0292f9..d9e3b41b41 100644
--- a/tox.ini
+++ b/tox.ini
@@ -19,6 +19,7 @@ exclude =
     .tox
     tests/data/
     venv/
+    env/
 
 max-complexity = 10
 

From b18435c2c0200169a109ac340af3327b28b0ad2e Mon Sep 17 00:00:00 2001
From: Zhankui Lu <zhankuilv@gmail.com>
Date: Fri, 7 Jan 2022 14:47:06 -0800
Subject: [PATCH 10/13] feature: support large pipeline (#2825)

Co-authored-by: Ameen Khan <ameenmk@amazon.com>
Co-authored-by: Zhankui Lu <zhankuil@amazon.com>
Co-authored-by: Ahsan Khan <ahsan.al.zaki@gmail.com>
---
 .../sagemaker.workflow.pipelines.rst          |  6 ++
 src/sagemaker/workflow/parallelism_config.py  | 34 +++++++
 src/sagemaker/workflow/pipeline.py            | 62 ++++++++++--
 tests/integ/test_workflow.py                  | 96 +++++++++++++++++++
 .../unit/sagemaker/workflow/test_pipeline.py  | 88 ++++++++++++++++-
 5 files changed, 276 insertions(+), 10 deletions(-)
 create mode 100644 src/sagemaker/workflow/parallelism_config.py

diff --git a/doc/workflows/pipelines/sagemaker.workflow.pipelines.rst b/doc/workflows/pipelines/sagemaker.workflow.pipelines.rst
index 9071d05145..908621ea1c 100644
--- a/doc/workflows/pipelines/sagemaker.workflow.pipelines.rst
+++ b/doc/workflows/pipelines/sagemaker.workflow.pipelines.rst
@@ -82,6 +82,12 @@ Pipeline
 .. autoclass:: sagemaker.workflow.pipeline._PipelineExecution
     :members:
 
+Parallelism Configuration
+-------------------------
+
+.. autoclass:: sagemaker.workflow.parallelism_config.ParallelismConfiguration
+    :members:
+
 Pipeline Experiment Config
 --------------------------
 
diff --git a/src/sagemaker/workflow/parallelism_config.py b/src/sagemaker/workflow/parallelism_config.py
new file mode 100644
index 0000000000..72c180517a
--- /dev/null
+++ b/src/sagemaker/workflow/parallelism_config.py
@@ -0,0 +1,34 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+"""Pipeline Parallelism Configuration"""
+from __future__ import absolute_import
+from sagemaker.workflow.entities import RequestType
+
+
+class ParallelismConfiguration:
+    """Parallelism config for SageMaker pipeline."""
+
+    def __init__(self, max_parallel_execution_steps: int):
+        """Create a ParallelismConfiguration
+
+        Args:
+            max_parallel_execution_steps, int:
+                max number of steps which could be parallelized
+        """
+        self.max_parallel_execution_steps = max_parallel_execution_steps
+
+    def to_request(self) -> RequestType:
+        """Returns: the request structure."""
+        return {
+            "MaxParallelExecutionSteps": self.max_parallel_execution_steps,
+        }
diff --git a/src/sagemaker/workflow/pipeline.py b/src/sagemaker/workflow/pipeline.py
index 4982c6f5fd..606ba38bc2 100644
--- a/src/sagemaker/workflow/pipeline.py
+++ b/src/sagemaker/workflow/pipeline.py
@@ -22,6 +22,7 @@
 import botocore
 from botocore.exceptions import ClientError
 
+from sagemaker import s3
 from sagemaker._studio import _append_project_tags
 from sagemaker.session import Session
 from sagemaker.workflow.callback_step import CallbackOutput, CallbackStep
@@ -34,6 +35,7 @@
 from sagemaker.workflow.execution_variables import ExecutionVariables
 from sagemaker.workflow.parameters import Parameter
 from sagemaker.workflow.pipeline_experiment_config import PipelineExperimentConfig
+from sagemaker.workflow.parallelism_config import ParallelismConfiguration
 from sagemaker.workflow.properties import Properties
 from sagemaker.workflow.steps import Step
 from sagemaker.workflow.step_collections import StepCollection
@@ -94,6 +96,7 @@ def create(
         role_arn: str,
         description: str = None,
         tags: List[Dict[str, str]] = None,
+        parallelism_config: ParallelismConfiguration = None,
     ) -> Dict[str, Any]:
         """Creates a Pipeline in the Pipelines service.
 
@@ -102,37 +105,62 @@ def create(
             description (str): A description of the pipeline.
             tags (List[Dict[str, str]]): A list of {"Key": "string", "Value": "string"} dicts as
                 tags.
+            parallelism_config (Optional[ParallelismConfiguration]): Parallelism configuration
+                that is applied to each of the executions of the pipeline. It takes precedence
+                over the parallelism configuration of the parent pipeline.
 
         Returns:
             A response dict from the service.
         """
         tags = _append_project_tags(tags)
-
-        kwargs = self._create_args(role_arn, description)
+        kwargs = self._create_args(role_arn, description, parallelism_config)
         update_args(
             kwargs,
             Tags=tags,
         )
         return self.sagemaker_session.sagemaker_client.create_pipeline(**kwargs)
 
-    def _create_args(self, role_arn: str, description: str):
+    def _create_args(
+        self, role_arn: str, description: str, parallelism_config: ParallelismConfiguration
+    ):
         """Constructs the keyword argument dict for a create_pipeline call.
 
         Args:
             role_arn (str): The role arn that is assumed by pipelines to create step artifacts.
             description (str): A description of the pipeline.
+            parallelism_config (Optional[ParallelismConfiguration]): Parallelism configuration
+                that is applied to each of the executions of the pipeline. It takes precedence
+                over the parallelism configuration of the parent pipeline.
 
         Returns:
             A keyword argument dict for calling create_pipeline.
         """
+        pipeline_definition = self.definition()
         kwargs = dict(
             PipelineName=self.name,
-            PipelineDefinition=self.definition(),
             RoleArn=role_arn,
         )
+
+        # If pipeline definition is large, upload to S3 bucket and
+        # provide PipelineDefinitionS3Location to request instead.
+        if len(pipeline_definition.encode("utf-8")) < 1024 * 100:
+            kwargs["PipelineDefinition"] = pipeline_definition
+        else:
+            desired_s3_uri = s3.s3_path_join(
+                "s3://", self.sagemaker_session.default_bucket(), self.name
+            )
+            s3.S3Uploader.upload_string_as_file_body(
+                body=pipeline_definition,
+                desired_s3_uri=desired_s3_uri,
+                sagemaker_session=self.sagemaker_session,
+            )
+            kwargs["PipelineDefinitionS3Location"] = {
+                "Bucket": self.sagemaker_session.default_bucket(),
+                "ObjectKey": self.name,
+            }
+
         update_args(
-            kwargs,
-            PipelineDescription=description,
+            kwargs, PipelineDescription=description, ParallelismConfiguration=parallelism_config
         )
         return kwargs
 
@@ -146,17 +174,25 @@ def describe(self) -> Dict[str, Any]:
         """
         return self.sagemaker_session.sagemaker_client.describe_pipeline(PipelineName=self.name)
 
-    def update(self, role_arn: str, description: str = None) -> Dict[str, Any]:
+    def update(
+        self,
+        role_arn: str,
+        description: str = None,
+        parallelism_config: ParallelismConfiguration = None,
+    ) -> Dict[str, Any]:
         """Updates a Pipeline in the Workflow service.
 
         Args:
             role_arn (str): The role arn that is assumed by pipelines to create step artifacts.
             description (str): A description of the pipeline.
+            parallelism_config (Optional[ParallelismConfiguration]): Parallelism configuration
+                that is applied to each of the executions of the pipeline. It takes precedence
+                over the parallelism configuration of the parent pipeline.
 
         Returns:
             A response dict from the service.
         """
-        kwargs = self._create_args(role_arn, description)
+        kwargs = self._create_args(role_arn, description, parallelism_config)
         return self.sagemaker_session.sagemaker_client.update_pipeline(**kwargs)
 
     def upsert(
@@ -164,6 +200,7 @@ def upsert(
         role_arn: str,
         description: str = None,
         tags: List[Dict[str, str]] = None,
+        parallelism_config: ParallelismConfiguration = None,
     ) -> Dict[str, Any]:
         """Creates a pipeline or updates it, if it already exists.
 
@@ -172,12 +209,14 @@ def upsert(
             description (str): A description of the pipeline.
             tags (List[Dict[str, str]]): A list of {"Key": "string", "Value": "string"} dicts as
                 tags.
+            parallelism_config (Optional[Config for parallel steps, Parallelism configuration that
+                is applied to each of. the executions
 
         Returns:
             response dict from service
         """
         try:
-            response = self.create(role_arn, description, tags)
+            response = self.create(role_arn, description, tags, parallelism_config)
         except ClientError as e:
             error = e.response["Error"]
             if (
@@ -215,6 +254,7 @@ def start(
         parameters: Dict[str, Union[str, bool, int, float]] = None,
         execution_display_name: str = None,
         execution_description: str = None,
+        parallelism_config: ParallelismConfiguration = None,
     ):
         """Starts a Pipeline execution in the Workflow service.
 
@@ -223,6 +263,9 @@ def start(
                 pipeline parameters.
             execution_display_name (str): The display name of the pipeline execution.
             execution_description (str): A description of the execution.
+            parallelism_config (Optional[ParallelismConfiguration]): Parallelism configuration
+                that is applied to each of the executions of the pipeline. It takes precedence
+                over the parallelism configuration of the parent pipeline.
 
         Returns:
             A `_PipelineExecution` instance, if successful.
@@ -245,6 +288,7 @@ def start(
             PipelineParameters=format_start_parameters(parameters),
             PipelineExecutionDescription=execution_description,
             PipelineExecutionDisplayName=execution_display_name,
+            ParallelismConfiguration=parallelism_config,
         )
         response = self.sagemaker_session.sagemaker_client.start_pipeline_execution(**kwargs)
         return _PipelineExecution(
diff --git a/tests/integ/test_workflow.py b/tests/integ/test_workflow.py
index 2fe674a203..58b681fd0e 100644
--- a/tests/integ/test_workflow.py
+++ b/tests/integ/test_workflow.py
@@ -2757,3 +2757,99 @@ def cleanup_feature_group(feature_group: FeatureGroup):
         except Exception as e:
             print(f"Delete FeatureGroup failed with error: {e}.")
             pass
+
+
+def test_large_pipeline(sagemaker_session, role, pipeline_name, region_name):
+    instance_count = ParameterInteger(name="InstanceCount", default_value=2)
+
+    outputParam = CallbackOutput(output_name="output", output_type=CallbackOutputTypeEnum.String)
+
+    callback_steps = [
+        CallbackStep(
+            name=f"callback-step{count}",
+            sqs_queue_url="https://sqs.us-east-2.amazonaws.com/123456789012/MyQueue",
+            inputs={"arg1": "foo"},
+            outputs=[outputParam],
+        )
+        for count in range(2000)
+    ]
+    pipeline = Pipeline(
+        name=pipeline_name,
+        parameters=[instance_count],
+        steps=callback_steps,
+        sagemaker_session=sagemaker_session,
+    )
+
+    try:
+        response = pipeline.create(role)
+        create_arn = response["PipelineArn"]
+        assert re.match(
+            fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
+            create_arn,
+        )
+        response = pipeline.describe()
+        assert len(json.loads(pipeline.describe()["PipelineDefinition"])["Steps"]) == 2000
+
+        pipeline.parameters = [ParameterInteger(name="InstanceCount", default_value=1)]
+        response = pipeline.update(role)
+        update_arn = response["PipelineArn"]
+        assert re.match(
+            fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
+            update_arn,
+        )
+    finally:
+        try:
+            pipeline.delete()
+        except Exception:
+            pass
+
+
+def test_create_and_update_with_parallelism_config(
+    sagemaker_session, role, pipeline_name, region_name
+):
+    instance_count = ParameterInteger(name="InstanceCount", default_value=2)
+
+    outputParam = CallbackOutput(output_name="output", output_type=CallbackOutputTypeEnum.String)
+
+    callback_steps = [
+        CallbackStep(
+            name=f"callback-step{count}",
+            sqs_queue_url="https://sqs.us-east-2.amazonaws.com/123456789012/MyQueue",
+            inputs={"arg1": "foo"},
+            outputs=[outputParam],
+        )
+        for count in range(500)
+    ]
+    pipeline = Pipeline(
+        name=pipeline_name,
+        parameters=[instance_count],
+        steps=callback_steps,
+        sagemaker_session=sagemaker_session,
+    )
+
+    try:
+        response = pipeline.create(role, parallelism_config={"MaxParallelExecutionSteps": 50})
+        create_arn = response["PipelineArn"]
+        assert re.match(
+            fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
+            create_arn,
+        )
+        response = pipeline.describe()
+        assert response["ParallelismConfiguration"]["MaxParallelExecutionSteps"] == 50
+
+        pipeline.parameters = [ParameterInteger(name="InstanceCount", default_value=1)]
+        response = pipeline.update(role, parallelism_config={"MaxParallelExecutionSteps": 55})
+        update_arn = response["PipelineArn"]
+        assert re.match(
+            fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
+            update_arn,
+        )
+
+        response = pipeline.describe()
+        assert response["ParallelismConfiguration"]["MaxParallelExecutionSteps"] == 55
+
+    finally:
+        try:
+            pipeline.delete()
+        except Exception:
+            pass
diff --git a/tests/unit/sagemaker/workflow/test_pipeline.py b/tests/unit/sagemaker/workflow/test_pipeline.py
index 4b68abceeb..be90a8a876 100644
--- a/tests/unit/sagemaker/workflow/test_pipeline.py
+++ b/tests/unit/sagemaker/workflow/test_pipeline.py
@@ -21,9 +21,11 @@
 
 from mock import Mock
 
+from sagemaker import s3
 from sagemaker.workflow.execution_variables import ExecutionVariables
 from sagemaker.workflow.parameters import ParameterString
 from sagemaker.workflow.pipeline import Pipeline
+from sagemaker.workflow.parallelism_config import ParallelismConfiguration
 from sagemaker.workflow.pipeline_experiment_config import (
     PipelineExperimentConfig,
     PipelineExperimentConfigProperties,
@@ -62,7 +64,9 @@ def role_arn():
 
 @pytest.fixture
 def sagemaker_session_mock():
-    return Mock()
+    session_mock = Mock()
+    session_mock.default_bucket = Mock(name="default_bucket", return_value="s3_bucket")
+    return session_mock
 
 
 def test_pipeline_create(sagemaker_session_mock, role_arn):
@@ -78,6 +82,47 @@ def test_pipeline_create(sagemaker_session_mock, role_arn):
     )
 
 
+def test_pipeline_create_with_parallelism_config(sagemaker_session_mock, role_arn):
+    pipeline = Pipeline(
+        name="MyPipeline",
+        parameters=[],
+        steps=[],
+        pipeline_experiment_config=ParallelismConfiguration(max_parallel_execution_steps=10),
+        sagemaker_session=sagemaker_session_mock,
+    )
+    pipeline.create(role_arn=role_arn)
+    assert sagemaker_session_mock.sagemaker_client.create_pipeline.called_with(
+        PipelineName="MyPipeline",
+        PipelineDefinition=pipeline.definition(),
+        RoleArn=role_arn,
+        ParallelismConfiguration={"MaxParallelExecutionSteps": 10},
+    )
+
+
+def test_large_pipeline_create(sagemaker_session_mock, role_arn):
+    parameter = ParameterString("MyStr")
+    pipeline = Pipeline(
+        name="MyPipeline",
+        parameters=[parameter],
+        steps=[CustomStep(name="MyStep", input_data=parameter)] * 2000,
+        sagemaker_session=sagemaker_session_mock,
+    )
+
+    s3.S3Uploader.upload_string_as_file_body = Mock()
+
+    pipeline.create(role_arn=role_arn)
+
+    assert s3.S3Uploader.upload_string_as_file_body.called_with(
+        body=pipeline.definition(), s3_uri="s3://s3_bucket/MyPipeline"
+    )
+
+    assert sagemaker_session_mock.sagemaker_client.create_pipeline.called_with(
+        PipelineName="MyPipeline",
+        PipelineDefinitionS3Location={"Bucket": "s3_bucket", "ObjectKey": "MyPipeline"},
+        RoleArn=role_arn,
+    )
+
+
 def test_pipeline_update(sagemaker_session_mock, role_arn):
     pipeline = Pipeline(
         name="MyPipeline",
@@ -91,6 +136,47 @@ def test_pipeline_update(sagemaker_session_mock, role_arn):
     )
 
 
+def test_pipeline_update_with_parallelism_config(sagemaker_session_mock, role_arn):
+    pipeline = Pipeline(
+        name="MyPipeline",
+        parameters=[],
+        steps=[],
+        pipeline_experiment_config=ParallelismConfiguration(max_parallel_execution_steps=10),
+        sagemaker_session=sagemaker_session_mock,
+    )
+    pipeline.create(role_arn=role_arn)
+    assert sagemaker_session_mock.sagemaker_client.update_pipeline.called_with(
+        PipelineName="MyPipeline",
+        PipelineDefinition=pipeline.definition(),
+        RoleArn=role_arn,
+        ParallelismConfiguration={"MaxParallelExecutionSteps": 10},
+    )
+
+
+def test_large_pipeline_update(sagemaker_session_mock, role_arn):
+    parameter = ParameterString("MyStr")
+    pipeline = Pipeline(
+        name="MyPipeline",
+        parameters=[parameter],
+        steps=[CustomStep(name="MyStep", input_data=parameter)] * 2000,
+        sagemaker_session=sagemaker_session_mock,
+    )
+
+    s3.S3Uploader.upload_string_as_file_body = Mock()
+
+    pipeline.create(role_arn=role_arn)
+
+    assert s3.S3Uploader.upload_string_as_file_body.called_with(
+        body=pipeline.definition(), s3_uri="s3://s3_bucket/MyPipeline"
+    )
+
+    assert sagemaker_session_mock.sagemaker_client.update_pipeline.called_with(
+        PipelineName="MyPipeline",
+        PipelineDefinitionS3Location={"Bucket": "s3_bucket", "ObjectKey": "MyPipeline"},
+        RoleArn=role_arn,
+    )
+
+
 def test_pipeline_upsert(sagemaker_session_mock, role_arn):
     sagemaker_session_mock.side_effect = [
         ClientError(

From 016e70dd49043a2cf1f93d473b35c3e722dd350f Mon Sep 17 00:00:00 2001
From: Yifei Zhu <66866419+yzhu0@users.noreply.github.com>
Date: Fri, 7 Jan 2022 14:47:25 -0800
Subject: [PATCH 11/13] change: Add action type to lineage object (#2798)

Co-authored-by: Ahsan Khan <ahsan.al.zaki@gmail.com>
---
 src/sagemaker/lineage/query.py             |  6 ++++-
 tests/unit/sagemaker/lineage/test_query.py | 31 +++++++++++++++++++++-
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/src/sagemaker/lineage/query.py b/src/sagemaker/lineage/query.py
index f2d1bf8c14..a1ab295b05 100644
--- a/src/sagemaker/lineage/query.py
+++ b/src/sagemaker/lineage/query.py
@@ -83,10 +83,11 @@ def __init__(
         self._session = sagemaker_session
 
     def to_lineage_object(self):
-        """Convert the ``Vertex`` object to its corresponding ``Artifact`` or ``Context`` object."""
+        """Convert the ``Vertex`` object to its corresponding Artifact, Action, Context object."""
         from sagemaker.lineage.artifact import Artifact, ModelArtifact
         from sagemaker.lineage.context import Context, EndpointContext
         from sagemaker.lineage.artifact import DatasetArtifact
+        from sagemaker.lineage.action import Action
 
         if self.lineage_entity == LineageEntityEnum.CONTEXT.value:
             resource_name = get_resource_name_from_arn(self.arn)
@@ -103,6 +104,9 @@ def to_lineage_object(self):
                 return DatasetArtifact.load(artifact_arn=self.arn, sagemaker_session=self._session)
             return Artifact.load(artifact_arn=self.arn, sagemaker_session=self._session)
 
+        if self.lineage_entity == LineageEntityEnum.ACTION.value:
+            return Action.load(action_name=self.arn.split("/")[1], sagemaker_session=self._session)
+
         raise ValueError("Vertex cannot be converted to a lineage object.")
 
 
diff --git a/tests/unit/sagemaker/lineage/test_query.py b/tests/unit/sagemaker/lineage/test_query.py
index 17d3eabe92..c25ca6f38f 100644
--- a/tests/unit/sagemaker/lineage/test_query.py
+++ b/tests/unit/sagemaker/lineage/test_query.py
@@ -13,6 +13,7 @@
 from __future__ import absolute_import
 from sagemaker.lineage.artifact import DatasetArtifact, ModelArtifact, Artifact
 from sagemaker.lineage.context import EndpointContext, Context
+from sagemaker.lineage.action import Action
 from sagemaker.lineage.query import LineageEntityEnum, LineageSourceEnum, Vertex, LineageQuery
 import pytest
 
@@ -240,10 +241,38 @@ def test_vertex_to_object_artifact(sagemaker_session):
     assert isinstance(artifact, Artifact)
 
 
+def test_vertex_to_object_action(sagemaker_session):
+    vertex = Vertex(
+        arn="arn:aws:sagemaker:us-west-2:0123456789012:action/cp-m5-20210424t041405868z-1619237657-1-aws-endpoint",
+        lineage_entity=LineageEntityEnum.ACTION.value,
+        lineage_source="A",
+        sagemaker_session=sagemaker_session,
+    )
+
+    sagemaker_session.sagemaker_client.describe_action.return_value = {
+        "ActionName": "cp-m5-20210424t041405868z-1619237657-1-aws-endpoint",
+        "Source": {
+            "SourceUri": "246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-scikit-learn:0.23-1-cpu-py3",
+            "SourceTypes": [],
+        },
+        "ActionType": "A",
+        "Properties": {},
+        "CreationTime": 1608224704.149,
+        "CreatedBy": {},
+        "LastModifiedTime": 1608224704.149,
+        "LastModifiedBy": {},
+    }
+
+    action = vertex.to_lineage_object()
+
+    assert action.action_name == "cp-m5-20210424t041405868z-1619237657-1-aws-endpoint"
+    assert isinstance(action, Action)
+
+
 def test_vertex_to_object_unconvertable(sagemaker_session):
     vertex = Vertex(
         arn="arn:aws:sagemaker:us-west-2:0123456789012:artifact/e66eef7f19c05e75284089183491bd4f",
-        lineage_entity=LineageEntityEnum.ACTION.value,
+        lineage_entity=LineageEntityEnum.TRIAL_COMPONENT.value,
         lineage_source=LineageSourceEnum.TENSORBOARD.value,
         sagemaker_session=sagemaker_session,
     )

From a8323a9b79a404771498486cdc5cbe9a1ea61639 Mon Sep 17 00:00:00 2001
From: Yifei Zhu <66866419+yzhu0@users.noreply.github.com>
Date: Fri, 7 Jan 2022 14:56:26 -0800
Subject: [PATCH 12/13] fix: Collapse cross-account artifacts in query lineage
 response (#2796)

---
 src/sagemaker/lineage/query.py             |  42 ++++++-
 tests/unit/sagemaker/lineage/test_query.py | 137 +++++++++++++++++++++
 2 files changed, 178 insertions(+), 1 deletion(-)

diff --git a/src/sagemaker/lineage/query.py b/src/sagemaker/lineage/query.py
index a1ab295b05..033e838137 100644
--- a/src/sagemaker/lineage/query.py
+++ b/src/sagemaker/lineage/query.py
@@ -212,6 +212,44 @@ def _convert_api_response(self, response) -> LineageQueryResult:
 
         return converted
 
+    def _collapse_cross_account_artifacts(self, query_response):
+        """Collapse the duplicate vertices and edges for cross-account."""
+        for edge in query_response.edges:
+            if (
+                "artifact" in edge.source_arn
+                and "artifact" in edge.destination_arn
+                and edge.source_arn.split("/")[1] == edge.destination_arn.split("/")[1]
+                and edge.source_arn != edge.destination_arn
+            ):
+                edge_source_arn = edge.source_arn
+                edge_destination_arn = edge.destination_arn
+                self._update_cross_account_edge(
+                    edges=query_response.edges,
+                    arn=edge_source_arn,
+                    duplicate_arn=edge_destination_arn,
+                )
+                self._update_cross_account_vertex(
+                    query_response=query_response, duplicate_arn=edge_destination_arn
+                )
+
+        # remove the duplicate edges from cross account
+        new_edge = [e for e in query_response.edges if not e.source_arn == e.destination_arn]
+        query_response.edges = new_edge
+
+        return query_response
+
+    def _update_cross_account_edge(self, edges, arn, duplicate_arn):
+        """Replace the duplicate arn with arn in edges list."""
+        for idx, e in enumerate(edges):
+            if e.destination_arn == duplicate_arn:
+                edges[idx].destination_arn = arn
+            elif e.source_arn == duplicate_arn:
+                edges[idx].source_arn = arn
+
+    def _update_cross_account_vertex(self, query_response, duplicate_arn):
+        """Remove the vertex with duplicate arn in the vertices list."""
+        query_response.vertices = [v for v in query_response.vertices if not v.arn == duplicate_arn]
+
     def query(
         self,
         start_arns: List[str],
@@ -239,5 +277,7 @@ def query(
             Filters=query_filter._to_request_dict() if query_filter else {},
             MaxDepth=max_depth,
         )
+        query_response = self._convert_api_response(query_response)
+        query_response = self._collapse_cross_account_artifacts(query_response)
 
-        return self._convert_api_response(query_response)
+        return query_response
diff --git a/tests/unit/sagemaker/lineage/test_query.py b/tests/unit/sagemaker/lineage/test_query.py
index c25ca6f38f..595e7e1d0f 100644
--- a/tests/unit/sagemaker/lineage/test_query.py
+++ b/tests/unit/sagemaker/lineage/test_query.py
@@ -45,6 +45,143 @@ def test_lineage_query(sagemaker_session):
     assert response.vertices[1].lineage_entity == "Context"
 
 
+def test_lineage_query_cross_account_same_artifact(sagemaker_session):
+    lineage_query = LineageQuery(sagemaker_session)
+    sagemaker_session.sagemaker_client.query_lineage.return_value = {
+        "Vertices": [
+            {
+                "Arn": "arn:aws:sagemaker:us-east-2:012345678901:artifact/e1f29799189751939405b0f2b5b9d2a0",
+                "Type": "Endpoint",
+                "LineageType": "Artifact",
+            },
+            {
+                "Arn": "arn:aws:sagemaker:us-east-2:012345678902:artifact/e1f29799189751939405b0f2b5b9d2a0",
+                "Type": "Endpoint",
+                "LineageType": "Artifact",
+            },
+        ],
+        "Edges": [
+            {
+                "SourceArn": "arn:aws:sagemaker:us-east-2:012345678901:artifact/e1f29799189751939405b0f2b5b9d2a0",
+                "DestinationArn": "arn:aws:sagemaker:us-east-2:012345678902:artifact/e1f29799189751939405b0f2b5b9d2a0",
+                "AssociationType": "SAME_AS",
+            },
+            {
+                "SourceArn": "arn:aws:sagemaker:us-east-2:012345678902:artifact/e1f29799189751939405b0f2b5b9d2a0",
+                "DestinationArn": "arn:aws:sagemaker:us-east-2:012345678901:artifact/e1f29799189751939405b0f2b5b9d2a0",
+                "AssociationType": "SAME_AS",
+            },
+        ],
+    }
+
+    response = lineage_query.query(
+        start_arns=["arn:aws:sagemaker:us-west-2:0123456789012:context/mycontext"]
+    )
+    assert len(response.edges) == 0
+    assert len(response.vertices) == 1
+    assert (
+        response.vertices[0].arn
+        == "arn:aws:sagemaker:us-east-2:012345678901:artifact/e1f29799189751939405b0f2b5b9d2a0"
+    )
+    assert response.vertices[0].lineage_source == "Endpoint"
+    assert response.vertices[0].lineage_entity == "Artifact"
+
+
+def test_lineage_query_cross_account(sagemaker_session):
+    lineage_query = LineageQuery(sagemaker_session)
+    sagemaker_session.sagemaker_client.query_lineage.return_value = {
+        "Vertices": [
+            {
+                "Arn": "arn:aws:sagemaker:us-east-2:012345678901:artifact/e1f29799189751939405b0f2b5b9d2a0",
+                "Type": "Endpoint",
+                "LineageType": "Artifact",
+            },
+            {
+                "Arn": "arn:aws:sagemaker:us-east-2:012345678902:artifact/e1f29799189751939405b0f2b5b9d2a0",
+                "Type": "Endpoint",
+                "LineageType": "Artifact",
+            },
+            {
+                "Arn": "arn:aws:sagemaker:us-east-2:012345678903:artifact/e1f29799189751939405b0f2b5b9abcd",
+                "Type": "Endpoint",
+                "LineageType": "Artifact",
+            },
+            {
+                "Arn": "arn:aws:sagemaker:us-east-2:012345678903:artifact/e1f29799189751939405b0f2b5b9efgh",
+                "Type": "Endpoint",
+                "LineageType": "Artifact",
+            },
+        ],
+        "Edges": [
+            {
+                "SourceArn": "arn:aws:sagemaker:us-east-2:012345678901:artifact/e1f29799189751939405b0f2b5b9d2a0",
+                "DestinationArn": "arn:aws:sagemaker:us-east-2:012345678902:artifact/e1f29799189751939405b0f2b5b9d2a0",
+                "AssociationType": "SAME_AS",
+            },
+            {
+                "SourceArn": "arn:aws:sagemaker:us-east-2:012345678902:artifact/e1f29799189751939405b0f2b5b9d2a0",
+                "DestinationArn": "arn:aws:sagemaker:us-east-2:012345678901:artifact/e1f29799189751939405b0f2b5b9d2a0",
+                "AssociationType": "SAME_AS",
+            },
+            {
+                "SourceArn": "arn:aws:sagemaker:us-east-2:012345678902:artifact/e1f29799189751939405b0f2b5b9d2a0",
+                "DestinationArn": "arn:aws:sagemaker:us-east-2:012345678903:artifact/e1f29799189751939405b0f2b5b9abcd",
+                "AssociationType": "ABC",
+            },
+            {
+                "SourceArn": "arn:aws:sagemaker:us-east-2:012345678903:artifact/e1f29799189751939405b0f2b5b9abcd",
+                "DestinationArn": "arn:aws:sagemaker:us-east-2:012345678903:artifact/e1f29799189751939405b0f2b5b9efgh",
+                "AssociationType": "DEF",
+            },
+        ],
+    }
+
+    response = lineage_query.query(
+        start_arns=["arn:aws:sagemaker:us-west-2:0123456789012:context/mycontext"]
+    )
+
+    assert len(response.edges) == 2
+    assert (
+        response.edges[0].source_arn
+        == "arn:aws:sagemaker:us-east-2:012345678901:artifact/e1f29799189751939405b0f2b5b9d2a0"
+    )
+    assert (
+        response.edges[0].destination_arn
+        == "arn:aws:sagemaker:us-east-2:012345678903:artifact/e1f29799189751939405b0f2b5b9abcd"
+    )
+    assert response.edges[0].association_type == "ABC"
+
+    assert (
+        response.edges[1].source_arn
+        == "arn:aws:sagemaker:us-east-2:012345678903:artifact/e1f29799189751939405b0f2b5b9abcd"
+    )
+    assert (
+        response.edges[1].destination_arn
+        == "arn:aws:sagemaker:us-east-2:012345678903:artifact/e1f29799189751939405b0f2b5b9efgh"
+    )
+    assert response.edges[1].association_type == "DEF"
+
+    assert len(response.vertices) == 3
+    assert (
+        response.vertices[0].arn
+        == "arn:aws:sagemaker:us-east-2:012345678901:artifact/e1f29799189751939405b0f2b5b9d2a0"
+    )
+    assert response.vertices[0].lineage_source == "Endpoint"
+    assert response.vertices[0].lineage_entity == "Artifact"
+    assert (
+        response.vertices[1].arn
+        == "arn:aws:sagemaker:us-east-2:012345678903:artifact/e1f29799189751939405b0f2b5b9abcd"
+    )
+    assert response.vertices[1].lineage_source == "Endpoint"
+    assert response.vertices[1].lineage_entity == "Artifact"
+    assert (
+        response.vertices[2].arn
+        == "arn:aws:sagemaker:us-east-2:012345678903:artifact/e1f29799189751939405b0f2b5b9efgh"
+    )
+    assert response.vertices[2].lineage_source == "Endpoint"
+    assert response.vertices[2].lineage_entity == "Artifact"
+
+
 def test_vertex_to_object_endpoint_context(sagemaker_session):
     vertex = Vertex(
         arn="arn:aws:sagemaker:us-west-2:0123456789012:context/mycontext",

From b7734ca4f62d6a3f1bdffebbd64b757054628383 Mon Sep 17 00:00:00 2001
From: Qingzi-Lan <83724147+Qingzi-Lan@users.noreply.github.com>
Date: Mon, 10 Jan 2022 09:13:33 -0800
Subject: [PATCH 13/13] feature: add support for pytorch 1.10.0 (#2795)

Co-authored-by: Ahsan Khan <ahsan.al.zaki@gmail.com>
---
 src/sagemaker/fw_utils.py                   | 15 ++++-
 src/sagemaker/image_uri_config/pytorch.json | 72 ++++++++++++++++++++-
 tests/unit/test_fw_utils.py                 |  1 +
 3 files changed, 85 insertions(+), 3 deletions(-)

diff --git a/src/sagemaker/fw_utils.py b/src/sagemaker/fw_utils.py
index ce1bbd538d..79b9e803d7 100644
--- a/src/sagemaker/fw_utils.py
+++ b/src/sagemaker/fw_utils.py
@@ -75,7 +75,20 @@
         "2.6.0",
         "2.6.2",
     ],
-    "pytorch": ["1.6", "1.6.0", "1.7", "1.7.1", "1.8", "1.8.0", "1.8.1", "1.9", "1.9.0", "1.9.1"],
+    "pytorch": [
+        "1.6",
+        "1.6.0",
+        "1.7",
+        "1.7.1",
+        "1.8",
+        "1.8.0",
+        "1.8.1",
+        "1.9",
+        "1.9.0",
+        "1.9.1",
+        "1.10",
+        "1.10.0",
+    ],
 }
 SMDISTRIBUTED_SUPPORTED_STRATEGIES = ["dataparallel", "modelparallel"]
 
diff --git a/src/sagemaker/image_uri_config/pytorch.json b/src/sagemaker/image_uri_config/pytorch.json
index a64a710692..9c96858efe 100644
--- a/src/sagemaker/image_uri_config/pytorch.json
+++ b/src/sagemaker/image_uri_config/pytorch.json
@@ -63,7 +63,8 @@
             "1.6": "1.6.0",
             "1.7": "1.7.1",
             "1.8": "1.8.1",
-            "1.9": "1.9.1"
+            "1.9": "1.9.1",
+            "1.10": "1.10.0"
         },
         "versions": {
             "0.4.0": {
@@ -500,6 +501,39 @@
                     "us-west-2": "763104351884"
                 },
                 "repository": "pytorch-inference"
+            },
+            "1.10.0": {
+                "py_versions": [
+                    "py38"
+                ],
+                "registries": {
+                    "af-south-1": "626614931356",
+                    "ap-east-1": "871362719292",
+                    "ap-northeast-1": "763104351884",
+                    "ap-northeast-2": "763104351884",
+                    "ap-northeast-3": "364406365360",
+                    "ap-south-1": "763104351884",
+                    "ap-southeast-1": "763104351884",
+                    "ap-southeast-2": "763104351884",
+                    "ca-central-1": "763104351884",
+                    "cn-north-1": "727897471807",
+                    "cn-northwest-1": "727897471807",
+                    "eu-central-1": "763104351884",
+                    "eu-north-1": "763104351884",
+                    "eu-west-1": "763104351884",
+                    "eu-west-2": "763104351884",
+                    "eu-west-3": "763104351884",
+                    "eu-south-1": "692866216735",
+                    "me-south-1": "217643126080",
+                    "sa-east-1": "763104351884",
+                    "us-east-1": "763104351884",
+                    "us-east-2": "763104351884",
+                    "us-gov-west-1": "442386744353",
+                    "us-iso-east-1": "886529160074",
+                    "us-west-1": "763104351884",
+                    "us-west-2": "763104351884"
+                },
+                "repository": "pytorch-inference"
             }
         }
     },
@@ -519,7 +553,8 @@
             "1.6": "1.6.0",
             "1.7": "1.7.1",
             "1.8": "1.8.1",
-            "1.9": "1.9.1"
+            "1.9": "1.9.1",
+            "1.10": "1.10.0"
         },
         "versions": {
             "0.4.0": {
@@ -957,6 +992,39 @@
                     "us-west-2": "763104351884"
                 },
                 "repository": "pytorch-training"
+            },
+            "1.10.0": {
+                "py_versions": [
+                    "py38"
+                ],
+                "registries": {
+                    "af-south-1": "626614931356",
+                    "ap-east-1": "871362719292",
+                    "ap-northeast-1": "763104351884",
+                    "ap-northeast-2": "763104351884",
+                    "ap-northeast-3": "364406365360",
+                    "ap-south-1": "763104351884",
+                    "ap-southeast-1": "763104351884",
+                    "ap-southeast-2": "763104351884",
+                    "ca-central-1": "763104351884",
+                    "cn-north-1": "727897471807",
+                    "cn-northwest-1": "727897471807",
+                    "eu-central-1": "763104351884",
+                    "eu-north-1": "763104351884",
+                    "eu-west-1": "763104351884",
+                    "eu-west-2": "763104351884",
+                    "eu-west-3": "763104351884",
+                    "eu-south-1": "692866216735",
+                    "me-south-1": "217643126080",
+                    "sa-east-1": "763104351884",
+                    "us-east-1": "763104351884",
+                    "us-east-2": "763104351884",
+                    "us-gov-west-1": "442386744353",
+                    "us-iso-east-1": "886529160074",
+                    "us-west-1": "763104351884",
+                    "us-west-2": "763104351884"
+                },
+                "repository": "pytorch-training"
             }
         }
     }
diff --git a/tests/unit/test_fw_utils.py b/tests/unit/test_fw_utils.py
index 42aebf3dc5..c2470a5ba6 100644
--- a/tests/unit/test_fw_utils.py
+++ b/tests/unit/test_fw_utils.py
@@ -693,6 +693,7 @@ def test_validate_smdataparallel_args_not_raises():
         ("ml.p3.16xlarge", "pytorch", "1.8", "py3", smdataparallel_enabled),
         ("ml.p3.16xlarge", "pytorch", "1.9.1", "py38", smdataparallel_enabled),
         ("ml.p3.16xlarge", "pytorch", "1.9", "py38", smdataparallel_enabled),
+        ("ml.p3.16xlarge", "pytorch", "1.10", "py38", smdataparallel_enabled),
         ("ml.p3.16xlarge", "tensorflow", "2.4.1", "py3", smdataparallel_enabled_custom_mpi),
         ("ml.p3.16xlarge", "tensorflow", "2.4.1", "py37", smdataparallel_enabled_custom_mpi),
         ("ml.p3.16xlarge", "tensorflow", "2.5.1", "py37", smdataparallel_enabled_custom_mpi),