From 37a60effc97dfe4c00fde0bb613216e38251ca4d Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Wed, 9 Dec 2020 09:13:17 +0000
Subject: [PATCH 01/13] add pp doc

---
 .pre-commit-config.yaml         |  1 +
 docs/source/performance.rst     | 59 ++++++++++++++++++++++++++++++++-
 docs/source/training_tricks.rst | 57 +++++++++++++++++++++++++++++++
 3 files changed, 116 insertions(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 78684a2ab74df..1a4cbed8efd54 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -32,5 +32,6 @@ repos:
         types: [python]
 
   - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v0.790
     hooks:
       - id: mypy
diff --git a/docs/source/performance.rst b/docs/source/performance.rst
index 0f97942128cda..a15d6ebfbf857 100644
--- a/docs/source/performance.rst
+++ b/docs/source/performance.rst
@@ -131,4 +131,61 @@ To use Optimizer Sharded Training, refer to :ref:`model-parallelism`.
 
 Sharded DDP can work across all DDP variants by adding the additional ``--plugins ddp_sharded`` flag.
 
-Refer to the :ref:`distributed computing guide for more details <multi_gpu>`.
\ No newline at end of file
+Refer to the :ref:`distributed computing guide for more details <multi_gpu>`.
+
+
+Sequential Model Parallelism with Checkpointing to reduce peak memory
+---------------------------------------------------------------------
+Pipe Pipeline is a lightning integration of Pipeline Parallelism provided by Fairscale.
+Pipe combines pipeline parallelism with checkpointing to reduce peak memory required to train while minimizing device under-utilization.
+
+Find more explanation at https://arxiv.org/abs/1811.06965
+
+Before running, install Fairscale by using pip install pytorch-lightning["extra"].
+
+To use Sequential Model Parallelism, one need to provide a  :class:`nn.Sequential <torch.nn.Sequential>` module.
+If the module requires lots of memory, Pipe can be used to reduce this by leveraging multiple GPUs.
+
+.. code-block:: python
+
+    # from pytorch_lightning.plugins.ddp_sequential_plugin import DDPSequentialPlugin
+    # from pytorch_lightning import LightningModule
+
+    class MyModel(LightningModule):
+        def __init__(self):
+            ...
+            self.sequential_module = torch.nn.Sequential(my_layers)
+
+    # Split my module across 4 gpus, one layer each
+    model = MyModel()
+    plugin = DDPSequentialPlugin(balance=[1, 1, 1, 1])
+    trainer = Trainer(accelerator='ddp', gpus=4, plugins=[plugin])
+    trainer.fit(model)
+
+Find a [conv_sequential_example](https://github.com/PyTorchLightning/pytorch-lightning/tree/master/pl_examples/basic_examples/conv_sequential_example.py) tutorial on cifar10.
+
+When running this example on 2 GPUS.
+
+.. list-table:: GPU Memory Utilization
+   :widths: 25 25 50
+   :header-rows: 1
+
+   * - GPUS
+     - Without Balancing
+     - With Balancing
+   * - Gpu 0
+     - 4436 MB
+     - 1554 MB
+   * - Gpu 1
+     - ~0
+     - 994 MB
+
+Run with Balancing
+.. code-block:: bash
+
+    python pl_examples/basic_examples/conv_sequential_example.py --use_pipe 1 --batch_size 1024
+
+Run without Balancing
+.. code-block:: bash
+
+    python pl_examples/basic_examples/conv_sequential_example.py --use_pipe 0 --batch_size 1024
diff --git a/docs/source/training_tricks.rst b/docs/source/training_tricks.rst
index 6ff9dfd0a30d3..3095cf321d7de 100644
--- a/docs/source/training_tricks.rst
+++ b/docs/source/training_tricks.rst
@@ -123,3 +123,60 @@ The algorithm in short works by:
    :members: scale_batch_size
 
 .. warning:: Batch size finder is not supported for DDP yet, it is coming soon.
+
+
+Sequential Model Parallelism with Checkpointing to reduce peak memory
+---------------------------------------------------------------------
+Pipe Pipeline is a lightning integration of Pipeline Parallelism provided by Fairscale.
+Pipe combines pipeline parallelism with checkpointing to reduce peak memory required to train while minimizing device under-utilization.
+
+Find more explanation at https://arxiv.org/abs/1811.06965
+
+Before running, install Fairscale by using pip install pytorch-lightning["extra"].
+
+To use Sequential Model Parallelism, one need to provide a  :class:`nn.Sequential <torch.nn.Sequential>` module.
+If the module requires lots of memory, Pipe can be used to reduce this by leveraging multiple GPUs.
+
+.. code-block:: python
+
+    # from pytorch_lightning.plugins.ddp_sequential_plugin import DDPSequentialPlugin
+    # from pytorch_lightning import LightningModule
+
+    class MyModel(LightningModule):
+        def __init__(self):
+            ...
+            self.sequential_module = torch.nn.Sequential(my_layers)
+
+    # Split my module across 4 gpus, one layer each
+    model = MyModel()
+    plugin = DDPSequentialPlugin(balance=[1, 1, 1, 1])
+    trainer = Trainer(accelerator='ddp', gpus=4, plugins=[plugin])
+    trainer.fit(model)
+
+Find a [conv_sequential_example](https://github.com/PyTorchLightning/pytorch-lightning/tree/master/pl_examples/basic_examples/conv_sequential_example.py) tutorial on cifar10.
+
+When running this example on 2 GPUS.
+
+.. list-table:: GPU Memory Utilization
+   :widths: 25 25 50
+   :header-rows: 1
+
+   * - GPUS
+     - Without Balancing
+     - With Balancing
+   * - Gpu 0
+     - 4436 MB
+     - 1554 MB
+   * - Gpu 1
+     - ~0
+     - 994 MB
+
+Run with Balancing
+.. code-block:: bash
+
+    python pl_examples/basic_examples/conv_sequential_example.py --use_pipe 1 --batch_size 1024
+
+Run without Balancing
+.. code-block:: bash
+
+    python pl_examples/basic_examples/conv_sequential_example.py --use_pipe 0 --batch_size 1024

From e11a2a2dd39cea26762061b8302252016cddbed3 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Wed, 9 Dec 2020 09:30:26 +0000
Subject: [PATCH 02/13] udpate doc

---
 docs/source/performance.rst     | 8 +++++---
 docs/source/training_tricks.rst | 8 +++++---
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/docs/source/performance.rst b/docs/source/performance.rst
index a15d6ebfbf857..c985b8798dd95 100644
--- a/docs/source/performance.rst
+++ b/docs/source/performance.rst
@@ -141,7 +141,9 @@ Pipe combines pipeline parallelism with checkpointing to reduce peak memory requ
 
 Find more explanation at https://arxiv.org/abs/1811.06965
 
-Before running, install Fairscale by using pip install pytorch-lightning["extra"].
+.. note:: DDPSequentialPlugin is currently supported only for torch 1.6
+
+Before running, install Fairscale by using pip install pytorch-lightning["extra"] and pip install pytorch-lightning-bolts
 
 To use Sequential Model Parallelism, one need to provide a  :class:`nn.Sequential <torch.nn.Sequential>` module.
 If the module requires lots of memory, Pipe can be used to reduce this by leveraging multiple GPUs.
@@ -183,9 +185,9 @@ When running this example on 2 GPUS.
 Run with Balancing
 .. code-block:: bash
 
-    python pl_examples/basic_examples/conv_sequential_example.py --use_pipe 1 --batch_size 1024
+    python pl_examples/basic_examples/conv_sequential_example.py --use_ddp_sequential --batch_size 1024
 
 Run without Balancing
 .. code-block:: bash
 
-    python pl_examples/basic_examples/conv_sequential_example.py --use_pipe 0 --batch_size 1024
+    python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024
diff --git a/docs/source/training_tricks.rst b/docs/source/training_tricks.rst
index 3095cf321d7de..964519e147464 100644
--- a/docs/source/training_tricks.rst
+++ b/docs/source/training_tricks.rst
@@ -132,7 +132,9 @@ Pipe combines pipeline parallelism with checkpointing to reduce peak memory requ
 
 Find more explanation at https://arxiv.org/abs/1811.06965
 
-Before running, install Fairscale by using pip install pytorch-lightning["extra"].
+.. note:: DDPSequentialPlugin is currently supported only for torch 1.6
+
+Before running, install Fairscale by using pip install pytorch-lightning["extra"] and pip install pytorch-lightning-bolts
 
 To use Sequential Model Parallelism, one need to provide a  :class:`nn.Sequential <torch.nn.Sequential>` module.
 If the module requires lots of memory, Pipe can be used to reduce this by leveraging multiple GPUs.
@@ -174,9 +176,9 @@ When running this example on 2 GPUS.
 Run with Balancing
 .. code-block:: bash
 
-    python pl_examples/basic_examples/conv_sequential_example.py --use_pipe 1 --batch_size 1024
+    python pl_examples/basic_examples/conv_sequential_example.py --use_ddp_sequential --batch_size 1024
 
 Run without Balancing
 .. code-block:: bash
 
-    python pl_examples/basic_examples/conv_sequential_example.py --use_pipe 0 --batch_size 1024
+    python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024

From ef0cf518aa6f1e3c6139f8297d6850aea9452a4a Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Wed, 9 Dec 2020 09:40:36 +0000
Subject: [PATCH 03/13] update doc

---
 docs/source/performance.rst     | 4 ++--
 docs/source/training_tricks.rst | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/performance.rst b/docs/source/performance.rst
index c985b8798dd95..48f6c589372e5 100644
--- a/docs/source/performance.rst
+++ b/docs/source/performance.rst
@@ -185,9 +185,9 @@ When running this example on 2 GPUS.
 Run with Balancing
 .. code-block:: bash
 
-    python pl_examples/basic_examples/conv_sequential_example.py --use_ddp_sequential --batch_size 1024
+    python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024 --gpus 2 --accelerator ddp --use_ddp_sequential
 
 Run without Balancing
 .. code-block:: bash
 
-    python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024
+    python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024 --gpus 1
diff --git a/docs/source/training_tricks.rst b/docs/source/training_tricks.rst
index 964519e147464..9dc2b4525e8a9 100644
--- a/docs/source/training_tricks.rst
+++ b/docs/source/training_tricks.rst
@@ -176,9 +176,9 @@ When running this example on 2 GPUS.
 Run with Balancing
 .. code-block:: bash
 
-    python pl_examples/basic_examples/conv_sequential_example.py --use_ddp_sequential --batch_size 1024
+    python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024 --gpus 2 --accelerator ddp --use_ddp_sequential
 
 Run without Balancing
 .. code-block:: bash
 
-    python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024
+    python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024 --gpus 1

From 83da2c854588e86568be0c05a59b66c3f16f2170 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Wed, 9 Dec 2020 09:57:41 +0000
Subject: [PATCH 04/13] update doc

---
 docs/source/performance.rst     | 12 +++++++-----
 docs/source/training_tricks.rst | 12 +++++++-----
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/docs/source/performance.rst b/docs/source/performance.rst
index 48f6c589372e5..4997ebc57d597 100644
--- a/docs/source/performance.rst
+++ b/docs/source/performance.rst
@@ -141,7 +141,7 @@ Pipe combines pipeline parallelism with checkpointing to reduce peak memory requ
 
 Find more explanation at https://arxiv.org/abs/1811.06965
 
-.. note:: DDPSequentialPlugin is currently supported only for torch 1.6
+.. note:: DDPSequentialPlugin is currently supported only for Pytorch 1.6
 
 Before running, install Fairscale by using pip install pytorch-lightning["extra"] and pip install pytorch-lightning-bolts
 
@@ -150,8 +150,8 @@ If the module requires lots of memory, Pipe can be used to reduce this by levera
 
 .. code-block:: python
 
-    # from pytorch_lightning.plugins.ddp_sequential_plugin import DDPSequentialPlugin
-    # from pytorch_lightning import LightningModule
+    from pytorch_lightning.plugins.ddp_sequential_plugin import DDPSequentialPlugin
+    from pytorch_lightning import LightningModule
 
     class MyModel(LightningModule):
         def __init__(self):
@@ -183,11 +183,13 @@ When running this example on 2 GPUS.
      - 994 MB
 
 Run with Balancing
-.. code-block:: bash
+
+.. code-block:: python
 
     python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024 --gpus 2 --accelerator ddp --use_ddp_sequential
 
 Run without Balancing
-.. code-block:: bash
+
+.. code-block:: python
 
     python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024 --gpus 1
diff --git a/docs/source/training_tricks.rst b/docs/source/training_tricks.rst
index 9dc2b4525e8a9..a740292fde571 100644
--- a/docs/source/training_tricks.rst
+++ b/docs/source/training_tricks.rst
@@ -132,7 +132,7 @@ Pipe combines pipeline parallelism with checkpointing to reduce peak memory requ
 
 Find more explanation at https://arxiv.org/abs/1811.06965
 
-.. note:: DDPSequentialPlugin is currently supported only for torch 1.6
+.. note:: DDPSequentialPlugin is currently supported only for Pytorch 1.6
 
 Before running, install Fairscale by using pip install pytorch-lightning["extra"] and pip install pytorch-lightning-bolts
 
@@ -141,8 +141,8 @@ If the module requires lots of memory, Pipe can be used to reduce this by levera
 
 .. code-block:: python
 
-    # from pytorch_lightning.plugins.ddp_sequential_plugin import DDPSequentialPlugin
-    # from pytorch_lightning import LightningModule
+    from pytorch_lightning.plugins.ddp_sequential_plugin import DDPSequentialPlugin
+    from pytorch_lightning import LightningModule
 
     class MyModel(LightningModule):
         def __init__(self):
@@ -174,11 +174,13 @@ When running this example on 2 GPUS.
      - 994 MB
 
 Run with Balancing
-.. code-block:: bash
+
+.. code-block:: python
 
     python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024 --gpus 2 --accelerator ddp --use_ddp_sequential
 
 Run without Balancing
-.. code-block:: bash
+
+.. code-block:: python
 
     python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024 --gpus 1

From 05c8102a72716e1d1148e64843721a1f085b5c80 Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Wed, 9 Dec 2020 11:08:27 +0000
Subject: [PATCH 05/13] Update docs

---
 docs/source/multi_gpu.rst       | 67 +++++++++++++++++++++++++++++++++
 docs/source/performance.rst     | 44 ++--------------------
 docs/source/training_tricks.rst | 44 ++--------------------
 3 files changed, 75 insertions(+), 80 deletions(-)

diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst
index b6ffad20e741b..b0f0799fb13d1 100644
--- a/docs/source/multi_gpu.rst
+++ b/docs/source/multi_gpu.rst
@@ -612,6 +612,7 @@ This is useful when dealing with large Transformer based models, or in environme
 Lightning currently offers the following methods to leverage model parallelism:
 
 - Sharded Training (partitioning your gradients and optimizer state across multiple GPUs, for reduced memory overhead with **no performance loss**)
+- Sequential Model Parallelism with Checkpointing (partition your :class:`nn.Sequential <torch.nn.Sequential>` module across multiple GPUs, leverage checkpointing and microbatching for further memory improvements and device utilization)
 
 Sharded Training
 ^^^^^^^^^^^^^^^^
@@ -678,6 +679,72 @@ Sharded Training can work across all DDP variants by adding the additional ``--p
 
 Internally we re-initialize your optimizers and shard them across your machines and processes. We handle all communication using PyTorch distributed, so no code changes are required.
 
+----------
+
+.. _sequential-parallelism:
+
+Sequential Model Parallelism with Checkpointing
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+PyTorch Lightning integration for Sequential Model Parallelism using `FairScale <https://github.com/facebookresearch/fairscale>`_.
+Sequential Model Parallelism splits a sequential module onto multiple GPUs, reducing peak GPU memory requirements substantially.
+We also provide auto-balancing techniques through FairScale, to find optimal balances for the model across GPUs.
+In addition, we use Gradient Checkpointing to reduce GPU memory requirements further, and micro-batches to minimizing device under-utilization automatically.
+
+Reference: https://arxiv.org/abs/1811.06965
+
+.. note:: DDPSequentialPlugin is currently supported only for Pytorch 1.6.
+
+Before running, install FairScale by using pip install pytorch-lightning["extra"] and pip install pytorch-lightning-bolts
+To use Sequential Model Parallelism, you need to first install FairScale using the command below or install all extras using ``pip install pytorch-lightning["extra"]``.
+
+.. code-block:: bash
+
+    pip install https://github.com/facebookresearch/fairscale/archive/bb468670838b98dc8f8d67be4eabf195042a7994.zip
+
+To use Sequential Model Parallelism, you must define a  :class:`nn.Sequential <torch.nn.Sequential>` module that defines the layers you wish to parallelize across GPUs.
+This should be kept within the ``sequential_module`` variable within your ``LightningModule`` like below.
+
+.. code-block:: python
+
+    from pytorch_lightning.plugins.ddp_sequential_plugin import DDPSequentialPlugin
+    from pytorch_lightning import LightningModule
+
+    class MyModel(LightningModule):
+        def __init__(self):
+            ...
+            self.sequential_module = torch.nn.Sequential(my_layers)
+
+    # Split my module across 4 gpus, one layer each
+    model = MyModel()
+    plugin = DDPSequentialPlugin(balance=[1, 1, 1, 1])
+    trainer = Trainer(accelerator='ddp', gpus=4, plugins=[plugin])
+    trainer.fit(model)
+
+
+We provide a minimal example of Sequential Model Parallelism using a convolutional model training on cifar10, split onto GPUs `here <https://github.com/PyTorchLightning/pytorch-lightning/tree/master/pl_examples/basic_examples/conv_sequential_example.py>`_.
+
+When running the Sequential Model Parallelism example on 2 GPUS we achieve these memory savings.
+
+.. list-table:: GPU Memory Utilization
+   :widths: 25 25 50
+   :header-rows: 1
+
+   * - GPUS
+     - Without Balancing
+     - With Balancing
+   * - Gpu 0
+     - 4436 MB
+     - 1554 MB
+   * - Gpu 1
+     - ~0
+     - 994 MB
+
+To run the example with Sequential Model Parallelism:
+
+.. code-block:: python
+
+    python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024 --gpus 2 --accelerator ddp --use_ddp_sequential
+
 
 Batch size
 ----------
diff --git a/docs/source/performance.rst b/docs/source/performance.rst
index 4997ebc57d597..7eab4c8bdf6a0 100644
--- a/docs/source/performance.rst
+++ b/docs/source/performance.rst
@@ -134,19 +134,10 @@ Sharded DDP can work across all DDP variants by adding the additional ``--plugin
 Refer to the :ref:`distributed computing guide for more details <multi_gpu>`.
 
 
-Sequential Model Parallelism with Checkpointing to reduce peak memory
+Sequential Model Parallelism with Checkpointing
 ---------------------------------------------------------------------
-Pipe Pipeline is a lightning integration of Pipeline Parallelism provided by Fairscale.
-Pipe combines pipeline parallelism with checkpointing to reduce peak memory required to train while minimizing device under-utilization.
-
-Find more explanation at https://arxiv.org/abs/1811.06965
-
-.. note:: DDPSequentialPlugin is currently supported only for Pytorch 1.6
-
-Before running, install Fairscale by using pip install pytorch-lightning["extra"] and pip install pytorch-lightning-bolts
-
-To use Sequential Model Parallelism, one need to provide a  :class:`nn.Sequential <torch.nn.Sequential>` module.
-If the module requires lots of memory, Pipe can be used to reduce this by leveraging multiple GPUs.
+PyTorch Lightning integration for Sequential Model Parallelism using `FairScale <https://github.com/facebookresearch/fairscale>`_.
+Sequential Model Parallelism splits a sequential module onto multiple GPUs, reducing peak GPU memory requirements substantially.
 
 .. code-block:: python
 
@@ -164,32 +155,5 @@ If the module requires lots of memory, Pipe can be used to reduce this by levera
     trainer = Trainer(accelerator='ddp', gpus=4, plugins=[plugin])
     trainer.fit(model)
 
-Find a [conv_sequential_example](https://github.com/PyTorchLightning/pytorch-lightning/tree/master/pl_examples/basic_examples/conv_sequential_example.py) tutorial on cifar10.
-
-When running this example on 2 GPUS.
-
-.. list-table:: GPU Memory Utilization
-   :widths: 25 25 50
-   :header-rows: 1
-
-   * - GPUS
-     - Without Balancing
-     - With Balancing
-   * - Gpu 0
-     - 4436 MB
-     - 1554 MB
-   * - Gpu 1
-     - ~0
-     - 994 MB
-
-Run with Balancing
-
-.. code-block:: python
-
-    python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024 --gpus 2 --accelerator ddp --use_ddp_sequential
-
-Run without Balancing
-
-.. code-block:: python
 
-    python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024 --gpus 1
+For more information, refer to :ref:`sequential-parallelism`.
diff --git a/docs/source/training_tricks.rst b/docs/source/training_tricks.rst
index a740292fde571..29810d7733b59 100644
--- a/docs/source/training_tricks.rst
+++ b/docs/source/training_tricks.rst
@@ -125,19 +125,10 @@ The algorithm in short works by:
 .. warning:: Batch size finder is not supported for DDP yet, it is coming soon.
 
 
-Sequential Model Parallelism with Checkpointing to reduce peak memory
+Sequential Model Parallelism with Checkpointing
 ---------------------------------------------------------------------
-Pipe Pipeline is a lightning integration of Pipeline Parallelism provided by Fairscale.
-Pipe combines pipeline parallelism with checkpointing to reduce peak memory required to train while minimizing device under-utilization.
-
-Find more explanation at https://arxiv.org/abs/1811.06965
-
-.. note:: DDPSequentialPlugin is currently supported only for Pytorch 1.6
-
-Before running, install Fairscale by using pip install pytorch-lightning["extra"] and pip install pytorch-lightning-bolts
-
-To use Sequential Model Parallelism, one need to provide a  :class:`nn.Sequential <torch.nn.Sequential>` module.
-If the module requires lots of memory, Pipe can be used to reduce this by leveraging multiple GPUs.
+PyTorch Lightning integration for Sequential Model Parallelism using `FairScale <https://github.com/facebookresearch/fairscale>`_.
+Sequential Model Parallelism splits a sequential module onto multiple GPUs, reducing peak GPU memory requirements substantially.
 
 .. code-block:: python
 
@@ -155,32 +146,5 @@ If the module requires lots of memory, Pipe can be used to reduce this by levera
     trainer = Trainer(accelerator='ddp', gpus=4, plugins=[plugin])
     trainer.fit(model)
 
-Find a [conv_sequential_example](https://github.com/PyTorchLightning/pytorch-lightning/tree/master/pl_examples/basic_examples/conv_sequential_example.py) tutorial on cifar10.
-
-When running this example on 2 GPUS.
-
-.. list-table:: GPU Memory Utilization
-   :widths: 25 25 50
-   :header-rows: 1
-
-   * - GPUS
-     - Without Balancing
-     - With Balancing
-   * - Gpu 0
-     - 4436 MB
-     - 1554 MB
-   * - Gpu 1
-     - ~0
-     - 994 MB
-
-Run with Balancing
-
-.. code-block:: python
-
-    python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024 --gpus 2 --accelerator ddp --use_ddp_sequential
-
-Run without Balancing
-
-.. code-block:: python
 
-    python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024 --gpus 1
+For more information, refer to :ref:`sequential-parallelism`.
\ No newline at end of file

From dcaa85acce4f4c88c224d51ab382ea4caafad141 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Wed, 9 Dec 2020 11:50:53 +0000
Subject: [PATCH 06/13] update doc

---
 docs/source/multi_gpu.rst | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst
index b0f0799fb13d1..f436ecdca091f 100644
--- a/docs/source/multi_gpu.rst
+++ b/docs/source/multi_gpu.rst
@@ -694,8 +694,8 @@ Reference: https://arxiv.org/abs/1811.06965
 
 .. note:: DDPSequentialPlugin is currently supported only for Pytorch 1.6.
 
-Before running, install FairScale by using pip install pytorch-lightning["extra"] and pip install pytorch-lightning-bolts
-To use Sequential Model Parallelism, you need to first install FairScale using the command below or install all extras using ``pip install pytorch-lightning["extra"]``.
+To use Sequential Model Parallelism, you need to first install FairScale using the command below or install all extras using ``pip install pytorch-lightning["extra"]``
+and ``pip install pytorch-lightning-bolts``
 
 .. code-block:: bash
 
@@ -745,6 +745,12 @@ To run the example with Sequential Model Parallelism:
 
     python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024 --gpus 2 --accelerator ddp --use_ddp_sequential
 
+To run the same example without Sequential Model Parallelism:
+
+.. code-block:: python
+
+    python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024 --gpus 1
+
 
 Batch size
 ----------
@@ -795,8 +801,8 @@ Lightning supports the use of TorchElastic to enable fault-tolerant and elastic
 .. code-block:: python
 
     Trainer(gpus=8, accelerator='ddp')
-    
-    
+
+
 Following the `TorchElastic Quickstart documentation <https://pytorch.org/elastic/latest/quickstart.html>`_, you then need to start a single-node etcd server on one of the hosts:
 
 .. code-block:: bash
@@ -804,8 +810,8 @@ Following the `TorchElastic Quickstart documentation <https://pytorch.org/elasti
     etcd --enable-v2
          --listen-client-urls http://0.0.0.0:2379,http://127.0.0.1:4001
          --advertise-client-urls PUBLIC_HOSTNAME:2379
-         
-     
+
+
 And then launch the elastic job with:
 
 .. code-block:: bash
@@ -817,7 +823,7 @@ And then launch the elastic job with:
             --rdzv_backend=etcd
             --rdzv_endpoint=ETCD_HOST:ETCD_PORT
             YOUR_LIGHTNING_TRAINING_SCRIPT.py (--arg1 ... train script args...)
-            
+
 
 See the official `TorchElastic documentation <https://pytorch.org/elastic>`_ for details
 on installation and more use cases.

From 4ff1a46995592b5f8cc084521c14f4a77a4e5b68 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Wed, 9 Dec 2020 11:52:55 +0000
Subject: [PATCH 07/13] udpate

---
 docs/source/multi_gpu.rst | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst
index f436ecdca091f..8be564f57980d 100644
--- a/docs/source/multi_gpu.rst
+++ b/docs/source/multi_gpu.rst
@@ -694,13 +694,9 @@ Reference: https://arxiv.org/abs/1811.06965
 
 .. note:: DDPSequentialPlugin is currently supported only for Pytorch 1.6.
 
-To use Sequential Model Parallelism, you need to first install FairScale using the command below or install all extras using ``pip install pytorch-lightning["extra"]``
+First, install FairScale install all extras using ``pip install pytorch-lightning["extra"]``
 and ``pip install pytorch-lightning-bolts``
 
-.. code-block:: bash
-
-    pip install https://github.com/facebookresearch/fairscale/archive/bb468670838b98dc8f8d67be4eabf195042a7994.zip
-
 To use Sequential Model Parallelism, you must define a  :class:`nn.Sequential <torch.nn.Sequential>` module that defines the layers you wish to parallelize across GPUs.
 This should be kept within the ``sequential_module`` variable within your ``LightningModule`` like below.
 

From c4d456a3377915fdd197285e14e0e288cc1865ab Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Wed, 9 Dec 2020 11:54:08 +0000
Subject: [PATCH 08/13] update doc

---
 docs/source/multi_gpu.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst
index 8be564f57980d..233cad9fb5ea6 100644
--- a/docs/source/multi_gpu.rst
+++ b/docs/source/multi_gpu.rst
@@ -694,8 +694,8 @@ Reference: https://arxiv.org/abs/1811.06965
 
 .. note:: DDPSequentialPlugin is currently supported only for Pytorch 1.6.
 
-First, install FairScale install all extras using ``pip install pytorch-lightning["extra"]``
-and ``pip install pytorch-lightning-bolts``
+To get started, install all extras using with ``pip install pytorch-lightning["extra"]``
+and Bolts with ``pip install pytorch-lightning-bolts``
 
 To use Sequential Model Parallelism, you must define a  :class:`nn.Sequential <torch.nn.Sequential>` module that defines the layers you wish to parallelize across GPUs.
 This should be kept within the ``sequential_module`` variable within your ``LightningModule`` like below.

From 49b408ba6323e4bf14a9225afdfeb63e01568850 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Wed, 9 Dec 2020 13:02:45 +0000
Subject: [PATCH 09/13] update doc

---
 docs/source/multi_gpu.rst | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst
index 233cad9fb5ea6..3dfc726c9fd6c 100644
--- a/docs/source/multi_gpu.rst
+++ b/docs/source/multi_gpu.rst
@@ -694,8 +694,13 @@ Reference: https://arxiv.org/abs/1811.06965
 
 .. note:: DDPSequentialPlugin is currently supported only for Pytorch 1.6.
 
-To get started, install all extras using with ``pip install pytorch-lightning["extra"]``
-and Bolts with ``pip install pytorch-lightning-bolts``
+To get started, install FairScale through extras using with ``pip install pytorch-lightning["extra"]``
+
+or directly using
+
+.. code-block:: bash
+
+     pip install https://github.com/PyTorchLightning/fairscale/archive/pl_1.1.0.zip
 
 To use Sequential Model Parallelism, you must define a  :class:`nn.Sequential <torch.nn.Sequential>` module that defines the layers you wish to parallelize across GPUs.
 This should be kept within the ``sequential_module`` variable within your ``LightningModule`` like below.
@@ -716,6 +721,7 @@ This should be kept within the ``sequential_module`` variable within your ``Ligh
     trainer = Trainer(accelerator='ddp', gpus=4, plugins=[plugin])
     trainer.fit(model)
 
+To run the example, you will need Bolts. Install with  ``pip install pytorch-lightning-bolts``
 
 We provide a minimal example of Sequential Model Parallelism using a convolutional model training on cifar10, split onto GPUs `here <https://github.com/PyTorchLightning/pytorch-lightning/tree/master/pl_examples/basic_examples/conv_sequential_example.py>`_.
 

From f7520815e75cbe1023cf5a94dfaf24be17e90d7c Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Wed, 9 Dec 2020 13:31:07 +0000
Subject: [PATCH 10/13] Formatting, update sharded zip link

---
 docs/source/multi_gpu.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst
index 3dfc726c9fd6c..de1cbb80ed30c 100644
--- a/docs/source/multi_gpu.rst
+++ b/docs/source/multi_gpu.rst
@@ -667,7 +667,7 @@ To use Sharded Training, you need to first install FairScale using the command b
 
 .. code-block:: bash
 
-    pip install https://github.com/facebookresearch/fairscale/archive/bb468670838b98dc8f8d67be4eabf195042a7994.zip
+    pip install https://github.com/PyTorchLightning/fairscale/archive/pl_1.1.0.zip
 
 
 .. code-block:: python
@@ -721,9 +721,9 @@ This should be kept within the ``sequential_module`` variable within your ``Ligh
     trainer = Trainer(accelerator='ddp', gpus=4, plugins=[plugin])
     trainer.fit(model)
 
-To run the example, you will need Bolts. Install with  ``pip install pytorch-lightning-bolts``
 
 We provide a minimal example of Sequential Model Parallelism using a convolutional model training on cifar10, split onto GPUs `here <https://github.com/PyTorchLightning/pytorch-lightning/tree/master/pl_examples/basic_examples/conv_sequential_example.py>`_.
+To run the example, you will to install `Bolts <https://github.com/PyTorchLightning/pytorch-lightning-bolts>`_. Install with ``pip install pytorch-lightning-bolts``.
 
 When running the Sequential Model Parallelism example on 2 GPUS we achieve these memory savings.
 

From 681a666ba0363c3f02c84a061290c66236c79ec1 Mon Sep 17 00:00:00 2001
From: chaton <thomas@grid.ai>
Date: Wed, 9 Dec 2020 13:36:01 +0000
Subject: [PATCH 11/13] Update docs/source/multi_gpu.rst
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
---
 docs/source/multi_gpu.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst
index de1cbb80ed30c..cf40ec41a8e19 100644
--- a/docs/source/multi_gpu.rst
+++ b/docs/source/multi_gpu.rst
@@ -723,7 +723,7 @@ This should be kept within the ``sequential_module`` variable within your ``Ligh
 
 
 We provide a minimal example of Sequential Model Parallelism using a convolutional model training on cifar10, split onto GPUs `here <https://github.com/PyTorchLightning/pytorch-lightning/tree/master/pl_examples/basic_examples/conv_sequential_example.py>`_.
-To run the example, you will to install `Bolts <https://github.com/PyTorchLightning/pytorch-lightning-bolts>`_. Install with ``pip install pytorch-lightning-bolts``.
+To run the example, you need to install `Bolts <https://github.com/PyTorchLightning/pytorch-lightning-bolts>`_. Install with ``pip install pytorch-lightning-bolts``.
 
 When running the Sequential Model Parallelism example on 2 GPUS we achieve these memory savings.
 

From 5e24528b68edcef2af59f62ae4c40d84e31273d0 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Wed, 9 Dec 2020 14:51:10 +0100
Subject: [PATCH 12/13] Apply suggestions from code review

---
 docs/source/multi_gpu.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst
index cf40ec41a8e19..2ce66c9a719ca 100644
--- a/docs/source/multi_gpu.rst
+++ b/docs/source/multi_gpu.rst
@@ -743,13 +743,13 @@ When running the Sequential Model Parallelism example on 2 GPUS we achieve these
 
 To run the example with Sequential Model Parallelism:
 
-.. code-block:: python
+.. code-block:: bash
 
     python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024 --gpus 2 --accelerator ddp --use_ddp_sequential
 
 To run the same example without Sequential Model Parallelism:
 
-.. code-block:: python
+.. code-block:: bash
 
     python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024 --gpus 1
 

From cc979042755d34972c6897ccaef34142e6d9ff8f Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Wed, 9 Dec 2020 14:16:46 +0000
Subject: [PATCH 13/13] Reference directly to section

---
 docs/source/performance.rst     | 17 -----------------
 docs/source/training_tricks.rst | 17 -----------------
 2 files changed, 34 deletions(-)

diff --git a/docs/source/performance.rst b/docs/source/performance.rst
index 7eab4c8bdf6a0..394f6e5f3ca13 100644
--- a/docs/source/performance.rst
+++ b/docs/source/performance.rst
@@ -139,21 +139,4 @@ Sequential Model Parallelism with Checkpointing
 PyTorch Lightning integration for Sequential Model Parallelism using `FairScale <https://github.com/facebookresearch/fairscale>`_.
 Sequential Model Parallelism splits a sequential module onto multiple GPUs, reducing peak GPU memory requirements substantially.
 
-.. code-block:: python
-
-    from pytorch_lightning.plugins.ddp_sequential_plugin import DDPSequentialPlugin
-    from pytorch_lightning import LightningModule
-
-    class MyModel(LightningModule):
-        def __init__(self):
-            ...
-            self.sequential_module = torch.nn.Sequential(my_layers)
-
-    # Split my module across 4 gpus, one layer each
-    model = MyModel()
-    plugin = DDPSequentialPlugin(balance=[1, 1, 1, 1])
-    trainer = Trainer(accelerator='ddp', gpus=4, plugins=[plugin])
-    trainer.fit(model)
-
-
 For more information, refer to :ref:`sequential-parallelism`.
diff --git a/docs/source/training_tricks.rst b/docs/source/training_tricks.rst
index 29810d7733b59..10ee668a97fa8 100644
--- a/docs/source/training_tricks.rst
+++ b/docs/source/training_tricks.rst
@@ -130,21 +130,4 @@ Sequential Model Parallelism with Checkpointing
 PyTorch Lightning integration for Sequential Model Parallelism using `FairScale <https://github.com/facebookresearch/fairscale>`_.
 Sequential Model Parallelism splits a sequential module onto multiple GPUs, reducing peak GPU memory requirements substantially.
 
-.. code-block:: python
-
-    from pytorch_lightning.plugins.ddp_sequential_plugin import DDPSequentialPlugin
-    from pytorch_lightning import LightningModule
-
-    class MyModel(LightningModule):
-        def __init__(self):
-            ...
-            self.sequential_module = torch.nn.Sequential(my_layers)
-
-    # Split my module across 4 gpus, one layer each
-    model = MyModel()
-    plugin = DDPSequentialPlugin(balance=[1, 1, 1, 1])
-    trainer = Trainer(accelerator='ddp', gpus=4, plugins=[plugin])
-    trainer.fit(model)
-
-
 For more information, refer to :ref:`sequential-parallelism`.
\ No newline at end of file