From 37a60effc97dfe4c00fde0bb613216e38251ca4d Mon Sep 17 00:00:00 2001 From: tchaton Date: Wed, 9 Dec 2020 09:13:17 +0000 Subject: [PATCH 01/13] add pp doc --- .pre-commit-config.yaml | 1 + docs/source/performance.rst | 59 ++++++++++++++++++++++++++++++++- docs/source/training_tricks.rst | 57 +++++++++++++++++++++++++++++++ 3 files changed, 116 insertions(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 78684a2ab74df..1a4cbed8efd54 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -32,5 +32,6 @@ repos: types: [python] - repo: https://github.com/pre-commit/mirrors-mypy + rev: v0.790 hooks: - id: mypy diff --git a/docs/source/performance.rst b/docs/source/performance.rst index 0f97942128cda..a15d6ebfbf857 100644 --- a/docs/source/performance.rst +++ b/docs/source/performance.rst @@ -131,4 +131,61 @@ To use Optimizer Sharded Training, refer to :ref:`model-parallelism`. Sharded DDP can work across all DDP variants by adding the additional ``--plugins ddp_sharded`` flag. -Refer to the :ref:`distributed computing guide for more details `. \ No newline at end of file +Refer to the :ref:`distributed computing guide for more details `. + + +Sequential Model Parallelism with Checkpointing to reduce peak memory +--------------------------------------------------------------------- +Pipe Pipeline is a lightning integration of Pipeline Parallelism provided by Fairscale. +Pipe combines pipeline parallelism with checkpointing to reduce peak memory required to train while minimizing device under-utilization. + +Find more explanation at https://arxiv.org/abs/1811.06965 + +Before running, install Fairscale by using pip install pytorch-lightning["extra"]. + +To use Sequential Model Parallelism, one need to provide a :class:`nn.Sequential ` module. +If the module requires lots of memory, Pipe can be used to reduce this by leveraging multiple GPUs. + +.. code-block:: python + + # from pytorch_lightning.plugins.ddp_sequential_plugin import DDPSequentialPlugin + # from pytorch_lightning import LightningModule + + class MyModel(LightningModule): + def __init__(self): + ... + self.sequential_module = torch.nn.Sequential(my_layers) + + # Split my module across 4 gpus, one layer each + model = MyModel() + plugin = DDPSequentialPlugin(balance=[1, 1, 1, 1]) + trainer = Trainer(accelerator='ddp', gpus=4, plugins=[plugin]) + trainer.fit(model) + +Find a [conv_sequential_example](https://github.com/PyTorchLightning/pytorch-lightning/tree/master/pl_examples/basic_examples/conv_sequential_example.py) tutorial on cifar10. + +When running this example on 2 GPUS. + +.. list-table:: GPU Memory Utilization + :widths: 25 25 50 + :header-rows: 1 + + * - GPUS + - Without Balancing + - With Balancing + * - Gpu 0 + - 4436 MB + - 1554 MB + * - Gpu 1 + - ~0 + - 994 MB + +Run with Balancing +.. code-block:: bash + + python pl_examples/basic_examples/conv_sequential_example.py --use_pipe 1 --batch_size 1024 + +Run without Balancing +.. code-block:: bash + + python pl_examples/basic_examples/conv_sequential_example.py --use_pipe 0 --batch_size 1024 diff --git a/docs/source/training_tricks.rst b/docs/source/training_tricks.rst index 6ff9dfd0a30d3..3095cf321d7de 100644 --- a/docs/source/training_tricks.rst +++ b/docs/source/training_tricks.rst @@ -123,3 +123,60 @@ The algorithm in short works by: :members: scale_batch_size .. warning:: Batch size finder is not supported for DDP yet, it is coming soon. + + +Sequential Model Parallelism with Checkpointing to reduce peak memory +--------------------------------------------------------------------- +Pipe Pipeline is a lightning integration of Pipeline Parallelism provided by Fairscale. +Pipe combines pipeline parallelism with checkpointing to reduce peak memory required to train while minimizing device under-utilization. + +Find more explanation at https://arxiv.org/abs/1811.06965 + +Before running, install Fairscale by using pip install pytorch-lightning["extra"]. + +To use Sequential Model Parallelism, one need to provide a :class:`nn.Sequential ` module. +If the module requires lots of memory, Pipe can be used to reduce this by leveraging multiple GPUs. + +.. code-block:: python + + # from pytorch_lightning.plugins.ddp_sequential_plugin import DDPSequentialPlugin + # from pytorch_lightning import LightningModule + + class MyModel(LightningModule): + def __init__(self): + ... + self.sequential_module = torch.nn.Sequential(my_layers) + + # Split my module across 4 gpus, one layer each + model = MyModel() + plugin = DDPSequentialPlugin(balance=[1, 1, 1, 1]) + trainer = Trainer(accelerator='ddp', gpus=4, plugins=[plugin]) + trainer.fit(model) + +Find a [conv_sequential_example](https://github.com/PyTorchLightning/pytorch-lightning/tree/master/pl_examples/basic_examples/conv_sequential_example.py) tutorial on cifar10. + +When running this example on 2 GPUS. + +.. list-table:: GPU Memory Utilization + :widths: 25 25 50 + :header-rows: 1 + + * - GPUS + - Without Balancing + - With Balancing + * - Gpu 0 + - 4436 MB + - 1554 MB + * - Gpu 1 + - ~0 + - 994 MB + +Run with Balancing +.. code-block:: bash + + python pl_examples/basic_examples/conv_sequential_example.py --use_pipe 1 --batch_size 1024 + +Run without Balancing +.. code-block:: bash + + python pl_examples/basic_examples/conv_sequential_example.py --use_pipe 0 --batch_size 1024 From e11a2a2dd39cea26762061b8302252016cddbed3 Mon Sep 17 00:00:00 2001 From: tchaton Date: Wed, 9 Dec 2020 09:30:26 +0000 Subject: [PATCH 02/13] udpate doc --- docs/source/performance.rst | 8 +++++--- docs/source/training_tricks.rst | 8 +++++--- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/docs/source/performance.rst b/docs/source/performance.rst index a15d6ebfbf857..c985b8798dd95 100644 --- a/docs/source/performance.rst +++ b/docs/source/performance.rst @@ -141,7 +141,9 @@ Pipe combines pipeline parallelism with checkpointing to reduce peak memory requ Find more explanation at https://arxiv.org/abs/1811.06965 -Before running, install Fairscale by using pip install pytorch-lightning["extra"]. +.. note:: DDPSequentialPlugin is currently supported only for torch 1.6 + +Before running, install Fairscale by using pip install pytorch-lightning["extra"] and pip install pytorch-lightning-bolts To use Sequential Model Parallelism, one need to provide a :class:`nn.Sequential ` module. If the module requires lots of memory, Pipe can be used to reduce this by leveraging multiple GPUs. @@ -183,9 +185,9 @@ When running this example on 2 GPUS. Run with Balancing .. code-block:: bash - python pl_examples/basic_examples/conv_sequential_example.py --use_pipe 1 --batch_size 1024 + python pl_examples/basic_examples/conv_sequential_example.py --use_ddp_sequential --batch_size 1024 Run without Balancing .. code-block:: bash - python pl_examples/basic_examples/conv_sequential_example.py --use_pipe 0 --batch_size 1024 + python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024 diff --git a/docs/source/training_tricks.rst b/docs/source/training_tricks.rst index 3095cf321d7de..964519e147464 100644 --- a/docs/source/training_tricks.rst +++ b/docs/source/training_tricks.rst @@ -132,7 +132,9 @@ Pipe combines pipeline parallelism with checkpointing to reduce peak memory requ Find more explanation at https://arxiv.org/abs/1811.06965 -Before running, install Fairscale by using pip install pytorch-lightning["extra"]. +.. note:: DDPSequentialPlugin is currently supported only for torch 1.6 + +Before running, install Fairscale by using pip install pytorch-lightning["extra"] and pip install pytorch-lightning-bolts To use Sequential Model Parallelism, one need to provide a :class:`nn.Sequential ` module. If the module requires lots of memory, Pipe can be used to reduce this by leveraging multiple GPUs. @@ -174,9 +176,9 @@ When running this example on 2 GPUS. Run with Balancing .. code-block:: bash - python pl_examples/basic_examples/conv_sequential_example.py --use_pipe 1 --batch_size 1024 + python pl_examples/basic_examples/conv_sequential_example.py --use_ddp_sequential --batch_size 1024 Run without Balancing .. code-block:: bash - python pl_examples/basic_examples/conv_sequential_example.py --use_pipe 0 --batch_size 1024 + python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024 From ef0cf518aa6f1e3c6139f8297d6850aea9452a4a Mon Sep 17 00:00:00 2001 From: tchaton Date: Wed, 9 Dec 2020 09:40:36 +0000 Subject: [PATCH 03/13] update doc --- docs/source/performance.rst | 4 ++-- docs/source/training_tricks.rst | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/performance.rst b/docs/source/performance.rst index c985b8798dd95..48f6c589372e5 100644 --- a/docs/source/performance.rst +++ b/docs/source/performance.rst @@ -185,9 +185,9 @@ When running this example on 2 GPUS. Run with Balancing .. code-block:: bash - python pl_examples/basic_examples/conv_sequential_example.py --use_ddp_sequential --batch_size 1024 + python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024 --gpus 2 --accelerator ddp --use_ddp_sequential Run without Balancing .. code-block:: bash - python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024 + python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024 --gpus 1 diff --git a/docs/source/training_tricks.rst b/docs/source/training_tricks.rst index 964519e147464..9dc2b4525e8a9 100644 --- a/docs/source/training_tricks.rst +++ b/docs/source/training_tricks.rst @@ -176,9 +176,9 @@ When running this example on 2 GPUS. Run with Balancing .. code-block:: bash - python pl_examples/basic_examples/conv_sequential_example.py --use_ddp_sequential --batch_size 1024 + python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024 --gpus 2 --accelerator ddp --use_ddp_sequential Run without Balancing .. code-block:: bash - python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024 + python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024 --gpus 1 From 83da2c854588e86568be0c05a59b66c3f16f2170 Mon Sep 17 00:00:00 2001 From: tchaton Date: Wed, 9 Dec 2020 09:57:41 +0000 Subject: [PATCH 04/13] update doc --- docs/source/performance.rst | 12 +++++++----- docs/source/training_tricks.rst | 12 +++++++----- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/docs/source/performance.rst b/docs/source/performance.rst index 48f6c589372e5..4997ebc57d597 100644 --- a/docs/source/performance.rst +++ b/docs/source/performance.rst @@ -141,7 +141,7 @@ Pipe combines pipeline parallelism with checkpointing to reduce peak memory requ Find more explanation at https://arxiv.org/abs/1811.06965 -.. note:: DDPSequentialPlugin is currently supported only for torch 1.6 +.. note:: DDPSequentialPlugin is currently supported only for Pytorch 1.6 Before running, install Fairscale by using pip install pytorch-lightning["extra"] and pip install pytorch-lightning-bolts @@ -150,8 +150,8 @@ If the module requires lots of memory, Pipe can be used to reduce this by levera .. code-block:: python - # from pytorch_lightning.plugins.ddp_sequential_plugin import DDPSequentialPlugin - # from pytorch_lightning import LightningModule + from pytorch_lightning.plugins.ddp_sequential_plugin import DDPSequentialPlugin + from pytorch_lightning import LightningModule class MyModel(LightningModule): def __init__(self): @@ -183,11 +183,13 @@ When running this example on 2 GPUS. - 994 MB Run with Balancing -.. code-block:: bash + +.. code-block:: python python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024 --gpus 2 --accelerator ddp --use_ddp_sequential Run without Balancing -.. code-block:: bash + +.. code-block:: python python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024 --gpus 1 diff --git a/docs/source/training_tricks.rst b/docs/source/training_tricks.rst index 9dc2b4525e8a9..a740292fde571 100644 --- a/docs/source/training_tricks.rst +++ b/docs/source/training_tricks.rst @@ -132,7 +132,7 @@ Pipe combines pipeline parallelism with checkpointing to reduce peak memory requ Find more explanation at https://arxiv.org/abs/1811.06965 -.. note:: DDPSequentialPlugin is currently supported only for torch 1.6 +.. note:: DDPSequentialPlugin is currently supported only for Pytorch 1.6 Before running, install Fairscale by using pip install pytorch-lightning["extra"] and pip install pytorch-lightning-bolts @@ -141,8 +141,8 @@ If the module requires lots of memory, Pipe can be used to reduce this by levera .. code-block:: python - # from pytorch_lightning.plugins.ddp_sequential_plugin import DDPSequentialPlugin - # from pytorch_lightning import LightningModule + from pytorch_lightning.plugins.ddp_sequential_plugin import DDPSequentialPlugin + from pytorch_lightning import LightningModule class MyModel(LightningModule): def __init__(self): @@ -174,11 +174,13 @@ When running this example on 2 GPUS. - 994 MB Run with Balancing -.. code-block:: bash + +.. code-block:: python python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024 --gpus 2 --accelerator ddp --use_ddp_sequential Run without Balancing -.. code-block:: bash + +.. code-block:: python python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024 --gpus 1 From 05c8102a72716e1d1148e64843721a1f085b5c80 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Wed, 9 Dec 2020 11:08:27 +0000 Subject: [PATCH 05/13] Update docs --- docs/source/multi_gpu.rst | 67 +++++++++++++++++++++++++++++++++ docs/source/performance.rst | 44 ++-------------------- docs/source/training_tricks.rst | 44 ++-------------------- 3 files changed, 75 insertions(+), 80 deletions(-) diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst index b6ffad20e741b..b0f0799fb13d1 100644 --- a/docs/source/multi_gpu.rst +++ b/docs/source/multi_gpu.rst @@ -612,6 +612,7 @@ This is useful when dealing with large Transformer based models, or in environme Lightning currently offers the following methods to leverage model parallelism: - Sharded Training (partitioning your gradients and optimizer state across multiple GPUs, for reduced memory overhead with **no performance loss**) +- Sequential Model Parallelism with Checkpointing (partition your :class:`nn.Sequential ` module across multiple GPUs, leverage checkpointing and microbatching for further memory improvements and device utilization) Sharded Training ^^^^^^^^^^^^^^^^ @@ -678,6 +679,72 @@ Sharded Training can work across all DDP variants by adding the additional ``--p Internally we re-initialize your optimizers and shard them across your machines and processes. We handle all communication using PyTorch distributed, so no code changes are required. +---------- + +.. _sequential-parallelism: + +Sequential Model Parallelism with Checkpointing +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +PyTorch Lightning integration for Sequential Model Parallelism using `FairScale `_. +Sequential Model Parallelism splits a sequential module onto multiple GPUs, reducing peak GPU memory requirements substantially. +We also provide auto-balancing techniques through FairScale, to find optimal balances for the model across GPUs. +In addition, we use Gradient Checkpointing to reduce GPU memory requirements further, and micro-batches to minimizing device under-utilization automatically. + +Reference: https://arxiv.org/abs/1811.06965 + +.. note:: DDPSequentialPlugin is currently supported only for Pytorch 1.6. + +Before running, install FairScale by using pip install pytorch-lightning["extra"] and pip install pytorch-lightning-bolts +To use Sequential Model Parallelism, you need to first install FairScale using the command below or install all extras using ``pip install pytorch-lightning["extra"]``. + +.. code-block:: bash + + pip install https://github.com/facebookresearch/fairscale/archive/bb468670838b98dc8f8d67be4eabf195042a7994.zip + +To use Sequential Model Parallelism, you must define a :class:`nn.Sequential ` module that defines the layers you wish to parallelize across GPUs. +This should be kept within the ``sequential_module`` variable within your ``LightningModule`` like below. + +.. code-block:: python + + from pytorch_lightning.plugins.ddp_sequential_plugin import DDPSequentialPlugin + from pytorch_lightning import LightningModule + + class MyModel(LightningModule): + def __init__(self): + ... + self.sequential_module = torch.nn.Sequential(my_layers) + + # Split my module across 4 gpus, one layer each + model = MyModel() + plugin = DDPSequentialPlugin(balance=[1, 1, 1, 1]) + trainer = Trainer(accelerator='ddp', gpus=4, plugins=[plugin]) + trainer.fit(model) + + +We provide a minimal example of Sequential Model Parallelism using a convolutional model training on cifar10, split onto GPUs `here `_. + +When running the Sequential Model Parallelism example on 2 GPUS we achieve these memory savings. + +.. list-table:: GPU Memory Utilization + :widths: 25 25 50 + :header-rows: 1 + + * - GPUS + - Without Balancing + - With Balancing + * - Gpu 0 + - 4436 MB + - 1554 MB + * - Gpu 1 + - ~0 + - 994 MB + +To run the example with Sequential Model Parallelism: + +.. code-block:: python + + python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024 --gpus 2 --accelerator ddp --use_ddp_sequential + Batch size ---------- diff --git a/docs/source/performance.rst b/docs/source/performance.rst index 4997ebc57d597..7eab4c8bdf6a0 100644 --- a/docs/source/performance.rst +++ b/docs/source/performance.rst @@ -134,19 +134,10 @@ Sharded DDP can work across all DDP variants by adding the additional ``--plugin Refer to the :ref:`distributed computing guide for more details `. -Sequential Model Parallelism with Checkpointing to reduce peak memory +Sequential Model Parallelism with Checkpointing --------------------------------------------------------------------- -Pipe Pipeline is a lightning integration of Pipeline Parallelism provided by Fairscale. -Pipe combines pipeline parallelism with checkpointing to reduce peak memory required to train while minimizing device under-utilization. - -Find more explanation at https://arxiv.org/abs/1811.06965 - -.. note:: DDPSequentialPlugin is currently supported only for Pytorch 1.6 - -Before running, install Fairscale by using pip install pytorch-lightning["extra"] and pip install pytorch-lightning-bolts - -To use Sequential Model Parallelism, one need to provide a :class:`nn.Sequential ` module. -If the module requires lots of memory, Pipe can be used to reduce this by leveraging multiple GPUs. +PyTorch Lightning integration for Sequential Model Parallelism using `FairScale `_. +Sequential Model Parallelism splits a sequential module onto multiple GPUs, reducing peak GPU memory requirements substantially. .. code-block:: python @@ -164,32 +155,5 @@ If the module requires lots of memory, Pipe can be used to reduce this by levera trainer = Trainer(accelerator='ddp', gpus=4, plugins=[plugin]) trainer.fit(model) -Find a [conv_sequential_example](https://github.com/PyTorchLightning/pytorch-lightning/tree/master/pl_examples/basic_examples/conv_sequential_example.py) tutorial on cifar10. - -When running this example on 2 GPUS. - -.. list-table:: GPU Memory Utilization - :widths: 25 25 50 - :header-rows: 1 - - * - GPUS - - Without Balancing - - With Balancing - * - Gpu 0 - - 4436 MB - - 1554 MB - * - Gpu 1 - - ~0 - - 994 MB - -Run with Balancing - -.. code-block:: python - - python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024 --gpus 2 --accelerator ddp --use_ddp_sequential - -Run without Balancing - -.. code-block:: python - python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024 --gpus 1 +For more information, refer to :ref:`sequential-parallelism`. diff --git a/docs/source/training_tricks.rst b/docs/source/training_tricks.rst index a740292fde571..29810d7733b59 100644 --- a/docs/source/training_tricks.rst +++ b/docs/source/training_tricks.rst @@ -125,19 +125,10 @@ The algorithm in short works by: .. warning:: Batch size finder is not supported for DDP yet, it is coming soon. -Sequential Model Parallelism with Checkpointing to reduce peak memory +Sequential Model Parallelism with Checkpointing --------------------------------------------------------------------- -Pipe Pipeline is a lightning integration of Pipeline Parallelism provided by Fairscale. -Pipe combines pipeline parallelism with checkpointing to reduce peak memory required to train while minimizing device under-utilization. - -Find more explanation at https://arxiv.org/abs/1811.06965 - -.. note:: DDPSequentialPlugin is currently supported only for Pytorch 1.6 - -Before running, install Fairscale by using pip install pytorch-lightning["extra"] and pip install pytorch-lightning-bolts - -To use Sequential Model Parallelism, one need to provide a :class:`nn.Sequential ` module. -If the module requires lots of memory, Pipe can be used to reduce this by leveraging multiple GPUs. +PyTorch Lightning integration for Sequential Model Parallelism using `FairScale `_. +Sequential Model Parallelism splits a sequential module onto multiple GPUs, reducing peak GPU memory requirements substantially. .. code-block:: python @@ -155,32 +146,5 @@ If the module requires lots of memory, Pipe can be used to reduce this by levera trainer = Trainer(accelerator='ddp', gpus=4, plugins=[plugin]) trainer.fit(model) -Find a [conv_sequential_example](https://github.com/PyTorchLightning/pytorch-lightning/tree/master/pl_examples/basic_examples/conv_sequential_example.py) tutorial on cifar10. - -When running this example on 2 GPUS. - -.. list-table:: GPU Memory Utilization - :widths: 25 25 50 - :header-rows: 1 - - * - GPUS - - Without Balancing - - With Balancing - * - Gpu 0 - - 4436 MB - - 1554 MB - * - Gpu 1 - - ~0 - - 994 MB - -Run with Balancing - -.. code-block:: python - - python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024 --gpus 2 --accelerator ddp --use_ddp_sequential - -Run without Balancing - -.. code-block:: python - python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024 --gpus 1 +For more information, refer to :ref:`sequential-parallelism`. \ No newline at end of file From dcaa85acce4f4c88c224d51ab382ea4caafad141 Mon Sep 17 00:00:00 2001 From: tchaton Date: Wed, 9 Dec 2020 11:50:53 +0000 Subject: [PATCH 06/13] update doc --- docs/source/multi_gpu.rst | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst index b0f0799fb13d1..f436ecdca091f 100644 --- a/docs/source/multi_gpu.rst +++ b/docs/source/multi_gpu.rst @@ -694,8 +694,8 @@ Reference: https://arxiv.org/abs/1811.06965 .. note:: DDPSequentialPlugin is currently supported only for Pytorch 1.6. -Before running, install FairScale by using pip install pytorch-lightning["extra"] and pip install pytorch-lightning-bolts -To use Sequential Model Parallelism, you need to first install FairScale using the command below or install all extras using ``pip install pytorch-lightning["extra"]``. +To use Sequential Model Parallelism, you need to first install FairScale using the command below or install all extras using ``pip install pytorch-lightning["extra"]`` +and ``pip install pytorch-lightning-bolts`` .. code-block:: bash @@ -745,6 +745,12 @@ To run the example with Sequential Model Parallelism: python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024 --gpus 2 --accelerator ddp --use_ddp_sequential +To run the same example without Sequential Model Parallelism: + +.. code-block:: python + + python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024 --gpus 1 + Batch size ---------- @@ -795,8 +801,8 @@ Lightning supports the use of TorchElastic to enable fault-tolerant and elastic .. code-block:: python Trainer(gpus=8, accelerator='ddp') - - + + Following the `TorchElastic Quickstart documentation `_, you then need to start a single-node etcd server on one of the hosts: .. code-block:: bash @@ -804,8 +810,8 @@ Following the `TorchElastic Quickstart documentation `_ for details on installation and more use cases. From 4ff1a46995592b5f8cc084521c14f4a77a4e5b68 Mon Sep 17 00:00:00 2001 From: tchaton Date: Wed, 9 Dec 2020 11:52:55 +0000 Subject: [PATCH 07/13] udpate --- docs/source/multi_gpu.rst | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst index f436ecdca091f..8be564f57980d 100644 --- a/docs/source/multi_gpu.rst +++ b/docs/source/multi_gpu.rst @@ -694,13 +694,9 @@ Reference: https://arxiv.org/abs/1811.06965 .. note:: DDPSequentialPlugin is currently supported only for Pytorch 1.6. -To use Sequential Model Parallelism, you need to first install FairScale using the command below or install all extras using ``pip install pytorch-lightning["extra"]`` +First, install FairScale install all extras using ``pip install pytorch-lightning["extra"]`` and ``pip install pytorch-lightning-bolts`` -.. code-block:: bash - - pip install https://github.com/facebookresearch/fairscale/archive/bb468670838b98dc8f8d67be4eabf195042a7994.zip - To use Sequential Model Parallelism, you must define a :class:`nn.Sequential ` module that defines the layers you wish to parallelize across GPUs. This should be kept within the ``sequential_module`` variable within your ``LightningModule`` like below. From c4d456a3377915fdd197285e14e0e288cc1865ab Mon Sep 17 00:00:00 2001 From: tchaton Date: Wed, 9 Dec 2020 11:54:08 +0000 Subject: [PATCH 08/13] update doc --- docs/source/multi_gpu.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst index 8be564f57980d..233cad9fb5ea6 100644 --- a/docs/source/multi_gpu.rst +++ b/docs/source/multi_gpu.rst @@ -694,8 +694,8 @@ Reference: https://arxiv.org/abs/1811.06965 .. note:: DDPSequentialPlugin is currently supported only for Pytorch 1.6. -First, install FairScale install all extras using ``pip install pytorch-lightning["extra"]`` -and ``pip install pytorch-lightning-bolts`` +To get started, install all extras using with ``pip install pytorch-lightning["extra"]`` +and Bolts with ``pip install pytorch-lightning-bolts`` To use Sequential Model Parallelism, you must define a :class:`nn.Sequential ` module that defines the layers you wish to parallelize across GPUs. This should be kept within the ``sequential_module`` variable within your ``LightningModule`` like below. From 49b408ba6323e4bf14a9225afdfeb63e01568850 Mon Sep 17 00:00:00 2001 From: tchaton Date: Wed, 9 Dec 2020 13:02:45 +0000 Subject: [PATCH 09/13] update doc --- docs/source/multi_gpu.rst | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst index 233cad9fb5ea6..3dfc726c9fd6c 100644 --- a/docs/source/multi_gpu.rst +++ b/docs/source/multi_gpu.rst @@ -694,8 +694,13 @@ Reference: https://arxiv.org/abs/1811.06965 .. note:: DDPSequentialPlugin is currently supported only for Pytorch 1.6. -To get started, install all extras using with ``pip install pytorch-lightning["extra"]`` -and Bolts with ``pip install pytorch-lightning-bolts`` +To get started, install FairScale through extras using with ``pip install pytorch-lightning["extra"]`` + +or directly using + +.. code-block:: bash + + pip install https://github.com/PyTorchLightning/fairscale/archive/pl_1.1.0.zip To use Sequential Model Parallelism, you must define a :class:`nn.Sequential ` module that defines the layers you wish to parallelize across GPUs. This should be kept within the ``sequential_module`` variable within your ``LightningModule`` like below. @@ -716,6 +721,7 @@ This should be kept within the ``sequential_module`` variable within your ``Ligh trainer = Trainer(accelerator='ddp', gpus=4, plugins=[plugin]) trainer.fit(model) +To run the example, you will need Bolts. Install with ``pip install pytorch-lightning-bolts`` We provide a minimal example of Sequential Model Parallelism using a convolutional model training on cifar10, split onto GPUs `here `_. From f7520815e75cbe1023cf5a94dfaf24be17e90d7c Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Wed, 9 Dec 2020 13:31:07 +0000 Subject: [PATCH 10/13] Formatting, update sharded zip link --- docs/source/multi_gpu.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst index 3dfc726c9fd6c..de1cbb80ed30c 100644 --- a/docs/source/multi_gpu.rst +++ b/docs/source/multi_gpu.rst @@ -667,7 +667,7 @@ To use Sharded Training, you need to first install FairScale using the command b .. code-block:: bash - pip install https://github.com/facebookresearch/fairscale/archive/bb468670838b98dc8f8d67be4eabf195042a7994.zip + pip install https://github.com/PyTorchLightning/fairscale/archive/pl_1.1.0.zip .. code-block:: python @@ -721,9 +721,9 @@ This should be kept within the ``sequential_module`` variable within your ``Ligh trainer = Trainer(accelerator='ddp', gpus=4, plugins=[plugin]) trainer.fit(model) -To run the example, you will need Bolts. Install with ``pip install pytorch-lightning-bolts`` We provide a minimal example of Sequential Model Parallelism using a convolutional model training on cifar10, split onto GPUs `here `_. +To run the example, you will to install `Bolts `_. Install with ``pip install pytorch-lightning-bolts``. When running the Sequential Model Parallelism example on 2 GPUS we achieve these memory savings. From 681a666ba0363c3f02c84a061290c66236c79ec1 Mon Sep 17 00:00:00 2001 From: chaton Date: Wed, 9 Dec 2020 13:36:01 +0000 Subject: [PATCH 11/13] Update docs/source/multi_gpu.rst MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos MocholĂ­ --- docs/source/multi_gpu.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst index de1cbb80ed30c..cf40ec41a8e19 100644 --- a/docs/source/multi_gpu.rst +++ b/docs/source/multi_gpu.rst @@ -723,7 +723,7 @@ This should be kept within the ``sequential_module`` variable within your ``Ligh We provide a minimal example of Sequential Model Parallelism using a convolutional model training on cifar10, split onto GPUs `here `_. -To run the example, you will to install `Bolts `_. Install with ``pip install pytorch-lightning-bolts``. +To run the example, you need to install `Bolts `_. Install with ``pip install pytorch-lightning-bolts``. When running the Sequential Model Parallelism example on 2 GPUS we achieve these memory savings. From 5e24528b68edcef2af59f62ae4c40d84e31273d0 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Wed, 9 Dec 2020 14:51:10 +0100 Subject: [PATCH 12/13] Apply suggestions from code review --- docs/source/multi_gpu.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst index cf40ec41a8e19..2ce66c9a719ca 100644 --- a/docs/source/multi_gpu.rst +++ b/docs/source/multi_gpu.rst @@ -743,13 +743,13 @@ When running the Sequential Model Parallelism example on 2 GPUS we achieve these To run the example with Sequential Model Parallelism: -.. code-block:: python +.. code-block:: bash python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024 --gpus 2 --accelerator ddp --use_ddp_sequential To run the same example without Sequential Model Parallelism: -.. code-block:: python +.. code-block:: bash python pl_examples/basic_examples/conv_sequential_example.py --batch_size 1024 --gpus 1 From cc979042755d34972c6897ccaef34142e6d9ff8f Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Wed, 9 Dec 2020 14:16:46 +0000 Subject: [PATCH 13/13] Reference directly to section --- docs/source/performance.rst | 17 ----------------- docs/source/training_tricks.rst | 17 ----------------- 2 files changed, 34 deletions(-) diff --git a/docs/source/performance.rst b/docs/source/performance.rst index 7eab4c8bdf6a0..394f6e5f3ca13 100644 --- a/docs/source/performance.rst +++ b/docs/source/performance.rst @@ -139,21 +139,4 @@ Sequential Model Parallelism with Checkpointing PyTorch Lightning integration for Sequential Model Parallelism using `FairScale `_. Sequential Model Parallelism splits a sequential module onto multiple GPUs, reducing peak GPU memory requirements substantially. -.. code-block:: python - - from pytorch_lightning.plugins.ddp_sequential_plugin import DDPSequentialPlugin - from pytorch_lightning import LightningModule - - class MyModel(LightningModule): - def __init__(self): - ... - self.sequential_module = torch.nn.Sequential(my_layers) - - # Split my module across 4 gpus, one layer each - model = MyModel() - plugin = DDPSequentialPlugin(balance=[1, 1, 1, 1]) - trainer = Trainer(accelerator='ddp', gpus=4, plugins=[plugin]) - trainer.fit(model) - - For more information, refer to :ref:`sequential-parallelism`. diff --git a/docs/source/training_tricks.rst b/docs/source/training_tricks.rst index 29810d7733b59..10ee668a97fa8 100644 --- a/docs/source/training_tricks.rst +++ b/docs/source/training_tricks.rst @@ -130,21 +130,4 @@ Sequential Model Parallelism with Checkpointing PyTorch Lightning integration for Sequential Model Parallelism using `FairScale `_. Sequential Model Parallelism splits a sequential module onto multiple GPUs, reducing peak GPU memory requirements substantially. -.. code-block:: python - - from pytorch_lightning.plugins.ddp_sequential_plugin import DDPSequentialPlugin - from pytorch_lightning import LightningModule - - class MyModel(LightningModule): - def __init__(self): - ... - self.sequential_module = torch.nn.Sequential(my_layers) - - # Split my module across 4 gpus, one layer each - model = MyModel() - plugin = DDPSequentialPlugin(balance=[1, 1, 1, 1]) - trainer = Trainer(accelerator='ddp', gpus=4, plugins=[plugin]) - trainer.fit(model) - - For more information, refer to :ref:`sequential-parallelism`. \ No newline at end of file