Lightning-AI
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 0 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmarks/test_sharded_parity.py‎
Lines changed: 3 additions & 1 deletion b/‎benchmarks/test_sharded_parity.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎pl_examples/basic_examples/conv_sequential_example.py‎
Lines changed: 216 additions & 0 deletions b/‎pl_examples/basic_examples/conv_sequential_example.py‎
Lines changed: 216 additions & 0 deletions
diff --git a/‎pytorch_lightning/overrides/data_parallel.py‎
Lines changed: 3 additions & 1 deletion b/‎pytorch_lightning/overrides/data_parallel.py‎
Lines changed: 3 additions & 1 deletion
@@ -32,5 +32,6 @@ repos:
         types: [python]
 
   - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: master
     hooks:
       - id: mypy
@@ -131,6 +131,7 @@ def test_ddp_sharded_plugin_correctness_amp_multi_gpu_ddp(tmpdir, args=None):
     )
 
 
+@pytest.mark.skip(reason="Current issue with multiple optimizers and FairScale.")
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 @pytest.mark.skipif(platform.system() == "Windows",
                     reason="Distributed training is not supported on Windows")
@@ -148,6 +149,7 @@ def test_ddp_sharded_plugin_correctness_multi_gpu_multi_optim():
     )
 
 
+@pytest.mark.skip(reason="Current issue with multiple optimizers and FairScale.")
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 @pytest.mark.skipif(platform.system() == "Windows",
                     reason="Distributed training is not supported on Windows")
@@ -189,7 +191,7 @@ def training_step(self, batch, batch_idx, optimizer_idx):
 
         # ensure we forward the correct params to the optimizer
         # without retain_graph we can't do multiple backward passes
-        self.manual_backward(loss_2, opt_b, retain_graph=True)
+        self.manual_backward(loss_2, opt_b)
         # todo: understand why synchronization breaks there.
         # self.manual_backward(loss_2, opt_a, retain_graph=True)
         opt_b.step()
 
@@ -0,0 +1,216 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+Example script of running the experimental DDP Sequential Plugin.
+This script splits a convolutional model onto multiple GPUs, whilst using the internal built in balancer
+to balance across your GPUs.
+
+To run:
+python conv_model_sequential_example.py --accelerator ddp --gpus 4 --max_epochs 1  --batch_size 256 --use_ddp_sequential
+"""
+import math
+from argparse import ArgumentParser
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision
+
+import pytorch_lightning as pl
+from pytorch_lightning import Trainer
+from pytorch_lightning.metrics.functional import accuracy
+from pytorch_lightning.plugins.ddp_sequential_plugin import DDPSequentialPlugin
+from pytorch_lightning.utilities import BOLTS_AVAILABLE, FAIRSCALE_PIPE_AVAILABLE
+
+if BOLTS_AVAILABLE:
+    import pl_bolts
+    from pl_bolts.transforms.dataset_normalizations import cifar10_normalization
+
+
+#####################
+#      Modules      #
+#####################
+
+
+class Flatten(nn.Module):
+    def forward(self, x):
+        return x.view(x.size(0), -1)
+
+###############################
+#       LightningModule       #
+###############################
+
+
+class LitResnet(pl.LightningModule):
+    def __init__(self, lr=0.05, batch_size=32, manual_optimization=False):
+        super().__init__()
+
+        self.save_hyperparameters()
+        self.sequential_module = nn.Sequential(
+            # Conv Layer block 1
+            nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, padding=1),
+            nn.BatchNorm2d(32),
+            nn.ReLU(inplace=False),
+            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1),
+            nn.ReLU(inplace=False),
+            nn.MaxPool2d(kernel_size=2, stride=2),
+
+            # Conv Layer block 2
+            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1),
+            nn.BatchNorm2d(128),
+            nn.ReLU(inplace=False),
+            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, padding=1),
+            nn.ReLU(inplace=False),
+            nn.MaxPool2d(kernel_size=2, stride=2),
+            nn.Dropout2d(p=0.05),
+
+            # Conv Layer block 3
+            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=1),
+            nn.BatchNorm2d(256),
+            nn.ReLU(inplace=False),
+            nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1),
+            nn.ReLU(inplace=False),
+            nn.MaxPool2d(kernel_size=2, stride=2),
+
+            Flatten(),
+
+            nn.Dropout(p=0.1),
+            nn.Linear(4096, 1024),
+            nn.ReLU(inplace=False),
+            nn.Linear(1024, 512),
+            nn.ReLU(inplace=False),
+            nn.Dropout(p=0.1),
+            nn.Linear(512, 10)
+        )
+        self._example_input_array = torch.randn((1, 3, 32, 32))
+        self._manual_optimization = manual_optimization
+        if self._manual_optimization:
+            self.training_step = self.training_step_manual
+
+    def forward(self, x):
+        out = self.sequential_module(x)
+        return F.log_softmax(out, dim=-1)
+
+    def training_step_manual(self, batch, batch_idx):
+        opt = self.optimizers()
+
+        def closure():
+            x, y = batch
+            logits = self.forward(x)
+            loss = F.nll_loss(logits, y)
+            self.manual_backward(loss, opt)
+            self.log('train_loss', loss, prog_bar=True)
+
+        opt.step(closure=closure)
+
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+        logits = self.forward(x)
+        loss = F.nll_loss(logits, y)
+        self.log('Training Loss', loss)
+        return loss
+
+    def _evaluate(self, batch, batch_idx, stage=None):
+        x, y = batch
+        out = self.forward(x)
+        logits = F.log_softmax(out, dim=-1)
+        loss = F.nll_loss(logits, y)
+        preds = torch.argmax(logits, dim=-1)
+        acc = accuracy(preds, y)
+
+        if stage:
+            self.log(f'{stage}_loss', loss, prog_bar=True)
+            self.log(f'{stage}_acc', acc, prog_bar=True)
+
+        return loss, acc
+
+    def validation_step(self, batch, batch_idx):
+        return self._evaluate(batch, batch_idx, 'val')[0]
+
+    def test_step(self, batch, batch_idx):
+        loss, acc = self._evaluate(batch, batch_idx, 'test')
+        self.log_dict({'test_loss': loss, 'test_acc': acc})
+
+    def configure_optimizers(self):
+        optimizer = torch.optim.SGD(self.parameters(), lr=self.hparams.lr, momentum=0.9, weight_decay=5e-4)
+        return {
+            'optimizer': optimizer,
+            'lr_scheduler': {
+                'scheduler': torch.optim.lr_scheduler.OneCycleLR(
+                    optimizer,
+                    0.1,
+                    epochs=self.trainer.max_epochs,
+                    steps_per_epoch=math.ceil(45000 / self.hparams.batch_size)),
+                'interval': 'step',
+            }
+        }
+
+    @property
+    def automatic_optimization(self) -> bool:
+        return not self._manual_optimization
+
+
+#################################
+#     Instantiate Data Module   #
+#################################
+
+def instantiate_datamodule(args):
+    train_transforms = torchvision.transforms.Compose([
+        torchvision.transforms.RandomCrop(32, padding=4),
+        torchvision.transforms.RandomHorizontalFlip(),
+        torchvision.transforms.ToTensor(),
+        cifar10_normalization(),
+    ])
+
+    test_transforms = torchvision.transforms.Compose([
+        torchvision.transforms.ToTensor(),
+        cifar10_normalization(),
+    ])
+
+    cifar10_dm = pl_bolts.datamodules.CIFAR10DataModule(
+        batch_size=args.batch_size,
+        train_transforms=train_transforms,
+        test_transforms=test_transforms,
+        val_transforms=test_transforms,
+    )
+
+    return cifar10_dm
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(description="Pipe Example")
+    parser.add_argument("--use_ddp_sequential", action="store_true")
+    parser = Trainer.add_argparse_args(parser)
+    parser = pl_bolts.datamodules.CIFAR10DataModule.add_argparse_args(parser)
+    args = parser.parse_args()
+
+    assert BOLTS_AVAILABLE, "Bolts is required for this example, install it via pip install pytorch-lightning-bolts"
+    assert FAIRSCALE_PIPE_AVAILABLE, "FairScale and PyTorch 1.6 is required for this example."
+
+    cifar10_dm = instantiate_datamodule(args)
+
+    plugins = None
+    if args.use_ddp_sequential:
+        plugins = DDPSequentialPlugin()
+
+    model = LitResnet(batch_size=args.batch_size, manual_optimization=not args.automatic_optimization)
+
+    trainer = pl.Trainer.from_argparse_args(args, plugins=[plugins] if plugins else None)
+    trainer.fit(model, cifar10_dm)
+    trainer.test(model, datamodule=cifar10_dm)
+
+    if trainer.accelerator_backend.rpc_enabled:
+        # Called at the end of trainer to ensure all processes are killed
+        trainer.accelerator_backend.ddp_plugin.exit_rpc_process()
@@ -155,6 +155,7 @@ class LightningDistributedDataParallel(DistributedDataParallel):
     """
     Override the forward call in lightning so it goes to training and validation step respectively
     """
+    PREPARE_FOR_BACKWARDS = True
 
     def parallel_apply(self, replicas, inputs, kwargs):
         return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
@@ -165,6 +166,7 @@ def forward(self, *inputs, **kwargs):  # pragma: no-cover
         fx_called: str = ''
 
         if self.device_ids:
+
             inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
             if len(self.device_ids) == 1:
                 # --------------
@@ -195,7 +197,7 @@ def forward(self, *inputs, **kwargs):  # pragma: no-cover
             else:
                 output = self.module.validation_step(*inputs, **kwargs)
 
-        if not self._reducer_prepared_for_backwards:
+        if not self._reducer_prepared_for_backwards and self.PREPARE_FOR_BACKWARDS:
             self.reducer_prepare_for_backwards(output)
 
         if output is None: