From 808edcdebf4bb51a1e4c56ec7c7be190d8a202b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Wed, 27 Oct 2021 13:16:09 +0200
Subject: [PATCH 1/4] update type (#10163)

---
 pytorch_lightning/plugins/precision/precision_plugin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/plugins/precision/precision_plugin.py b/pytorch_lightning/plugins/precision/precision_plugin.py
index ed60df7dd971a..62e1e33232480 100644
--- a/pytorch_lightning/plugins/precision/precision_plugin.py
+++ b/pytorch_lightning/plugins/precision/precision_plugin.py
@@ -103,7 +103,7 @@ def post_backward(self, model: "pl.LightningModule", closure_loss: Tensor) -> Te
         model.trainer.call_hook("on_after_backward")
         return closure_loss
 
-    def _run_backward(self, tensor: Tensor, model: Module, *args: Any, **kwargs: Any) -> None:
+    def _run_backward(self, tensor: Tensor, model: Optional[Module], *args: Any, **kwargs: Any) -> None:
         """Lightning-independent backward logic.
 
         Currently only used by Lightning Lite. Subject to further refactors.

From dbe1662dc38d5217328ab459743b1113869a628c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Wed, 27 Oct 2021 14:38:39 +0200
Subject: [PATCH 2/4] Replace `_TORCH_GREATER_EQUAL_DEV_1_10` with
 `_TORCH_GREATER_EQUAL_1_10` (#10157)

---
 docs/source/advanced/mixed_precision.rst          | 4 ++--
 docs/source/conf.py                               | 2 +-
 pytorch_lightning/callbacks/quantization.py       | 6 +++---
 pytorch_lightning/core/lightning.py               | 4 ++--
 pytorch_lightning/plugins/precision/native_amp.py | 8 ++++----
 pytorch_lightning/utilities/__init__.py           | 1 -
 pytorch_lightning/utilities/imports.py            | 2 +-
 tests/core/test_lightning_module.py               | 6 +-----
 tests/models/test_amp.py                          | 6 ++----
 tests/plugins/test_amp_plugins.py                 | 5 ++---
 10 files changed, 18 insertions(+), 26 deletions(-)

diff --git a/docs/source/advanced/mixed_precision.rst b/docs/source/advanced/mixed_precision.rst
index 1c98f663ed5f3..9889c05db243d 100644
--- a/docs/source/advanced/mixed_precision.rst
+++ b/docs/source/advanced/mixed_precision.rst
@@ -50,14 +50,14 @@ BFloat16 Mixed precision is similar to FP16 mixed precision, however we maintain
 Since BFloat16 is more stable than FP16 during training, we do not need to worry about any gradient scaling or nan gradient values that comes with using FP16 mixed precision.
 
 .. testcode::
-    :skipif: not _TORCH_GREATER_EQUAL_DEV_1_10 or not torch.cuda.is_available()
+    :skipif: not _TORCH_GREATER_EQUAL_1_10 or not torch.cuda.is_available()
 
     Trainer(gpus=1, precision="bf16")
 
 It is also possible to use BFloat16 mixed precision on the CPU, relying on MKLDNN under the hood.
 
 .. testcode::
-    :skipif: not _TORCH_GREATER_EQUAL_DEV_1_10
+    :skipif: not _TORCH_GREATER_EQUAL_1_10
 
     Trainer(precision="bf16")
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index f5f9605263217..16b2ed7509ee3 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -373,7 +373,7 @@ def package_list_from_file(file):
     _XLA_AVAILABLE,
     _TPU_AVAILABLE,
     _TORCHVISION_AVAILABLE,
-    _TORCH_GREATER_EQUAL_DEV_1_10,
+    _TORCH_GREATER_EQUAL_1_10,
     _module_available,
 )
 _JSONARGPARSE_AVAILABLE = _module_available("jsonargparse")
diff --git a/pytorch_lightning/callbacks/quantization.py b/pytorch_lightning/callbacks/quantization.py
index bf0088575e8b4..ca82a574f71d1 100644
--- a/pytorch_lightning/callbacks/quantization.py
+++ b/pytorch_lightning/callbacks/quantization.py
@@ -33,10 +33,10 @@
 
 import pytorch_lightning as pl
 from pytorch_lightning.callbacks.base import Callback
-from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_DEV_1_10
+from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_10
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
-if _TORCH_GREATER_EQUAL_DEV_1_10:
+if _TORCH_GREATER_EQUAL_1_10:
     from torch.ao.quantization.qconfig import QConfig
 else:
     from torch.quantization import QConfig
@@ -245,7 +245,7 @@ def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -
                 # version=None corresponds to using FakeQuantize rather than
                 # FusedMovingAvgObsFakeQuantize which was introduced in PT1.10
                 # details in https://github.com/pytorch/pytorch/issues/64564
-                extra_kwargs = dict(version=None) if _TORCH_GREATER_EQUAL_DEV_1_10 else {}
+                extra_kwargs = dict(version=None) if _TORCH_GREATER_EQUAL_1_10 else {}
                 pl_module.qconfig = torch.quantization.get_default_qat_qconfig(self._qconfig, **extra_kwargs)
 
         elif isinstance(self._qconfig, QConfig):
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index 7a58f91adda7d..cfac84be1367b 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -39,7 +39,7 @@
 from pytorch_lightning.trainer.connectors.logger_connector.fx_validator import _FxValidator
 from pytorch_lightning.utilities import (
     _IS_WINDOWS,
-    _TORCH_GREATER_EQUAL_DEV_1_10,
+    _TORCH_GREATER_EQUAL_1_10,
     GradClipAlgorithmType,
     rank_zero_deprecation,
     rank_zero_warn,
@@ -2043,7 +2043,7 @@ def _register_sharded_tensor_state_dict_hooks_if_available(self) -> None:
 
         These hooks ensure that ShardedTensors are included when saving, and are loaded the LightningModule correctly.
         """
-        if not _TORCH_GREATER_EQUAL_DEV_1_10 or _IS_WINDOWS:
+        if not _TORCH_GREATER_EQUAL_1_10 or _IS_WINDOWS:
             return
 
         from torch.distributed._sharded_tensor import pre_load_state_dict_hook, state_dict_hook
diff --git a/pytorch_lightning/plugins/precision/native_amp.py b/pytorch_lightning/plugins/precision/native_amp.py
index 3fc903cbb3fce..487d80005c222 100644
--- a/pytorch_lightning/plugins/precision/native_amp.py
+++ b/pytorch_lightning/plugins/precision/native_amp.py
@@ -21,10 +21,10 @@
 
 import pytorch_lightning as pl
 from pytorch_lightning.plugins.precision.mixed import MixedPrecisionPlugin
-from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_DEV_1_10, AMPType
+from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_10, AMPType
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
-if _TORCH_GREATER_EQUAL_DEV_1_10:
+if _TORCH_GREATER_EQUAL_1_10:
     from torch import autocast
 else:
     from torch.cuda.amp import autocast
@@ -47,7 +47,7 @@ def __init__(self, precision: Union[int, str] = 16, use_cpu: bool = False) -> No
 
     def _select_precision_dtype(self, precision: Union[int, str] = 16) -> torch.dtype:
         if precision == "bf16":
-            if not _TORCH_GREATER_EQUAL_DEV_1_10:
+            if not _TORCH_GREATER_EQUAL_1_10:
                 raise MisconfigurationException(
                     "To use bfloat16 with native amp you must install torch greater or equal to 1.10."
                 )
@@ -97,7 +97,7 @@ def optimizer_step(
             self.scaler.update()
 
     def autocast_context_manager(self) -> autocast:
-        if _TORCH_GREATER_EQUAL_DEV_1_10:
+        if _TORCH_GREATER_EQUAL_1_10:
             return autocast("cpu" if self.use_cpu else "cuda", dtype=self._dtype)
         return autocast()
 
diff --git a/pytorch_lightning/utilities/__init__.py b/pytorch_lightning/utilities/__init__.py
index bc19aa1366a55..158d7356c91ce 100644
--- a/pytorch_lightning/utilities/__init__.py
+++ b/pytorch_lightning/utilities/__init__.py
@@ -48,7 +48,6 @@
     _TORCH_GREATER_EQUAL_1_8,
     _TORCH_GREATER_EQUAL_1_9,
     _TORCH_GREATER_EQUAL_1_10,
-    _TORCH_GREATER_EQUAL_DEV_1_10,
     _TORCH_QUANTIZE_AVAILABLE,
     _TORCHTEXT_AVAILABLE,
     _TORCHVISION_AVAILABLE,
diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py
index c7ad70895672a..811e81a370601 100644
--- a/pytorch_lightning/utilities/imports.py
+++ b/pytorch_lightning/utilities/imports.py
@@ -75,7 +75,7 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version:
 _TORCH_GREATER_EQUAL_1_8_1 = _compare_version("torch", operator.ge, "1.8.1")
 _TORCH_GREATER_EQUAL_1_9 = _compare_version("torch", operator.ge, "1.9.0")
 _TORCH_GREATER_EQUAL_1_10 = _compare_version("torch", operator.ge, "1.10.0")
-_TORCH_GREATER_EQUAL_DEV_1_10 = _compare_version("torch", operator.ge, "1.10.0", use_base_version=True)
+# _TORCH_GREATER_EQUAL_DEV_1_11 = _compare_version("torch", operator.ge, "1.11.0", use_base_version=True)
 
 _APEX_AVAILABLE = _module_available("apex.amp")
 _DEEPSPEED_AVAILABLE = _module_available("deepspeed")
diff --git a/tests/core/test_lightning_module.py b/tests/core/test_lightning_module.py
index d661228ee09d8..ff8ffa3c50acd 100644
--- a/tests/core/test_lightning_module.py
+++ b/tests/core/test_lightning_module.py
@@ -21,7 +21,6 @@
 
 from pytorch_lightning import Trainer
 from pytorch_lightning.loggers import TensorBoardLogger
-from pytorch_lightning.utilities import _IS_WINDOWS, _TORCH_GREATER_EQUAL_DEV_1_10
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers import BoringModel
 from tests.helpers.runif import RunIf
@@ -312,10 +311,7 @@ def __init__(self, spec):
         self.sharded_tensor.local_shards()[0].tensor.fill_(0)
 
 
-@pytest.mark.skipif(
-    not _TORCH_GREATER_EQUAL_DEV_1_10, reason="Test requires the torch version to support `ShardedTensor`"
-)
-@pytest.mark.skipif(_IS_WINDOWS, reason="Not supported on Windows")
+@RunIf(min_torch="1.10", skip_windows=True)
 def test_sharded_tensor_state_dict(tmpdir, single_process_pg):
     spec = dist._sharding_spec.ChunkShardingSpec(
         dim=0,
diff --git a/tests/models/test_amp.py b/tests/models/test_amp.py
index 716c0f17f203d..86863238da057 100644
--- a/tests/models/test_amp.py
+++ b/tests/models/test_amp.py
@@ -22,7 +22,6 @@
 import tests.helpers.utils as tutils
 from pytorch_lightning import Trainer
 from pytorch_lightning.plugins.environments import SLURMEnvironment
-from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_DEV_1_10
 from tests.helpers import BoringModel, RandomDataset
 from tests.helpers.runif import RunIf
 
@@ -68,7 +67,7 @@ def _assert_autocast_enabled(self):
             assert torch.is_autocast_enabled()
 
 
-@pytest.mark.skipif(not _TORCH_GREATER_EQUAL_DEV_1_10, reason="Needs bfloat16 support")
+@RunIf(min_torch="1.10")
 @pytest.mark.parametrize(
     "strategy",
     [
@@ -95,8 +94,7 @@ def test_amp_cpus(tmpdir, strategy, precision, num_processes):
     assert trainer.state.finished, f"Training failed with {trainer.state}"
 
 
-@RunIf(min_gpus=2)
-@pytest.mark.skipif(not _TORCH_GREATER_EQUAL_DEV_1_10, reason="Needs bfloat16 support")
+@RunIf(min_gpus=2, min_torch="1.10")
 @pytest.mark.parametrize("strategy", [None, "dp", "ddp_spawn"])
 @pytest.mark.parametrize("precision", [16, "bf16"])
 @pytest.mark.parametrize("gpus", [1, 2])
diff --git a/tests/plugins/test_amp_plugins.py b/tests/plugins/test_amp_plugins.py
index ed8c653b3a78f..227d898a7da40 100644
--- a/tests/plugins/test_amp_plugins.py
+++ b/tests/plugins/test_amp_plugins.py
@@ -21,7 +21,6 @@
 from pytorch_lightning import Trainer
 from pytorch_lightning.plugins import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin
 from pytorch_lightning.plugins.precision import MixedPrecisionPlugin
-from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_DEV_1_10
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers import BoringModel
 from tests.helpers.runif import RunIf
@@ -178,7 +177,7 @@ def test_amp_apex_ddp_spawn_fit(amp_level, tmpdir):
     trainer.fit(model)
 
 
-@pytest.mark.skipif(not _TORCH_GREATER_EQUAL_DEV_1_10, reason="Torch CPU AMP is not available.")
+@RunIf(min_torch="1.10")
 def test_cpu_amp_precision_context_manager(tmpdir):
     """Test to ensure that the context manager correctly is set to CPU + bfloat16, and a scaler isn't set."""
     plugin = NativeMixedPrecisionPlugin(precision="bf16", use_cpu=True)
@@ -197,7 +196,7 @@ def test_precision_selection_raises(monkeypatch):
 
     import pytorch_lightning.plugins.precision.native_amp as amp
 
-    monkeypatch.setattr(amp, "_TORCH_GREATER_EQUAL_DEV_1_10", False)
+    monkeypatch.setattr(amp, "_TORCH_GREATER_EQUAL_1_10", False)
     with pytest.warns(
         UserWarning, match=r"precision=16\)` but native AMP is not supported on CPU. Using `precision='bf16"
     ), pytest.raises(MisconfigurationException, match="must install torch greater or equal to 1.10"):

From 121607753f5aac03d423a70881980fac1c66d81e Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Wed, 27 Oct 2021 16:52:50 +0100
Subject: [PATCH 3/4] update

---
 pl_examples/basic_examples/README.md                | 8 +++++---
 pl_examples/basic_examples/mnist_examples/README.md | 6 +++---
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/pl_examples/basic_examples/README.md b/pl_examples/basic_examples/README.md
index cda779c459ad6..63fbc360a8c60 100644
--- a/pl_examples/basic_examples/README.md
+++ b/pl_examples/basic_examples/README.md
@@ -6,7 +6,7 @@ Use these examples to test how Lightning works.
 
 5 MNIST examples showing how to gradually convert from pure PyTorch to PyTorch Lightning.
 
-The transition through [LightningLite](https://pytorch-lightning.readthedocs.io/en/latest/starter/lightning_lite.rst) from pure PyTorch is optional but it might helpful to learn about it.
+The transition through [LightningLite](https://pytorch-lightning.readthedocs.io/en/latest/starter/lightning_lite.rst) from pure PyTorch is optional but it might be helpful to learn about it.
 
 #### 1 . Image Classifier with Vanilla PyTorch
 
@@ -30,6 +30,8 @@ python mnist_examples/image_classifier_2_lite.py
 
 ______________________________________________________________________
 
+#### 3. Image Classifier - Conversion Lite to Lightning
+
 Trains a simple CNN over MNIST where `LightningLite` is almost a `LightningModule`.
 
 ```bash
@@ -64,8 +66,8 @@ python mnist_examples/image_classifier_5_lightning_datamodule.py
 # gpus (any number)
 python mnist_examples/image_classifier_5_lightning_datamodule.py --trainer.gpus 2
 
-# Distributed Data Parallel
-python backbone_image_classifier.py --trainer.gpus 2 --trainer.accelerator ddp
+# data parallel
+python mnist_examples/image_classifier_5_lightning_datamodule.py --trainer.gpus 2 --trainer.accelerator 'dp'
 ```
 
 ______________________________________________________________________
diff --git a/pl_examples/basic_examples/mnist_examples/README.md b/pl_examples/basic_examples/mnist_examples/README.md
index 323273d9ff718..b2cf63661e9ca 100644
--- a/pl_examples/basic_examples/mnist_examples/README.md
+++ b/pl_examples/basic_examples/mnist_examples/README.md
@@ -2,7 +2,7 @@
 
 5 MNIST examples showing how to gradually convert from pure PyTorch to PyTorch Lightning.
 
-The transition through [LightningLite](https://pytorch-lightning.readthedocs.io/en/latest/starter/lightning_lite.rst) from pure PyTorch is optional but it might helpful to learn about it.
+The transition through [LightningLite](https://pytorch-lightning.readthedocs.io/en/latest/starter/lightning_lite.rst) from pure PyTorch is optional but it might be helpful to learn about it.
 
 #### 1 . Image Classifier with Vanilla PyTorch
 
@@ -43,10 +43,10 @@ Trains a simple CNN over MNIST with `Lightning Trainer` and the converted `Light
 
 ```bash
 # cpu
-python mnist_examples/image_classifier_4_lightning.py
+python image_classifier_4_lightning.py
 
 # gpus (any number)
-python mnist_examples/image_classifier_4_lightning.py --trainer.gpus 2
+python image_classifier_4_lightning.py --trainer.gpus 2
 ```
 
 ______________________________________________________________________

From 46d2d54c17fa61a1a7719f4cac73017aa7edcd7d Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Wed, 27 Oct 2021 18:32:30 +0100
Subject: [PATCH 4/4] update

---
 pl_examples/basic_examples/README.md          |   9 +-
 .../basic_examples/mnist_examples/README.md   |   9 +-
 .../image_classifier_1_pytorch.py             | 146 +++++++++---------
 .../mnist_examples/image_classifier_2_lite.py | 127 ++++++++-------
 .../image_classifier_3_lite_to_lightning.py   |  96 ++++++------
 .../image_classifier_4_lightning.py           |   7 +-
 ...image_classifier_5_lightning_datamodule.py |   3 +-
 7 files changed, 190 insertions(+), 207 deletions(-)

diff --git a/pl_examples/basic_examples/README.md b/pl_examples/basic_examples/README.md
index 63fbc360a8c60..b58632cf51158 100644
--- a/pl_examples/basic_examples/README.md
+++ b/pl_examples/basic_examples/README.md
@@ -21,7 +21,7 @@ ______________________________________________________________________
 
 #### 2. Image Classifier with LightningLite
 
-Trains a simple CNN over MNIST using [LightningLite](https://pytorch-lightning.readthedocs.io/en/latest/starter/lightning_lite.rst).
+This script shows you how to scale the previous script to enable GPU and multi GPU training using [LightningLite](https://pytorch-lightning.readthedocs.io/en/latest/starter/lightning_lite.rst).
 
 ```bash
 # cpu / multiple gpus if available
@@ -32,7 +32,8 @@ ______________________________________________________________________
 
 #### 3. Image Classifier - Conversion Lite to Lightning
 
-Trains a simple CNN over MNIST where `LightningLite` is almost a `LightningModule`.
+This script shows you to prepare your conversion from [LightningLite](https://pytorch-lightning.readthedocs.io/en/latest/starter/lightning_lite.rst)
+to `LightningModule`.
 
 ```bash
 # cpu / multiple gpus if available
@@ -43,7 +44,7 @@ ______________________________________________________________________
 
 #### 4. Image Classifier with LightningModule
 
-Trains a simple CNN over MNIST with `Lightning Trainer` and the converted `LightningModule`.
+This script shows you how the result of the conversion to the `LightningModule` and finally get all the benefits from Lightning.
 
 ```bash
 # cpu
@@ -57,7 +58,7 @@ ______________________________________________________________________
 
 #### 5. Image Classifier with LightningModule + LightningDataModule
 
-Trains a simple CNN over MNIST with `Lightning Trainer` and the converted `LightningModule` and `LightningDataModule`
+This script shows you how extracts the data related components to a `LightningDataModule`.
 
 ```bash
 # cpu
diff --git a/pl_examples/basic_examples/mnist_examples/README.md b/pl_examples/basic_examples/mnist_examples/README.md
index b2cf63661e9ca..68028f7059c6a 100644
--- a/pl_examples/basic_examples/mnist_examples/README.md
+++ b/pl_examples/basic_examples/mnist_examples/README.md
@@ -17,7 +17,7 @@ ______________________________________________________________________
 
 #### 2. Image Classifier with LightningLite
 
-Trains a simple CNN over MNIST using [LightningLite](https://pytorch-lightning.readthedocs.io/en/latest/starter/lightning_lite.rst).
+This script shows you how to scale the previous script to enable GPU and multi GPU training using [LightningLite](https://pytorch-lightning.readthedocs.io/en/latest/starter/lightning_lite.rst).
 
 ```bash
 # cpu / multiple gpus if available
@@ -28,7 +28,8 @@ ______________________________________________________________________
 
 #### 3. Image Classifier - Conversion Lite to Lightning
 
-Trains a simple CNN over MNIST where `LightningLite` is almost a `LightningModule`.
+This script shows you to prepare your conversion from  [LightningLite](https://pytorch-lightning.readthedocs.io/en/latest/starter/lightning_lite.rst)
+to `LightningModule`.
 
 ```bash
 # cpu / multiple gpus if available
@@ -39,7 +40,7 @@ ______________________________________________________________________
 
 #### 4. Image Classifier with LightningModule
 
-Trains a simple CNN over MNIST with `Lightning Trainer` and the converted `LightningModule`.
+This script shows you how the result of the conversion to the `LightningModule` and finally get all the benefits from Lightning.
 
 ```bash
 # cpu
@@ -53,7 +54,7 @@ ______________________________________________________________________
 
 #### 5. Image Classifier with LightningModule + LightningDataModule
 
-Trains a simple CNN over MNIST with `Lightning Trainer` and the converted `LightningModule` and `LightningDataModule`
+This script shows you how extracts the data related components to a `LightningDataModule`.
 
 ```bash
 # cpu
diff --git a/pl_examples/basic_examples/mnist_examples/image_classifier_1_pytorch.py b/pl_examples/basic_examples/mnist_examples/image_classifier_1_pytorch.py
index e7449473194ed..4073c485e6017 100644
--- a/pl_examples/basic_examples/mnist_examples/image_classifier_1_pytorch.py
+++ b/pl_examples/basic_examples/mnist_examples/image_classifier_1_pytorch.py
@@ -52,50 +52,80 @@ def forward(self, x):
         return output
 
 
-def train(args, model, device, train_loader, optimizer, epoch):
-    model.train()
-    for batch_idx, (data, target) in enumerate(train_loader):
-        data, target = data.to(device), target.to(device)
-        optimizer.zero_grad()
-        output = model(data)
-        loss = F.nll_loss(output, target)
-        loss.backward()
-        optimizer.step()
-        if (batch_idx == 0) or ((batch_idx + 1) % args.log_interval == 0):
-            print(
-                "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
-                    epoch,
-                    batch_idx * len(data),
-                    len(train_loader.dataset),
-                    100.0 * batch_idx / len(train_loader),
-                    loss.item(),
-                )
-            )
-            if args.dry_run:
-                break
+def run(hparams):
+
+    torch.manual_seed(hparams.seed)
+
+    use_cuda = torch.cuda.is_available()
+    device = torch.device("cuda" if use_cuda else "cpu")
+
+    transform = T.Compose([T.ToTensor(), T.Normalize((0.1307,), (0.3081,))])
+    train_dataset = MNIST("./data", train=True, download=True, transform=transform)
+    test_dataset = MNIST("./data", train=False, transform=transform)
+    train_loader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=hparams.batch_size,
+    )
+    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=hparams.batch_size)
 
+    model = Net().to(device)
+    optimizer = optim.Adadelta(model.parameters(), lr=hparams.lr)
+
+    scheduler = StepLR(optimizer, step_size=1, gamma=hparams.gamma)
+
+    # EPOCH LOOP
+    for epoch in range(1, hparams.epochs + 1):
 
-def test(args, model, device, test_loader):
-    model.eval()
-    test_loss = 0
-    correct = 0
-    with torch.no_grad():
-        for data, target in test_loader:
+        # TRAINING LOOP
+        model.train()
+        for batch_idx, (data, target) in enumerate(train_loader):
             data, target = data.to(device), target.to(device)
+            optimizer.zero_grad()
             output = model(data)
-            test_loss += F.nll_loss(output, target, reduction="sum").item()  # sum up batch loss
-            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
-            correct += pred.eq(target.view_as(pred)).sum().item()
-            if args.dry_run:
-                break
-
-    test_loss /= len(test_loader.dataset)
+            loss = F.nll_loss(output, target)
+            loss.backward()
+            optimizer.step()
+            if (batch_idx == 0) or ((batch_idx + 1) % hparams.log_interval == 0):
+                print(
+                    "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
+                        epoch,
+                        batch_idx * len(data),
+                        len(train_loader.dataset),
+                        100.0 * batch_idx / len(train_loader),
+                        loss.item(),
+                    )
+                )
+                if hparams.dry_run:
+                    break
+        scheduler.step()
 
-    print(
-        "\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n".format(
-            test_loss, correct, len(test_loader.dataset), 100.0 * correct / len(test_loader.dataset)
+        # TESTING LOOP
+        model.eval()
+        test_loss = 0
+        correct = 0
+        with torch.no_grad():
+            for data, target in test_loader:
+                data, target = data.to(device), target.to(device)
+                output = model(data)
+                test_loss += F.nll_loss(output, target, reduction="sum").item()  # sum up batch loss
+                pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
+                correct += pred.eq(target.view_as(pred)).sum().item()
+                if hparams.dry_run:
+                    break
+
+        test_loss /= len(test_loader.dataset)
+
+        print(
+            "\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n".format(
+                test_loss, correct, len(test_loader.dataset), 100.0 * correct / len(test_loader.dataset)
+            )
         )
-    )
+
+        if hparams.dry_run:
+            break
+
+    if hparams.save_model:
+        torch.save(model.state_dict(), "mnist_cnn.pt")
 
 
 def main():
@@ -103,13 +133,9 @@ def main():
     parser.add_argument(
         "--batch-size", type=int, default=64, metavar="N", help="input batch size for training (default: 64)"
     )
-    parser.add_argument(
-        "--test-batch-size", type=int, default=1000, metavar="N", help="input batch size for testing (default: 1000)"
-    )
     parser.add_argument("--epochs", type=int, default=14, metavar="N", help="number of epochs to train (default: 14)")
     parser.add_argument("--lr", type=float, default=1.0, metavar="LR", help="learning rate (default: 1.0)")
     parser.add_argument("--gamma", type=float, default=0.7, metavar="M", help="Learning rate step gamma (default: 0.7)")
-    parser.add_argument("--no-cuda", action="store_true", default=False, help="disables CUDA training")
     parser.add_argument("--dry-run", action="store_true", default=False, help="quickly check a single pass")
     parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)")
     parser.add_argument(
@@ -120,40 +146,8 @@ def main():
         help="how many batches to wait before logging training status",
     )
     parser.add_argument("--save-model", action="store_true", default=False, help="For Saving the current Model")
-    args = parser.parse_args()
-    use_cuda = not args.no_cuda and torch.cuda.is_available()
-
-    torch.manual_seed(args.seed)
-
-    device = torch.device("cuda" if use_cuda else "cpu")
-
-    train_kwargs = {"batch_size": args.batch_size}
-    test_kwargs = {"batch_size": args.test_batch_size}
-    if use_cuda:
-        cuda_kwargs = {"num_workers": 1, "pin_memory": True, "shuffle": True}
-        train_kwargs.update(cuda_kwargs)
-        test_kwargs.update(cuda_kwargs)
-
-    transform = T.Compose([T.ToTensor(), T.Normalize((0.1307,), (0.3081,))])
-    train_dataset = MNIST("./data", train=True, download=True, transform=transform)
-    test_dataset = MNIST("./data", train=False, transform=transform)
-    train_loader = torch.utils.data.DataLoader(train_dataset, **train_kwargs)
-    test_loader = torch.utils.data.DataLoader(test_dataset, **test_kwargs)
-
-    model = Net().to(device)
-    optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
-
-    scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
-    for epoch in range(1, args.epochs + 1):
-        train(args, model, device, train_loader, optimizer, epoch)
-        test(args, model, device, test_loader)
-        scheduler.step()
-
-        if args.dry_run:
-            break
-
-    if args.save_model:
-        torch.save(model.state_dict(), "mnist_cnn.pt")
+    hparams = parser.parse_args()
+    run(hparams)
 
 
 if __name__ == "__main__":
diff --git a/pl_examples/basic_examples/mnist_examples/image_classifier_2_lite.py b/pl_examples/basic_examples/mnist_examples/image_classifier_2_lite.py
index 78677cdf33bc4..f03850b94e92c 100644
--- a/pl_examples/basic_examples/mnist_examples/image_classifier_2_lite.py
+++ b/pl_examples/basic_examples/mnist_examples/image_classifier_2_lite.py
@@ -26,73 +26,80 @@
 from pytorch_lightning.lite import LightningLite
 
 
-def train(lite, args, model, train_loader, optimizer, epoch):
-    model.train()
-    for batch_idx, (data, target) in enumerate(train_loader):
-        optimizer.zero_grad()
-        output = model(data)
-        loss = F.nll_loss(output, target)
-        lite.backward(loss)
-        optimizer.step()
-        if (batch_idx == 0) or ((batch_idx + 1) % args.log_interval == 0):
-            print(
-                "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
-                    epoch,
-                    batch_idx * len(data),
-                    len(train_loader.dataset),
-                    100.0 * batch_idx / len(train_loader),
-                    loss.item(),
-                )
-            )
-            if args.dry_run:
-                break
-
-
-def test(lite, args, model, test_loader):
-    model.eval()
-    test_loss = 0
-    acc = Accuracy().to(lite.device)
-    with torch.no_grad():
-        for data, target in test_loader:
-            output = model(data)
-            test_loss += F.nll_loss(output, target, reduction="sum").item()  # sum up batch loss
-            acc.update(output, target)
-            if args.dry_run:
-                break
-
-    test_loss = lite.all_gather(test_loss).sum() / len(test_loader.dataset)
-
-    if lite.is_global_zero:
-        print(f"\nTest set: Average loss: {test_loss:.4f}, Accuracy: ({acc.compute():.0f}%)\n")
-
-
 class Lite(LightningLite):
-    def run(self, args):
-        train_kwargs = {"batch_size": args.batch_size}
-        test_kwargs = {"batch_size": args.test_batch_size}
+    def run(self, hparams):
+        self.hparams = hparams
+        seed_everything(hparams.seed)
+
         transform = T.Compose([T.ToTensor(), T.Normalize((0.1307,), (0.3081,))])
         train_dataset = MNIST("./data", train=True, download=True, transform=transform)
         test_dataset = MNIST("./data", train=False, transform=transform)
-        train_loader = torch.utils.data.DataLoader(train_dataset, **train_kwargs)
-        test_loader = torch.utils.data.DataLoader(test_dataset, **test_kwargs)
+        train_loader = torch.utils.data.DataLoader(
+            train_dataset,
+            batch_size=hparams.batch_size,
+        )
+        test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=hparams.batch_size)
 
         train_loader, test_loader = self.setup_dataloaders(train_loader, test_loader)
 
         model = Net()
-        optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
-
+        optimizer = optim.Adadelta(model.parameters(), lr=hparams.lr)
         model, optimizer = self.setup(model, optimizer)
 
-        scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
-        for epoch in range(1, args.epochs + 1):
-            train(self, args, model, train_loader, optimizer, epoch)
-            test(self, args, model, test_loader)
+        scheduler = StepLR(optimizer, step_size=1, gamma=hparams.gamma)
+
+        test_acc = Accuracy()
+
+        # EPOCH LOOP
+        for epoch in range(1, hparams.epochs + 1):
+
+            # TRAINING LOOP
+            model.train()
+            for batch_idx, (data, target) in enumerate(train_loader):
+                optimizer.zero_grad()
+                output = model(data)
+                loss = F.nll_loss(output, target)
+
+                ####################
+                self.backward(loss)
+                ####################
+
+                optimizer.step()
+                if (batch_idx == 0) or ((batch_idx + 1) % hparams.log_interval == 0):
+                    print(
+                        "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
+                            epoch,
+                            batch_idx * len(data),
+                            len(train_loader.dataset),
+                            100.0 * batch_idx / len(train_loader),
+                            loss.item(),
+                        )
+                    )
+                    if hparams.dry_run:
+                        break
+
             scheduler.step()
 
-            if args.dry_run:
+            # TESTING LOOP
+            model.eval()
+            test_loss = 0
+            with torch.no_grad():
+                for data, target in test_loader:
+                    output = model(data)
+                    test_loss += F.nll_loss(output, target, reduction="sum").item()  # sum up batch loss
+                    test_acc(output, target)
+                    if hparams.dry_run:
+                        break
+
+            test_loss = self.all_gather(test_loss).sum() / len(test_loader.dataset)
+
+            print(f"\nTest set: Average loss: {test_loss:.4f}, Accuracy: ({test_acc.compute():.0f}%)\n")
+            test_acc.reset()
+
+            if hparams.dry_run:
                 break
 
-        if args.save_model and self.is_global_zero:
+        if hparams.save_model and self.is_global_zero:
             torch.save(model.state_dict(), "mnist_cnn.pt")
 
 
@@ -102,9 +109,6 @@ def run(self, args):
     parser.add_argument(
         "--batch-size", type=int, default=64, metavar="N", help="input batch size for training (default: 64)"
     )
-    parser.add_argument(
-        "--test-batch-size", type=int, default=1000, metavar="N", help="input batch size for testing (default: 1000)"
-    )
     parser.add_argument("--epochs", type=int, default=14, metavar="N", help="number of epochs to train (default: 14)")
     parser.add_argument("--lr", type=float, default=1.0, metavar="LR", help="learning rate (default: 1.0)")
     parser.add_argument("--gamma", type=float, default=0.7, metavar="M", help="Learning rate step gamma (default: 0.7)")
@@ -119,13 +123,6 @@ def run(self, args):
         help="how many batches to wait before logging training status",
     )
     parser.add_argument("--save-model", action="store_true", default=False, help="For Saving the current Model")
-    args = parser.parse_args()
-
-    seed_everything(args.seed)
-
-    if torch.cuda.is_available():
-        lite_kwargs = {"accelerator": "gpu", "devices": torch.cuda.device_count()}
-    else:
-        lite_kwargs = {"accelerator": "cpu"}
+    hparams = parser.parse_args()
 
-    Lite(**lite_kwargs).run(args)
+    Lite(accelerator="gpu" if torch.cuda.is_available() else "cpu", devices=torch.cuda.device_count()).run(hparams)
diff --git a/pl_examples/basic_examples/mnist_examples/image_classifier_3_lite_to_lightning.py b/pl_examples/basic_examples/mnist_examples/image_classifier_3_lite_to_lightning.py
index 223f23312586e..1095a6a54822f 100644
--- a/pl_examples/basic_examples/mnist_examples/image_classifier_3_lite_to_lightning.py
+++ b/pl_examples/basic_examples/mnist_examples/image_classifier_3_lite_to_lightning.py
@@ -26,48 +26,13 @@
 from pytorch_lightning.lite import LightningLite
 
 
-def train(lite, args, model, train_loader, optimizer, epoch):
-    model.train()
-    for batch_idx, batch in enumerate(train_loader):
-        optimizer.zero_grad()
-        loss = lite.training_step(batch, batch_idx)
-        lite.backward(loss)
-        optimizer.step()
-        if (batch_idx == 0) or ((batch_idx + 1) % args.log_interval == 0):
-            print(
-                "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
-                    epoch,
-                    batch_idx * len(batch[0]),
-                    len(train_loader.dataset),
-                    100.0 * batch_idx / len(train_loader),
-                    loss.item(),
-                )
-            )
-            if args.dry_run:
-                break
-
-
-def test(lite, args, model, test_loader):
-    model.eval()
-    test_loss = 0
-    with torch.no_grad():
-        for batch_idx, batch in enumerate(test_loader):
-            test_loss += lite.test_step(batch, batch_idx)
-            if args.dry_run:
-                break
-
-    test_loss = lite.all_gather(test_loss).sum() / len(test_loader.dataset)
-
-    if lite.is_global_zero:
-        print(f"\nTest set: Average loss: {test_loss:.4f}, Accuracy: ({lite.test_acc.compute():.0f}%)\n")
-
-
 class Lite(LightningLite):
 
     """`Lite` is starting to look like a `LightningModule`."""
 
     def run(self, hparams):
         self.hparams = hparams
+        seed_everything(hparams.seed)
 
         self.model = Net()
         [optimizer], [scheduler] = self.configure_optimizers()
@@ -77,15 +42,49 @@ def run(self, hparams):
             self.prepare_data()
 
         train_loader, test_loader = self.setup_dataloaders(self.train_dataloader(), self.train_dataloader())
-
         self.test_acc = Accuracy()
 
+        # EPOCH LOOP
         for epoch in range(1, hparams.epochs + 1):
-            train(self, hparams, model, train_loader, optimizer, epoch)
-            test(self, hparams, model, test_loader)
+
+            # TRAINING LOOP
+            self.model.train()
+            for batch_idx, batch in enumerate(train_loader):
+                optimizer.zero_grad()
+                loss = self.training_step(batch, batch_idx)
+                self.backward(loss)
+                optimizer.step()
+
+                if (batch_idx == 0) or ((batch_idx + 1) % hparams.log_interval == 0):
+                    print(
+                        "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
+                            epoch,
+                            (batch_idx + 1) * self.hparams.batch_size,
+                            len(train_loader.dataset),
+                            100.0 * batch_idx / len(train_loader),
+                            loss.item(),
+                        )
+                    )
+                    if hparams.dry_run:
+                        break
+
             scheduler.step()
 
-            if args.dry_run:
+            # TESTING LOOP
+            self.model.eval()
+            test_loss = 0
+            with torch.no_grad():
+                for batch_idx, batch in enumerate(test_loader):
+                    test_loss += self.test_step(batch, batch_idx)
+                    if hparams.dry_run:
+                        break
+
+            test_loss = self.all_gather(test_loss).sum() / len(test_loader.dataset)
+
+            print(f"\nTest set: Average loss: {test_loss:.4f}, Accuracy: ({self.test_acc.compute():.0f}%)\n")
+            self.test_acc.reset()
+
+            if hparams.dry_run:
                 break
 
         if hparams.save_model and self.is_global_zero:
@@ -97,12 +96,14 @@ def forward(self, x):
         return self.model(x)
 
     def training_step(self, batch, batch_idx):
+        """Here you compute and return the training loss+ compute extra training metrics."""
         x, y = batch
         logits = self.forward(x)
         loss = F.nll_loss(logits, y.long())
         return loss
 
     def test_step(self, batch, batch_idx):
+        """Here you compute and return the testing loss+ compute extra testing metrics."""
         x, y = batch
         logits = self.forward(x)
         loss = F.nll_loss(logits, y.long())
@@ -137,13 +138,9 @@ def test_dataloader(self):
     parser.add_argument(
         "--batch-size", type=int, default=64, metavar="N", help="input batch size for training (default: 64)"
     )
-    parser.add_argument(
-        "--test-batch-size", type=int, default=1000, metavar="N", help="input batch size for testing (default: 1000)"
-    )
     parser.add_argument("--epochs", type=int, default=14, metavar="N", help="number of epochs to train (default: 14)")
     parser.add_argument("--lr", type=float, default=1.0, metavar="LR", help="learning rate (default: 1.0)")
     parser.add_argument("--gamma", type=float, default=0.7, metavar="M", help="Learning rate step gamma (default: 0.7)")
-    parser.add_argument("--no-cuda", action="store_true", default=False, help="disables CUDA training")
     parser.add_argument("--dry-run", action="store_true", default=False, help="quickly check a single pass")
     parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)")
     parser.add_argument(
@@ -154,13 +151,6 @@ def test_dataloader(self):
         help="how many batches to wait before logging training status",
     )
     parser.add_argument("--save-model", action="store_true", default=False, help="For Saving the current Model")
-    args = parser.parse_args()
-
-    seed_everything(args.seed)
-
-    if torch.cuda.is_available():
-        lite_kwargs = {"accelerator": "gpu", "devices": torch.cuda.device_count()}
-    else:
-        lite_kwargs = {"accelerator": "cpu"}
+    hparams = parser.parse_args()
 
-    Lite(**lite_kwargs).run(args)
+    Lite(accelerator="gpu" if torch.cuda.is_available() else "cpu", devices=torch.cuda.device_count()).run(hparams)
diff --git a/pl_examples/basic_examples/mnist_examples/image_classifier_4_lightning.py b/pl_examples/basic_examples/mnist_examples/image_classifier_4_lightning.py
index 6b73bfa20bf8f..42501cdbfdddb 100644
--- a/pl_examples/basic_examples/mnist_examples/image_classifier_4_lightning.py
+++ b/pl_examples/basic_examples/mnist_examples/image_classifier_4_lightning.py
@@ -48,11 +48,9 @@ def test_step(self, batch, batch_idx):
         logits = self.forward(x)
         loss = F.nll_loss(logits, y.long())
         self.test_acc(logits, y.long())
+        self.log("test_acc", self.test_acc)
         return loss
 
-    def test_epoch_end(self, *_) -> None:
-        self.log("test_acc", self.test_acc.compute())
-
     def configure_optimizers(self):
         optimizer = torch.optim.Adadelta(self.model.parameters(), lr=self.hparams.lr)
         return [optimizer], [torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=self.hparams.gamma)]
@@ -76,7 +74,8 @@ def test_dataloader(self):
 
 
 def cli_main():
-    cli = LightningCLI(ImageClassifier, seed_everything_default=1234, save_config_overwrite=True, run=False)
+    # The LightningCLI removes all the boilerplate associate to arguments parsing. This is purely optional.
+    cli = LightningCLI(ImageClassifier, seed_everything_default=42, save_config_overwrite=True, run=False)
     cli.trainer.fit(cli.model, datamodule=cli.datamodule)
     cli.trainer.test(ckpt_path="best", datamodule=cli.datamodule)
 
diff --git a/pl_examples/basic_examples/mnist_examples/image_classifier_5_lightning_datamodule.py b/pl_examples/basic_examples/mnist_examples/image_classifier_5_lightning_datamodule.py
index fc30836b6c37b..3dfb5543aca21 100644
--- a/pl_examples/basic_examples/mnist_examples/image_classifier_5_lightning_datamodule.py
+++ b/pl_examples/basic_examples/mnist_examples/image_classifier_5_lightning_datamodule.py
@@ -80,8 +80,9 @@ def test_dataloader(self):
 
 
 def cli_main():
+    # The LightningCLI removes all the boilerplate associate to arguments parsing. This is purely optional.
     cli = LightningCLI(
-        ImageClassifier, MNISTDataModule, seed_everything_default=1234, save_config_overwrite=True, run=False
+        ImageClassifier, MNISTDataModule, seed_everything_default=42, save_config_overwrite=True, run=False
     )
     cli.trainer.fit(cli.model, datamodule=cli.datamodule)
     cli.trainer.test(ckpt_path="best", datamodule=cli.datamodule)