From 8bf41f1814af1a0f6c23185e4dd6f356a139b0da Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Wed, 10 Mar 2021 00:18:38 +0100
Subject: [PATCH 01/13] Improve DummyLogger (#6398)

* fix dummy logger

* docs

* update docs

* add changelog

* add none return annotation

* return empty string for name, version
---
 CHANGELOG.md                             | 11 ++++++--
 pytorch_lightning/loggers/base.py        | 34 +++++++++++++-----------
 tests/loggers/test_base.py               |  9 +++++++
 tests/trainer/flags/test_fast_dev_run.py |  1 +
 4 files changed, 37 insertions(+), 18 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d192f814c4081..4d2f403739b47 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,15 @@ All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
+## [1.2.6] - 2021-03-30
+
+### Changed
+
+-
+
+### Fixed
+
+- Fixed `DummyLogger.log_hyperparams` raising a `TypeError` when running with `fast_dev_run=True` ([#6398](https://github.com/PyTorchLightning/pytorch-lightning/pull/6398))
 
 
 ## [1.2.5] - 2021-03-23
@@ -13,7 +22,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Update Gradient Clipping for the TPU Accelerator ([#6576](https://github.com/PyTorchLightning/pytorch-lightning/pull/6576))
 - Refactored setup for typing friendly ([#6590](https://github.com/PyTorchLightning/pytorch-lightning/pull/6590))
 
-
 ### Fixed
 
 - Fixed a bug where `all_gather` would not work correctly with `tpu_cores=8` ([#6587](https://github.com/PyTorchLightning/pytorch-lightning/pull/6587))
@@ -36,7 +44,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed broadcast to use PyTorch `broadcast_object_list` and add `reduce_decision` ([#6410](https://github.com/PyTorchLightning/pytorch-lightning/pull/6410))
 - Fixed logger creating directory structure too early in DDP ([#6380](https://github.com/PyTorchLightning/pytorch-lightning/pull/6380))
 - Fixed DeepSpeed additional memory use on rank 0 when default device not set early enough ([#6460](https://github.com/PyTorchLightning/pytorch-lightning/pull/6460))
-- Fixed `DummyLogger.log_hyperparams` raising a `TypeError` when running with `fast_dev_run=True` ([#6398](https://github.com/PyTorchLightning/pytorch-lightning/pull/6398))
 - Fixed an issue with `Tuner.scale_batch_size` not finding the batch size attribute in the datamodule ([#5968](https://github.com/PyTorchLightning/pytorch-lightning/pull/5968))
 - Fixed an exception in the layer summary when the model contains torch.jit scripted submodules ([#6511](https://github.com/PyTorchLightning/pytorch-lightning/pull/6511))
 - Fixed when Train loop config was run during `Trainer.predict` ([#6541](https://github.com/PyTorchLightning/pytorch-lightning/pull/6541))
diff --git a/pytorch_lightning/loggers/base.py b/pytorch_lightning/loggers/base.py
index 4fdb5e8c437bf..035a42338fe68 100644
--- a/pytorch_lightning/loggers/base.py
+++ b/pytorch_lightning/loggers/base.py
@@ -279,12 +279,14 @@ def _sanitize_params(params: Dict[str, Any]) -> Dict[str, Any]:
         return params
 
     @abstractmethod
-    def log_hyperparams(self, params: argparse.Namespace):
+    def log_hyperparams(self, params: argparse.Namespace, *args, **kwargs):
         """
         Record hyperparameters.
 
         Args:
             params: :class:`~argparse.Namespace` containing the hyperparameters
+            args: Optional positional arguments, depends on the specific logger being used
+            kwargs: Optional keywoard arguments, depends on the specific logger being used
         """
 
     def log_graph(self, model: LightningModule, input_array=None) -> None:
@@ -418,41 +420,41 @@ def nop(*args, **kw):
     def __getattr__(self, _):
         return self.nop
 
-    def __getitem__(self, idx):
-        # enables self.logger[0].experiment.add_image
-        # and self.logger.experiment[0].add_image(...)
+    def __getitem__(self, idx) -> "DummyExperiment":
+        # enables self.logger.experiment[0].add_image(...)
         return self
 
 
 class DummyLogger(LightningLoggerBase):
-    """ Dummy logger for internal use. Is usefull if we want to disable users
-        logger for a feature, but still secure that users code can run """
+    """
+    Dummy logger for internal use. It is useful if we want to disable user's
+    logger for a feature, but still ensure that user code can run
+    """
 
     def __init__(self):
         super().__init__()
         self._experiment = DummyExperiment()
 
     @property
-    def experiment(self):
+    def experiment(self) -> DummyExperiment:
         return self._experiment
 
-    @rank_zero_only
-    def log_metrics(self, metrics, step):
+    def log_metrics(self, *args, **kwargs) -> None:
         pass
 
-    @rank_zero_only
-    def log_hyperparams(self, params):
+    def log_hyperparams(self, *args, **kwargs) -> None:
         pass
 
     @property
-    def name(self):
-        pass
+    def name(self) -> str:
+        return ""
 
     @property
-    def version(self):
-        pass
+    def version(self) -> str:
+        return ""
 
-    def __getitem__(self, idx):
+    def __getitem__(self, idx) -> "DummyLogger":
+        # enables self.logger[0].experiment.add_image(...)
         return self
 
 
diff --git a/tests/loggers/test_base.py b/tests/loggers/test_base.py
index c48fef5e04b49..cf3a0cb74b3f4 100644
--- a/tests/loggers/test_base.py
+++ b/tests/loggers/test_base.py
@@ -229,15 +229,24 @@ def log_metrics(self, metrics, step):
 
 
 def test_dummyexperiment_support_indexing():
+    """ Test that the DummyExperiment can imitate indexing the experiment in a LoggerCollection. """
     experiment = DummyExperiment()
     assert experiment[0] == experiment
 
 
 def test_dummylogger_support_indexing():
+    """ Test that the DummyLogger can imitate indexing of a LoggerCollection. """
     logger = DummyLogger()
     assert logger[0] == logger
 
 
+def test_dummylogger_noop_method_calls():
+    """ Test that the DummyLogger methods can be called with arbitrary arguments. """
+    logger = DummyLogger()
+    logger.log_hyperparams("1", 2, three="three")
+    logger.log_metrics("1", 2, three="three")
+
+
 def test_np_sanitization():
 
     class CustomParamsLogger(CustomLogger):
diff --git a/tests/trainer/flags/test_fast_dev_run.py b/tests/trainer/flags/test_fast_dev_run.py
index 221951e788284..bcfdd6247d550 100644
--- a/tests/trainer/flags/test_fast_dev_run.py
+++ b/tests/trainer/flags/test_fast_dev_run.py
@@ -71,6 +71,7 @@ def test_step(self, batch, batch_idx):
     checkpoint_callback = ModelCheckpoint()
     early_stopping_callback = EarlyStopping()
     trainer_config = dict(
+        default_root_dir=tmpdir,
         fast_dev_run=fast_dev_run,
         val_check_interval=2,
         logger=True,

From 4e19a5b4f1307a83410fc72dd4dc3721ccd7a9a8 Mon Sep 17 00:00:00 2001
From: Rohit Gupta <rohitgr1998@gmail.com>
Date: Thu, 25 Mar 2021 18:50:49 +0530
Subject: [PATCH 02/13] Add on_epoch_start to run at the beginning of every
 loop irrespective of train/val/test (#6498)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* update docs

* add hook and update docs

* update tests

* chlog

* Update CHANGELOG.md

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>

* chlog

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
---
 CHANGELOG.md                                  |  2 +-
 docs/source/common/lightning_module.rst       | 91 +++++++++++++++++--
 docs/source/extensions/callbacks.rst          | 12 +++
 docs/source/extensions/logging.rst            |  2 +-
 pytorch_lightning/callbacks/base.py           |  4 +-
 .../gradient_accumulation_scheduler.py        |  2 +-
 pytorch_lightning/callbacks/progress.py       |  6 +-
 pytorch_lightning/core/hooks.py               |  4 +-
 pytorch_lightning/core/lightning.py           | 11 ++-
 pytorch_lightning/trainer/callback_hook.py    |  4 +-
 pytorch_lightning/trainer/evaluation_loop.py  |  2 +
 pytorch_lightning/trainer/training_loop.py    |  6 +-
 tests/callbacks/test_callbacks.py             |  3 +
 tests/models/test_hooks.py                    |  3 +
 .../logging_/test_eval_loop_logging_1_0.py    | 15 ++-
 15 files changed, 135 insertions(+), 32 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4d2f403739b47..524e57ac48e03 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,7 +8,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Changed
 
--
+- Changed the behavior of `on_epoch_start` to run at the beginning of validation & test epoch ([#6498](https://github.com/PyTorchLightning/pytorch-lightning/pull/6498))
 
 ### Fixed
 
diff --git a/docs/source/common/lightning_module.rst b/docs/source/common/lightning_module.rst
index c02f23ac60d09..7f0df33a351e4 100644
--- a/docs/source/common/lightning_module.rst
+++ b/docs/source/common/lightning_module.rst
@@ -1039,6 +1039,7 @@ This is the pseudocode to describe how all the hooks are called during a call to
         teardown()
 
     def train_loop():
+        on_epoch_start()
         on_train_epoch_start()
         train_outs = []
         for train_batch in train_dataloader():
@@ -1062,12 +1063,15 @@ This is the pseudocode to describe how all the hooks are called during a call to
                 val_loop()
 
         # end training epoch
-        logs = training_epoch_end(outs)
+        outs = training_epoch_end(outs)
+        on_train_epoch_end(outs)
+        on_epoch_end()
 
     def val_loop():
         model.eval()
         torch.set_grad_enabled(False)
 
+        on_epoch_start()
         on_validation_epoch_start()
         val_outs = []
         for val_batch in val_dataloader():
@@ -1081,6 +1085,7 @@ This is the pseudocode to describe how all the hooks are called during a call to
 
         validation_epoch_end(val_outs)
         on_validation_epoch_end()
+        on_epoch_end()
 
         # set up for train
         model.train()
@@ -1108,12 +1113,12 @@ manual_backward
 on_after_backward
 ~~~~~~~~~~~~~~~~~
 
-.. automethod:: pytorch_lightning.core.lightning.LightningModule.on_after_backward
+.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_after_backward
     :noindex:
 
 on_before_zero_grad
 ~~~~~~~~~~~~~~~~~~~
-.. automethod:: pytorch_lightning.core.lightning.LightningModule.on_before_zero_grad
+.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_before_zero_grad
     :noindex:
 
 on_fit_start
@@ -1132,15 +1137,38 @@ on_fit_end
 on_load_checkpoint
 ~~~~~~~~~~~~~~~~~~
 
-.. automethod:: pytorch_lightning.core.lightning.LightningModule.on_load_checkpoint
+.. automethod:: pytorch_lightning.core.hooks.CheckpointHooks.on_load_checkpoint
     :noindex:
 
 on_save_checkpoint
 ~~~~~~~~~~~~~~~~~~
 
-.. automethod:: pytorch_lightning.core.lightning.LightningModule.on_save_checkpoint
+.. automethod:: pytorch_lightning.core.hooks.CheckpointHooks.on_save_checkpoint
     :noindex:
 
+on_train_start
+~~~~~~~~~~~~~~
+
+.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_train_start
+    :noindex:
+
+on_train_end
+~~~~~~~~~~~~
+
+.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_train_end
+    :noindex:
+
+on_validation_start
+~~~~~~~~~~~~~~~~~~~
+
+.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_validation_start
+    :noindex:
+
+on_validation_end
+~~~~~~~~~~~~~~~~~
+
+.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_validation_end
+    :noindex:
 
 on_pretrain_routine_start
 ~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -1178,6 +1206,11 @@ on_test_epoch_end
 .. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_test_epoch_end
     :noindex:
 
+on_test_end
+~~~~~~~~~~~
+
+.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_test_end
+    :noindex:
 
 on_train_batch_start
 ~~~~~~~~~~~~~~~~~~~~
@@ -1191,6 +1224,18 @@ on_train_batch_end
 .. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_train_batch_end
     :noindex:
 
+on_epoch_start
+~~~~~~~~~~~~~~
+
+.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_epoch_start
+    :noindex:
+
+on_epoch_end
+~~~~~~~~~~~~
+
+.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_epoch_end
+    :noindex:
+
 on_train_epoch_start
 ~~~~~~~~~~~~~~~~~~~~
 
@@ -1227,6 +1272,36 @@ on_validation_epoch_end
 .. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_validation_epoch_end
     :noindex:
 
+on_post_move_to_device
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_post_move_to_device
+    :noindex:
+
+on_validation_model_eval
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_validation_model_eval
+    :noindex:
+
+on_validation_model_train
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_validation_model_train
+    :noindex:
+
+on_test_model_eval
+~~~~~~~~~~~~~~~~~~
+
+.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_test_model_eval
+    :noindex:
+
+on_test_model_train
+~~~~~~~~~~~~~~~~~~~
+
+.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_test_model_train
+    :noindex:
+
 optimizer_step
 ~~~~~~~~~~~~~~
 
@@ -1266,19 +1341,19 @@ teardown
 train_dataloader
 ~~~~~~~~~~~~~~~~
 
-.. automethod:: pytorch_lightning.core.lightning.LightningModule.train_dataloader
+.. automethod:: pytorch_lightning.core.hooks.DataHooks.train_dataloader
     :noindex:
 
 val_dataloader
 ~~~~~~~~~~~~~~
 
-.. automethod:: pytorch_lightning.core.lightning.LightningModule.val_dataloader
+.. automethod:: pytorch_lightning.core.hooks.DataHooks.val_dataloader
     :noindex:
 
 test_dataloader
 ~~~~~~~~~~~~~~~
 
-.. automethod:: pytorch_lightning.core.lightning.LightningModule.test_dataloader
+.. automethod:: pytorch_lightning.core.hooks.DataHooks.test_dataloader
     :noindex:
 
 transfer_batch_to_device
diff --git a/docs/source/extensions/callbacks.rst b/docs/source/extensions/callbacks.rst
index 63a221a06119f..73691c6dd76f5 100644
--- a/docs/source/extensions/callbacks.rst
+++ b/docs/source/extensions/callbacks.rst
@@ -349,3 +349,15 @@ on_load_checkpoint
 
 .. automethod:: pytorch_lightning.callbacks.Callback.on_load_checkpoint
     :noindex:
+
+on_after_backward
+^^^^^^^^^^^^^^^^^
+
+.. automethod:: pytorch_lightning.callbacks.Callback.on_after_backward
+    :noindex:
+
+on_before_zero_grad
+^^^^^^^^^^^^^^^^^^^
+
+.. automethod:: pytorch_lightning.callbacks.Callback.on_before_zero_grad
+    :noindex:
diff --git a/docs/source/extensions/logging.rst b/docs/source/extensions/logging.rst
index bfeed22fd4e66..1ac6e698ccbd3 100644
--- a/docs/source/extensions/logging.rst
+++ b/docs/source/extensions/logging.rst
@@ -90,7 +90,7 @@ The :func:`~~pytorch_lightning.core.lightning.LightningModule.log` method has a
 .. note::
 
     -   Setting ``on_epoch=True`` will cache all your logged values during the full training epoch and perform a
-        reduction `on_epoch_end`. We recommend using the :doc:`metrics <../extensions/metrics>` API when working with custom reduction.
+        reduction in ``on_train_epoch_end``. We recommend using the :doc:`metrics <../extensions/metrics>` API when working with custom reduction.
 
     -   Setting both ``on_step=True`` and ``on_epoch=True`` will create two keys per metric you log with
         suffix ``_step`` and ``_epoch``, respectively. You can refer to these keys e.g. in the `monitor`
diff --git a/pytorch_lightning/callbacks/base.py b/pytorch_lightning/callbacks/base.py
index d53acf0f7030d..76e23a3118dcb 100644
--- a/pytorch_lightning/callbacks/base.py
+++ b/pytorch_lightning/callbacks/base.py
@@ -102,11 +102,11 @@ def on_test_epoch_end(self, trainer, pl_module: LightningModule) -> None:
         pass
 
     def on_epoch_start(self, trainer, pl_module: LightningModule) -> None:
-        """Called when the epoch begins."""
+        """Called when either of train/val/test epoch begins."""
         pass
 
     def on_epoch_end(self, trainer, pl_module: LightningModule) -> None:
-        """Called when the epoch ends."""
+        """Called when either of train/val/test epoch ends."""
         pass
 
     def on_batch_start(self, trainer, pl_module: LightningModule) -> None:
diff --git a/pytorch_lightning/callbacks/gradient_accumulation_scheduler.py b/pytorch_lightning/callbacks/gradient_accumulation_scheduler.py
index 0af7d61bf5dec..b1885087f4da0 100644
--- a/pytorch_lightning/callbacks/gradient_accumulation_scheduler.py
+++ b/pytorch_lightning/callbacks/gradient_accumulation_scheduler.py
@@ -74,7 +74,7 @@ def __init__(self, scheduling: Dict[int, int]):
     def going_to_accumulate_grad_batches(self):
         return any([v > 1 for v in self.scheduling.values()])
 
-    def on_epoch_start(self, trainer, pl_module):
+    def on_train_epoch_start(self, trainer, pl_module):
         epoch = trainer.current_epoch
         for i in reversed(range(len(self.epochs))):
             if epoch >= self.epochs[i]:
diff --git a/pytorch_lightning/callbacks/progress.py b/pytorch_lightning/callbacks/progress.py
index 587fee95e9cd0..46331e004c1c7 100644
--- a/pytorch_lightning/callbacks/progress.py
+++ b/pytorch_lightning/callbacks/progress.py
@@ -192,7 +192,7 @@ def on_init_end(self, trainer):
     def on_train_start(self, trainer, pl_module):
         self._train_batch_idx = trainer.batch_idx
 
-    def on_epoch_start(self, trainer, pl_module):
+    def on_train_epoch_start(self, trainer, pl_module):
         self._train_batch_idx = 0
 
     def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx):
@@ -383,8 +383,8 @@ def on_train_start(self, trainer, pl_module):
         super().on_train_start(trainer, pl_module)
         self.main_progress_bar = self.init_train_tqdm()
 
-    def on_epoch_start(self, trainer, pl_module):
-        super().on_epoch_start(trainer, pl_module)
+    def on_train_epoch_start(self, trainer, pl_module):
+        super().on_train_epoch_start(trainer, pl_module)
         total_train_batches = self.total_train_batches
         total_val_batches = self.total_val_batches
         if total_train_batches != float('inf'):
diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py
index 2e1ea31871e03..79295c7c81dc1 100644
--- a/pytorch_lightning/core/hooks.py
+++ b/pytorch_lightning/core/hooks.py
@@ -224,13 +224,13 @@ def on_predict_model_eval(self) -> None:
 
     def on_epoch_start(self) -> None:
         """
-        Called in the training loop at the very beginning of the epoch.
+        Called when either of train/val/test epoch begins.
         """
         # do something when the epoch starts
 
     def on_epoch_end(self) -> None:
         """
-        Called in the training loop at the very end of the epoch.
+        Called when either of train/val/test epoch ends.
         """
         # do something when the epoch ends
 
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index d1a0a87c37f33..137f65baf71cb 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -706,10 +706,13 @@ def validation_step(self, *args, **kwargs):
         .. code-block:: python
 
             # pseudocode of order
-            out = validation_step()
-            if defined('validation_step_end'):
-                out = validation_step_end(out)
-            out = validation_epoch_end(out)
+            val_outs = []
+            for val_batch in val_data:
+                out = validation_step(val_batch)
+                if defined('validation_step_end'):
+                    out = validation_step_end(out)
+                val_outs.append(out)
+            val_outs = validation_epoch_end(val_outs)
 
 
         .. code-block:: python
diff --git a/pytorch_lightning/trainer/callback_hook.py b/pytorch_lightning/trainer/callback_hook.py
index d33338055a5b1..bbd968fba061e 100644
--- a/pytorch_lightning/trainer/callback_hook.py
+++ b/pytorch_lightning/trainer/callback_hook.py
@@ -105,12 +105,12 @@ def on_test_epoch_end(self):
             callback.on_test_epoch_end(self, self.lightning_module)
 
     def on_epoch_start(self):
-        """Called when the epoch begins."""
+        """Called when either of train/val/test epoch begins."""
         for callback in self.callbacks:
             callback.on_epoch_start(self, self.lightning_module)
 
     def on_epoch_end(self):
-        """Called when the epoch ends."""
+        """Called when either of train/val/test epoch ends."""
         for callback in self.callbacks:
             callback.on_epoch_end(self, self.lightning_module)
 
diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py
index e1b3688ef36e6..c7eb7e0c90ad0 100644
--- a/pytorch_lightning/trainer/evaluation_loop.py
+++ b/pytorch_lightning/trainer/evaluation_loop.py
@@ -120,6 +120,8 @@ def setup(self, model, max_batches, dataloaders):
         self._predictions = [[] for _ in range(self.num_dataloaders)]
 
     def on_evaluation_epoch_start(self, *args, **kwargs):
+        self.trainer.call_hook('on_epoch_start', *args, **kwargs)
+
         if self.trainer.testing:
             self.trainer.call_hook('on_test_epoch_start', *args, **kwargs)
         else:
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index c3afe14285d9f..36e1f6799437e 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -189,7 +189,7 @@ def on_train_epoch_start(self, epoch):
             self.trainer.train_dataloader.sampler.set_epoch(epoch)
 
         # changing gradient according accumulation_scheduler
-        self.trainer.accumulation_scheduler.on_epoch_start(self.trainer, self.trainer.lightning_module)
+        self.trainer.accumulation_scheduler.on_train_epoch_start(self.trainer, self.trainer.lightning_module)
 
         # stores accumulated grad fractions per batch
         self.accumulated_loss = TensorRunningAccum(window_length=self.trainer.accumulate_grad_batches)
@@ -555,7 +555,7 @@ def run_training_epoch(self):
             self.increment_accumulated_grad_global_step()
 
         # epoch end hook
-        self.run_on_epoch_end_hook(epoch_output)
+        self.on_train_epoch_end(epoch_output)
 
         # log epoch metrics
         self.trainer.logger_connector.log_train_epoch_end_metrics(
@@ -798,7 +798,7 @@ def update_train_loop_lr_schedulers(self, monitor_metrics=None):
             # update lr
             self.trainer.optimizer_connector.update_learning_rates(interval="step", monitor_metrics=monitor_metrics)
 
-    def run_on_epoch_end_hook(self, epoch_output):
+    def on_train_epoch_end(self, epoch_output):
         # inform logger the batch loop has finished
         self.trainer.logger_connector.on_train_epoch_end()
 
diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py
index 8d01841f3636c..4b3aab7638e3d 100644
--- a/tests/callbacks/test_callbacks.py
+++ b/tests/callbacks/test_callbacks.py
@@ -61,6 +61,7 @@ def test_trainer_callback_system(torch_save, tmpdir):
         call.on_pretrain_routine_end(trainer, model),
         call.on_sanity_check_start(trainer, model),
         call.on_validation_start(trainer, model),
+        call.on_epoch_start(trainer, model),
         call.on_validation_epoch_start(trainer, model),
         call.on_validation_batch_start(trainer, model, ANY, 0, 0),
         call.on_validation_batch_end(trainer, model, ANY, ANY, 0, 0),
@@ -92,6 +93,7 @@ def test_trainer_callback_system(torch_save, tmpdir):
         call.on_train_epoch_end(trainer, model, ANY),
         call.on_epoch_end(trainer, model),
         call.on_validation_start(trainer, model),
+        call.on_epoch_start(trainer, model),
         call.on_validation_epoch_start(trainer, model),
         call.on_validation_batch_start(trainer, model, ANY, 0, 0),
         call.on_validation_batch_end(trainer, model, ANY, ANY, 0, 0),
@@ -115,6 +117,7 @@ def test_trainer_callback_system(torch_save, tmpdir):
         call.on_before_accelerator_backend_setup(trainer, model),
         call.on_fit_start(trainer, model),
         call.on_test_start(trainer, model),
+        call.on_epoch_start(trainer, model),
         call.on_test_epoch_start(trainer, model),
         call.on_test_batch_start(trainer, model, ANY, 0, 0),
         call.on_test_batch_end(trainer, model, ANY, ANY, 0, 0),
diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py
index 62a252eaa3128..0da13ecbd8867 100644
--- a/tests/models/test_hooks.py
+++ b/tests/models/test_hooks.py
@@ -435,6 +435,7 @@ def teardown(self, stage: str):
         'on_pretrain_routine_end',
         'on_validation_model_eval',
         'on_validation_start',
+        'on_epoch_start',
         'on_validation_epoch_start',
         'on_validation_batch_start',
         'on_validation_batch_end',
@@ -457,6 +458,7 @@ def teardown(self, stage: str):
         'on_epoch_end',
         'on_validation_model_eval',
         'on_validation_start',
+        'on_epoch_start',
         'on_validation_epoch_start',
         'on_validation_batch_start',
         'on_validation_batch_end',
@@ -479,6 +481,7 @@ def teardown(self, stage: str):
         'on_fit_start',
         'on_test_model_eval',
         'on_test_start',
+        'on_epoch_start',
         'on_test_epoch_start',
         'on_test_batch_start',
         'on_test_batch_end',
diff --git a/tests/trainer/logging_/test_eval_loop_logging_1_0.py b/tests/trainer/logging_/test_eval_loop_logging_1_0.py
index 765fab229f6cf..79bdecae46424 100644
--- a/tests/trainer/logging_/test_eval_loop_logging_1_0.py
+++ b/tests/trainer/logging_/test_eval_loop_logging_1_0.py
@@ -496,9 +496,15 @@ def on_validation_start(self, trainer, pl_module):
             )
 
         def on_epoch_start(self, trainer, pl_module):
-            self.make_logging(
-                pl_module, 'on_epoch_start', 2, on_steps=self.choices, on_epochs=self.choices, prob_bars=self.choices
-            )
+            if trainer.validating:
+                self.make_logging(
+                    pl_module,
+                    'on_epoch_start',
+                    2,
+                    on_steps=self.choices,
+                    on_epochs=self.choices,
+                    prob_bars=self.choices
+                )
 
         def on_validation_epoch_start(self, trainer, pl_module):
             self.make_logging(
@@ -540,7 +546,7 @@ def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx,
             self.count += 1
 
         def on_epoch_end(self, trainer, pl_module):
-            if not trainer.training:
+            if trainer.validating:
                 self.make_logging(
                     pl_module, 'on_epoch_end', 8, on_steps=[False], on_epochs=self.choices, prob_bars=self.choices
                 )
@@ -578,7 +584,6 @@ def validation_step(self, batch, batch_idx):
         callbacks=[test_callback],
     )
     trainer.fit(model)
-    trainer.test()
 
     assert test_callback.funcs_called_count["on_epoch_start"] == 1
     # assert test_callback.funcs_called_count["on_batch_start"] == 1

From c02db78e591054ae40306a503661aa0b4305a9c1 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Tue, 23 Mar 2021 23:05:04 +0100
Subject: [PATCH 03/13] update coverage config (#6524)

* update coverage config

* parallel

* parallel

* Apply suggestions from code review

* Apply suggestions from code review

* paralel

* paralel

* paralel

* combine

* combine

* .

* ..

* ..

* ..

* rev

* cb

* cb

* drop

* drop

* .

* ..

* ...

* ...

* ...

* .
---
 .github/workflows/ci_test-base.yml  | 2 +-
 .github/workflows/ci_test-conda.yml | 2 +-
 .github/workflows/ci_test-full.yml  | 2 +-
 azure-pipelines.yml                 | 2 +-
 requirements/test.txt               | 4 ++--
 setup.cfg                           | 5 -----
 tests/special_tests.sh              | 2 +-
 7 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/ci_test-base.yml b/.github/workflows/ci_test-base.yml
index 0e84642e2f810..77363992718af 100644
--- a/.github/workflows/ci_test-base.yml
+++ b/.github/workflows/ci_test-base.yml
@@ -68,7 +68,7 @@ jobs:
     - name: Test Package [only]
       run: |
         # NOTE: run coverage on tests does not propagare faler status for Win, https://github.com/nedbat/coveragepy/issues/1003
-        python -m pytest pytorch_lightning -v --cov=pytorch_lightning --junitxml=junit/test-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}.xml
+        coverage run --source pytorch_lightning -m pytest pytorch_lightning -v --junitxml=junit/test-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}.xml
 
     - name: Upload pytest test results
       uses: actions/upload-artifact@v2
diff --git a/.github/workflows/ci_test-conda.yml b/.github/workflows/ci_test-conda.yml
index 419580b71cd10..94cc73f0dc184 100644
--- a/.github/workflows/ci_test-conda.yml
+++ b/.github/workflows/ci_test-conda.yml
@@ -44,7 +44,7 @@ jobs:
     - name: Tests
       run: |
         # NOTE: run coverage on tests does not propagare faler status for Win, https://github.com/nedbat/coveragepy/issues/1003
-        python -m pytest pytorch_lightning tests -v --durations=50 --junitxml=junit/test-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}.xml
+        coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --durations=50 --junitxml=junit/test-results-${{ runner.os }}-torch${{ matrix.pytorch-version }}.xml
       shell: bash -l {0}
 
     - name: Upload pytest test results
diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml
index ff8fba06adee6..5fe142577b2c0 100644
--- a/.github/workflows/ci_test-full.yml
+++ b/.github/workflows/ci_test-full.yml
@@ -134,7 +134,7 @@ jobs:
     - name: Tests
       run: |
         # NOTE: do not include coverage report here, see: https://github.com/nedbat/coveragepy/issues/1003
-        coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --durations=50 --junitxml=junit/test-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}.xml
+        coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --durations=50 --junitxml=junit/test-results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}.xml
 
     - name: Examples
       run: |
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 1447176c7ea70..fecf8f1a776ce 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -78,7 +78,7 @@ jobs:
       displayName: 'Get legacy checkpoints'
 
     - bash: |
-        python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --durations=50
+        python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50
       displayName: 'Testing: standard'
 
     - bash: |
diff --git a/requirements/test.txt b/requirements/test.txt
index 84ddb2f981b54..48ed4727ecd51 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -1,8 +1,8 @@
 coverage>=5.2
 codecov>=2.1
 pytest>=6.0
-pytest-cov>2.10
-# pytest-xdist
+#pytest-cov>2.10
+#pytest-xdist
 flake8>=3.6
 check-manifest
 twine==3.2
diff --git a/setup.cfg b/setup.cfg
index 4c478dccb709e..528c4ffa9e214 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -47,11 +47,6 @@ omit =
     pytorch_lightning/utilities/xla_device_utils.py
     pytorch_lightning/utilities/distributed.py
     pytorch_lightning/tuner/auto_gpu_select.py
-    # TODO: temporary, until accelerator refactor is finished
-    pytorch_lightning/accelerators/accelerator.py
-    pytorch_lightning/plugins/training_type/*.py
-    pytorch_lightning/plugins/precision/*.py
-    pytorch_lightning/plugins/base_plugin.py
 
 
 [flake8]
diff --git a/tests/special_tests.sh b/tests/special_tests.sh
index 43658721e9226..b00936398489d 100644
--- a/tests/special_tests.sh
+++ b/tests/special_tests.sh
@@ -14,7 +14,7 @@
 # Running special tests
 set -e
 export PL_RUNNING_SPECIAL_TESTS=1
-DEFAULTS="-m coverage run --source pytorch_lightning -a -m pytest --verbose --capture=no"
+DEFAULTS="-m coverage run --source pytorch_lightning --append -m pytest --verbose --capture=no"
 python ${DEFAULTS} tests/trainer/optimization/test_manual_optimization.py::test_step_with_optimizer_closure_with_different_frequencies_ddp
 python ${DEFAULTS} tests/models/test_sync_batchnorm.py::test_sync_batchnorm_ddp
 python ${DEFAULTS} tests/plugins/test_deepspeed_plugin.py::test_deepspeed_multigpu

From 4aa9be21f832a9959739d04580744bf19aea0d60 Mon Sep 17 00:00:00 2001
From: Amog Kamsetty <amogkam@users.noreply.github.com>
Date: Fri, 19 Mar 2021 14:38:49 -0700
Subject: [PATCH 04/13] Automatically set sync_batchnorm for
 training_type_plugin (#6536)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: Roger Shieh <sh.rog@protonmail.ch>
Co-authored-by: Kaushik Bokka <kaushikbokka@gmail.com>
---
 .../connectors/accelerator_connector.py       |  5 +++
 tests/plugins/test_custom_plugin.py           | 41 +++++++++++++++++++
 2 files changed, 46 insertions(+)
 create mode 100644 tests/plugins/test_custom_plugin.py

diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py
index 7d5e5fb9c358c..83eddfed6c4dc 100644
--- a/pytorch_lightning/trainer/connectors/accelerator_connector.py
+++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -425,6 +425,11 @@ def resolve_training_type_plugin(self, training_type: TrainingTypePlugin) -> Tra
         if hasattr(training_type, 'num_nodes') and getattr(training_type, 'num_nodes') is None:
             training_type.num_nodes = self.num_nodes
 
+        # Automatically set sync_batchnorm if None.
+        # Useful for custom plugins.
+        if hasattr(training_type, 'sync_batchnorm') and getattr(training_type, 'sync_batchnorm') is None:
+            training_type.sync_batchnorm = self.sync_batchnorm
+
         return training_type
 
     def select_accelerator(self) -> Accelerator:
diff --git a/tests/plugins/test_custom_plugin.py b/tests/plugins/test_custom_plugin.py
new file mode 100644
index 0000000000000..872b49ef48635
--- /dev/null
+++ b/tests/plugins/test_custom_plugin.py
@@ -0,0 +1,41 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from pytorch_lightning import Trainer
+from pytorch_lightning.plugins import DDPPlugin
+from tests.helpers import BoringModel
+from tests.helpers.runif import RunIf
+
+
+class CustomParallelPlugin(DDPPlugin):
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        # Set to None so it will be overwritten by the accelerator connector.
+        self.sync_batchnorm = None
+
+
+@RunIf(skip_windows=True)
+def test_sync_batchnorm_set(tmpdir):
+    """Tests if sync_batchnorm is automatically set for custom plugin."""
+    model = BoringModel()
+    plugin = CustomParallelPlugin()
+    assert plugin.sync_batchnorm is None
+    trainer = Trainer(
+        max_epochs=1,
+        plugins=[plugin],
+        default_root_dir=tmpdir,
+        sync_batchnorm=True,
+    )
+    trainer.fit(model)
+    assert plugin.sync_batchnorm is True

From 836d02aa3506e45a329f320ff41ab19b418965e7 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Mon, 29 Mar 2021 23:01:04 +0200
Subject: [PATCH 05/13] Add RunIf

---
 tests/helpers/runif.py | 184 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 184 insertions(+)
 create mode 100644 tests/helpers/runif.py

diff --git a/tests/helpers/runif.py b/tests/helpers/runif.py
new file mode 100644
index 0000000000000..5483e33d9cddb
--- /dev/null
+++ b/tests/helpers/runif.py
@@ -0,0 +1,184 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import sys
+from distutils.version import LooseVersion
+from typing import Optional
+
+import pytest
+import torch
+from pkg_resources import get_distribution
+
+from pytorch_lightning.utilities import (
+    _APEX_AVAILABLE,
+    _DEEPSPEED_AVAILABLE,
+    _FAIRSCALE_AVAILABLE,
+    _FAIRSCALE_PIPE_AVAILABLE,
+    _HOROVOD_AVAILABLE,
+    _NATIVE_AMP_AVAILABLE,
+    _RPC_AVAILABLE,
+    _TORCH_QUANTIZE_AVAILABLE,
+    _TPU_AVAILABLE,
+)
+
+try:
+    from horovod.common.util import nccl_built
+    nccl_built()
+except (ImportError, ModuleNotFoundError, AttributeError):
+    _HOROVOD_NCCL_AVAILABLE = False
+finally:
+    _HOROVOD_NCCL_AVAILABLE = True
+
+
+class RunIf:
+    """
+    RunIf wrapper for simple marking specific cases, fully compatible with pytest.mark::
+
+        @RunIf(min_torch="0.0")
+        @pytest.mark.parametrize("arg1", [1, 2.0])
+        def test_wrapper(arg1):
+            assert arg1 > 0.0
+    """
+
+    def __new__(
+        self,
+        *args,
+        min_gpus: int = 0,
+        min_torch: Optional[str] = None,
+        max_torch: Optional[str] = None,
+        min_python: Optional[str] = None,
+        quantization: bool = False,
+        amp_apex: bool = False,
+        amp_native: bool = False,
+        tpu: bool = False,
+        horovod: bool = False,
+        horovod_nccl: bool = False,
+        skip_windows: bool = False,
+        special: bool = False,
+        rpc: bool = False,
+        fairscale: bool = False,
+        fairscale_pipe: bool = False,
+        deepspeed: bool = False,
+        **kwargs
+    ):
+        """
+        Args:
+            args: native pytest.mark.skipif arguments
+            min_gpus: min number of gpus required to run test
+            min_torch: minimum pytorch version to run test
+            max_torch: maximum pytorch version to run test
+            min_python: minimum python version required to run test
+            quantization: if `torch.quantization` package is required to run test
+            amp_apex: NVIDIA Apex is installed
+            amp_native: if native PyTorch native AMP is supported
+            tpu: if TPU is available
+            horovod: if Horovod is installed
+            horovod_nccl: if Horovod is installed with NCCL support
+            skip_windows: skip test for Windows platform (typically fo some limited torch functionality)
+            special: running in special mode, outside pytest suit
+            rpc: requires Remote Procedure Call (RPC)
+            fairscale: if `fairscale` module is required to run the test
+            deepspeed: if `deepspeed` module is required to run the test
+            kwargs: native pytest.mark.skipif keyword arguments
+        """
+        conditions = []
+        reasons = []
+
+        if min_gpus:
+            conditions.append(torch.cuda.device_count() < min_gpus)
+            reasons.append(f"GPUs>={min_gpus}")
+
+        if min_torch:
+            torch_version = LooseVersion(get_distribution("torch").version)
+            conditions.append(torch_version < LooseVersion(min_torch))
+            reasons.append(f"torch>={min_torch}")
+
+        if max_torch:
+            torch_version = LooseVersion(get_distribution("torch").version)
+            conditions.append(torch_version >= LooseVersion(max_torch))
+            reasons.append(f"torch<{max_torch}")
+
+        if min_python:
+            py_version = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}"
+            conditions.append(py_version < LooseVersion(min_python))
+            reasons.append(f"python>={min_python}")
+
+        if quantization:
+            _miss_default = 'fbgemm' not in torch.backends.quantized.supported_engines
+            conditions.append(not _TORCH_QUANTIZE_AVAILABLE or _miss_default)
+            reasons.append("PyTorch quantization")
+
+        if amp_native:
+            conditions.append(not _NATIVE_AMP_AVAILABLE)
+            reasons.append("native AMP")
+
+        if amp_apex:
+            conditions.append(not _APEX_AVAILABLE)
+            reasons.append("NVIDIA Apex")
+
+        if skip_windows:
+            conditions.append(sys.platform == "win32")
+            reasons.append("unimplemented on Windows")
+
+        if tpu:
+            conditions.append(not _TPU_AVAILABLE)
+            reasons.append("TPU")
+
+        if horovod:
+            conditions.append(not _HOROVOD_AVAILABLE)
+            reasons.append("Horovod")
+
+        if horovod_nccl:
+            conditions.append(not _HOROVOD_NCCL_AVAILABLE)
+            reasons.append("Horovod with NCCL")
+
+        if special:
+            env_flag = os.getenv("PL_RUNNING_SPECIAL_TESTS", '0')
+            conditions.append(env_flag != '1')
+            reasons.append("Special execution")
+
+        if rpc:
+            conditions.append(not _RPC_AVAILABLE)
+            reasons.append("RPC")
+
+        if fairscale:
+            conditions.append(not _FAIRSCALE_AVAILABLE)
+            reasons.append("Fairscale")
+
+        if fairscale_pipe:
+            conditions.append(not _FAIRSCALE_PIPE_AVAILABLE)
+            reasons.append("Fairscale Pipe")
+
+        if deepspeed:
+            conditions.append(not _DEEPSPEED_AVAILABLE)
+            reasons.append("Deepspeed")
+
+        reasons = [rs for cond, rs in zip(conditions, reasons) if cond]
+        return pytest.mark.skipif(
+            *args,
+            condition=any(conditions),
+            reason=f"Requires: [{' + '.join(reasons)}]",
+            **kwargs,
+        )
+
+
+@RunIf(min_torch="99")
+def test_always_skip():
+    exit(1)
+
+
+@pytest.mark.parametrize("arg1", [0.5, 1.0, 2.0])
+@RunIf(min_torch="0.0")
+def test_wrapper(arg1: float):
+    assert arg1 > 0.0

From 4df060bc8d6dba95e0853daf858a1301faeb1f83 Mon Sep 17 00:00:00 2001
From: Shengyao Zhuang <46237844+ArvinZhuang@users.noreply.github.com>
Date: Thu, 25 Mar 2021 19:37:58 +1000
Subject: [PATCH 06/13] Match the number of outputs of backward with forward
 for AllGatherGrad (#6625)

---
 pytorch_lightning/utilities/distributed.py |  2 +-
 tests/utilities/test_all_gather_grad.py    | 23 ++++++++++++++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py
index 9e47af26f53d5..3877f774b7cd8 100644
--- a/pytorch_lightning/utilities/distributed.py
+++ b/pytorch_lightning/utilities/distributed.py
@@ -187,7 +187,7 @@ def backward(ctx, *grad_output):
 
         torch.distributed.all_reduce(grad_output, op=torch.distributed.ReduceOp.SUM, async_op=False, group=ctx.group)
 
-        return grad_output[torch.distributed.get_rank()]
+        return grad_output[torch.distributed.get_rank()], None
 
 
 def all_gather_ddp_if_available(
diff --git a/tests/utilities/test_all_gather_grad.py b/tests/utilities/test_all_gather_grad.py
index f82cfc94bcce2..86b977cfff029 100644
--- a/tests/utilities/test_all_gather_grad.py
+++ b/tests/utilities/test_all_gather_grad.py
@@ -96,3 +96,26 @@ def training_epoch_end(self, outputs) -> None:
 
     trainer.fit(model)
     assert model.training_epoch_end_called
+
+
+@RunIf(min_gpus=2, skip_windows=True, special=True)
+def test_all_gather_sync_grads(tmpdir):
+
+    class TestModel(BoringModel):
+
+        training_step_called = False
+
+        def training_step(self, batch, batch_idx):
+            self.training_step_called = True
+            tensor = torch.rand(2, 2, requires_grad=True, device=self.device)
+            gathered_tensor = self.all_gather(tensor, sync_grads=True)
+            assert gathered_tensor.shape == torch.Size([2, 2, 2])
+
+            loss = gathered_tensor.sum()
+
+            return loss
+
+    model = TestModel()
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, gpus=2)
+    trainer.fit(model)
+    assert model.training_step_called

From 014a6b7d026c8e9a73db0428643cd3e63ca744b8 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Mon, 22 Mar 2021 17:49:01 +0100
Subject: [PATCH 07/13] hotfix: mock examples (#6632)

* mock examples

* drop from GA
---
 azure-pipelines.yml     | 2 ++
 pl_examples/__init__.py | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index fecf8f1a776ce..f944cad2ae09d 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -103,4 +103,6 @@ jobs:
         # cd pl_examples/basic_examples
         # bash submit_ddp_job.sh
         # bash submit_ddp2_job.sh
+      env:
+        PL_USE_MOCKED_MNIST: "1"
       displayName: 'Examples'
diff --git a/pl_examples/__init__.py b/pl_examples/__init__.py
index ffd60f9ed71af..150ac309ddceb 100644
--- a/pl_examples/__init__.py
+++ b/pl_examples/__init__.py
@@ -15,10 +15,10 @@
 _DATASETS_PATH = os.path.join(_PACKAGE_ROOT, 'Datasets')
 
 _TORCHVISION_AVAILABLE = _module_available("torchvision")
-_TORCHVISION_MNIST_AVAILABLE = True
+_TORCHVISION_MNIST_AVAILABLE = not bool(os.environ.get("PL_USE_MOCKED_MNIST", False))
 _DALI_AVAILABLE = _module_available("nvidia.dali")
 
-if _TORCHVISION_AVAILABLE:
+if _TORCHVISION_MNIST_AVAILABLE:
     try:
         from torchvision.datasets.mnist import MNIST
         MNIST(_DATASETS_PATH, download=True)

From a1829bc4058b448577338ef80a2144e955b7d6b4 Mon Sep 17 00:00:00 2001
From: Kaushik B <45285388+kaushikb11@users.noreply.github.com>
Date: Thu, 25 Mar 2021 16:07:37 +0530
Subject: [PATCH 08/13] Fix checkpoint callback & Trainer.test(_) issue for
 TPUs (#6654)

* Fix checkpoint callback issue for TPUs

* update changelog

* add barrier

* apply code suggestions

* update trainer test

* remove spaces

* fix tpu tests

* Apply suggestions from code review

* add comment

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 CHANGELOG.md                                    |  2 ++
 .../plugins/training_type/tpu_spawn.py          | 12 ++++++------
 pytorch_lightning/trainer/trainer.py            |  6 ++++--
 tests/models/test_tpu.py                        | 17 +++++++++++++++--
 4 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 524e57ac48e03..6669050a56298 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 ### Fixed
 
 - Fixed `DummyLogger.log_hyperparams` raising a `TypeError` when running with `fast_dev_run=True` ([#6398](https://github.com/PyTorchLightning/pytorch-lightning/pull/6398))
+- Fixed error on TPUs when there was no `ModelCheckpoint` ([#6654](https://github.com/PyTorchLightning/pytorch-lightning/pull/6654))
+- Fixed `trainer.test` freeze on TPUs ([#6654](https://github.com/PyTorchLightning/pytorch-lightning/pull/6654))
 
 
 ## [1.2.5] - 2021-03-23
diff --git a/pytorch_lightning/plugins/training_type/tpu_spawn.py b/pytorch_lightning/plugins/training_type/tpu_spawn.py
index 1e951329b22cc..09603f9a22bc2 100644
--- a/pytorch_lightning/plugins/training_type/tpu_spawn.py
+++ b/pytorch_lightning/plugins/training_type/tpu_spawn.py
@@ -4,7 +4,6 @@
 from typing import Any, Dict, Iterable, List, Optional, Union
 
 import torch
-import torch.distributed as torch_distrib
 import torch.multiprocessing as mp
 
 from pytorch_lightning.core.lightning import LightningModule
@@ -96,13 +95,15 @@ def new_process(self, process_idx: int, trainer, mp_queue) -> None:
 
         # replace trainer save_checkpoint to use `xm.save`
         trainer.save_checkpoint = self.save_checkpoint
-        self.barrier()
+        self.barrier("pre-run-stage")
 
         results = trainer.train_or_test_or_predict()
 
         self.__save_end_of_training_weights(self.lightning_module)
         self.transfer_distrib_spawn_state_on_fit_end(results)
 
+        self.barrier("end-process")
+
     def __save_end_of_training_weights(self, model: LightningModule) -> None:
         # when training ends on these platforms dump weights to get out of the main process
         if on_colab_kaggle():
@@ -113,12 +114,11 @@ def model_to_device(self) -> None:
         self._model.to(xm.xla_device())
 
     def barrier(self, name: Optional[str] = None) -> None:
-        if torch_distrib.is_initialized():
-            rendezvous(f"pl.Trainer.{name}")
+        rendezvous(name)
 
     def transfer_distrib_spawn_state_on_fit_end(self, results):
-        # TODO: is there a better way than accessing callback through model -> trainer -> callback?
-        best_model_path = self.lightning_module.trainer.checkpoint_callback.best_model_path
+        checkpoint_callback = self.lightning_module.trainer.checkpoint_callback
+        best_model_path = checkpoint_callback.best_model_path if checkpoint_callback else None
 
         if self.mp_queue is not None:
             rank_zero_warn("cleaning up ddp environment...")
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index f378ee830d261..2d5e2504a319f 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -56,7 +56,7 @@
 from pytorch_lightning.trainer.training_loop import TrainLoop
 from pytorch_lightning.trainer.training_tricks import TrainerTrainingTricksMixin
 from pytorch_lightning.tuner.tuning import Tuner
-from pytorch_lightning.utilities import rank_zero_warn
+from pytorch_lightning.utilities import DeviceType, rank_zero_warn
 from pytorch_lightning.utilities.cloud_io import load as pl_load
 from pytorch_lightning.utilities.debugging import InternalDebugger
 from pytorch_lightning.utilities.enums import LightningEnum
@@ -942,7 +942,9 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders):
                 )
                 return {}
 
-            self.training_type_plugin.barrier()
+            # only one process running at this point for TPUs, as spawn isn't triggered yet
+            if not self._device_type == DeviceType.TPU:
+                self.training_type_plugin.barrier()
 
             ckpt = pl_load(ckpt_path, map_location=lambda storage, loc: storage)
             model.load_state_dict(ckpt['state_dict'])
diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index 0554d924e6e9f..fbda891f0065f 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -349,13 +349,14 @@ def test_reduce(rank):
     xmp.spawn(test_reduce, nprocs=8, start_method='fork')
 
 
-@pytest.mark.parametrize("clip_val", [0, 10])
-@pytest.mark.skipif(not _TPU_AVAILABLE, reason="test requires TPU machine")
+@RunIf(tpu=True)
 @pl_multi_process_test
+@pytest.mark.parametrize("clip_val", [10])
 @mock.patch("pytorch_lightning.accelerators.tpu.xla_clip_grad_norm_")
 def test_tpu_precision_16_clip_gradients(mock_clip_grad_norm, clip_val, tmpdir):
     """
     Ensure that clip gradients is only called if the value is greater than 0.
+    TODO: Fix (test fails with parametrize)
     """
     tutils.reset_seed()
     trainer_options = dict(
@@ -375,3 +376,15 @@ def test_tpu_precision_16_clip_gradients(mock_clip_grad_norm, clip_val, tmpdir):
         mock_clip_grad_norm.assert_called()
     else:
         mock_clip_grad_norm.assert_not_called()
+
+
+@RunIf(tpu=True)
+@pl_multi_process_test
+def test_if_test_works_with_checkpoint_false(tmpdir):
+    """Ensure that model trains properly when `checkpoint_callback` is set to False."""
+
+    # Train a model on TPU
+    model = BoringModel()
+    trainer = Trainer(max_epochs=1, tpu_cores=8, default_root_dir=tmpdir, fast_dev_run=True, checkpoint_callback=False)
+    trainer.fit(model)
+    assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"

From d229429a9ea0fe4c763487cf08015163966387a7 Mon Sep 17 00:00:00 2001
From: Ethan Harris <ewah1g13@soton.ac.uk>
Date: Tue, 23 Mar 2021 22:07:48 +0000
Subject: [PATCH 09/13] Fix disabled grads after call to predict (#6657)

---
 CHANGELOG.md                         |  1 +
 pytorch_lightning/trainer/trainer.py |  4 ++++
 tests/trainer/test_trainer.py        | 21 +++++++++++++++++----
 3 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6669050a56298..003c321203a9c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,6 +15,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed `DummyLogger.log_hyperparams` raising a `TypeError` when running with `fast_dev_run=True` ([#6398](https://github.com/PyTorchLightning/pytorch-lightning/pull/6398))
 - Fixed error on TPUs when there was no `ModelCheckpoint` ([#6654](https://github.com/PyTorchLightning/pytorch-lightning/pull/6654))
 - Fixed `trainer.test` freeze on TPUs ([#6654](https://github.com/PyTorchLightning/pytorch-lightning/pull/6654))
+- Fixed a bug where gradients were disabled after calling `Trainer.predict` ([#6657](https://github.com/PyTorchLightning/pytorch-lightning/pull/6657))
 
 
 ## [1.2.5] - 2021-03-23
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 2d5e2504a319f..6af34b71f69c2 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -834,6 +834,10 @@ def run_predict(self):
                     self.predict_loop.predict(batch, batch_idx, dataloader_idx)
 
         results = self.predict_loop.on_predict_epoch_end()
+
+        # re-enable grads
+        torch.set_grad_enabled(True)
+
         return results
 
     def run_sanity_check(self, ref_model):
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 6966edc3cbf70..fd2b48a3fa140 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -1410,12 +1410,12 @@ def predict_dataloader(self):
         return self._dataloaders
 
 
-def predict(tmpdir, accelerator, gpus, num_processes, plugins=None, datamodule=True):
+def predict(tmpdir, accelerator, gpus, num_processes, model=None, plugins=None, datamodule=True):
 
     dataloaders = [torch.utils.data.DataLoader(RandomDataset(32, 2)), torch.utils.data.DataLoader(RandomDataset(32, 2))]
 
-    model = BoringModel()
-    datamodule = TestLightningDataModule(dataloaders)
+    model = model or BoringModel()
+    dm = TestLightningDataModule(dataloaders)
 
     trainer = Trainer(
         default_root_dir=tmpdir,
@@ -1428,7 +1428,7 @@ def predict(tmpdir, accelerator, gpus, num_processes, plugins=None, datamodule=T
         plugins=plugins,
     )
     if datamodule:
-        results = trainer.predict(model, datamodule=datamodule)
+        results = trainer.predict(model, datamodule=dm)
     else:
         results = trainer.predict(model, dataloaders=dataloaders)
 
@@ -1439,6 +1439,19 @@ def predict(tmpdir, accelerator, gpus, num_processes, plugins=None, datamodule=T
     assert results[0][0].shape == torch.Size([1, 2])
 
 
+def test_trainer_predict_grad(tmpdir):
+    class CustomBoringModel(BoringModel):
+
+        def predict_step(self, batch, batch_idx, dataloader_idx=None):
+            assert batch.expand_as(batch).grad_fn is None
+            return super().predict_step(batch, batch_idx, dataloader_idx)
+
+    predict(tmpdir, None, None, 1, model=CustomBoringModel())
+
+    x = torch.zeros(1, requires_grad=True)
+    assert x.expand_as(x).grad_fn is not None
+
+
 @pytest.mark.parametrize('datamodule', [False, True])
 def test_trainer_predict_cpu(tmpdir, datamodule):
     predict(tmpdir, None, None, 1, datamodule=datamodule)

From 2b59a49eb0c35acfd5a8c9a05b5f7471efc1aad7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Fri, 26 Mar 2021 14:05:20 +0100
Subject: [PATCH 10/13] Do not add return dict items to callback_metrics
 (#6682)

---
 CHANGELOG.md                                  |   4 +
 docs/source/ecosystem/asr_nlp_tts.rst         |  10 +-
 docs/source/ecosystem/bolts.rst               |   4 +-
 .../callbacks/model_checkpoint.py             |   2 +-
 .../logger_connector/epoch_result_store.py    |   1 -
 .../logger_connector/logger_connector.py      |  67 ++-----
 pytorch_lightning/trainer/logging.py          |  31 +--
 pytorch_lightning/trainer/trainer.py          |   9 -
 pytorch_lightning/trainer/training_loop.py    |   3 +-
 tests/base/model_valid_epoch_ends.py          |   5 +-
 tests/callbacks/test_early_stopping.py        |  30 +--
 tests/checkpointing/test_model_checkpoint.py  |  10 +-
 tests/models/test_tpu.py                      |   1 +
 .../test_eval_loop_dict_return.py             | 176 ------------------
 .../test_trainer_steps_dict_return.py         |  22 ---
 .../logging_/test_eval_loop_logging_1_0.py    |  25 +--
 .../trainer/logging_/test_logger_connector.py |  42 +++++
 tests/utilities/test_all_gather_grad.py       |   1 +
 18 files changed, 101 insertions(+), 342 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 003c321203a9c..a2ea14b23d166 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Changed the behavior of `on_epoch_start` to run at the beginning of validation & test epoch ([#6498](https://github.com/PyTorchLightning/pytorch-lightning/pull/6498))
 
+### Removed
+
+- Removed legacy code to include `step` dictionary returns in `callback_metrics`. Use `self.log_dict` instead. ([#6682](https://github.com/PyTorchLightning/pytorch-lightning/pull/6682))
+
 ### Fixed
 
 - Fixed `DummyLogger.log_hyperparams` raising a `TypeError` when running with `fast_dev_run=True` ([#6398](https://github.com/PyTorchLightning/pytorch-lightning/pull/6398))
diff --git a/docs/source/ecosystem/asr_nlp_tts.rst b/docs/source/ecosystem/asr_nlp_tts.rst
index 49bed0a981a6e..af9a7084583f2 100644
--- a/docs/source/ecosystem/asr_nlp_tts.rst
+++ b/docs/source/ecosystem/asr_nlp_tts.rst
@@ -270,12 +270,12 @@ with PyTorch Lightning since every NeMo model is a Lightning Module.
                 log_probs=log_probs, targets=transcript, input_lengths=encoded_len, target_lengths=transcript_len
             )
             wer_num, wer_denom = self._wer(predictions, transcript, transcript_len)
-            tensorboard_logs = {
+            self.log_dict({
                 'train_loss': loss_value,
                 'training_batch_wer': wer_num / wer_denom,
                 'learning_rate': self._optimizer.param_groups[0]['lr'],
-            }
-            return {'loss': loss_value, 'log': tensorboard_logs}
+            })
+            return loss_value
 
 Neural Types in NeMo ASR
 ------------------------
@@ -539,8 +539,8 @@ since every NeMo model is a Lightning Module.
             logits = self(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask)
 
             loss = self.loss(logits=logits, labels=labels, loss_mask=loss_mask)
-            tensorboard_logs = {'train_loss': loss, 'lr': self._optimizer.param_groups[0]['lr']}
-            return {'loss': loss, 'log': tensorboard_logs}
+            self.log_dict({'train_loss': loss, 'lr': self._optimizer.param_groups[0]['lr']})
+            return loss
         ...
 
 Neural Types in NeMo NLP
diff --git a/docs/source/ecosystem/bolts.rst b/docs/source/ecosystem/bolts.rst
index 9133176cab912..f3a4ab9c858be 100644
--- a/docs/source/ecosystem/bolts.rst
+++ b/docs/source/ecosystem/bolts.rst
@@ -68,8 +68,8 @@ you can trust the implementations and use them to bootstrap your research much f
 
             loss = self.criterion(logits.view(-1, logits.size(-1)), x.view(-1).long())
 
-            logs = {"loss": loss}
-            return {"loss": loss, "log": logs}
+            self.log("loss", loss)
+            return loss
 
 ----------
 
diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py
index 43f7a66dca313..5dc891795af93 100644
--- a/pytorch_lightning/callbacks/model_checkpoint.py
+++ b/pytorch_lightning/callbacks/model_checkpoint.py
@@ -490,7 +490,7 @@ def _validate_monitor_key(self, trainer):
             m = (
                 f"ModelCheckpoint(monitor='{self.monitor}') not found in the returned metrics:"
                 f" {list(metrics.keys())}. "
-                f"HINT: Did you call self.log('{self.monitor}', tensor) in the LightningModule?"
+                f"HINT: Did you call self.log('{self.monitor}', value) in the LightningModule?"
             )
             raise MisconfigurationException(m)
 
diff --git a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py
index b40d87c2d9664..8979ce7798eaa 100644
--- a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py
+++ b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py
@@ -346,7 +346,6 @@ def update_logger_connector(self) -> Tuple[Dict, Dict]:
 
         # update callback_metrics
         logger_connector._callback_metrics.update(callback_metrics)
-        logger_connector._callback_metrics.pop("epoch", None)
 
         batch_pbar_metrics.pop("debug_epoch", None)
         return batch_pbar_metrics, batch_log_metrics
diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
index 8ebec3238e276..b106244f06307 100644
--- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
+++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
@@ -78,7 +78,7 @@ def progress_bar_metrics(self, progress_bar_metrics: Dict) -> None:
 
     @property
     def cached_results(self) -> Union[EpochResultStore, None]:
-        return self._cached_results.get(self.trainer._running_stage)  # type: ignore
+        return self._cached_results.get(self.trainer._running_stage)
 
     def get_metrics(self, key: str) -> Dict:
         metrics_holder = getattr(self, f"_{key}", None)
@@ -125,8 +125,6 @@ def cache_logged_metrics(self):
     def on_trainer_init(self, logger, flush_logs_every_n_steps: int, log_every_n_steps: int, move_metrics_to_cpu: bool):
         # logging
         self.configure_logger(logger)
-        # todo: IDE is complaining, these shall be initialized in the Trainer init at leas as placeholders
-        # and assign here the desired value
         self.trainer.flush_logs_every_n_steps = flush_logs_every_n_steps
         self.trainer.log_every_n_steps = log_every_n_steps
         self.trainer.move_metrics_to_cpu = move_metrics_to_cpu
@@ -189,9 +187,6 @@ def cache_training_step_metrics(self, opt_closure_result):
             batch_log_metrics = opt_closure_result.training_step_output.log_metrics
             logged_metrics_tmp.update(batch_log_metrics)
 
-            callback_metrics = opt_closure_result.training_step_output.callback_metrics
-            callback_metrics_tmp.update(callback_metrics)
-
             batch_pbar_metrics = opt_closure_result.training_step_output.pbar_on_batch_end
             pbar_metrics_tmp.update(batch_pbar_metrics)
 
@@ -214,9 +209,6 @@ def log_metrics(self, metrics, grad_norm_dic, step=None):
             metrics (dict): Metric values
             grad_norm_dic (dict): Gradient norms
             step (int): Step for which metrics should be logged. Default value corresponds to `self.global_step`
-            log_train_step_metrics (bool): Used to track if `log_metrics` function is being called in during training
-                steps. In training steps, we will log metrics on step: `total_nb_idx` (for accumulated gradients)
-                and global_step for the rest.
         """
         # add gpu memory
         if self.trainer._device_type == DeviceType.GPU and self.log_gpu_memory:
@@ -350,27 +342,6 @@ def _track_callback_metrics(self, eval_results):
             if self.trainer.testing:
                 self.trainer.logger_connector.evaluation_callback_metrics.update(flat)
 
-    def __process_eval_epoch_end_results_and_log_legacy_update(self, prog_bar_metrics, log_metrics, callback_metrics):
-        # eval loop returns all metrics
-        dataloader_result_metrics = {**prog_bar_metrics, **log_metrics, **callback_metrics}
-
-        # add metrics to prog bar
-        self.trainer.logger_connector.add_progress_bar_metrics(prog_bar_metrics)
-
-        # log metrics
-        if len(log_metrics) > 0:
-            self.trainer.logger_connector.log_metrics(log_metrics, {})
-
-        # track metrics for callbacks (all prog bar, logged and callback metrics)
-        callback_metrics.update(log_metrics)
-        callback_metrics.update(prog_bar_metrics)
-        self.trainer.logger_connector.callback_metrics.update(callback_metrics)
-        if self.trainer.testing:
-            self.trainer.logger_connector.evaluation_callback_metrics.update(callback_metrics)
-
-        if len(dataloader_result_metrics) > 0:
-            self.eval_loop_results.append(dataloader_result_metrics)
-
     def __process_eval_epoch_end_results_and_log_legacy(self, eval_results):
         if self.trainer.running_sanity_check:
             return
@@ -381,21 +352,21 @@ def __process_eval_epoch_end_results_and_log_legacy(self, eval_results):
             if not isinstance(eval_results, list):
                 eval_results = [eval_results]
 
-            num_loaders: int = self.trainer.evaluation_loop.num_dataloaders
-            prog_bar_metrics, log_metrics, callback_metrics = {}, {}, {}
-
             for result_idx, result in enumerate(eval_results):
-                _, prog_bar_metrics, log_metrics, callback_metrics, _ = self.trainer.process_dict_result(result)
+                _, prog_bar_metrics, log_metrics, _ = self.trainer.process_dict_result(result)
+
+                # eval loop returns all metrics
+                dataloader_result_metrics = {**prog_bar_metrics, **log_metrics}
+
+                # add metrics to prog bar
+                self.trainer.logger_connector.add_progress_bar_metrics(prog_bar_metrics)
 
-                if num_loaders > 1:
-                    self.__process_eval_epoch_end_results_and_log_legacy_update(
-                        prog_bar_metrics, log_metrics, callback_metrics
-                    )
+                # log metrics
+                if len(log_metrics) > 0:
+                    self.trainer.logger_connector.log_metrics(log_metrics, {})
 
-            if num_loaders == 1:
-                self.__process_eval_epoch_end_results_and_log_legacy_update(
-                    prog_bar_metrics, log_metrics, callback_metrics
-                )
+                if len(dataloader_result_metrics) > 0:
+                    self.eval_loop_results.append(dataloader_result_metrics)
 
     def on_train_epoch_end(self):
         # inform cached logger connector epoch finished
@@ -448,10 +419,9 @@ def log_train_epoch_end_metrics(
 
         # TODO: deprecate 1.0
         else:
-            out = self.__run_legacy_training_epoch_end(
-                num_optimizers, epoch_output, model, is_result_obj, epoch_callback_metrics
+            epoch_log_metrics, epoch_progress_bar_metrics = self.__run_legacy_training_epoch_end(
+                num_optimizers, epoch_output, model, is_result_obj
             )
-            epoch_log_metrics, epoch_progress_bar_metrics, epoch_callback_metrics = out
 
         # it will perform reduction over epoch and return log metrics
         cached_epoch_log_metrics = self.cached_results.get_epoch_log_metrics()
@@ -503,9 +473,7 @@ def training_epoch_end(self, model, epoch_output, num_optimizers):
         # capture logging
         self.trainer.logger_connector.cache_logged_metrics()
 
-    def __run_legacy_training_epoch_end(
-        self, num_optimizers, epoch_output, model, is_result_obj, epoch_callback_metrics
-    ):
+    def __run_legacy_training_epoch_end(self, num_optimizers, epoch_output, model, is_result_obj):
 
         epoch_log_metrics = {}
         epoch_progress_bar_metrics = {}
@@ -536,7 +504,6 @@ def __run_legacy_training_epoch_end(
                 _processed_outputs = self.trainer.process_dict_result(epoch_output)
                 epoch_progress_bar_metrics = _processed_outputs[1]
                 epoch_log_metrics = _processed_outputs[2]
-                epoch_callback_metrics = _processed_outputs[3]
 
         # --------------------------
         # Structured Result (auto epoch end)
@@ -544,7 +511,7 @@ def __run_legacy_training_epoch_end(
         elif is_result_obj:
             epoch_log_metrics, epoch_progress_bar_metrics = self.__auto_reduce_results_on_epoch_end(epoch_output)
 
-        return epoch_log_metrics, epoch_progress_bar_metrics, epoch_callback_metrics
+        return epoch_log_metrics, epoch_progress_bar_metrics
 
     def __auto_reduce_results_on_epoch_end(self, epoch_output):
         epoch_log_metrics = {}
diff --git a/pytorch_lightning/trainer/logging.py b/pytorch_lightning/trainer/logging.py
index 16060f863884c..3f97f4adcf1b0 100644
--- a/pytorch_lightning/trainer/logging.py
+++ b/pytorch_lightning/trainer/logging.py
@@ -21,6 +21,7 @@
 from pytorch_lightning.loggers import LightningLoggerBase
 from pytorch_lightning.utilities import DeviceType, DistributedType
 from pytorch_lightning.utilities.distributed import rank_zero_warn
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.memory import recursive_detach
 
 
@@ -42,8 +43,14 @@ class TrainerLoggingMixin(ABC):
 
     def metrics_to_scalars(self, metrics):
         new_metrics = {}
+        # TODO: this is duplicated in MetricsHolder. should be unified
         for k, v in metrics.items():
             if isinstance(v, torch.Tensor):
+                if v.numel() != 1:
+                    raise MisconfigurationException(
+                        f"The metric `{k}` does not contain a single element"
+                        f" thus it cannot be converted to float. Found `{v}`"
+                    )
                 v = v.item()
 
             if isinstance(v, dict):
@@ -81,23 +88,8 @@ def process_dict_result(self, output, train=False):
         if isinstance(output, torch.Tensor):
             progress_bar_metrics = {}
             log_metrics = {}
-            callback_metrics = {}
             hiddens = None
-            return output, progress_bar_metrics, log_metrics, callback_metrics, hiddens
-
-        # ---------------
-        # EXTRACT CALLBACK KEYS
-        # ---------------
-        # all keys not progress_bar or log are candidates for callbacks
-        callback_metrics = {}
-        if isinstance(output, Mapping):
-            for k, v in output.items():
-                if k not in ['progress_bar', 'log', 'hiddens']:
-                    callback_metrics[k] = v
-
-        if train and self._distrib_type in (DistributedType.DP, DistributedType.DDP2):
-            num_gpus = self.num_gpus
-            callback_metrics = self.reduce_distributed_output(callback_metrics, num_gpus)
+            return output, progress_bar_metrics, log_metrics, hiddens
 
         # ---------------
         # EXTRACT PROGRESS BAR KEYS
@@ -159,17 +151,12 @@ def process_dict_result(self, output, train=False):
         # ---------------
         hiddens = output.get('hiddens', None) if isinstance(output, Mapping) else None
 
-        # use every metric passed in as a candidate for callback
-        callback_metrics.update(progress_bar_metrics)
-        callback_metrics.update(log_metrics)
-
         # detach all metrics for callbacks to prevent memory leaks
         # no .item() because it will slow things down
-        callback_metrics = recursive_detach(callback_metrics)
         progress_bar_metrics = recursive_detach(progress_bar_metrics)
         log_metrics = recursive_detach(log_metrics)
 
-        return loss, progress_bar_metrics, log_metrics, callback_metrics, hiddens
+        return loss, progress_bar_metrics, log_metrics, hiddens
 
     def reduce_distributed_output(self, output, num_gpus):
         if num_gpus <= 1:
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 6af34b71f69c2..82bb858ef6c53 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -859,15 +859,6 @@ def run_sanity_check(self, ref_model):
             # run eval step
             _, eval_results = self.run_evaluation(max_batches=self.num_sanity_val_batches)
 
-            # allow no returns from eval
-            if eval_results is not None and len(eval_results) > 0:
-                # when we get a list back, used only the last item
-                if isinstance(eval_results, list):
-                    eval_results = eval_results[-1]
-
-                _, _, _, callback_metrics, _ = self.process_dict_result(eval_results)
-                self.logger_connector.callback_metrics = callback_metrics
-
             self.on_sanity_check_end()
             self.running_sanity_check = False
 
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 36e1f6799437e..17efecaf98a2f 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -360,8 +360,7 @@ def _process_training_step_output(self, training_step_output, split_batch):
             batch_loss=training_step_output[0],
             pbar_on_batch_end=training_step_output[1],
             log_metrics=training_step_output[2],
-            callback_metrics=training_step_output[3],
-            hiddens=training_step_output[4],
+            hiddens=training_step_output[3],
         )
         # if the user decides to finally reduce things in epoch_end, save raw output without graphs
         if isinstance(training_step_output_for_epoch_end, torch.Tensor):
diff --git a/tests/base/model_valid_epoch_ends.py b/tests/base/model_valid_epoch_ends.py
index dd29d355a4a98..7b83670acacef 100644
--- a/tests/base/model_valid_epoch_ends.py
+++ b/tests/base/model_valid_epoch_ends.py
@@ -43,9 +43,8 @@ def _mean(res, key):
             val_loss_mean = val_loss_mean.item()
             val_acc_mean = val_acc_mean.item()
 
-        metrics_dict = {'early_stop_on': val_loss_mean, 'val_acc': val_acc_mean}
-        results = {'progress_bar': metrics_dict, 'log': metrics_dict}
-        return results
+        self.log('early_stop_on', val_loss_mean, prog_bar=True)
+        self.log('val_acc', val_acc_mean, prog_bar=True)
 
     def validation_epoch_end__multiple_dataloaders(self, outputs):
         """
diff --git a/tests/callbacks/test_early_stopping.py b/tests/callbacks/test_early_stopping.py
index 7062fe35bbcb7..643bfb90f2fda 100644
--- a/tests/callbacks/test_early_stopping.py
+++ b/tests/callbacks/test_early_stopping.py
@@ -127,7 +127,7 @@ class ModelOverrideValidationReturn(BoringModel):
 
         def validation_epoch_end(self, outputs):
             loss = self.validation_return_values[self.current_epoch]
-            return {"test_val_loss": loss}
+            self.log("test_val_loss", loss)
 
     model = ModelOverrideValidationReturn()
     early_stop_callback = EarlyStopping(monitor="test_val_loss", patience=patience, verbose=True)
@@ -217,7 +217,7 @@ class CurrentModel(BoringModel):
         def validation_epoch_end(self, outputs):
             losses = [8, 4, 2, 3, 4, 5, 8, 10]
             val_loss = losses[self.current_epoch]
-            self.log('abc', torch.tensor(val_loss))
+            self.log('abc', val_loss)
 
     model = CurrentModel()
 
@@ -231,28 +231,6 @@ def validation_epoch_end(self, outputs):
     assert trainer.current_epoch == 5, 'early_stopping failed'
 
 
-def test_early_stopping_functionality_arbitrary_key(tmpdir):
-    """Tests whether early stopping works with a custom key and dictionary results on val step."""
-
-    class CurrentModel(BoringModel):
-
-        def validation_epoch_end(self, outputs):
-            losses = [8, 4, 2, 3, 4, 5, 8, 10]
-            val_loss = losses[self.current_epoch]
-            return {'jiraffe': torch.tensor(val_loss)}
-
-    model = CurrentModel()
-
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        callbacks=[EarlyStopping(monitor='jiraffe')],
-        overfit_batches=0.20,
-        max_epochs=20,
-    )
-    trainer.fit(model)
-    assert trainer.current_epoch >= 5, 'early_stopping failed'
-
-
 @pytest.mark.parametrize('step_freeze, min_steps, min_epochs', [(5, 1, 1), (5, 1, 3), (3, 15, 1)])
 def test_min_steps_override_early_stopping_functionality(tmpdir, step_freeze, min_steps, min_epochs):
     """Excepted Behaviour:
@@ -269,7 +247,7 @@ def test_min_steps_override_early_stopping_functionality(tmpdir, step_freeze, mi
         when `early_stopping` is being triggered,
     THEN the highest between `min_epochs * len(train_dataloader)` and `min_steps` would be reached.
 
-    Caviat: IF min_steps is divisible by len(train_dataloader), then it will do min_steps + len(train_dataloader)
+    Caveat: IF min_steps is divisible by len(train_dataloader), then it will do min_steps + len(train_dataloader)
 
     This test validate those expected behaviours
     """
@@ -306,7 +284,7 @@ def validation_epoch_end(self, outputs):
                 self._count_decrease += 1
                 self._loss_value -= self._eps
             self._values.append(_mean)
-            return {"test_val_loss": _mean}
+            self.log('test_val_loss', _mean)
 
     model = Model(step_freeze)
     model.training_step_end = None
diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py
index 3b4ea00ecb0ba..d87beb9a78aac 100644
--- a/tests/checkpointing/test_model_checkpoint.py
+++ b/tests/checkpointing/test_model_checkpoint.py
@@ -49,7 +49,6 @@ def training_step(self, batch, batch_idx):
 
     def validation_epoch_end(self, outputs):
         outs = torch.stack([x['x'] for x in outputs]).mean()
-        self.log('epoch', self.current_epoch)
         self.log('val_acc', outs)
 
 
@@ -585,12 +584,7 @@ def test_model_checkpoint_topk_all(tmpdir):
     seed_everything(1000)
     epochs = 3
 
-    class CustomModel(LogInTwoMethods):
-
-        def validation_epoch_end(self, outputs):
-            return {'epoch': self.current_epoch}
-
-    model = CustomModel()
+    model = BoringModel()
     checkpoint_callback = ModelCheckpoint(
         dirpath=tmpdir,
         filename="{epoch}",
@@ -754,7 +748,7 @@ class ExtendedBoringModel(BoringModel):
         def validation_step(self, batch, batch_idx):
             output = self.layer(batch)
             loss = self.loss(batch, output)
-            return {"val_loss": loss}
+            self.log("val_loss", loss)
 
     model = ExtendedBoringModel()
     model.validation_epoch_end = None
diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index fbda891f0065f..2befc5bd7dbd2 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -29,6 +29,7 @@
 from pytorch_lightning.utilities.distributed import ReduceOp
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers import BoringModel, RandomDataset
+from tests.helpers.runif import RunIf
 from tests.helpers.utils import pl_multi_process_test
 
 if _TPU_AVAILABLE:
diff --git a/tests/trainer/legacy_deprecate_flow_log/test_eval_loop_dict_return.py b/tests/trainer/legacy_deprecate_flow_log/test_eval_loop_dict_return.py
index 87cab653de6aa..a616acf4d65cf 100644
--- a/tests/trainer/legacy_deprecate_flow_log/test_eval_loop_dict_return.py
+++ b/tests/trainer/legacy_deprecate_flow_log/test_eval_loop_dict_return.py
@@ -122,49 +122,6 @@ def test_validation_step_arbitrary_dict_return(tmpdir):
     assert not model.validation_epoch_end_called
 
 
-def test_validation_step_dict_return(tmpdir):
-    """
-    Test that val step can return a dict with all the expected keys and they end up
-    in the correct place
-    """
-
-    model = DeterministicModel()
-    model.training_step = model.training_step__dict_return
-    model.validation_step = model.validation_step__dict_return
-    model.validation_step_end = None
-    model.validation_epoch_end = None
-
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        weights_summary=None,
-        limit_train_batches=2,
-        limit_val_batches=2,
-        max_epochs=2,
-    )
-    trainer.fit(model)
-
-    # out are the results of the full loop
-    # eval_results are output of _evaluate
-    callback_metrics, eval_results = trainer.run_evaluation()
-    assert len(callback_metrics) == 1
-    assert len(callback_metrics[0]) == 5
-    assert len(eval_results) == 2
-    assert eval_results[0]['log']['log_acc1'] == 12
-    assert eval_results[1]['log']['log_acc1'] == 13
-
-    for k in ['val_loss', 'log', 'progress_bar']:
-        assert k in eval_results[0]
-        assert k in eval_results[1]
-
-    # ensure all the keys ended up as candidates for callbacks
-    assert len(trainer.logger_connector.callback_metrics) in [7, 8]
-
-    # make sure correct steps were called
-    assert model.validation_step_called
-    assert not model.validation_step_end_called
-    assert not model.validation_epoch_end_called
-
-
 def test_val_step_step_end_no_return(tmpdir):
     """
     Test that val step + val step end work (with no return in val step end)
@@ -195,136 +152,3 @@ def test_val_step_step_end_no_return(tmpdir):
     assert model.validation_step_called
     assert model.validation_step_end_called
     assert not model.validation_epoch_end_called
-
-
-def test_val_step_step_end(tmpdir):
-    """
-    Test that val step + val step end work
-    """
-
-    model = DeterministicModel()
-    model.training_step = model.training_step__dict_return
-    model.validation_step = model.validation_step__dict_return
-    model.validation_step_end = model.validation_step_end
-    model.validation_epoch_end = None
-
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        weights_summary=None,
-        limit_train_batches=2,
-        limit_val_batches=2,
-        max_epochs=2,
-    )
-    trainer.fit(model)
-
-    # out are the results of the full loop
-    # eval_results are output of _evaluate
-    callback_metrics, eval_results = trainer.run_evaluation()
-    assert len(callback_metrics) == 1
-    assert len(callback_metrics[0]) == 6
-
-    callback_metrics = callback_metrics[0]
-    assert callback_metrics['val_step_end'] == 1802
-    assert len(eval_results) == 2
-    assert eval_results[0]['log']['log_acc1'] == 12
-    assert eval_results[1]['log']['log_acc1'] == 13
-
-    for k in ['val_loss', 'log', 'progress_bar']:
-        assert k in eval_results[0]
-        assert k in eval_results[1]
-
-    # ensure all the keys ended up as candidates for callbacks
-    assert len(trainer.logger_connector.callback_metrics) in [8, 9]
-
-    # make sure correct steps were called
-    assert model.validation_step_called
-    assert model.validation_step_end_called
-    assert not model.validation_epoch_end_called
-
-
-def test_no_val_step_end(tmpdir):
-    """
-    Test that val step + val epoch end
-    """
-
-    model = DeterministicModel()
-    model.training_step = model.training_step__dict_return
-    model.validation_step = model.validation_step__dict_return
-    model.validation_step_end = None
-    model.validation_epoch_end = model.validation_epoch_end
-
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        weights_summary=None,
-        limit_train_batches=2,
-        limit_val_batches=3,
-        num_sanity_val_steps=0,
-        max_epochs=2
-    )
-    trainer.fit(model)
-
-    # out are the results of the full loop
-    # eval_results are output of _evaluate
-    callback_metrics, eval_results = trainer.run_evaluation()
-    assert len(callback_metrics) == 1
-    assert len(callback_metrics[0]) == 6
-    assert len(eval_results) == 1
-
-    eval_results = eval_results[0]
-    assert 'val_step_end' not in eval_results
-    assert eval_results['val_epoch_end'] == 1233
-
-    for k in ['val_loss', 'log', 'progress_bar']:
-        assert k in eval_results
-
-    # ensure all the keys ended up as candidates for callbacks
-    assert len(trainer.logger_connector.callback_metrics) in [8, 9]
-
-    # make sure correct steps were called
-    assert model.validation_step_called
-    assert not model.validation_step_end_called
-    assert model.validation_epoch_end_called
-
-
-def test_full_val_loop(tmpdir):
-    """
-    Test that val step + val step end + val epoch end
-    """
-
-    model = DeterministicModel()
-    model.training_step = model.training_step__dict_return
-    model.validation_step = model.validation_step__dict_return
-    model.validation_step_end = model.validation_step_end
-    model.validation_epoch_end = model.validation_epoch_end
-
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        weights_summary=None,
-        limit_train_batches=2,
-        limit_val_batches=3,
-        num_sanity_val_steps=0,
-        max_epochs=2
-    )
-    trainer.fit(model)
-
-    # out are the results of the full loop
-    # eval_results are output of _evaluate
-    callback_metrics, eval_results = trainer.run_evaluation()
-    assert len(callback_metrics) == 1
-    assert len(callback_metrics[0]) == 7
-    assert len(eval_results) == 1
-
-    eval_results = eval_results[0]
-    assert eval_results['val_step_end'] == 1802
-    assert eval_results['val_epoch_end'] == 1233
-
-    for k in ['val_loss', 'log', 'progress_bar']:
-        assert k in eval_results
-
-    # ensure all the keys ended up as candidates for callbacks
-    assert len(trainer.logger_connector.callback_metrics) in [9, 10]
-
-    # make sure correct steps were called
-    assert model.validation_step_called
-    assert model.validation_step_end_called
-    assert model.validation_epoch_end_called
diff --git a/tests/trainer/legacy_deprecate_flow_log/test_trainer_steps_dict_return.py b/tests/trainer/legacy_deprecate_flow_log/test_trainer_steps_dict_return.py
index 9c114f72080d8..3f60e6060d2ae 100644
--- a/tests/trainer/legacy_deprecate_flow_log/test_trainer_steps_dict_return.py
+++ b/tests/trainer/legacy_deprecate_flow_log/test_trainer_steps_dict_return.py
@@ -171,28 +171,6 @@ def test_result_obj_lr_scheduler_epoch(tmpdir):
     assert len(trainer.dev_debugger.saved_lr_scheduler_updates) == 3
 
 
-@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"})
-def test_result_obj_lr_scheduler_step(tmpdir):
-    """
-    test that the LR scheduler was called at the correct time with the correct metrics
-    """
-    model = DeterministicModel()
-    model.training_step = model.training_step__for_step_end_dict
-    model.training_step_end = model.training_step_end__dict
-    model.training_epoch_end = model.training_epoch_end__dict
-    model.val_dataloader = None
-    model.configure_optimizers = model.configure_optimizers__lr_on_plateau_step
-
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        max_epochs=2,
-        weights_summary=None,
-    )
-    trainer.fit(model)
-
-    assert len(trainer.dev_debugger.saved_lr_scheduler_updates) == 8
-
-
 def test_train_step_epoch_end(tmpdir):
     """
     Checks train_step + training_epoch_end (NO training_step_end)
diff --git a/tests/trainer/logging_/test_eval_loop_logging_1_0.py b/tests/trainer/logging_/test_eval_loop_logging_1_0.py
index 79bdecae46424..e480ee7080b59 100644
--- a/tests/trainer/logging_/test_eval_loop_logging_1_0.py
+++ b/tests/trainer/logging_/test_eval_loop_logging_1_0.py
@@ -373,11 +373,10 @@ def test_multi_dataloaders_add_suffix_properly(tmpdir):
 
     class TestModel(BoringModel):
 
-        def test_step(self, batch, batch_idx, dataloader_idx):
+        def test_step(self, batch, *args):
             output = self.layer(batch)
             loss = self.loss(batch, output)
             self.log("test_loss", loss, on_step=True, on_epoch=True)
-            return {"y": loss}
 
         def test_dataloader(self):
             return [
@@ -398,22 +397,19 @@ def test_dataloader(self):
         weights_summary=None,
     )
     results = trainer.test(model)
-    assert "test_loss_epoch/dataloader_idx_0" in results[0]
-    assert "test_loss_epoch/dataloader_idx_1" in results[1]
+
+    assert {"test_loss/dataloader_idx_0", "test_loss_epoch/dataloader_idx_0"} == set(results[0])
+    assert {"test_loss/dataloader_idx_1", "test_loss_epoch/dataloader_idx_1"} == set(results[1])
 
 
 def test_single_dataloader_no_suffix_added(tmpdir):
 
     class TestModel(BoringModel):
 
-        def test_step(self, batch, batch_idx):
+        def test_step(self, batch, *args):
             output = self.layer(batch)
             loss = self.loss(batch, output)
             self.log("test_loss", loss, on_step=True, on_epoch=True)
-            return {"y": loss}
-
-        def test_dataloader(self):
-            return torch.utils.data.DataLoader(RandomDataset(32, 64))
 
     model = TestModel()
     model.test_epoch_end = None
@@ -428,9 +424,9 @@ def test_dataloader(self):
         weights_summary=None,
     )
     results = trainer.test(model)
+
     assert len(results) == 1
-    # error : It is wrong there. `y` should equal test_loss_epoch
-    assert results[0]['test_loss'] == results[0]['y']
+    assert {"test_loss", "test_loss_epoch"} == set(results[0])
 
 
 @mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"})
@@ -496,7 +492,7 @@ def on_validation_start(self, trainer, pl_module):
             )
 
         def on_epoch_start(self, trainer, pl_module):
-            if trainer.validating:
+            if trainer.evaluating:
                 self.make_logging(
                     pl_module,
                     'on_epoch_start',
@@ -546,7 +542,7 @@ def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx,
             self.count += 1
 
         def on_epoch_end(self, trainer, pl_module):
-            if trainer.validating:
+            if trainer.evaluating:
                 self.make_logging(
                     pl_module, 'on_epoch_end', 8, on_steps=[False], on_epochs=self.choices, prob_bars=self.choices
                 )
@@ -860,7 +856,7 @@ def validation_step(self, batch, batch_idx):
             self.log('valid_loss_1', loss, on_step=False, on_epoch=True)
             self.log('valid_loss_2', loss, on_step=True, on_epoch=False)
             self.log('valid_loss_3', loss, on_step=False, on_epoch=False)
-            return {"val_loss": loss}
+            return {"val_loss": loss}  # not added to callback_metrics
 
         def test_step(self, batch, batch_idx):
             output = self.layer(batch)
@@ -937,7 +933,6 @@ def get_metrics_at_idx(idx):
         'debug_epoch',
         'valid_loss_1',
         'test_loss',
-        'val_loss',
     }
     assert set(trainer.callback_metrics) == expected_callback_metrics
     assert set(results[0]) == {'test_loss', 'debug_epoch'}
diff --git a/tests/trainer/logging_/test_logger_connector.py b/tests/trainer/logging_/test_logger_connector.py
index 92eb2c76a8c6b..5efb31b9fb608 100644
--- a/tests/trainer/logging_/test_logger_connector.py
+++ b/tests/trainer/logging_/test_logger_connector.py
@@ -453,6 +453,48 @@ def is_float(value: Any) -> bool:
     assert excepted_function(metrics["z"])
 
 
+def test_metric_holder_raises(tmpdir):
+    """Check that an error is raised when trying to convert non-scalar tensors"""
+
+    class TestModel(BoringModel):
+
+        def validation_step(self, batch, *args, **kwargs):
+            output = self(batch)
+            self.log('test', output)
+
+        def test_step(self, *args, **kwargs):
+            return self.validation_step(*args, **kwargs)
+
+    model = TestModel()
+    model.validation_epoch_end = None
+    model.test_epoch_end = None
+
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True)
+
+    match = "The metric `test` does not contain a single element"
+    with pytest.raises(MisconfigurationException, match=match):
+        trainer.test(model)
+
+
+def test_can_return_tensor_with_more_than_one_element(tmpdir):
+    """Ensure {validation,test}_step return values are not included as callback metrics. #6623"""
+
+    class TestModel(BoringModel):
+
+        def test_step(self, batch, *args, **kwargs):
+            return {"test": torch.tensor([0, 1])}
+
+        def test_epoch_end(self, outputs):
+            assert len(outputs) == 2
+            assert all(list(d) == ["test"] for d in outputs)  # check keys
+            assert all(torch.equal(d["test"], torch.tensor([0, 1])) for d in outputs)  # check values
+
+    model = TestModel()
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=2, progress_bar_refresh_rate=0)
+    trainer.fit(model)
+    trainer.test(model)
+
+
 def test_logging_to_progress_bar_with_reserved_key(tmpdir):
     """ Test that logging a metric with a reserved name to the progress bar raises a warning. """
 
diff --git a/tests/utilities/test_all_gather_grad.py b/tests/utilities/test_all_gather_grad.py
index 86b977cfff029..94b2ad4263cbc 100644
--- a/tests/utilities/test_all_gather_grad.py
+++ b/tests/utilities/test_all_gather_grad.py
@@ -8,6 +8,7 @@
 from pytorch_lightning import seed_everything, Trainer
 from pytorch_lightning.utilities import AllGatherGrad
 from tests.helpers.boring_model import BoringModel
+from tests.helpers.runif import RunIf
 
 
 def setup_ddp(rank, world_size):

From d91e36fe0476bba668166ef5a0835751eb10ee78 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Mon, 29 Mar 2021 23:25:01 +0200
Subject: [PATCH 11/13] Add 1.8.1 to adjust_versions.py

---
 requirements/adjust_versions.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements/adjust_versions.py b/requirements/adjust_versions.py
index c1499cd4ea5ee..d0dfbc59e2352 100644
--- a/requirements/adjust_versions.py
+++ b/requirements/adjust_versions.py
@@ -11,6 +11,7 @@
     "1.7.0": dict(torchvision="0.8.1", torchtext="0.8"),
     "1.7.1": dict(torchvision="0.8.2", torchtext="0.8.1"),
     "1.8.0": dict(torchvision="0.9.0", torchtext="0.9"),
+    "1.8.1": dict(torchvision="0.9.0", torchtext="0.9"),
 }
 
 

From 67d4749b401fbe8f40a660371e19bcd2b2d2295b Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Mon, 29 Mar 2021 18:59:20 +0100
Subject: [PATCH 12/13] [TPU] update is_tpu_exists utils internal logic to rely
 on xmp.spawn  (#6719)

* update_logic

* update

* Update tests/utilities/test_xla_device_utils.py

* Update pytorch_lightning/utilities/xla_device.py

Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com>

* Update pytorch_lightning/utilities/xla_device.py

Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com>

* update test

* Update tests/utilities/test_xla_device_utils.py

* update

* Apply fix

* Docstring

* flake8

* update

Co-authored-by: Your Name <you@example.com>
Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com>
Co-authored-by: Carlos Mocholi <carlossmocholi@gmail.com>
---
 CHANGELOG.md                              |  1 +
 pytorch_lightning/utilities/__init__.py   |  1 -
 pytorch_lightning/utilities/xla_device.py | 54 +++++++++++++----------
 tests/plugins/test_custom_plugin.py       |  4 ++
 tests/utilities/test_xla_device_utils.py  | 30 ++++++++-----
 5 files changed, 54 insertions(+), 36 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a2ea14b23d166..8a20ee5914854 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -20,6 +20,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed error on TPUs when there was no `ModelCheckpoint` ([#6654](https://github.com/PyTorchLightning/pytorch-lightning/pull/6654))
 - Fixed `trainer.test` freeze on TPUs ([#6654](https://github.com/PyTorchLightning/pytorch-lightning/pull/6654))
 - Fixed a bug where gradients were disabled after calling `Trainer.predict` ([#6657](https://github.com/PyTorchLightning/pytorch-lightning/pull/6657))
+- Fixed bug where no TPUs were detected in a TPU pod env ([#6719](https://github.com/PyTorchLightning/pytorch-lightning/pull/6719))
 
 
 ## [1.2.5] - 2021-03-23
diff --git a/pytorch_lightning/utilities/__init__.py b/pytorch_lightning/utilities/__init__.py
index cf3aa06f305b8..e24e4a0db560a 100644
--- a/pytorch_lightning/utilities/__init__.py
+++ b/pytorch_lightning/utilities/__init__.py
@@ -14,7 +14,6 @@
 """General utilities"""
 
 import numpy
-
 from pytorch_lightning.utilities.apply_func import move_data_to_device  # noqa: F401
 from pytorch_lightning.utilities.distributed import (  # noqa: F401
     AllGatherGrad,
diff --git a/pytorch_lightning/utilities/xla_device.py b/pytorch_lightning/utilities/xla_device.py
index fcf56e9c679f4..294d3d2c5ec40 100644
--- a/pytorch_lightning/utilities/xla_device.py
+++ b/pytorch_lightning/utilities/xla_device.py
@@ -12,18 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import functools
+import os
 import queue as q
 import traceback
 from multiprocessing import Process, Queue
 
-import torch
+import torch.multiprocessing as mp
 
 from pytorch_lightning.utilities.imports import _XLA_AVAILABLE
 
 if _XLA_AVAILABLE:
     import torch_xla.core.xla_model as xm
+    import torch_xla.distributed.xla_multiprocessing as xmp
+
 #: define waiting time got checking TPU available in sec
-TPU_CHECK_TIMEOUT = 100
+TPU_CHECK_TIMEOUT = 25
 
 
 def inner_f(queue, func, *args, **kwargs):  # pragma: no cover
@@ -55,23 +58,10 @@ def wrapper(*args, **kwargs):
 class XLADeviceUtils:
     """Used to detect the type of XLA device"""
 
-    TPU_AVAILABLE = None
-
-    @staticmethod
-    def _fetch_xla_device_type(device: torch.device) -> str:
-        """
-        Returns XLA device type
-
-        Args:
-            device: (:class:`~torch.device`): Accepts a torch.device type with a XLA device format i.e xla:0
-
-        Return:
-            Returns a str of the device hardware type. i.e TPU
-        """
-        if _XLA_AVAILABLE:
-            return xm.xla_device_hw(device)
+    _TPU_AVAILABLE = False
 
     @staticmethod
+    @pl_multi_process
     def _is_device_tpu() -> bool:
         """
         Check if device is TPU
@@ -79,10 +69,18 @@ def _is_device_tpu() -> bool:
         Return:
             A boolean value indicating if the xla device is a TPU device or not
         """
-        if _XLA_AVAILABLE:
-            device = xm.xla_device()
-            device_type = XLADeviceUtils._fetch_xla_device_type(device)
-            return device_type == "TPU"
+
+        def _fn(_: int, mp_queue):
+            try:
+                device = xm.xla_device()
+                mp_queue.put(device.type == 'xla')
+            except Exception:
+                mp_queue.put(False)
+
+        smp = mp.get_context("spawn")
+        queue = smp.SimpleQueue()
+        xmp.spawn(_fn, args=(queue, ), nprocs=1)
+        return queue.get()
 
     @staticmethod
     def xla_available() -> bool:
@@ -102,6 +100,14 @@ def tpu_device_exists() -> bool:
         Return:
             A boolean value indicating if a TPU device exists on the system
         """
-        if XLADeviceUtils.TPU_AVAILABLE is None and _XLA_AVAILABLE:
-            XLADeviceUtils.TPU_AVAILABLE = pl_multi_process(XLADeviceUtils._is_device_tpu)()
-        return XLADeviceUtils.TPU_AVAILABLE
+        if os.getenv("PL_TPU_AVAILABLE", '0') == "1":
+            XLADeviceUtils._TPU_AVAILABLE = True
+
+        if XLADeviceUtils.xla_available() and not XLADeviceUtils._TPU_AVAILABLE:
+
+            XLADeviceUtils._TPU_AVAILABLE = XLADeviceUtils._is_device_tpu()
+
+            if XLADeviceUtils._TPU_AVAILABLE:
+                os.environ["PL_TPU_AVAILABLE"] = '1'
+
+        return XLADeviceUtils._TPU_AVAILABLE
diff --git a/tests/plugins/test_custom_plugin.py b/tests/plugins/test_custom_plugin.py
index 872b49ef48635..b0407d1fca6b2 100644
--- a/tests/plugins/test_custom_plugin.py
+++ b/tests/plugins/test_custom_plugin.py
@@ -11,6 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import pytest
+import torch
+
 from pytorch_lightning import Trainer
 from pytorch_lightning.plugins import DDPPlugin
 from tests.helpers import BoringModel
@@ -26,6 +29,7 @@ def __init__(self, **kwargs):
 
 
 @RunIf(skip_windows=True)
+@pytest.mark.skipif(torch.cuda.is_available(), reason="RuntimeError: Tensors must be CUDA and dense")
 def test_sync_batchnorm_set(tmpdir):
     """Tests if sync_batchnorm is automatically set for custom plugin."""
     model = BoringModel()
diff --git a/tests/utilities/test_xla_device_utils.py b/tests/utilities/test_xla_device_utils.py
index 73b11b48267ce..edca2777b578a 100644
--- a/tests/utilities/test_xla_device_utils.py
+++ b/tests/utilities/test_xla_device_utils.py
@@ -17,29 +17,37 @@
 import pytest
 
 import pytorch_lightning.utilities.xla_device as xla_utils
-from pytorch_lightning.utilities import _TPU_AVAILABLE, _XLA_AVAILABLE
-from tests.helpers.utils import pl_multi_process_test
+from pytorch_lightning.utilities import _XLA_AVAILABLE
+from tests.helpers.runif import RunIf
 
 
 @pytest.mark.skipif(_XLA_AVAILABLE, reason="test requires torch_xla to be absent")
 def test_tpu_device_absence():
-    """Check tpu_device_exists returns None when torch_xla is not available"""
-    assert xla_utils.XLADeviceUtils.tpu_device_exists() is None
+    """Check tpu_device_exists returns False when torch_xla is not available"""
+    assert not xla_utils.XLADeviceUtils.tpu_device_exists()
 
 
-@pytest.mark.skipif(not _TPU_AVAILABLE, reason="test requires torch_xla to be installed")
-@pl_multi_process_test
+@RunIf(tpu=True)
 def test_tpu_device_presence():
     """Check tpu_device_exists returns True when TPU is available"""
-    assert xla_utils.XLADeviceUtils.tpu_device_exists() is True
+    assert xla_utils.XLADeviceUtils.tpu_device_exists()
 
 
-@patch('pytorch_lightning.utilities.xla_device.TPU_CHECK_TIMEOUT', 10)
+def sleep_fn(sleep_time: float) -> bool:
+    time.sleep(sleep_time)
+    return True
+
+
+@patch('pytorch_lightning.utilities.xla_device.TPU_CHECK_TIMEOUT', 3)
+@pytest.mark.skipif(not _XLA_AVAILABLE, reason="test requires torch_xla to be present")
 def test_result_returns_within_timeout_seconds():
-    """Check that pl_multi_process returns within 10 seconds"""
+    """Check that pl_multi_process returns within 3 seconds"""
+    fn = xla_utils.pl_multi_process(sleep_fn)
+
     start = time.time()
-    result = xla_utils.pl_multi_process(time.sleep)(xla_utils.TPU_CHECK_TIMEOUT * 1.25)
+    result = fn(xla_utils.TPU_CHECK_TIMEOUT * 0.5)
     end = time.time()
     elapsed_time = int(end - start)
+
     assert elapsed_time <= xla_utils.TPU_CHECK_TIMEOUT
-    assert result is False
+    assert result

From fb50a051c6195e16df3a767950e31e5120df4835 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Mon, 29 Mar 2021 10:39:06 +0200
Subject: [PATCH 13/13] remake nvidia docker (#6686)

* use latest

* remake

* examples
---
 azure-pipelines.yml              |  5 ++-
 dockers/nvidia/Dockerfile        | 67 ++++++++++++++++++++++++++------
 dockers/release/Dockerfile       |  8 ++--
 pl_examples/run_ddp-example.sh   | 12 ------
 pl_examples/run_ddp-examples.sh  | 13 +++++++
 pl_examples/run_examples-args.sh | 15 +++++++
 6 files changed, 91 insertions(+), 29 deletions(-)
 delete mode 100644 pl_examples/run_ddp-example.sh
 create mode 100644 pl_examples/run_ddp-examples.sh
 create mode 100644 pl_examples/run_examples-args.sh

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index f944cad2ae09d..376078ee85335 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -98,8 +98,9 @@ jobs:
     - script: |
         set -e
         python -m pytest pl_examples -v --maxfail=2 --durations=0
-        python setup.py install --user --quiet
-        bash pl_examples/run_ddp-example.sh
+        pip install . --user --quiet
+        bash pl_examples/run_examples-args.sh --gpus 1 --max_epochs 1 --batch_size 64 --limit_train_batches 5 --limit_val_batches 3
+        bash pl_examples/run_ddp-examples.sh --max_epochs 1 --batch_size 32 --limit_train_batches 2 --limit_val_batches 2
         # cd pl_examples/basic_examples
         # bash submit_ddp_job.sh
         # bash submit_ddp2_job.sh
diff --git a/dockers/nvidia/Dockerfile b/dockers/nvidia/Dockerfile
index ea567a5306eed..ad1169c4450dd 100644
--- a/dockers/nvidia/Dockerfile
+++ b/dockers/nvidia/Dockerfile
@@ -12,26 +12,69 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-FROM nvcr.io/nvidia/pytorch:20.12-py3
+FROM nvcr.io/nvidia/cuda:11.1.1-runtime-ubuntu20.04
 
 MAINTAINER PyTorchLightning <https://github.com/PyTorchLightning>
 
 ARG LIGHTNING_VERSION=""
 
-COPY ./ ./pytorch-lightning/
+SHELL ["/bin/bash", "-c"]
+# https://techoverflow.net/2019/05/18/how-to-fix-configuring-tzdata-interactive-input-when-building-docker-images/
+ENV \
+    DEBIAN_FRONTEND=noninteractive \
+    TZ=Europe/Prague \
+    PATH="$PATH:/root/.local/bin" \
+    CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda" \
+    MKL_THREADING_LAYER=GNU
+
+RUN apt-get update -qq && \
+    apt-get install -y --no-install-recommends \
+        build-essential \
+        python3 \
+        python3-distutils \
+        python3-dev \
+        pkg-config \
+        cmake \
+        git \
+        wget \
+        unzip \
+        ca-certificates \
+    && \
+
+# Cleaning
+    apt-get autoremove -y && \
+    apt-get clean && \
+    rm -rf /root/.cache && \
+    rm -rf /var/lib/apt/lists/* && \
+
+# Setup PIP
+    update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \
+    wget https://bootstrap.pypa.io/get-pip.py --progress=bar:force:noscroll --no-check-certificate && \
+    python get-pip.py && \
+    rm get-pip.py && \
+    pip --version
+
+COPY ./ /home/pytorch-lightning/
 
-# install dependencies
 RUN \
-    # Disable cache
-    #conda install "pip>20.1" && \
-    #pip config set global.cache-dir false && \
-    if [ -z $LIGHTNING_VERSION ] ; then \
-        pip install ./pytorch-lightning --no-cache-dir ; \
-        rm -rf pytorch-lightning ; \
-    else \
+    cd /home  && \
+    mv pytorch-lightning/notebooks . && \
+    mv pytorch-lightning/pl_examples . && \
+    # replace by specific version if asked
+    if [ ! -z "$LIGHTNING_VERSION" ] ; then \
         rm -rf pytorch-lightning ; \
-        pip install https://github.com/PyTorchLightning/pytorch-lightning/archive/${LIGHTNING_VERSION}.zip --no-cache-dir ; \
-    fi
+        wget https://github.com/PyTorchLightning/pytorch-lightning/archive/${LIGHTNING_VERSION}.zip --progress=bar:force:noscroll ; \
+        unzip ${LIGHTNING_VERSION}.zip ; \
+        mv pytorch-lightning-*/ pytorch-lightning ; \
+        rm *.zip ; \
+    fi && \
+
+# Installations
+    python -c "fname = './pytorch-lightning/requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('horovod')] ; open(fname, 'w').writelines(lines)" && \
+    pip install -r ./pytorch-lightning/requirements/extra.txt -U --no-cache-dir && \
+    pip install -r ./pytorch-lightning/requirements/examples.txt -U --no-cache-dir && \
+    pip install ./pytorch-lightning --no-cache-dir && \
+    rm -rf pytorch-lightning
 
 RUN python --version && \
     pip --version && \
diff --git a/dockers/release/Dockerfile b/dockers/release/Dockerfile
index 3584ee02746e3..5cd53385f660b 100644
--- a/dockers/release/Dockerfile
+++ b/dockers/release/Dockerfile
@@ -21,12 +21,14 @@ MAINTAINER PyTorchLightning <https://github.com/PyTorchLightning>
 
 ARG LIGHTNING_VERSION=""
 
-COPY ./ ./pytorch-lightning/
+COPY ./ /home/pytorch-lightning/
 
 # install dependencies
 RUN \
-    # Disable cache
-    #conda install "pip>20.1" && \
+    cd /home && \
+    mv pytorch-lightning/notebooks . && \
+    mv pytorch-lightning/pl_examples . && \
+    # replace by specific version if asked
     if [ ! -z "$LIGHTNING_VERSION" ] ; then \
         rm -rf pytorch-lightning ; \
         wget https://github.com/PyTorchLightning/pytorch-lightning/archive/${LIGHTNING_VERSION}.zip --progress=bar:force:noscroll ; \
diff --git a/pl_examples/run_ddp-example.sh b/pl_examples/run_ddp-example.sh
deleted file mode 100644
index f0c7695e766f2..0000000000000
--- a/pl_examples/run_ddp-example.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-ARGS_DEFAULT=" --default_root_dir %(tmpdir)s --max_epochs 1 --batch_size 32 --limit_train_batches 2 --limit_val_batches 2"
-ARGS_EXTRA_DDP=" --gpus 2 --accelerator ddp"
-ARGS_EXTRA_AMP=" --precision 16"
-
-python pl_examples/basic_examples/simple_image_classifier.py ${ARGS_DEFAULT} ${ARGS_EXTRA_DDP}
-python pl_examples/basic_examples/simple_image_classifier.py ${ARGS_DEFAULT} ${ARGS_EXTRA_DDP} ${ARGS_EXTRA_AMP}
-python pl_examples/basic_examples/backbone_image_classifier.py ${ARGS_DEFAULT} ${ARGS_EXTRA_DDP}
-python pl_examples/basic_examples/backbone_image_classifier.py ${ARGS_DEFAULT} ${ARGS_EXTRA_DDP} ${ARGS_EXTRA_AMP}
-python pl_examples/basic_examples/autoencoder.py ${ARGS_DEFAULT} ${ARGS_EXTRA_DDP}
-python pl_examples/basic_examples/autoencoder.py ${ARGS_DEFAULT} ${ARGS_EXTRA_DDP} ${ARGS_EXTRA_AMP}
diff --git a/pl_examples/run_ddp-examples.sh b/pl_examples/run_ddp-examples.sh
new file mode 100644
index 0000000000000..6cc36364e397d
--- /dev/null
+++ b/pl_examples/run_ddp-examples.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+ARGS_EXTRA_DDP=" --gpus 2 --accelerator ddp"
+ARGS_EXTRA_AMP=" --precision 16"
+
+python pl_examples/basic_examples/simple_image_classifier.py $@ ${ARGS_EXTRA_DDP}
+python pl_examples/basic_examples/simple_image_classifier.py $@ ${ARGS_EXTRA_DDP} ${ARGS_EXTRA_AMP}
+
+python pl_examples/basic_examples/backbone_image_classifier.py $@ ${ARGS_EXTRA_DDP}
+python pl_examples/basic_examples/backbone_image_classifier.py $@ ${ARGS_EXTRA_DDP} ${ARGS_EXTRA_AMP}
+
+python pl_examples/basic_examples/autoencoder.py $@ ${ARGS_EXTRA_DDP}
+python pl_examples/basic_examples/autoencoder.py $@ ${ARGS_EXTRA_DDP} ${ARGS_EXTRA_AMP}
diff --git a/pl_examples/run_examples-args.sh b/pl_examples/run_examples-args.sh
new file mode 100644
index 0000000000000..352869538cb18
--- /dev/null
+++ b/pl_examples/run_examples-args.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+echo $@
+
+full_path=$(realpath $0)
+echo $full_path
+
+dir_path=$(dirname $full_path)
+echo $dir_path
+
+python ${dir_path}/basic_examples/simple_image_classifier.py $@
+
+python ${dir_path}/basic_examples/backbone_image_classifier.py $@
+
+python ${dir_path}/basic_examples/autoencoder.py $@