diff --git a/.drone.yml b/.drone.yml
index c87130844c040..b0b6c3df1b699 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -33,11 +33,10 @@ steps:
     - python --version
     - pip --version
     - nvidia-smi
-    - pip install -r ./requirements/devel.txt --upgrade-strategy only-if-needed -v --no-cache-dir
-    - pip install git+https://${AUTH_TOKEN}@github.com/PyTorchLightning/lightning-dtrun.git@v0.0.2 -v --no-cache-dir
+    - pip install -r ./requirements/devel.txt --upgrade-strategy only-if-needed --no-cache-dir
+    - pip install git+https://${AUTH_TOKEN}@github.com/PyTorchLightning/lightning-dtrun.git@v0.0.2 --no-cache-dir
     # when Image has defined CUDa version we can switch to this package spec "nvidia-dali-cuda${CUDA_VERSION%%.*}0"
-    # todo: temprarl fix till https://github.com/PyTorchLightning/pytorch-lightning/pull/4922 is resolved
-    - pip install --extra-index-url https://developer.download.nvidia.com/compute/redist "nvidia-dali-cuda100<0.27" --upgrade-strategy only-if-needed
+    - pip install --extra-index-url https://developer.download.nvidia.com/compute/redist nvidia-dali-cuda100 --upgrade-strategy only-if-needed
     - pip list
     - python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --durations=25 # --flake8
     # Running special tests
diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml
index b8ca5d8723b39..3543891cf7698 100644
--- a/.github/workflows/release-docker.yml
+++ b/.github/workflows/release-docker.yml
@@ -26,7 +26,7 @@ jobs:
       - name: Get release version
         if: startsWith(github.ref, 'refs/tags/') || github.event_name == 'release'
         id: get_version
-        run: echo ::set-env name=RELEASE_VERSION::$(echo ${GITHUB_REF##*/})
+        run: echo "::set-output name=RELEASE_VERSION::$(echo ${GITHUB_REF##*/})"
 
       - name: Publish Releases to Docker
         # only on releases
@@ -37,6 +37,6 @@ jobs:
           username: ${{ secrets.DOCKER_USERNAME }}
           password: ${{ secrets.DOCKER_PASSWORD }}
           dockerfile: dockers/release/Dockerfile
-          build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},LIGHTNING_VERSION=${{ env.RELEASE_VERSION }}
-          tags: "${{ env.RELEASE_VERSION }}-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }},latest-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}"
+          build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }}
+          tags: "${{ steps.get_version.outputs.RELEASE_VERSION }}-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }},latest-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}"
         timeout-minutes: 55
diff --git a/.mergify.yml b/.mergify.yml
index 44c48f2ddced5..cb5ef3ec7519a 100644
--- a/.mergify.yml
+++ b/.mergify.yml
@@ -12,59 +12,59 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-pull_request_rules:
-
-  - name: Automatic merge on approval
-    conditions:
-      - base=master
-      # number of review approvals
-      - "#approved-reviews-by>=3"
-      # no waiting or assigned review
-      - "#review-requested=0"
-      # no requested chnages from any reviewer
-      - "#changes-requested-reviews-by=0"
-      # this serves as ALL check has to pass as we have actually around 40 tests in total
-      - "#status-success>=54"
-      # this is just in case since we rely on GPU tests (note: redundand to the above)
-      - status-success=continuous-integration/drone/pr
-      - "status-success=ci/circleci: TPU-tests"
-      # this is patter-like, unofrunatly serves as `any(...)` (note: redundand to the above)
-      #- "status-success~=^ci/circleci:"
-      # no conflict with master branch
-      - -conflict
-      # was not closed yet
-      - -closed
-      # filter-out GH draft PRs
-      - -draft
-    actions:
-      delete_head_branch: {}
-      merge:
-        # https://doc.mergify.io/merge-action.html#strict-merge
-        # (on head branch) $ git merge --no-ff base
-        # (on head branch) # Wait for CI to go green
-        # (on head branch) # Squash all commits
-        # (on base branch) $ git merge --ff head
-        strict: true
-        method: squash
-      comment:
-        message: Great job! =)
-
-  - name: warn on conflicts
-    conditions:
-      - conflict
-      # filter-out GH draft PRs
-      - -draft
-    actions:
-      comment:
-        message: This pull request is now in conflict... :(
-
-  - name: add core reviewer
-    conditions:
-      # filter-out GH draft PRs
-      - -draft
-      # number of review approvals
-      - "#approved-reviews-by<3"
-    actions:
-      request_reviews:
-        teams:
-          - core-contributors
+#pull_request_rules:
+#
+#  - name: Automatic merge on approval
+#    conditions:
+#      - base=master
+#      # number of review approvals
+#      - "#approved-reviews-by>=3"
+#      # no waiting or assigned review
+#      - "#review-requested=0"
+#      # no requested chnages from any reviewer
+#      - "#changes-requested-reviews-by=0"
+#      # this serves as ALL check has to pass as we have actually around 40 tests in total
+#      - "#status-success>=54"
+#      # this is just in case since we rely on GPU tests (note: redundand to the above)
+#      - status-success=continuous-integration/drone/pr
+#      - "status-success=ci/circleci: TPU-tests"
+#      # this is patter-like, unofrunatly serves as `any(...)` (note: redundand to the above)
+#      #- "status-success~=^ci/circleci:"
+#      # no conflict with master branch
+#      - -conflict
+#      # was not closed yet
+#      - -closed
+#      # filter-out GH draft PRs
+#      - -draft
+#    actions:
+#      delete_head_branch: {}
+#      merge:
+#        # https://doc.mergify.io/merge-action.html#strict-merge
+#        # (on head branch) $ git merge --no-ff base
+#        # (on head branch) # Wait for CI to go green
+#        # (on head branch) # Squash all commits
+#        # (on base branch) $ git merge --ff head
+#        strict: true
+#        method: squash
+#      comment:
+#        message: Great job! =)
+#
+#  - name: warn on conflicts
+#    conditions:
+#      - conflict
+#      # filter-out GH draft PRs
+#      - -draft
+#    actions:
+#      comment:
+#        message: This pull request is now in conflict... :(
+#
+#  - name: add core reviewer
+#    conditions:
+#      # filter-out GH draft PRs
+#      - -draft
+#      # number of review approvals
+#      - "#approved-reviews-by<3"
+#    actions:
+#      request_reviews:
+#        teams:
+#          - core-contributors
diff --git a/.update.sh b/.update.sh
deleted file mode 100644
index 40fcc22d6b79b..0000000000000
--- a/.update.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-
-version=$1
-
-git commit -am "release v$version"
-git tag $version -m "test_tube v$version"
-git push --tags origin master
-
-# push to pypi
-rm -rf ./dist/*
-python3 setup.py sdist
-twine upload dist/*
-
-# to update docs
-# cd to root dir
-# mkdocs gh-deploy
-
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 233d54476889b..04cc5a71d728f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -50,6 +50,64 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed distributed setting and `ddp_cpu` only with `num_processes>1` ([#5297](https://github.com/PyTorchLightning/pytorch-lightning/pull/5297))
 
 
+- Fixed the saved filename in `ModelCheckpoint` when it already exists ([#4861](https://github.com/PyTorchLightning/pytorch-lightning/pull/4861))
+
+
+- Fixed `DDPHPCAccelerator` hangs in DDP construction by calling `init_device` ([#5157](https://github.com/PyTorchLightning/pytorch-lightning/pull/5157))
+
+
+## [1.1.2] - 2020-12-23
+
+### Added
+
+- Support number for logging with `sync_dist=True` ([#5080](https://github.com/PyTorchLightning/pytorch-lightning/pull/5080)
+- Added offset logging step when resuming for Wandb logger ([#5050](https://github.com/PyTorchLightning/pytorch-lightning/pull/5050)
+
+### Removed
+
+- `enable_pl_optimizer=False` by default to temporarily fix AMP issues ([#5163](https://github.com/PyTorchLightning/pytorch-lightning/pull/5163)
+
+### Fixed
+
+- Metric reduction with Logging ([#5150](https://github.com/PyTorchLightning/pytorch-lightning/pull/5150)
+- Remove nan loss in manual optimization ([#5121](https://github.com/PyTorchLightning/pytorch-lightning/pull/5121)
+- Un-balanced logging properly supported ([#5119](https://github.com/PyTorchLightning/pytorch-lightning/pull/5119)
+- Fix hanging in DDP HPC accelerators ([#5157](https://github.com/PyTorchLightning/pytorch-lightning/pull/5157)
+- Fix saved filename in `ModelCheckpoint` if it already exists ([#4861](https://github.com/PyTorchLightning/pytorch-lightning/pull/4861)
+- Fix reset `TensorRunningAccum` ([#5106](https://github.com/PyTorchLightning/pytorch-lightning/pull/5106)
+- Updated `DALIClassificationLoader` to not use deprecated arguments ([#4925](https://github.com/PyTorchLightning/pytorch-lightning/pull/4925)
+- Corrected call to `torch.no_grad` ([#5124](https://github.com/PyTorchLightning/pytorch-lightning/pull/5124)
+
+
+## [1.1.1] - 2020-12-15
+
+### Added
+
+- Add a notebook example to reach a quick baseline of ~94% accuracy on CIFAR10 using Resnet in Lightning ([#4818](https://github.com/PyTorchLightning/pytorch-lightning/pull/4818))
+
+### Changed
+
+- Simplify accelerator steps ([#5015](https://github.com/PyTorchLightning/pytorch-lightning/pull/5015))
+- Refactor load in checkpoint connector ([#4593](https://github.com/PyTorchLightning/pytorch-lightning/pull/4593))
+
+### Removed
+
+- Drop duplicate metrics ([#5014](https://github.com/PyTorchLightning/pytorch-lightning/pull/5014))
+- Remove beta arg from F1 class and functional ([#5076](https://github.com/PyTorchLightning/pytorch-lightning/pull/5076))
+
+### Fixed
+
+- Fixed trainer by default `None` in `DDPAccelerator` ([#4915](https://github.com/PyTorchLightning/pytorch-lightning/pull/4915))
+- Fixed `LightningOptimizer` to expose optimizer attributes ([#5095](https://github.com/PyTorchLightning/pytorch-lightning/pull/5095))
+- Do not warn when the `name` key is used in the `lr_scheduler` dict ([#5057](https://github.com/PyTorchLightning/pytorch-lightning/pull/5057))
+- Check if optimizer supports closure ([#4981](https://github.com/PyTorchLightning/pytorch-lightning/pull/4981)
+- Extend LightningOptimizer to exposure underlying Optimizer attributes + update doc ([#5095](https://github.com/PyTorchLightning/pytorch-lightning/pull/5095))
+- Add deprecated metric utility functions back to functional (
+    [#5067](https://github.com/PyTorchLightning/pytorch-lightning/pull/5067),
+    [#5068](https://github.com/PyTorchLightning/pytorch-lightning/pull/5068))
+- Allow any input in `to_onnx` and `to_torchscript` ([#4378](https://github.com/PyTorchLightning/pytorch-lightning/pull/4378)
+- Do not warn when the name key is used in the `lr_scheduler` dict ([#5057](https://github.com/PyTorchLightning/pytorch-lightning/pull/5057))
+
 
 ## [1.1.0] - 2020-12-09
 
@@ -65,8 +123,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added logging using `self.log` in train and evaluation for epoch end hooks (
     [#4552](https://github.com/PyTorchLightning/pytorch-lightning/pull/4552),
     [#4495](https://github.com/PyTorchLightning/pytorch-lightning/pull/4495),
-    [#4439](https://github.com/PyTorchLightning/pytorch-lightning/pull/4439))
-    [#4684](https://github.com/PyTorchLightning/pytorch-lightning/pull/4684))
+    [#4439](https://github.com/PyTorchLightning/pytorch-lightning/pull/4439),
+    [#4684](https://github.com/PyTorchLightning/pytorch-lightning/pull/4684),
     [#4913](https://github.com/PyTorchLightning/pytorch-lightning/pull/4913))
 - Added ability for DDP plugin to modify optimizer state saving ([#4675](https://github.com/PyTorchLightning/pytorch-lightning/pull/4675))
 - Added casting to python types for numpy scalars when logging hparams ([#4647](https://github.com/PyTorchLightning/pytorch-lightning/pull/4647))
diff --git a/Makefile b/Makefile
index 76e8bac4e3748..55a95f0b14af2 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: test
+.PHONY: test clean
 
 test:
 	# install APEX, see https://github.com/NVIDIA/apex#linux
@@ -13,3 +13,7 @@ test:
 
 	# specific file
 	# python -m coverage run --source pytorch_lightning -m py.test --flake8 --durations=0 -v -k
+
+clean:
+	# clean all temp runs
+	rm -rf $(shell find . -name "mlruns" )
diff --git a/README.md b/README.md
index a5c6bbb244730..84d9571395519 100644
--- a/README.md
+++ b/README.md
@@ -42,6 +42,11 @@ Scale your models, not the boilerplate.**
 
 ---
 
+## NEWS
+[Dec 2020 - Read about how Facebook uses Lightning to standardize deep learning across research and production teams](https://ai.facebook.com/blog/reengineering-facebook-ais-deep-learning-platforms-for-interoperability)
+
+---
+
 ## PyTorch Lightning is just organized PyTorch
 Lightning disentangles PyTorch code to decouple the science from the engineering.
 ![PT to PL](docs/source/_images/general/pl_quick_start_full_compressed.gif)
@@ -73,19 +78,6 @@ Lightning can automatically export to ONNX or TorchScript for those cases.
 
 ---
 
-## Trending contributors
-
-[![](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/images/0)](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/links/0)
-[![](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/images/1)](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/links/1)
-[![](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/images/2)](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/links/2)
-[![](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/images/3)](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/links/3)
-[![](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/images/4)](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/links/4)
-[![](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/images/5)](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/links/5)
-[![](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/images/6)](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/links/6)
-[![](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/images/7)](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/links/7)
-
----
-
 ## Continuous Integration
 <center>
 
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
index e69de29bb2d1d..734288b07235d 100644
--- a/benchmarks/__init__.py
+++ b/benchmarks/__init__.py
@@ -0,0 +1,17 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+BENCHMARK_ROOT = os.path.dirname(__file__)
+PROJECT_ROOT = os.path.dirname(BENCHMARK_ROOT)
diff --git a/benchmarks/generate_comparison.py b/benchmarks/generate_comparison.py
new file mode 100644
index 0000000000000..69eb47cb7e759
--- /dev/null
+++ b/benchmarks/generate_comparison.py
@@ -0,0 +1,60 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import matplotlib.pylab as plt
+import pandas as pd
+
+from benchmarks.test_basic_parity import lightning_loop, vanilla_loop
+from tests.base.models import ParityModuleMNIST, ParityModuleRNN
+
+NUM_EPOCHS = 20
+NUM_RUNS = 50
+MODEL_CLASSES = (ParityModuleRNN, ParityModuleMNIST)
+PATH_HERE = os.path.dirname(__file__)
+FIGURE_EXTENSION = '.png'
+
+
+def _main():
+    fig, axarr = plt.subplots(nrows=len(MODEL_CLASSES))
+
+    for i, cls_model in enumerate(MODEL_CLASSES):
+        path_csv = os.path.join(PATH_HERE, f'dump-times_{cls_model.__name__}.csv')
+        if os.path.isfile(path_csv):
+            df_time = pd.read_csv(path_csv, index_col=0)
+        else:
+            vanilla = vanilla_loop(cls_model, num_epochs=NUM_EPOCHS, num_runs=NUM_RUNS)
+            lightning = lightning_loop(cls_model, num_epochs=NUM_EPOCHS, num_runs=NUM_RUNS)
+
+            df_time = pd.DataFrame({'vanilla PT': vanilla['durations'][1:], 'PT Lightning': lightning['durations'][1:]})
+            df_time /= NUM_RUNS
+            df_time.to_csv(os.path.join(PATH_HERE, f'dump-times_{cls_model.__name__}.csv'))
+        # todo: add also relative X-axis ticks to see both: relative and absolute time differences
+        df_time.plot.hist(
+            ax=axarr[i],
+            bins=20,
+            alpha=0.5,
+            title=cls_model.__name__,
+            legend=True,
+            grid=True,
+        )
+        axarr[i].set(xlabel='time [seconds]')
+
+    path_fig = os.path.join(PATH_HERE, f'figure-parity-times{FIGURE_EXTENSION}')
+    fig.tight_layout()
+    fig.savefig(path_fig)
+
+
+if __name__ == '__main__':
+    _main()
diff --git a/benchmarks/test_parity.py b/benchmarks/test_basic_parity.py
similarity index 60%
rename from benchmarks/test_parity.py
rename to benchmarks/test_basic_parity.py
index 41bba9533e10d..c85984b092b9d 100644
--- a/benchmarks/test_parity.py
+++ b/benchmarks/test_basic_parity.py
@@ -1,11 +1,26 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import time
 
 import numpy as np
 import pytest
 import torch
+from tqdm import tqdm
 
+from pytorch_lightning import seed_everything, Trainer
 import tests.base.develop_utils as tutils
-from pytorch_lightning import Trainer, seed_everything
 from tests.base.models import ParityModuleMNIST, ParityModuleRNN
 
 
@@ -15,34 +30,33 @@
     (ParityModuleMNIST, 0.25),  # todo: lower this thr
 ])
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
-def test_pytorch_parity(tmpdir, cls_model, max_diff):
+def test_pytorch_parity(tmpdir, cls_model, max_diff: float, num_epochs: int = 4, num_runs: int = 3):
     """
     Verify that the same  pytorch and lightning models achieve the same results
     """
-    num_epochs = 4
-    num_rums = 3
-    lightning_outs, pl_times = lightning_loop(cls_model, num_rums, num_epochs)
-    manual_outs, pt_times = vanilla_loop(cls_model, num_rums, num_epochs)
+    lightning = lightning_loop(cls_model, num_runs, num_epochs)
+    vanilla = vanilla_loop(cls_model, num_runs, num_epochs)
 
     # make sure the losses match exactly  to 5 decimal places
-    for pl_out, pt_out in zip(lightning_outs, manual_outs):
+    for pl_out, pt_out in zip(lightning['losses'], vanilla['losses']):
         np.testing.assert_almost_equal(pl_out, pt_out, 5)
 
     # the fist run initialize dataset (download & filter)
-    tutils.assert_speed_parity_absolute(pl_times[1:], pt_times[1:],
-                                        nb_epochs=num_epochs, max_diff=max_diff)
+    tutils.assert_speed_parity_absolute(
+        lightning['durations'][1:], vanilla['durations'][1:], nb_epochs=num_epochs, max_diff=max_diff
+    )
 
 
 def vanilla_loop(cls_model, num_runs=10, num_epochs=10):
     """
     Returns an array with the last loss from each epoch for each run
     """
-    device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
-    errors = []
-    times = []
+    hist_losses = []
+    hist_durations = []
 
+    device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
     torch.backends.cudnn.deterministic = True
-    for i in range(num_runs):
+    for i in tqdm(range(num_runs), desc=f'Vanilla PT with {cls_model.__name__}'):
         time_start = time.perf_counter()
 
         # set seed
@@ -74,18 +88,21 @@ def vanilla_loop(cls_model, num_runs=10, num_epochs=10):
             epoch_losses.append(loss.item())
 
         time_end = time.perf_counter()
-        times.append(time_end - time_start)
+        hist_durations.append(time_end - time_start)
 
-        errors.append(epoch_losses[-1])
+        hist_losses.append(epoch_losses[-1])
 
-    return errors, times
+    return {
+        'losses': hist_losses,
+        'durations': hist_durations,
+    }
 
 
 def lightning_loop(cls_model, num_runs=10, num_epochs=10):
-    errors = []
-    times = []
+    hist_losses = []
+    hist_durations = []
 
-    for i in range(num_runs):
+    for i in tqdm(range(num_runs), desc=f'PT Lightning with {cls_model.__name__}'):
         time_start = time.perf_counter()
 
         # set seed
@@ -108,9 +125,12 @@ def lightning_loop(cls_model, num_runs=10, num_epochs=10):
         trainer.fit(model)
 
         final_loss = trainer.train_loop.running_loss.last().item()
-        errors.append(final_loss)
+        hist_losses.append(final_loss)
 
         time_end = time.perf_counter()
-        times.append(time_end - time_start)
+        hist_durations.append(time_end - time_start)
 
-    return errors, times
+    return {
+        'losses': hist_losses,
+        'durations': hist_durations,
+    }
diff --git a/benchmarks/test_sharded_parity.py b/benchmarks/test_sharded_parity.py
index 0f58cb882bcf9..c3273b6956698 100644
--- a/benchmarks/test_sharded_parity.py
+++ b/benchmarks/test_sharded_parity.py
@@ -1,3 +1,17 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import platform
 import time
@@ -6,7 +20,7 @@
 import pytest
 import torch
 
-from pytorch_lightning import Trainer, seed_everything
+from pytorch_lightning import seed_everything, Trainer
 from pytorch_lightning.plugins.ddp_plugin import DDPPlugin
 from pytorch_lightning.plugins.sharded_plugin import DDPShardedPlugin
 from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE, _NATIVE_AMP_AVAILABLE
diff --git a/dockers/base-xla/Dockerfile b/dockers/base-xla/Dockerfile
index 8eb093295c37b..5dfeac8c9e86e 100644
--- a/dockers/base-xla/Dockerfile
+++ b/dockers/base-xla/Dockerfile
@@ -97,6 +97,8 @@ RUN \
     python -c "fname = 'requirements.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('torch')] ; open(fname, 'w').writelines(lines)" && \
     # drop Horovod as it is not needed
     python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('horovod')] ; open(fname, 'w').writelines(lines)" && \
+    # drop fairscale as it is not needed
+    python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'fairscale' not in line] ; open(fname, 'w').writelines(lines)" && \
     # drop TorchVision as it was installed with XLA
     python -c "fname = 'requirements/examples.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('torchvision')] ; open(fname, 'w').writelines(lines)" && \
     pip install --requirement ./requirements/devel.txt --upgrade-strategy only-if-needed && \
diff --git a/dockers/tpu-tests/Dockerfile b/dockers/tpu-tests/Dockerfile
index a514b1c3d35fe..464f7fd8f309e 100644
--- a/dockers/tpu-tests/Dockerfile
+++ b/dockers/tpu-tests/Dockerfile
@@ -27,8 +27,10 @@ COPY ./ ./pytorch-lightning/
 RUN \
     # Install pytorch-lightning at the current PR, plus dependencies.
     #pip install -r pytorch-lightning/requirements.txt --no-cache-dir && \
-    # drop Horovod
+    # drop Horovod as it is not needed
     python -c "fname = 'pytorch-lightning/requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('horovod')] ; open(fname, 'w').writelines(lines)" && \
+    # drop fairscale as it is not needed
+    python -c "fname = 'pytorch-lightning/requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'fairscale' not in line] ; open(fname, 'w').writelines(lines)" && \
     pip install -r pytorch-lightning/requirements/devel.txt --no-cache-dir --upgrade-strategy only-if-needed
 
 #RUN python -c "import pytorch_lightning as pl; print(pl.__version__)"
diff --git a/docs/source/_images/benchmarks/figure-parity-times.png b/docs/source/_images/benchmarks/figure-parity-times.png
new file mode 100644
index 0000000000000..2e8c5899020d9
Binary files /dev/null and b/docs/source/_images/benchmarks/figure-parity-times.png differ
diff --git a/docs/source/benchmarking.rst b/docs/source/benchmarking.rst
new file mode 100644
index 0000000000000..8dec7ca8c123c
--- /dev/null
+++ b/docs/source/benchmarking.rst
@@ -0,0 +1,14 @@
+Benchmark with vanilla PyTorch
+==============================
+
+In this section we set grounds for comparison between vanilla PyTorch and PT Lightning for most common scenarios.
+
+Time comparison
+---------------
+
+We have set regular benchmarking against PyTorch vanilla training loop on with RNN and simple MNIST classifier as per of out CI.
+In average for simple MNIST CNN classifier we are only about 0.06s slower per epoch, see detail chart bellow.
+
+.. figure:: _images/benchmarks/figure-parity-times.png
+   :alt: Speed parity to vanilla PT, created on 2020-12-16
+   :width: 500
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 1049a6d16a75d..2b7d9c3b58e26 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -24,6 +24,7 @@ PyTorch Lightning Documentation
    style_guide
    performance
    Lightning project template<https://github.com/PyTorchLightning/pytorch-lightning-conference-seed>
+   benchmarking
 
 
 .. toctree::
diff --git a/notebooks/04-transformers-text-classification.ipynb b/notebooks/04-transformers-text-classification.ipynb
index 037b24e4ddd9d..d52af84a76d97 100644
--- a/notebooks/04-transformers-text-classification.ipynb
+++ b/notebooks/04-transformers-text-classification.ipynb
@@ -1,5 +1,12 @@
 {
  "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<a href=\"https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/04-transformers-text-classification.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {
diff --git a/notebooks/05-trainer-flags-overview.ipynb b/notebooks/05-trainer-flags-overview.ipynb
index 6413e8239bb2e..da044a9c9b5c6 100644
--- a/notebooks/05-trainer-flags-overview.ipynb
+++ b/notebooks/05-trainer-flags-overview.ipynb
@@ -1,5 +1,12 @@
 {
  "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<a href=\"https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/05-trainer-flags-overview.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {
diff --git a/pl_examples/__init__.py b/pl_examples/__init__.py
index d7cec9fc1bc3a..860d8b48d4393 100644
--- a/pl_examples/__init__.py
+++ b/pl_examples/__init__.py
@@ -2,9 +2,46 @@
 
 from pytorch_lightning.utilities import _module_available
 
-EXAMPLES_ROOT = os.path.dirname(__file__)
-PACKAGE_ROOT = os.path.dirname(EXAMPLES_ROOT)
-DATASETS_PATH = os.path.join(PACKAGE_ROOT, 'Datasets')
+_EXAMPLES_ROOT = os.path.dirname(__file__)
+_PACKAGE_ROOT = os.path.dirname(_EXAMPLES_ROOT)
+_DATASETS_PATH = os.path.join(_PACKAGE_ROOT, 'Datasets')
 
-TORCHVISION_AVAILABLE = _module_available("torchvision")
-DALI_AVAILABLE = _module_available("nvidia.dali")
+_TORCHVISION_AVAILABLE = _module_available("torchvision")
+_DALI_AVAILABLE = _module_available("nvidia.dali")
+
+
+LIGHTNING_LOGO = """
+                    ####
+                ###########
+             ####################
+         ############################
+    #####################################
+##############################################
+#########################  ###################
+#######################    ###################
+####################      ####################
+##################       #####################
+################        ######################
+#####################        #################
+######################     ###################
+#####################    #####################
+####################   #######################
+###################  #########################
+##############################################
+    #####################################
+         ############################
+             ####################
+                  ##########
+                     ####
+"""
+
+
+def nice_print(msg, last=False):
+    print()
+    print("\033[0;35m" + msg + "\033[0m")
+    if last:
+        print()
+
+
+def cli_lightning_logo():
+    nice_print(LIGHTNING_LOGO)
diff --git a/pl_examples/basic_examples/autoencoder.py b/pl_examples/basic_examples/autoencoder.py
index 58a117a648458..eb540d16bf2cf 100644
--- a/pl_examples/basic_examples/autoencoder.py
+++ b/pl_examples/basic_examples/autoencoder.py
@@ -21,9 +21,9 @@
 from torch.utils.data import random_split
 
 import pytorch_lightning as pl
-from pl_examples import TORCHVISION_AVAILABLE
+from pl_examples import _TORCHVISION_AVAILABLE, cli_lightning_logo
 
-if TORCHVISION_AVAILABLE:
+if _TORCHVISION_AVAILABLE:
     from torchvision.datasets.mnist import MNIST
     from torchvision import transforms
 else:
@@ -31,6 +31,13 @@
 
 
 class LitAutoEncoder(pl.LightningModule):
+    """
+    >>> LitAutoEncoder()  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    LitAutoEncoder(
+      (encoder): ...
+      (decoder): ...
+    )
+    """
 
     def __init__(self):
         super().__init__()
@@ -105,4 +112,5 @@ def cli_main():
 
 
 if __name__ == '__main__':
+    cli_lightning_logo()
     cli_main()
diff --git a/pl_examples/basic_examples/backbone_image_classifier.py b/pl_examples/basic_examples/backbone_image_classifier.py
index 91a8481de7fd9..63517dfc9ed08 100644
--- a/pl_examples/basic_examples/backbone_image_classifier.py
+++ b/pl_examples/basic_examples/backbone_image_classifier.py
@@ -19,9 +19,9 @@
 from torch.utils.data import DataLoader, random_split
 
 import pytorch_lightning as pl
-from pl_examples import DATASETS_PATH, TORCHVISION_AVAILABLE
+from pl_examples import _DATASETS_PATH, _TORCHVISION_AVAILABLE, cli_lightning_logo
 
-if TORCHVISION_AVAILABLE:
+if _TORCHVISION_AVAILABLE:
     from torchvision.datasets.mnist import MNIST
     from torchvision import transforms
 else:
@@ -29,6 +29,13 @@
 
 
 class Backbone(torch.nn.Module):
+    """
+    >>> Backbone()  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    Backbone(
+      (l1): Linear(...)
+      (l2): Linear(...)
+    )
+    """
     def __init__(self, hidden_dim=128):
         super().__init__()
         self.l1 = torch.nn.Linear(28 * 28, hidden_dim)
@@ -42,6 +49,12 @@ def forward(self, x):
 
 
 class LitClassifier(pl.LightningModule):
+    """
+    >>> LitClassifier(Backbone())  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    LitClassifier(
+      (backbone): ...
+    )
+    """
     def __init__(self, backbone, learning_rate=1e-3):
         super().__init__()
         self.save_hyperparameters()
@@ -98,8 +111,8 @@ def cli_main():
     # ------------
     # data
     # ------------
-    dataset = MNIST(DATASETS_PATH, train=True, download=True, transform=transforms.ToTensor())
-    mnist_test = MNIST(DATASETS_PATH, train=False, download=True, transform=transforms.ToTensor())
+    dataset = MNIST(_DATASETS_PATH, train=True, download=True, transform=transforms.ToTensor())
+    mnist_test = MNIST(_DATASETS_PATH, train=False, download=True, transform=transforms.ToTensor())
     mnist_train, mnist_val = random_split(dataset, [55000, 5000])
 
     train_loader = DataLoader(mnist_train, batch_size=args.batch_size)
@@ -125,4 +138,5 @@ def cli_main():
 
 
 if __name__ == '__main__':
+    cli_lightning_logo()
     cli_main()
diff --git a/pl_examples/basic_examples/conv_sequential_example.py b/pl_examples/basic_examples/conv_sequential_example.py
index 4c2986701b27c..84efb4bea7670 100644
--- a/pl_examples/basic_examples/conv_sequential_example.py
+++ b/pl_examples/basic_examples/conv_sequential_example.py
@@ -29,6 +29,7 @@
 import torchvision
 
 import pytorch_lightning as pl
+from pl_examples import cli_lightning_logo
 from pytorch_lightning import Trainer
 from pytorch_lightning.metrics.functional import accuracy
 from pytorch_lightning.plugins.ddp_sequential_plugin import DDPSequentialPlugin
@@ -54,6 +55,12 @@ def forward(self, x):
 
 
 class LitResnet(pl.LightningModule):
+    """
+    >>> LitResnet()  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    LitResnet(
+      (sequential_module): Sequential(...)
+    )
+    """
     def __init__(self, lr=0.05, batch_size=32, manual_optimization=False):
         super().__init__()
 
@@ -190,6 +197,7 @@ def instantiate_datamodule(args):
 
 
 if __name__ == "__main__":
+    cli_lightning_logo()
     parser = ArgumentParser(description="Pipe Example")
     parser.add_argument("--use_ddp_sequential", action="store_true")
     parser = Trainer.add_argparse_args(parser)
diff --git a/pl_examples/basic_examples/dali_image_classifier.py b/pl_examples/basic_examples/dali_image_classifier.py
index 291490d6f93e0..e163cb4a6f156 100644
--- a/pl_examples/basic_examples/dali_image_classifier.py
+++ b/pl_examples/basic_examples/dali_image_classifier.py
@@ -15,6 +15,7 @@
 from argparse import ArgumentParser
 from random import shuffle
 from warnings import warn
+from distutils.version import LooseVersion
 
 import numpy as np
 import torch
@@ -22,21 +23,26 @@
 from torch.utils.data import random_split
 
 import pytorch_lightning as pl
-from pl_examples import TORCHVISION_AVAILABLE, DALI_AVAILABLE
+from pl_examples import _TORCHVISION_AVAILABLE, _DALI_AVAILABLE, cli_lightning_logo
 
-if TORCHVISION_AVAILABLE:
+if _TORCHVISION_AVAILABLE:
     from torchvision.datasets.mnist import MNIST
     from torchvision import transforms
 else:
     from tests.base.datasets import MNIST
 
-if DALI_AVAILABLE:
-    import nvidia.dali.ops as ops
+if _DALI_AVAILABLE:
+    from nvidia.dali import ops
     from nvidia.dali.pipeline import Pipeline
     from nvidia.dali.plugin.pytorch import DALIClassificationIterator
+    from nvidia.dali import __version__ as dali_version
+
+    NEW_DALI_API = LooseVersion(dali_version) >= LooseVersion('0.28.0')
+    if NEW_DALI_API:
+        from nvidia.dali.plugin.base_iterator import LastBatchPolicy
 else:
     warn('NVIDIA DALI is not available')
-    ops, Pipeline, DALIClassificationIterator = ..., ABC, ABC
+    ops, Pipeline, DALIClassificationIterator, LastBatchPolicy = ..., ABC, ABC, ABC
 
 
 class ExternalMNISTInputIterator(object):
@@ -98,11 +104,18 @@ def __init__(
             dynamic_shape=False,
             last_batch_padded=False,
     ):
-        super().__init__(pipelines, size, reader_name, auto_reset, fill_last_batch, dynamic_shape, last_batch_padded)
+        if NEW_DALI_API:
+            last_batch_policy = LastBatchPolicy.FILL if fill_last_batch else LastBatchPolicy.DROP
+            super().__init__(pipelines, size, reader_name, auto_reset, dynamic_shape,
+                             last_batch_policy=last_batch_policy, last_batch_padded=last_batch_padded)
+        else:
+            super().__init__(pipelines, size, reader_name, auto_reset, fill_last_batch,
+                             dynamic_shape, last_batch_padded)
+        self._fill_last_batch = fill_last_batch
 
     def __len__(self):
         batch_count = self._size // (self._num_gpus * self.batch_size)
-        last_batch = 1 if self._fill_last_batch else 0
+        last_batch = 1 if self._fill_last_batch else 1
         return batch_count + last_batch
 
 
@@ -153,7 +166,7 @@ def add_model_specific_args(parent_parser):
 
 
 def cli_main():
-    if not DALI_AVAILABLE:
+    if not _DALI_AVAILABLE:
         return
 
     pl.seed_everything(1234)
@@ -179,7 +192,7 @@ def cli_main():
     eii_test = ExternalMNISTInputIterator(mnist_test, args.batch_size)
 
     pipe_train = ExternalSourcePipeline(batch_size=args.batch_size, eii=eii_train, num_threads=2, device_id=0)
-    train_loader = DALIClassificationLoader(pipe_train, size=len(mnist_train), auto_reset=True, fill_last_batch=False)
+    train_loader = DALIClassificationLoader(pipe_train, size=len(mnist_train), auto_reset=True, fill_last_batch=True)
 
     pipe_val = ExternalSourcePipeline(batch_size=args.batch_size, eii=eii_val, num_threads=2, device_id=0)
     val_loader = DALIClassificationLoader(pipe_val, size=len(mnist_val), auto_reset=True, fill_last_batch=False)
@@ -205,4 +218,5 @@ def cli_main():
 
 
 if __name__ == "__main__":
+    cli_lightning_logo()
     cli_main()
diff --git a/pl_examples/basic_examples/mnist_datamodule.py b/pl_examples/basic_examples/mnist_datamodule.py
index eb1415cf8b981..6c33e4f1b77d9 100644
--- a/pl_examples/basic_examples/mnist_datamodule.py
+++ b/pl_examples/basic_examples/mnist_datamodule.py
@@ -16,10 +16,10 @@
 
 from torch.utils.data import DataLoader, random_split
 
-from pl_examples import DATASETS_PATH, TORCHVISION_AVAILABLE
+from pl_examples import _DATASETS_PATH, _TORCHVISION_AVAILABLE
 from pytorch_lightning import LightningDataModule
 
-if TORCHVISION_AVAILABLE:
+if _TORCHVISION_AVAILABLE:
     from torchvision import transforms as transform_lib
     from torchvision.datasets import MNIST
 else:
@@ -29,13 +29,16 @@
 class MNISTDataModule(LightningDataModule):
     """
     Standard MNIST, train, val, test splits and transforms
+
+    >>> MNISTDataModule()  # doctest: +ELLIPSIS
+    <...mnist_datamodule.MNISTDataModule object at ...>
     """
 
     name = "mnist"
 
     def __init__(
         self,
-        data_dir: str = DATASETS_PATH,
+        data_dir: str = _DATASETS_PATH,
         val_split: int = 5000,
         num_workers: int = 16,
         normalize: bool = False,
@@ -120,7 +123,7 @@ def test_dataloader(self):
 
     @property
     def default_transforms(self):
-        if not TORCHVISION_AVAILABLE:
+        if not _TORCHVISION_AVAILABLE:
             return None
         if self.normalize:
             mnist_transforms = transform_lib.Compose(
diff --git a/pl_examples/basic_examples/simple_image_classifier.py b/pl_examples/basic_examples/simple_image_classifier.py
index a341728554d31..894eeea619ba9 100644
--- a/pl_examples/basic_examples/simple_image_classifier.py
+++ b/pl_examples/basic_examples/simple_image_classifier.py
@@ -19,10 +19,18 @@
 from torch.nn import functional as F
 
 import pytorch_lightning as pl
+from pl_examples import cli_lightning_logo
 from pl_examples.basic_examples.mnist_datamodule import MNISTDataModule
 
 
 class LitClassifier(pl.LightningModule):
+    """
+    >>> LitClassifier()  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    LitClassifier(
+      (l1): Linear(...)
+      (l2): Linear(...)
+    )
+    """
     def __init__(self, hidden_dim=128, learning_rate=1e-3):
         super().__init__()
         self.save_hyperparameters()
@@ -103,4 +111,5 @@ def cli_main():
 
 
 if __name__ == '__main__':
+    cli_lightning_logo()
     cli_main()
diff --git a/pl_examples/bug_report_model.py b/pl_examples/bug_report_model.py
index dbea2013d1110..30345122e251f 100644
--- a/pl_examples/bug_report_model.py
+++ b/pl_examples/bug_report_model.py
@@ -22,10 +22,16 @@
 import os
 import torch
 from torch.utils.data import Dataset
+
+from pl_examples import cli_lightning_logo
 from pytorch_lightning import Trainer, LightningModule
 
 
 class RandomDataset(Dataset):
+    """
+    >>> RandomDataset(size=10, length=20)  # doctest: +ELLIPSIS
+    <...bug_report_model.RandomDataset object at ...>
+    """
     def __init__(self, size, length):
         self.len = length
         self.data = torch.randn(length, size)
@@ -38,6 +44,12 @@ def __len__(self):
 
 
 class BoringModel(LightningModule):
+    """
+    >>> BoringModel()  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    BoringModel(
+      (layer): Linear(...)
+    )
+    """
 
     def __init__(self):
         """
@@ -111,10 +123,9 @@ def configure_optimizers(self):
 #  parser = ArgumentParser()
 #  args = parser.parse_args(opt)
 
-def run_test():
+def test_run():
 
     class TestModel(BoringModel):
-
         def on_train_epoch_start(self) -> None:
             print('override any method to prove your bug')
 
@@ -137,4 +148,5 @@ def on_train_epoch_start(self) -> None:
 
 
 if __name__ == '__main__':
-    run_test()
+    cli_lightning_logo()
+    test_run()
diff --git a/pl_examples/domain_templates/computer_vision_fine_tuning.py b/pl_examples/domain_templates/computer_vision_fine_tuning.py
index 21f6644b09a5b..4392ac47e837f 100644
--- a/pl_examples/domain_templates/computer_vision_fine_tuning.py
+++ b/pl_examples/domain_templates/computer_vision_fine_tuning.py
@@ -1,3 +1,16 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """Computer vision example on Transfer Learning.
 
 This computer vision example illustrates how one could fine-tune a pre-trained
@@ -40,6 +53,7 @@
 from torchvision.datasets.utils import download_and_extract_archive
 
 import pytorch_lightning as pl
+from pl_examples import cli_lightning_logo
 from pytorch_lightning import _logger as log
 
 BN_TYPES = (torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, torch.nn.BatchNorm3d)
@@ -145,20 +159,30 @@ def _unfreeze_and_add_param_group(module: Module,
 class TransferLearningModel(pl.LightningModule):
     """Transfer Learning with pre-trained ResNet50.
 
-    Args:
-        hparams: Model hyperparameters
-        dl_path: Path where the data will be downloaded
+    >>> with TemporaryDirectory(dir='.') as tmp_dir:
+    ...     TransferLearningModel(tmp_dir)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    TransferLearningModel(
+      (feature_extractor): Sequential(...)
+      (fc): Sequential(...)
+    )
     """
-    def __init__(self,
-                 dl_path: Union[str, Path],
-                 backbone: str = 'resnet50',
-                 train_bn: bool = True,
-                 milestones: tuple = (5, 10),
-                 batch_size: int = 8,
-                 lr: float = 1e-2,
-                 lr_scheduler_gamma: float = 1e-1,
-                 num_workers: int = 6, **kwargs) -> None:
-        super().__init__()
+    def __init__(
+            self,
+            dl_path: Union[str, Path],
+            backbone: str = 'resnet50',
+            train_bn: bool = True,
+            milestones: tuple = (5, 10),
+            batch_size: int = 8,
+            lr: float = 1e-2,
+            lr_scheduler_gamma: float = 1e-1,
+            num_workers: int = 6,
+            **kwargs,
+    ) -> None:
+        """
+        Args:
+            dl_path: Path where the data will be downloaded
+        """
+        super().__init__(**kwargs)
         self.dl_path = dl_path
         self.backbone = backbone
         self.train_bn = train_bn
@@ -451,4 +475,5 @@ def get_args() -> argparse.Namespace:
 
 
 if __name__ == '__main__':
+    cli_lightning_logo()
     main(get_args())
diff --git a/pl_examples/domain_templates/generative_adversarial_net.py b/pl_examples/domain_templates/generative_adversarial_net.py
index 088b625e31d01..b0c324c193574 100644
--- a/pl_examples/domain_templates/generative_adversarial_net.py
+++ b/pl_examples/domain_templates/generative_adversarial_net.py
@@ -1,3 +1,16 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 To run this template just do:
 python generative_adversarial_net.py
@@ -18,12 +31,19 @@
 from torch.utils.data import DataLoader
 from torchvision.datasets import MNIST
 
+from pl_examples import cli_lightning_logo
 from pytorch_lightning.core import LightningModule, LightningDataModule
 from pytorch_lightning.trainer import Trainer
 
 
 class Generator(nn.Module):
-    def __init__(self, latent_dim, img_shape):
+    """
+    >>> Generator(img_shape=(1, 8, 8))  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    Generator(
+      (model): Sequential(...)
+    )
+    """
+    def __init__(self, latent_dim: int = 100, img_shape: tuple = (1, 28, 28)):
         super().__init__()
         self.img_shape = img_shape
 
@@ -50,6 +70,12 @@ def forward(self, z):
 
 
 class Discriminator(nn.Module):
+    """
+    >>> Discriminator(img_shape=(1, 28, 28))  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    Discriminator(
+      (model): Sequential(...)
+    )
+    """
     def __init__(self, img_shape):
         super().__init__()
 
@@ -69,6 +95,37 @@ def forward(self, img):
 
 
 class GAN(LightningModule):
+    """
+    >>> GAN(img_shape=(1, 8, 8))  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    GAN(
+      (generator): Generator(
+        (model): Sequential(...)
+      )
+      (discriminator): Discriminator(
+        (model): Sequential(...)
+      )
+    )
+    """
+    def __init__(
+            self,
+            img_shape: tuple = (1, 28, 28),
+            lr: float = 0.0002,
+            b1: float = 0.5,
+            b2: float = 0.999,
+            latent_dim: int = 100,
+    ):
+        super().__init__()
+
+        self.save_hyperparameters()
+
+        # networks
+        self.generator = Generator(latent_dim=self.hparams.latent_dim, img_shape=img_shape)
+        self.discriminator = Discriminator(img_shape=img_shape)
+
+        self.validation_z = torch.randn(8, self.hparams.latent_dim)
+
+        self.example_input_array = torch.zeros(2, self.hparams.latent_dim)
+
     @staticmethod
     def add_argparse_args(parent_parser: ArgumentParser):
         parser = ArgumentParser(parents=[parent_parser], add_help=False)
@@ -82,20 +139,6 @@ def add_argparse_args(parent_parser: ArgumentParser):
 
         return parser
 
-    def __init__(self, hparams: Namespace):
-        super().__init__()
-
-        self.hparams = hparams
-
-        # networks
-        mnist_shape = (1, 28, 28)
-        self.generator = Generator(latent_dim=self.hparams.latent_dim, img_shape=mnist_shape)
-        self.discriminator = Discriminator(img_shape=mnist_shape)
-
-        self.validation_z = torch.randn(8, self.hparams.latent_dim)
-
-        self.example_input_array = torch.zeros(2, self.hparams.latent_dim)
-
     def forward(self, z):
         return self.generator(z)
 
@@ -166,6 +209,10 @@ def on_epoch_end(self):
 
 
 class MNISTDataModule(LightningDataModule):
+    """
+    >>> MNISTDataModule()  # doctest: +ELLIPSIS
+    <...generative_adversarial_net.MNISTDataModule object at ...>
+    """
     def __init__(self, batch_size: int = 64, data_path: str = os.getcwd(), num_workers: int = 4):
         super().__init__()
         self.batch_size = batch_size
@@ -211,6 +258,7 @@ def main(args: Namespace) -> None:
 
 
 if __name__ == '__main__':
+    cli_lightning_logo()
     parser = ArgumentParser()
 
     # Add program level args, if any.
diff --git a/pl_examples/domain_templates/imagenet.py b/pl_examples/domain_templates/imagenet.py
index b7116547d389b..cc36f3542a1c8 100644
--- a/pl_examples/domain_templates/imagenet.py
+++ b/pl_examples/domain_templates/imagenet.py
@@ -1,3 +1,16 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 This example is largely adapted from https://github.com/pytorch/examples/blob/master/imagenet/main.py
 
@@ -32,10 +45,17 @@
 import torchvision.transforms as transforms
 
 import pytorch_lightning as pl
+from pl_examples import cli_lightning_logo
 from pytorch_lightning.core import LightningModule
 
 
 class ImageNetLightningModel(LightningModule):
+    """
+    >>> ImageNetLightningModel(data_path='missing')  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    ImageNetLightningModel(
+      (model): ResNet(...)
+    )
+    """
     # pull out resnet names from torchvision models
     MODEL_NAMES = sorted(
         name for name in models.__dict__
@@ -44,14 +64,14 @@ class ImageNetLightningModel(LightningModule):
 
     def __init__(
             self,
-            arch: str,
-            pretrained: bool,
-            lr: float,
-            momentum: float,
-            weight_decay: int,
             data_path: str,
-            batch_size: int,
-            workers: int,
+            arch: str = 'resnet18',
+            pretrained: bool = False,
+            lr: float = 0.1,
+            momentum: float = 0.9,
+            weight_decay: float = 1e-4,
+            batch_size: int = 4,
+            workers: int = 2,
             **kwargs,
     ):
         super().__init__()
@@ -246,4 +266,5 @@ def run_cli():
 
 
 if __name__ == '__main__':
+    cli_lightning_logo()
     run_cli()
diff --git a/pl_examples/domain_templates/reinforce_learn_Qnet.py b/pl_examples/domain_templates/reinforce_learn_Qnet.py
index 4b01f83e36639..6aee8bb6038c1 100644
--- a/pl_examples/domain_templates/reinforce_learn_Qnet.py
+++ b/pl_examples/domain_templates/reinforce_learn_Qnet.py
@@ -1,3 +1,16 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 Deep Reinforcement Learning: Deep Q-network (DQN)
 
@@ -33,19 +46,26 @@
 from torch.utils.data.dataset import IterableDataset
 
 import pytorch_lightning as pl
+from pl_examples import cli_lightning_logo
 
 
 class DQN(nn.Module):
     """
     Simple MLP network
 
-    Args:
-        obs_size: observation/state size of the environment
-        n_actions: number of discrete actions available in the environment
-        hidden_size: size of hidden layers
+    >>> DQN(10, 5)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    DQN(
+      (net): Sequential(...)
+    )
     """
 
     def __init__(self, obs_size: int, n_actions: int, hidden_size: int = 128):
+        """
+        Args:
+            obs_size: observation/state size of the environment
+            n_actions: number of discrete actions available in the environment
+            hidden_size: size of hidden layers
+        """
         super(DQN, self).__init__()
         self.net = nn.Sequential(
             nn.Linear(obs_size, hidden_size),
@@ -67,11 +87,15 @@ class ReplayBuffer:
     """
     Replay Buffer for storing past experiences allowing the agent to learn from them
 
-    Args:
-        capacity: size of the buffer
+    >>> ReplayBuffer(5)  # doctest: +ELLIPSIS
+    <...reinforce_learn_Qnet.ReplayBuffer object at ...>
     """
 
     def __init__(self, capacity: int) -> None:
+        """
+        Args:
+            capacity: size of the buffer
+        """
         self.buffer = deque(maxlen=capacity)
 
     def __len__(self) -> int:
@@ -99,12 +123,16 @@ class RLDataset(IterableDataset):
     Iterable Dataset containing the ExperienceBuffer
     which will be updated with new experiences during training
 
-    Args:
-        buffer: replay buffer
-        sample_size: number of experiences to sample at a time
+    >>> RLDataset(ReplayBuffer(5))  # doctest: +ELLIPSIS
+    <...reinforce_learn_Qnet.RLDataset object at ...>
     """
 
     def __init__(self, buffer: ReplayBuffer, sample_size: int = 200) -> None:
+        """
+        Args:
+            buffer: replay buffer
+            sample_size: number of experiences to sample at a time
+        """
         self.buffer = buffer
         self.sample_size = sample_size
 
@@ -118,12 +146,18 @@ class Agent:
     """
     Base Agent class handling the interaction with the environment
 
-    Args:
-        env: training environment
-        replay_buffer: replay buffer storing experiences
+    >>> env = gym.make("CartPole-v0")
+    >>> buffer = ReplayBuffer(10)
+    >>> Agent(env, buffer)  # doctest: +ELLIPSIS
+    <...reinforce_learn_Qnet.Agent object at ...>
     """
 
     def __init__(self, env: gym.Env, replay_buffer: ReplayBuffer) -> None:
+        """
+        Args:
+            env: training environment
+            replay_buffer: replay buffer storing experiences
+        """
         self.env = env
         self.replay_buffer = replay_buffer
         self.reset()
@@ -190,20 +224,34 @@ def play_step(self, net: nn.Module, epsilon: float = 0.0, device: str = 'cpu') -
 
 
 class DQNLightning(pl.LightningModule):
-    """ Basic DQN Model """
-
-    def __init__(self,
-                 replay_size,
-                 warm_start_steps: int,
-                 gamma: float,
-                 eps_start: int,
-                 eps_end: int,
-                 eps_last_frame: int,
-                 sync_rate,
-                 lr: float,
-                 episode_length,
-                 batch_size, **kwargs) -> None:
-        super().__init__()
+    """ Basic DQN Model
+
+    >>> DQNLightning(env="CartPole-v0")  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    DQNLightning(
+      (net): DQN(
+        (net): Sequential(...)
+      )
+      (target_net): DQN(
+        (net): Sequential(...)
+      )
+    )
+    """
+    def __init__(
+            self,
+            env: str,
+            replay_size: int = 200,
+            warm_start_steps: int = 200,
+            gamma: float = 0.99,
+            eps_start: float = 1.0,
+            eps_end: float = 0.01,
+            eps_last_frame: int = 200,
+            sync_rate: int = 10,
+            lr: float = 1e-2,
+            episode_length: int = 50,
+            batch_size: int = 4,
+            **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
         self.replay_size = replay_size
         self.warm_start_steps = warm_start_steps
         self.gamma = gamma
@@ -215,7 +263,7 @@ def __init__(self,
         self.episode_length = episode_length
         self.batch_size = batch_size
 
-        self.env = gym.make(self.env)
+        self.env = gym.make(env)
         obs_size = self.env.observation_space.shape[0]
         n_actions = self.env.action_space.n
 
@@ -288,8 +336,7 @@ def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], nb_batch) -> O
             Training loss and log metrics
         """
         device = self.get_device(batch)
-        epsilon = max(self.eps_end, self.eps_start -
-                      self.global_step + 1 / self.eps_last_frame)
+        epsilon = max(self.eps_end, self.eps_start - self.global_step + 1 / self.eps_last_frame)
 
         # step through environment with agent
         reward, done = self.agent.play_step(self.net, epsilon, device)
@@ -335,6 +382,30 @@ def get_device(self, batch) -> str:
         """Retrieve device currently being used by minibatch"""
         return batch[0].device.index if self.on_gpu else 'cpu'
 
+    @staticmethod
+    def add_model_specific_args(parent_parser):  # pragma: no-cover
+        parser = argparse.ArgumentParser(parents=[parent_parser])
+        parser.add_argument("--batch_size", type=int, default=16, help="size of the batches")
+        parser.add_argument("--lr", type=float, default=1e-2, help="learning rate")
+        parser.add_argument("--env", type=str, default="CartPole-v0", help="gym environment tag")
+        parser.add_argument("--gamma", type=float, default=0.99, help="discount factor")
+        parser.add_argument("--sync_rate", type=int, default=10,
+                            help="how many frames do we update the target network")
+        parser.add_argument("--replay_size", type=int, default=1000,
+                            help="capacity of the replay buffer")
+        parser.add_argument("--warm_start_size", type=int, default=1000,
+                            help="how many samples do we use to fill our buffer at the start of training")
+        parser.add_argument("--eps_last_frame", type=int, default=1000,
+                            help="what frame should epsilon stop decaying")
+        parser.add_argument("--eps_start", type=float, default=1.0, help="starting value of epsilon")
+        parser.add_argument("--eps_end", type=float, default=0.01, help="final value of epsilon")
+        parser.add_argument("--episode_length", type=int, default=200, help="max length of an episode")
+        parser.add_argument("--max_episode_reward", type=int, default=200,
+                            help="max episode reward in the environment")
+        parser.add_argument("--warm_start_steps", type=int, default=1000,
+                            help="max episode reward in the environment")
+        return parser
+
 
 def main(args) -> None:
     model = DQNLightning(**vars(args))
@@ -349,30 +420,12 @@ def main(args) -> None:
 
 
 if __name__ == '__main__':
+    cli_lightning_logo()
     torch.manual_seed(0)
     np.random.seed(0)
 
     parser = argparse.ArgumentParser()
-    parser.add_argument("--batch_size", type=int, default=16, help="size of the batches")
-    parser.add_argument("--lr", type=float, default=1e-2, help="learning rate")
-    parser.add_argument("--env", type=str, default="CartPole-v0", help="gym environment tag")
-    parser.add_argument("--gamma", type=float, default=0.99, help="discount factor")
-    parser.add_argument("--sync_rate", type=int, default=10,
-                        help="how many frames do we update the target network")
-    parser.add_argument("--replay_size", type=int, default=1000,
-                        help="capacity of the replay buffer")
-    parser.add_argument("--warm_start_size", type=int, default=1000,
-                        help="how many samples do we use to fill our buffer at the start of training")
-    parser.add_argument("--eps_last_frame", type=int, default=1000,
-                        help="what frame should epsilon stop decaying")
-    parser.add_argument("--eps_start", type=float, default=1.0, help="starting value of epsilon")
-    parser.add_argument("--eps_end", type=float, default=0.01, help="final value of epsilon")
-    parser.add_argument("--episode_length", type=int, default=200, help="max length of an episode")
-    parser.add_argument("--max_episode_reward", type=int, default=200,
-                        help="max episode reward in the environment")
-    parser.add_argument("--warm_start_steps", type=int, default=1000,
-                        help="max episode reward in the environment")
-
+    parser = DQNLightning.add_model_specific_args(parser)
     args = parser.parse_args()
 
     main(args)
diff --git a/pl_examples/domain_templates/semantic_segmentation.py b/pl_examples/domain_templates/semantic_segmentation.py
index 4ca1ebc2aec76..507efc78e0f8d 100644
--- a/pl_examples/domain_templates/semantic_segmentation.py
+++ b/pl_examples/domain_templates/semantic_segmentation.py
@@ -1,3 +1,17 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import random
 from argparse import ArgumentParser, Namespace
@@ -10,6 +24,7 @@
 from torch.utils.data import DataLoader, Dataset
 
 import pytorch_lightning as pl
+from pl_examples import cli_lightning_logo
 from pl_examples.domain_templates.unet import UNet
 from pytorch_lightning.loggers import WandbLogger
 
@@ -17,6 +32,19 @@
 DEFAULT_VALID_LABELS = (7, 8, 11, 12, 13, 17, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 31, 32, 33)
 
 
+def _create_synth_kitti_dataset(path_dir: str, image_dims: tuple = (1024, 512)):
+    """Create synthetic dataset with random images, just to simulate that the dataset have been already downloaded."""
+    path_dir_images = os.path.join(path_dir, KITTI.IMAGE_PATH)
+    path_dir_masks = os.path.join(path_dir, KITTI.MASK_PATH)
+    for p_dir in (path_dir_images, path_dir_masks):
+        os.makedirs(p_dir, exist_ok=True)
+    for i in range(3):
+        path_img = os.path.join(path_dir_images, f'dummy_kitti_{i}.png')
+        Image.new('RGB', image_dims).save(path_img)
+        path_mask = os.path.join(path_dir_masks, f'dummy_kitti_{i}.png')
+        Image.new('L', image_dims).save(path_mask)
+
+
 class KITTI(Dataset):
     """
     Class for KITTI Semantic Segmentation Benchmark dataset
@@ -38,6 +66,12 @@ class KITTI(Dataset):
     In the `get_item` function, images and masks are resized to the given `img_size`, masks are
     encoded using `encode_segmap`, and given `transform` (if any) are applied to the image only
     (mask does not usually require transforms, but they can be implemented in a similar way).
+
+    >>> from pl_examples import _DATASETS_PATH
+    >>> dataset_path = os.path.join(_DATASETS_PATH, "Kitti")
+    >>> _create_synth_kitti_dataset(dataset_path, image_dims=(1024, 512))
+    >>> KITTI(dataset_path, 'train')  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    <...semantic_segmentation.KITTI object at ...>
     """
     IMAGE_PATH = os.path.join('training', 'image_2')
     MASK_PATH = os.path.join('training', 'semantic')
@@ -126,16 +160,35 @@ class SegModel(pl.LightningModule):
     It uses the FCN ResNet50 model as an example.
 
     Adam optimizer is used along with Cosine Annealing learning rate scheduler.
-    """
 
-    def __init__(self,
-                 data_path: str,
-                 batch_size: int,
-                 lr: float,
-                 num_layers: int,
-                 features_start: int,
-                 bilinear: bool, **kwargs):
-        super().__init__()
+    >>> from pl_examples import _DATASETS_PATH
+    >>> dataset_path = os.path.join(_DATASETS_PATH, "Kitti")
+    >>> _create_synth_kitti_dataset(dataset_path, image_dims=(1024, 512))
+    >>> SegModel(dataset_path)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    SegModel(
+      (net): UNet(
+        (layers): ModuleList(
+          (0): DoubleConv(...)
+          (1): Down(...)
+          (2): Down(...)
+          (3): Up(...)
+          (4): Up(...)
+          (5): Conv2d(64, 19, kernel_size=(1, 1), stride=(1, 1))
+        )
+      )
+    )
+    """
+    def __init__(
+            self,
+            data_path: str,
+            batch_size: int = 4,
+            lr: float = 1e-3,
+            num_layers: int = 3,
+            features_start: int = 64,
+            bilinear: bool = False,
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
         self.data_path = data_path
         self.batch_size = batch_size
         self.lr = lr
@@ -189,6 +242,18 @@ def train_dataloader(self):
     def val_dataloader(self):
         return DataLoader(self.validset, batch_size=self.batch_size, shuffle=False)
 
+    @staticmethod
+    def add_model_specific_args(parent_parser):  # pragma: no-cover
+        parser = ArgumentParser(parents=[parent_parser])
+        parser.add_argument("--data_path", type=str, help="path where dataset is stored")
+        parser.add_argument("--batch_size", type=int, default=16, help="size of the batches")
+        parser.add_argument("--lr", type=float, default=0.001, help="adam: learning rate")
+        parser.add_argument("--num_layers", type=int, default=5, help="number of layers on u-net")
+        parser.add_argument("--features_start", type=float, default=64, help="number of features in first layer")
+        parser.add_argument("--bilinear", action='store_true', default=False,
+                            help="whether to use bilinear interpolation or transposed")
+        return parser
+
 
 def main(hparams: Namespace):
     # ------------------------
@@ -209,14 +274,7 @@ def main(hparams: Namespace):
     # ------------------------
     # 3 INIT TRAINER
     # ------------------------
-    trainer = pl.Trainer(
-        gpus=hparams.gpus,
-        logger=logger,
-        max_epochs=hparams.epochs,
-        accumulate_grad_batches=hparams.grad_batches,
-        accelerator=hparams.accelerator,
-        precision=16 if hparams.use_amp else 32,
-    )
+    trainer = pl.Trainer.from_argparse_args(hparams)
 
     # ------------------------
     # 5 START TRAINING
@@ -225,22 +283,9 @@ def main(hparams: Namespace):
 
 
 if __name__ == '__main__':
+    cli_lightning_logo()
     parser = ArgumentParser()
-    parser.add_argument("--data_path", type=str, help="path where dataset is stored")
-    parser.add_argument("--gpus", type=int, default=-1, help="number of available GPUs")
-    parser.add_argument('--distributed-backend', type=str, default='dp', choices=('dp', 'ddp', 'ddp2'),
-                        help='supports three options dp, ddp, ddp2')
-    parser.add_argument('--use_amp', action='store_true', help='if true uses 16 bit precision')
-    parser.add_argument("--batch_size", type=int, default=4, help="size of the batches")
-    parser.add_argument("--lr", type=float, default=0.001, help="adam: learning rate")
-    parser.add_argument("--num_layers", type=int, default=5, help="number of layers on u-net")
-    parser.add_argument("--features_start", type=float, default=64, help="number of features in first layer")
-    parser.add_argument("--bilinear", action='store_true', default=False,
-                        help="whether to use bilinear interpolation or transposed")
-    parser.add_argument("--grad_batches", type=int, default=1, help="number of batches to accumulate")
-    parser.add_argument("--epochs", type=int, default=20, help="number of epochs to train")
-    parser.add_argument("--log_wandb", action='store_true', help="log training on Weights & Biases")
-
+    parser = SegModel.add_model_specific_args(parser)
     hparams = parser.parse_args()
 
     main(hparams)
diff --git a/pl_examples/domain_templates/unet.py b/pl_examples/domain_templates/unet.py
index 6117447e5ed33..2314e19ddbfc9 100644
--- a/pl_examples/domain_templates/unet.py
+++ b/pl_examples/domain_templates/unet.py
@@ -1,3 +1,17 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -8,20 +22,33 @@ class UNet(nn.Module):
     Architecture based on U-Net: Convolutional Networks for Biomedical Image Segmentation
     Link - https://arxiv.org/abs/1505.04597
 
-    Parameters:
-        num_classes: Number of output classes required (default 19 for KITTI dataset)
-        num_layers: Number of layers in each side of U-net
-        features_start: Number of features in first layer
-        bilinear: Whether to use bilinear interpolation or transposed
-            convolutions for upsampling.
+    >>> UNet(num_classes=2, num_layers=3)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    UNet(
+      (layers): ModuleList(
+        (0): DoubleConv(...)
+        (1): Down(...)
+        (2): Down(...)
+        (3): Up(...)
+        (4): Up(...)
+        (5): Conv2d(64, 2, kernel_size=(1, 1), stride=(1, 1))
+      )
+    )
     """
 
     def __init__(
-            self, num_classes: int = 19,
+            self,
+            num_classes: int = 19,
             num_layers: int = 5,
             features_start: int = 64,
-            bilinear: bool = False
+            bilinear: bool = False,
     ):
+        """
+        Args:
+            num_classes: Number of output classes required (default 19 for KITTI dataset)
+            num_layers: Number of layers in each side of U-net
+            features_start: Number of features in first layer
+            bilinear: Whether to use bilinear interpolation or transposed convolutions for upsampling.
+        """
         super().__init__()
         self.num_layers = num_layers
 
@@ -55,6 +82,11 @@ class DoubleConv(nn.Module):
     """
     Double Convolution and BN and ReLU
     (3x3 conv -> BN -> ReLU) ** 2
+
+    >>> DoubleConv(4, 4)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    DoubleConv(
+      (net): Sequential(...)
+    )
     """
 
     def __init__(self, in_ch: int, out_ch: int):
@@ -75,6 +107,16 @@ def forward(self, x):
 class Down(nn.Module):
     """
     Combination of MaxPool2d and DoubleConv in series
+
+    >>> Down(4, 8)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    Down(
+      (net): Sequential(
+        (0): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
+        (1): DoubleConv(
+          (net): Sequential(...)
+        )
+      )
+    )
     """
 
     def __init__(self, in_ch: int, out_ch: int):
@@ -93,6 +135,14 @@ class Up(nn.Module):
     Upsampling (by either bilinear interpolation or transpose convolutions)
     followed by concatenation of feature map from contracting path,
     followed by double 3x3 convolution.
+
+    >>> Up(8, 4)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    Up(
+      (upsample): ConvTranspose2d(8, 4, kernel_size=(2, 2), stride=(2, 2))
+      (conv): DoubleConv(
+        (net): Sequential(...)
+      )
+    )
     """
 
     def __init__(self, in_ch: int, out_ch: int, bilinear: bool = False):
diff --git a/pl_examples/pytorch_ecosystem/pytorch_geometric/README.md b/pl_examples/pytorch_ecosystem/pytorch_geometric/README.md
deleted file mode 100644
index 5c9a42d5a8942..0000000000000
--- a/pl_examples/pytorch_ecosystem/pytorch_geometric/README.md
+++ /dev/null
@@ -1,38 +0,0 @@
-#  [Pytorch Geometric](https://github.com/rusty1s/pytorch_geometric) examples with Lighting
-
-### Introduction
-
-PyTorch Geometric (PyG) is a geometric deep learning extension library for PyTorch. It relies on lower level libraries such as
-
-* PyTorch Cluster: A package consists of a small extension library of highly optimized graph cluster algorithms in Pytorch
-* PyTorch Sparse: A package consists of a small extension library of optimized sparse matrix operations with autograd support in Pytorch
-* PyTorch Scatter: A package consists of a small extension library of highly optimized sparse update (scatter and segment) operations for the use in PyTorch
-
-## Setup
-
-```
-pyenv install 3.7.8
-pyenv local 3.7.8
-python -m venv
-source .venv/bin/activate
-poetry install
-```
-
-Run example
-
-```
-python cora_dna.py
-```
-
-## Current example lists
-
-| `DATASET` | `MODEL` | `TASK` | DATASET DESCRIPTION | MODEL DESCRIPTION                                                                                                                                                                   |                                                                                                                                                                     |
-| :---: | :---: | :---: | :---: | :---: | :---: |
-| Cora | DNA | Node Classification | The citation network datasets "Cora", "CiteSeer" and "PubMed" from the "Revisiting Semi-Supervised Learning with Graph Embeddings" <https://arxiv.org/abs/1603.08861> | The dynamic neighborhood aggregation operator from the "Just Jump: Towards Dynamic Neighborhood Aggregation in Graph Neural Networks"
-
-
-## DATASET SIZES
-
-```
- 16M    ./cora
-```
diff --git a/pl_examples/pytorch_ecosystem/pytorch_geometric/__init__.py b/pl_examples/pytorch_ecosystem/pytorch_geometric/__init__.py
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/pl_examples/pytorch_ecosystem/pytorch_geometric/cora_dna.py b/pl_examples/pytorch_ecosystem/pytorch_geometric/cora_dna.py
deleted file mode 100644
index 84df5e1edf31a..0000000000000
--- a/pl_examples/pytorch_ecosystem/pytorch_geometric/cora_dna.py
+++ /dev/null
@@ -1,370 +0,0 @@
-"""Graph Convolution Example using Pytorch Geometric
-
-This example illustrates how one could train a graph convolution model with DNA Conv
-on Cora Dataset using pytorch-lightning. This example will also demonstrate how this
-model can be easily torch-scripted, thanks to Pytorch Geometric.
-"""
-# python imports
-import os.path as osp
-from collections import namedtuple
-from argparse import ArgumentParser
-from typing import List, Optional, NamedTuple
-
-# thrid parties libraries
-from torch import nn
-import torch
-from torch import Tensor
-from torch.optim import Adam
-import torch.nn.functional as F
-
-# Lightning imports
-from pytorch_lightning import (
-    Trainer,
-    LightningDataModule,
-    LightningModule
-)
-from pytorch_lightning.metrics import Accuracy
-
-try:
-    # Pytorch Geometric imports
-    from torch_geometric.nn import DNAConv, MessagePassing
-    from torch_geometric.datasets import Planetoid
-    import torch_geometric.transforms as T
-    from torch_geometric.data import NeighborSampler
-    from lightning import lightning_logo, nice_print
-except Exception:
-    HAS_PYTORCH_GEOMETRIC = False
-else:
-    HAS_PYTORCH_GEOMETRIC = True
-
-
-# use to make model jittable
-OptTensor = Optional[Tensor]
-ListTensor = List[Tensor]
-
-
-class TensorBatch(NamedTuple):
-    x: Tensor
-    edge_index: ListTensor
-    edge_attr: OptTensor
-    batch: OptTensor
-
-###################################
-#       LightningDataModule       #
-###################################
-
-
-class CoraDataset(LightningDataModule):
-
-    r"""The citation network datasets "Cora", "CiteSeer" and "PubMed" from the
-    `"Revisiting Semi-Supervised Learning with Graph Embeddings"
-    <https://arxiv.org/abs/1603.08861>`_ paper.
-    Nodes represent documents and edges represent citation links.
-    Training, validation and test splits are given by binary masks.
-    c.f https://github.com/rusty1s/pytorch_geometric/blob/master/torch_geometric/datasets/planetoid.py
-    """
-
-    NAME = "cora"
-
-    def __init__(self,
-                 num_workers: int = 1,
-                 batch_size: int = 8,
-                 drop_last: bool = True,
-                 pin_memory: bool = True,
-                 num_layers: int = None):
-        super().__init__()
-
-        assert num_layers is not None
-
-        self._num_workers = num_workers
-        self._batch_size = batch_size
-        self._drop_last = drop_last
-        self._pin_memory = pin_memory
-        self._num_layers = num_layers
-
-        self._transform = T.NormalizeFeatures()
-
-    @property
-    def num_features(self):
-        return 1433
-
-    @property
-    def num_classes(self):
-        return 7
-
-    @property
-    def hyper_parameters(self):
-        # used to inform the model the dataset specifications
-        return {"num_features": self.num_features, "num_classes": self.num_classes}
-
-    def prepare_data(self):
-        path = osp.join(
-            osp.dirname(osp.realpath(__file__)), "..", "..", "data", self.NAME
-        )
-        self.dataset = Planetoid(path, self.NAME, transform=self._transform)
-        self.data = self.dataset[0]
-
-    def create_neighbor_sampler(self, batch_size=2, stage=None):
-        # https://github.com/rusty1s/pytorch_geometric/tree/master/torch_geometric/data/sampler.py#L18
-        return NeighborSampler(
-            self.data.edge_index,
-            # the nodes that should be considered for sampling.
-            node_idx=getattr(self.data, f"{stage}_mask"),
-            # -1 indicates all neighbors will be selected
-            sizes=[self._num_layers, -1],
-            num_workers=self._num_workers,
-            drop_last=self._drop_last,
-            pin_memory=self._pin_memory,
-        )
-
-    def train_dataloader(self):
-        return self.create_neighbor_sampler(stage="train")
-
-    def validation_dataloader(self):
-        return self.create_neighbor_sampler(stage="val")
-
-    def test_dataloader(self):
-        return self.create_neighbor_sampler(stage="test")
-
-    def gather_data_and_convert_to_namedtuple(self, batch, batch_nb):
-        """
-        This function will select features using node_idx
-        and create a NamedTuple Object.
-        """
-
-        usual_keys = ["x", "edge_index", "edge_attr", "batch"]
-        Batch: TensorBatch = namedtuple("Batch", usual_keys)
-        return (
-            Batch(
-                self.data.x[batch[1]],
-                [e.edge_index for e in batch[2]],
-                None,
-                None,
-            ),
-            self.data.y[batch[1]],
-        )
-
-    @staticmethod
-    def add_argparse_args(parser):
-        parser.add_argument("--num_workers", type=int, default=1)
-        parser.add_argument("--batch_size", type=int, default=2)
-        parser.add_argument("--drop_last", default=True)
-        parser.add_argument("--pin_memory", default=True)
-        return parser
-
-
-###############################
-#       LightningModule       #
-###############################
-
-
-class DNAConvNet(LightningModule):
-
-    r"""The dynamic neighborhood aggregation operator from the `"Just Jump:
-    Towards Dynamic Neighborhood Aggregation in Graph Neural Networks"
-    <https://arxiv.org/abs/1904.04849>`_ paper
-    c.f https://github.com/rusty1s/pytorch_geometric/blob/master/torch_geometric/nn/conv/dna_conv.py#L172
-    """
-
-    def __init__(self,
-                 num_layers: int = 2,
-                 hidden_channels: int = 128,
-                 heads: int = 8,
-                 groups: int = 16,
-                 dropout: float = 0.8,
-                 cached: bool = False,
-                 num_features: int = None,
-                 num_classes: int = None,
-                 ):
-        super().__init__()
-
-        assert num_features is not None
-        assert num_classes is not None
-
-        # utils from Lightning to save __init__ arguments
-        self.save_hyperparameters()
-        hparams = self.hparams
-
-        # Instantiate metrics
-        self.val_acc = Accuracy(hparams["num_classes"])
-        self.test_acc = Accuracy(hparams["num_classes"])
-
-        # Define DNA graph convolution model
-        self.hidden_channels = hparams["hidden_channels"]
-        self.lin1 = nn.Linear(hparams["num_features"], hparams["hidden_channels"])
-
-        # Create ModuleList to hold all convolutions
-        self.convs = nn.ModuleList()
-
-        # Iterate through the number of layers
-        for _ in range(hparams["num_layers"]):
-
-            # Create a DNA Convolution - This graph convolution relies on MultiHead Attention mechanism
-            # to route information similar to Transformers.
-            # https://github.com/rusty1s/pytorch_geometric/blob/master/torch_geometric/nn/conv/dna_conv.py#L172
-            self.convs.append(
-                DNAConv(
-                    hparams["hidden_channels"],
-                    hparams["heads"],
-                    hparams["groups"],
-                    dropout=hparams["dropout"],
-                    cached=False,
-                )
-            )
-        # classification MLP
-        self.lin2 = nn.Linear(hparams["hidden_channels"], hparams["num_classes"], bias=False)
-
-    def forward(self, batch: TensorBatch):
-        # batch needs to be typed for making this model jittable.
-        x = batch.x
-        x = F.relu(self.lin1(x))
-        x = F.dropout(x, p=0.5, training=self.training)
-        x_all = x.view(-1, 1, self.hidden_channels)
-
-        # iterate over all convolutions
-        for idx, conv in enumerate(self.convs):
-            # perform convolution using previously concatenated embedding
-            # through edge_index
-            x = F.relu(conv(x_all, batch.edge_index[idx]))
-            x = x.view(-1, 1, self.hidden_channels)
-
-            # concatenate with previously computed embedding
-            x_all = torch.cat([x_all, x], dim=1)
-
-        # extra latest layer embedding
-        x = x_all[:, -1]
-
-        x = F.dropout(x, p=0.5, training=self.training)
-
-        # return logits per nodes
-        return F.log_softmax(self.lin2(x), -1)
-
-    def step(self, batch, batch_nb):
-        typed_batch, targets = self.gather_data_and_convert_to_namedtuple(batch, batch_nb)
-        logits = self(typed_batch)
-        return logits, targets
-
-    def training_step(self, batch, batch_nb):
-        logits, targets = self.step(batch, batch_nb)
-        train_loss = F.nll_loss(logits, targets)
-        self.log("train_loss", train_loss, on_step=True, on_epoch=True, prog_bar=True)
-        return train_loss
-
-    def validation_step(self, batch, batch_nb):
-        logits, targets = self.step(batch, batch_nb)
-        val_loss = F.nll_loss(logits, targets)
-        self.log("val_loss", val_loss, on_step=False, on_epoch=True, prog_bar=True)
-        self.log("val_acc", self.val_acc(logits, targets), on_step=False, on_epoch=True, prog_bar=True)
-
-    def test_step(self, batch, batch_nb):
-        logits, targets = self.step(batch, batch_nb)
-        test_loss = F.nll_loss(logits, targets)
-        self.log("test_loss", test_loss, on_step=False, on_epoch=True, prog_bar=True)
-        self.log("test_acc", self.test_acc(logits, targets), on_step=False, on_epoch=True, prog_bar=True)
-
-    # Use for jittable demonstration.
-
-    def _convert_to_jittable(self, module):
-        for key, m in module._modules.items():
-            if isinstance(m, MessagePassing) and m.jittable is not None:
-                # Pytorch Geometric MessagePassing implements a `.jittable` function
-                # which converts the current module into its jittable version.
-                module._modules[key] = m.jittable()
-            else:
-                self._convert_to_jittable(m)
-        return module
-
-    def jittable(self):
-        for key, m in self._modules.items():
-            self._modules[key] = self._convert_to_jittable(m)
-
-    def configure_optimizers(self):
-        return Adam(self.parameters(), lr=1e-3)
-
-    @staticmethod
-    def add_argparse_args(parser):
-        parser.add_argument("--num_layers", type=int, default=2)
-        parser.add_argument("--hidden_channels", type=int, default=128)
-        parser.add_argument("--heads", type=int, default=8)
-        parser.add_argument("--groups", type=int, default=16)
-        parser.add_argument("--dropout", type=float, default=0.8)
-        parser.add_argument("--cached", type=int, default=0)
-        parser.add_argument("--jit", default=True)
-        return parser
-
-#################################
-#     Instantiate Functions     #
-#################################
-
-
-def instantiate_datamodule(args):
-    datamodule = CoraDataset(
-        num_workers=args.num_workers,
-        batch_size=args.batch_size,
-        drop_last=args.drop_last,
-        pin_memory=args.pin_memory,
-        num_layers=args.num_layers,
-    )
-    return datamodule
-
-
-def instantiate_model(args, datamodule):
-    model = DNAConvNet(
-        num_layers=args.num_layers,
-        hidden_channels=args.hidden_channels,
-        heads=args.heads,
-        groups=args.groups,
-        dropout=args.dropout,
-        # provide dataset specific arguments
-        **datamodule.hyper_parameters,
-    )
-    if args.jit:
-        model.jittable()
-
-    # Attached datamodule function to model
-    model.gather_data_and_convert_to_namedtuple = datamodule.gather_data_and_convert_to_namedtuple
-    return model
-
-
-def get_single_batch(datamodule):
-    for batch in datamodule.test_dataloader():
-        return datamodule.gather_data_and_convert_to_namedtuple(batch, 0)
-
-#######################
-#     Trainer Run     #
-#######################
-
-
-def run(args):
-
-    nice_print("You are about to train a TorchScripted Pytorch Geometric Lightning model !")
-    nice_print(lightning_logo)
-
-    datamodule: LightningDataModule = instantiate_datamodule(args)
-    model: LightningModule = instantiate_model(args, datamodule)
-    trainer = Trainer.from_argparse_args(args)
-    trainer.fit(model, datamodule)
-    trainer.test()
-
-    batch = get_single_batch(datamodule)
-    model.to_torchscript(file_path="model_trace.pt",
-                         method='script',
-                         example_inputs=batch)
-
-    nice_print("Congratulations !")
-    nice_print("You trained your first TorchScripted Pytorch Geometric Lightning model !", last=True)
-
-
-if __name__ == "__main__":
-    if not HAS_PYTORCH_GEOMETRIC:
-        print("Skip training. Pytorch Geometric isn't installed. Please, check README.md !")
-
-    else:
-        parser = ArgumentParser(description="Pytorch Geometric Example")
-        parser = Trainer.add_argparse_args(parser)
-        parser = CoraDataset.add_argparse_args(parser)
-        parser = DNAConvNet.add_argparse_args(parser)
-
-        cmd_line = '--max_epochs 1'.split(' ')
-
-        run(parser.parse_args(cmd_line))
diff --git a/pl_examples/pytorch_ecosystem/pytorch_geometric/lightning.py b/pl_examples/pytorch_ecosystem/pytorch_geometric/lightning.py
deleted file mode 100644
index 2c765d1449c57..0000000000000
--- a/pl_examples/pytorch_ecosystem/pytorch_geometric/lightning.py
+++ /dev/null
@@ -1,31 +0,0 @@
-def nice_print(msg, last=False):
-    print()
-    print("\033[0;35m" + msg + "\033[0m")
-    if last:
-        print()
-
-
-lightning_logo = """
-                    ####
-                ###########
-             ####################
-         ############################
-    #####################################
-##############################################
-#########################  ###################
-#######################    ###################
-####################      ####################
-##################       #####################
-################        ######################
-#####################        #################
-######################     ###################
-#####################    #####################
-####################   #######################
-###################  #########################
-##############################################
-    #####################################
-         ############################
-             ####################
-                  ##########
-                     ####
-"""
diff --git a/pl_examples/pytorch_ecosystem/pytorch_geometric/pyproject.toml b/pl_examples/pytorch_ecosystem/pytorch_geometric/pyproject.toml
deleted file mode 100644
index 99f516323e976..0000000000000
--- a/pl_examples/pytorch_ecosystem/pytorch_geometric/pyproject.toml
+++ /dev/null
@@ -1,25 +0,0 @@
-[tool.poetry]
-name = "lightning-geometric"
-version = "0.1.0"
-description = "TorchScripted Pytorch Geometric Examples with Pytorch Lightning"
-authors = ["Thomas Chaton <thomas.ai@grid.com>"]
-
-[tool.poetry.dependencies]
-python = "3.7.8"
-torch = "^1.6.0"
-torch-cluster = "^1.5.7"
-torch-sparse = "^0.6.7"
-torch-scatter = "^2.0.5"
-torch-geometric = "^1.6.1"
-pytorch-lightning = "^ 1.0.5"
-openmesh = "^1.1.4"
-torch-spline-conv = "^1.2.0"
-tqdm = "^4.50.0"
-pytest = "^6.1.0"
-
-[tool.poetry.dev-dependencies]
-black = {version = "^20.8b1", allow-prereleases = true}
-
-[build-system]
-requires = ["poetry>=0.12"]
-build-backend = "poetry.masonry.api"
diff --git a/pl_examples/test_examples.py b/pl_examples/test_examples.py
index da21384190163..5f92399671b37 100644
--- a/pl_examples/test_examples.py
+++ b/pl_examples/test_examples.py
@@ -1,3 +1,17 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import importlib
 import platform
 from unittest import mock
@@ -5,7 +19,7 @@
 import pytest
 import torch
 
-from pl_examples import DALI_AVAILABLE
+from pl_examples import _DALI_AVAILABLE
 
 ARGS_DEFAULT = """
 --default_root_dir %(tmpdir)s \
@@ -90,7 +104,7 @@ def test_examples_cpu(tmpdir, import_cli, cli_args):
         module.cli_main()
 
 
-@pytest.mark.skipif(not DALI_AVAILABLE, reason="Nvidia DALI required")
+@pytest.mark.skipif(not _DALI_AVAILABLE, reason="Nvidia DALI required")
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
 @pytest.mark.skipif(platform.system() != 'Linux', reason='Only applies to Linux platform.')
 @pytest.mark.parametrize('cli_args', [ARGS_GPU])
diff --git a/pyproject.toml b/pyproject.toml
index 760421a56ece8..01e416aa51d8b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,7 +16,7 @@ exclude = "(.eggs|.git|.hg|.mypy_cache|.nox|.tox|.venv|.svn|_build|buck-out|buil
 
 [tool.isort]
 known_first_party = [
-    "bencharmks",
+    "benchmarks",
     "docs",
     "pl_examples",
     "pytorch_lightning",
@@ -52,3 +52,5 @@ skip_glob = [
 ]
 profile = "black"
 line_length = 120
+force_sort_within_sections = "True"
+order_by_type = "False"
diff --git a/pytorch_lightning/__init__.py b/pytorch_lightning/__init__.py
index 595750af0709a..890db586b2084 100644
--- a/pytorch_lightning/__init__.py
+++ b/pytorch_lightning/__init__.py
@@ -40,8 +40,8 @@
 _logger.addHandler(python_logging.StreamHandler())
 _logger.setLevel(python_logging.INFO)
 
-PACKAGE_ROOT = os.path.dirname(__file__)
-PROJECT_ROOT = os.path.dirname(PACKAGE_ROOT)
+_PACKAGE_ROOT = os.path.dirname(__file__)
+_PROJECT_ROOT = os.path.dirname(_PACKAGE_ROOT)
 
 try:
     # This variable is injected in the __builtins__ by the build
diff --git a/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py b/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py
index 4f45b7456cc9c..7db8e3defdb21 100644
--- a/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py
+++ b/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py
@@ -43,3 +43,6 @@ def model_to_device(self, model, process_idx):
     def get_device_ids(self):
         device_ids = None
         return device_ids
+
+    def init_device(self, process_idx):
+        pass
diff --git a/pytorch_lightning/accelerators/ddp_hpc_accelerator.py b/pytorch_lightning/accelerators/ddp_hpc_accelerator.py
index a32e3d6c2f1fe..47c1b736fd8b4 100644
--- a/pytorch_lightning/accelerators/ddp_hpc_accelerator.py
+++ b/pytorch_lightning/accelerators/ddp_hpc_accelerator.py
@@ -121,6 +121,7 @@ def ddp_train(self, process_idx, model):
         """
         # determine which process we are and world size
         self.set_world_ranks(process_idx)
+        self.init_device(process_idx)
 
         # toggle prog bar
         if (self.trainer.node_rank != 0 or process_idx != 0) and self.trainer.progress_bar_callback is not None:
diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py
index 066effc68a03c..670ad1813467a 100644
--- a/pytorch_lightning/callbacks/early_stopping.py
+++ b/pytorch_lightning/callbacks/early_stopping.py
@@ -19,12 +19,14 @@
 Monitor a metric and stop training when it stops improving.
 
 """
+import numbers
 
 import numpy as np
 import torch
 
 from pytorch_lightning.callbacks.base import Callback
-from pytorch_lightning.utilities import rank_zero_info, rank_zero_warn, _TPU_AVAILABLE
+from pytorch_lightning.metrics.metric import Metric
+from pytorch_lightning.utilities import _TPU_AVAILABLE, rank_zero_info, rank_zero_warn
 
 
 class EarlyStopping(Callback):
@@ -199,8 +201,11 @@ def _run_early_stopping_check(self, trainer, pl_module):
         # when in dev debugging
         trainer.dev_debugger.track_early_stopping_history(self, current)
 
-        if not isinstance(current, torch.Tensor):
-            current = torch.tensor(current, device=pl_module.device)
+        if current is not None:
+            if isinstance(current, Metric):
+                current = current.compute()
+            elif isinstance(current, numbers.Number):
+                current = torch.tensor(current, device=pl_module.device, dtype=torch.float)
 
         if trainer.use_tpu and _TPU_AVAILABLE:
             current = current.cpu()
diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py
index 2a22c78b51a85..24e518fb7aa0a 100644
--- a/pytorch_lightning/callbacks/model_checkpoint.py
+++ b/pytorch_lightning/callbacks/model_checkpoint.py
@@ -20,10 +20,11 @@
 
 """
 
-import os
-import re
 from copy import deepcopy
+import numbers
+import os
 from pathlib import Path
+import re
 from typing import Any, Dict, Optional, Union
 
 import numpy as np
@@ -32,6 +33,7 @@
 
 from pytorch_lightning import _logger as log
 from pytorch_lightning.callbacks.base import Callback
+from pytorch_lightning.metrics.metric import Metric
 from pytorch_lightning.utilities import rank_zero_info, rank_zero_only, rank_zero_warn
 from pytorch_lightning.utilities.cloud_io import get_filesystem
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -230,17 +232,14 @@ def save_checkpoint(self, trainer, pl_module):
         # what can be monitored
         monitor_candidates = self._monitor_candidates(trainer)
 
-        # ie: path/val_loss=0.5.ckpt
-        filepath = self._get_metric_interpolated_filepath_name(monitor_candidates, epoch, global_step)
-
         # callback supports multiple simultaneous modes
         # here we call each mode sequentially
         # Mode 1: save all checkpoints OR only the top k
         if self.save_top_k:
-            self._save_top_k_checkpoints(monitor_candidates, trainer, pl_module, filepath)
+            self._save_top_k_checkpoints(trainer, pl_module, monitor_candidates)
 
         # Mode 2: save the last checkpoint
-        self._save_last_checkpoint(trainer, pl_module, monitor_candidates, filepath)
+        self._save_last_checkpoint(trainer, pl_module, monitor_candidates)
 
     def __validate_init_configuration(self):
         if self.save_top_k is not None and self.save_top_k < -1:
@@ -272,8 +271,7 @@ def __init_ckpt_dir(self, dirpath, filename, save_top_k):
             and len(self._fs.ls(dirpath)) > 0
         ):
             rank_zero_warn(
-                f"Checkpoint directory {dirpath} exists and is not empty. With save_top_k={save_top_k},"
-                " all files in this directory will be deleted when a checkpoint is saved!"
+                f"Checkpoint directory {dirpath} exists and is not empty."
             )
 
         if dirpath and self._fs.protocol == 'file':
@@ -414,6 +412,7 @@ def format_checkpoint_name(
         )
         if ver is not None:
             filename = self.CHECKPOINT_JOIN_CHAR.join((filename, f"v{ver}"))
+
         ckpt_name = f"{filename}{self.FILE_EXTENSION}"
         return os.path.join(self.dirpath, ckpt_name) if self.dirpath else ckpt_name
 
@@ -486,13 +485,20 @@ def _validate_monitor_key(self, trainer):
             )
             raise MisconfigurationException(m)
 
-    def _get_metric_interpolated_filepath_name(self, ckpt_name_metrics: Dict[str, Any], epoch: int, step: int):
+    def _get_metric_interpolated_filepath_name(
+        self,
+        ckpt_name_metrics: Dict[str, Any],
+        epoch: int,
+        step: int,
+        del_filepath: Optional[str] = None
+    ) -> str:
         filepath = self.format_checkpoint_name(epoch, step, ckpt_name_metrics)
+
         version_cnt = 0
-        while self._fs.exists(filepath):
+        while self._fs.exists(filepath) and filepath != del_filepath:
             filepath = self.format_checkpoint_name(epoch, step, ckpt_name_metrics, ver=version_cnt)
-            # this epoch called before
             version_cnt += 1
+
         return filepath
 
     def _monitor_candidates(self, trainer):
@@ -502,13 +508,11 @@ def _monitor_candidates(self, trainer):
         ckpt_name_metrics.update({"step": trainer.global_step, "epoch": trainer.current_epoch})
         return ckpt_name_metrics
 
-    def _save_last_checkpoint(self, trainer, pl_module, ckpt_name_metrics, filepath):
+    def _save_last_checkpoint(self, trainer, pl_module, ckpt_name_metrics):
         should_save_last = self.monitor is None or self.save_last
         if not should_save_last:
             return
 
-        last_filepath = filepath
-
         # when user ALSO asked for the 'last.ckpt' change the name
         if self.save_last:
             last_filepath = self._format_checkpoint_name(
@@ -519,6 +523,10 @@ def _save_last_checkpoint(self, trainer, pl_module, ckpt_name_metrics, filepath)
                 prefix=self.prefix
             )
             last_filepath = os.path.join(self.dirpath, f"{last_filepath}{self.FILE_EXTENSION}")
+        else:
+            last_filepath = self._get_metric_interpolated_filepath_name(
+                ckpt_name_metrics, trainer.current_epoch, trainer.global_step
+            )
 
         accelerator_backend = trainer.accelerator_backend
 
@@ -539,16 +547,19 @@ def _save_last_checkpoint(self, trainer, pl_module, ckpt_name_metrics, filepath)
         if self.monitor is None:
             self.best_model_path = self.last_model_path
 
-    def _save_top_k_checkpoints(self, metrics, trainer, pl_module, filepath):
+    def _save_top_k_checkpoints(self, trainer, pl_module, metrics):
         current = metrics.get(self.monitor)
         epoch = metrics.get("epoch")
         step = metrics.get("step")
 
-        if not isinstance(current, torch.Tensor) and current is not None:
-            current = torch.tensor(current, device=pl_module.device)
+        if current is not None:
+            if isinstance(current, Metric):
+                current = current.compute()
+            elif isinstance(current, numbers.Number):
+                current = torch.tensor(current, device=pl_module.device, dtype=torch.float)
 
         if self.check_monitor_top_k(current):
-            self._update_best_and_save(filepath, current, epoch, step, trainer, pl_module)
+            self._update_best_and_save(current, epoch, step, trainer, pl_module, metrics)
         elif self.verbose:
             rank_zero_info(
                 f"Epoch {epoch:d}, step {step:d}: {self.monitor} was not in top {self.save_top_k}"
@@ -559,25 +570,26 @@ def _is_valid_monitor_key(self, metrics):
 
     def _update_best_and_save(
         self,
-        filepath: str,
         current: torch.Tensor,
         epoch: int,
         step: int,
         trainer,
         pl_module,
+        ckpt_name_metrics
     ):
         k = len(self.best_k_models) + 1 if self.save_top_k == -1 else self.save_top_k
 
-        del_list = []
+        del_filepath = None
         if len(self.best_k_models) == k and k > 0:
-            delpath = self.kth_best_model_path
-            self.best_k_models.pop(self.kth_best_model_path)
-            del_list.append(delpath)
+            del_filepath = self.kth_best_model_path
+            self.best_k_models.pop(del_filepath)
 
         # do not save nan, replace with +/- inf
         if torch.isnan(current):
             current = torch.tensor(float('inf' if self.mode == "min" else '-inf'))
 
+        filepath = self._get_metric_interpolated_filepath_name(ckpt_name_metrics, epoch, step, del_filepath)
+
         # save the current score
         self.current_score = current
         self.best_k_models[filepath] = current
@@ -601,9 +613,8 @@ def _update_best_and_save(
             )
         self._save_model(filepath, trainer, pl_module)
 
-        for cur_path in del_list:
-            if cur_path != filepath:
-                self._del_model(cur_path)
+        if del_filepath is not None and filepath != del_filepath:
+            self._del_model(del_filepath)
 
     def to_yaml(self, filepath: Optional[Union[str, Path]] = None):
         """
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index 6a2f75b5b2d02..e8c19ec269366 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -276,6 +276,7 @@ def log(
                 sync_dist_group,
                 accelerator.sync_tensor,
                 self._current_dataloader_idx,
+                self.device,
             )
 
     def log_dict(
@@ -1391,12 +1392,15 @@ def get_progress_bar_dict(self):
         """
         # call .item() only once but store elements without graphs
         running_train_loss = self.trainer.train_loop.running_loss.mean()
-        avg_training_loss = (
-            running_train_loss.cpu().item()
-            if running_train_loss is not None
-            else float("NaN")
-        )
-        tqdm_dict = {"loss": "{:.3g}".format(avg_training_loss)}
+        avg_training_loss = None
+        if running_train_loss is not None:
+            avg_training_loss = running_train_loss.cpu().item()
+        elif self.trainer.train_loop.automatic_optimization:
+            avg_training_loss = float('NaN')
+
+        tqdm_dict = {}
+        if avg_training_loss is not None:
+            tqdm_dict["loss"] = f"{avg_training_loss:.3g}"
 
         if self.trainer.truncated_bptt_steps is not None:
             tqdm_dict["split_idx"] = self.trainer.split_idx
diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py
index c406fe883db01..091f9a789efda 100644
--- a/pytorch_lightning/core/step_result.py
+++ b/pytorch_lightning/core/step_result.py
@@ -15,15 +15,15 @@
 """[Train, Eval]Result for easier logging, checkpointing, early stopping, epoch-wise reduction."""
 
 import numbers
+import os
 from copy import copy
-from typing import Optional, Dict, Union, Sequence, Callable, MutableMapping, Any, List, Tuple, Iterable
+from typing import Any, Callable, Dict, Iterable, List, MutableMapping, Optional, Sequence, Tuple, Union
 
 import torch
 from torch import Tensor
-import os
 
-from pytorch_lightning.utilities.distributed import sync_ddp_if_available
 from pytorch_lightning.metrics import Metric
+from pytorch_lightning.utilities.distributed import sync_ddp_if_available
 
 
 class Result(Dict):
@@ -128,6 +128,7 @@ def log(
         sync_dist_group: Optional[Any] = None,
         sync_fn: Callable = None,
         dataloader_idx: Optional[int] = None,
+        device: torch.device = None,
     ):
         # no metrics should be logged with graphs
         if not enable_graph and isinstance(value, torch.Tensor):
@@ -138,7 +139,10 @@ def log(
         if sync_dist and isinstance(value, (torch.Tensor, numbers.Number)):
             is_dist_initialized = torch.distributed.is_available() and torch.distributed.is_initialized()
             # TODO: Find a way to make the reduction only once, so we don't need to clone.
-            value = value.clone() if is_dist_initialized else value
+            if is_dist_initialized and isinstance(value, torch.Tensor):
+                value = value.clone()
+            else:
+                value = torch.tensor(value, device=device, dtype=torch.float)
             value = sync_fn(value, group=sync_dist_group, reduce_op=sync_dist_op)
 
         if 'meta' not in self:
@@ -367,7 +371,10 @@ def get_forked_metrics(self, add_dataloader_idx=False):
             dl_key = self._add_dataloader_idx(k, options["dataloader_idx"], add_dataloader_idx)
 
             if options['forked']:
-                result[dl_key] = self[k]
+                if isinstance(self[k], Metric):
+                    result[dl_key] = self[k].compute().detach()
+                else:
+                    result[dl_key] = self[k]
 
         return result
 
diff --git a/pytorch_lightning/loggers/wandb.py b/pytorch_lightning/loggers/wandb.py
index 5c09cda5666fb..0d147adee5ed4 100644
--- a/pytorch_lightning/loggers/wandb.py
+++ b/pytorch_lightning/loggers/wandb.py
@@ -31,6 +31,7 @@
 
 from pytorch_lightning.loggers.base import LightningLoggerBase, rank_zero_experiment
 from pytorch_lightning.utilities import rank_zero_only
+from pytorch_lightning.utilities.warning_utils import WarningCache
 
 
 class WandbLogger(LightningLoggerBase):
@@ -59,13 +60,16 @@ class WandbLogger(LightningLoggerBase):
 
     Example::
 
-    .. code::
+    .. code-block:: python
 
         from pytorch_lightning.loggers import WandbLogger
         from pytorch_lightning import Trainer
         wandb_logger = WandbLogger()
         trainer = Trainer(logger=wandb_logger)
 
+    Note: When logging manually through `wandb.log` or `trainer.logger.experiment.log`,
+    make sure to use `commit=False` so the logging step does not increase.
+
     See Also:
         - `Tutorial <https://app.wandb.ai/cayush/pytorchlightning/reports/
           Use-Pytorch-Lightning-with-Weights-%26-Biases--Vmlldzo2NjQ1Mw>`__
@@ -103,8 +107,9 @@ def __init__(
         self._log_model = log_model
         self._prefix = prefix
         self._kwargs = kwargs
-        # logging multiple Trainer on a single W&B run (k-fold, etc)
+        # logging multiple Trainer on a single W&B run (k-fold, resuming, etc)
         self._step_offset = 0
+        self.warning_cache = WarningCache()
 
     def __getstate__(self):
         state = self.__dict__.copy()
@@ -134,6 +139,8 @@ def experiment(self) -> Run:
             self._experiment = wandb.init(
                 name=self._name, dir=self._save_dir, project=self._project, anonymous=self._anonymous,
                 id=self._id, resume='allow', **self._kwargs) if wandb.run is None else wandb.run
+            # offset logging step when resuming a run
+            self._step_offset = self._experiment.step
             # save checkpoints in wandb dir to upload on W&B servers
             if self._log_model:
                 self._save_dir = self._experiment.dir
@@ -154,6 +161,10 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) ->
         assert rank_zero_only.rank == 0, 'experiment tried to log from global_rank != 0'
 
         metrics = self._add_prefix(metrics)
+        if step is not None and step + self._step_offset < self.experiment.step:
+            self.warning_cache.warn(
+                'Trying to log at a previous step. Use `commit=False` when logging metrics manually.'
+            )
         self.experiment.log(metrics, step=(step + self._step_offset) if step is not None else None)
 
     @property
diff --git a/pytorch_lightning/setup_tools.py b/pytorch_lightning/setup_tools.py
index 26a607a2955b8..07f5545df8a54 100644
--- a/pytorch_lightning/setup_tools.py
+++ b/pytorch_lightning/setup_tools.py
@@ -14,12 +14,12 @@
 # limitations under the License.
 import os
 import re
-import warnings
 from typing import Iterable, List
 from urllib.error import HTTPError, URLError
 from urllib.request import Request, urlopen
+import warnings
 
-from pytorch_lightning import PROJECT_ROOT, __homepage__, __version__
+from pytorch_lightning import __homepage__, __version__, _PROJECT_ROOT
 
 _PATH_BADGES = os.path.join('.', 'docs', 'source', '_images', 'badges')
 # badge to download
@@ -37,7 +37,7 @@
 def _load_requirements(path_dir: str , file_name: str = 'requirements.txt', comment_char: str = '#') -> List[str]:
     """Load requirements from a file
 
-    >>> _load_requirements(PROJECT_ROOT)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    >>> _load_requirements(_PROJECT_ROOT)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
     ['numpy...', 'torch...', ...]
     """
     with open(os.path.join(path_dir, file_name), 'r') as file:
@@ -155,7 +155,7 @@ def _download_badge(url_badge: str, badge_name: str, target_dir: str) -> str:
 def _load_long_description(path_dir: str) -> str:
     """Load readme as decribtion
 
-    >>> _load_long_description(PROJECT_ROOT)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    >>> _load_long_description(_PROJECT_ROOT)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
     '<div align="center">...'
     """
     path_readme = os.path.join(path_dir, "README.md")
diff --git a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py
index 28025859814cc..6d206f3dd929e 100644
--- a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py
+++ b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py
@@ -91,11 +91,13 @@ def check_dataloader_idx(self, result: Result) -> bool:
         random_key = list(result.keys())[-1]
         return result["meta"][random_key]["dataloader_idx"] is not None
 
-    def get_latest_from_func_name(self, latest_result, func_name: str, *args, **kwargs) -> Dict:
+    def get_latest_from_func_name(self, latest_result_opt, func_name: str, *args, **kwargs) -> Dict:
         results = {}
-        add_dataloader_idx = self.check_dataloader_idx(latest_result)
-        func = getattr(latest_result, func_name)
-        results.update(func(*args, add_dataloader_idx=add_dataloader_idx, **kwargs))
+        for opt_idx in latest_result_opt:
+            latest_result = latest_result_opt[opt_idx]
+            add_dataloader_idx = self.check_dataloader_idx(latest_result)
+            func = getattr(latest_result, func_name)
+            results.update(func(*args, add_dataloader_idx=add_dataloader_idx, **kwargs))
         return results
 
     def run_latest_batch_metrics_with_func_name(self, func_name, *args, **kwargs) -> List[Dict]:
@@ -156,6 +158,7 @@ def append(self, result, dataloader_idx: Optional[int] = None, extra_info: Optio
         assert isinstance(result, Result)
         if dataloader_idx is None:
             dataloader_idx = 0
+
         if extra_info is None:
             extra_info = {}
 
@@ -166,6 +169,7 @@ def append(self, result, dataloader_idx: Optional[int] = None, extra_info: Optio
             if dataloader_idx not in self._internals:
                 self._internals[dataloader_idx] = {}
                 self._internals_reduced[dataloader_idx] = defaultdict(dict)
+                self._latest_ref[dataloader_idx] = {}
 
             # extract infos
             opt_idx = extra_info["opt_idx"]
@@ -173,7 +177,7 @@ def append(self, result, dataloader_idx: Optional[int] = None, extra_info: Optio
 
             self._append_to_structure(self._internals[dataloader_idx], opt_idx, batch_idx, result)
 
-            self._latest_ref[dataloader_idx] = result
+            self._latest_ref[dataloader_idx][opt_idx] = result
 
         # [dataloader_idx] is a list
         else:
@@ -181,7 +185,11 @@ def append(self, result, dataloader_idx: Optional[int] = None, extra_info: Optio
             self._internals.setdefault(dataloader_idx, [])
             self._internals[dataloader_idx].append(result)
 
-            self._latest_ref[dataloader_idx] = result
+            if dataloader_idx not in self._latest_ref:
+                self._latest_ref[dataloader_idx] = {}
+                self._latest_ref[dataloader_idx][0] = {}
+
+            self._latest_ref[dataloader_idx][0] = result
 
     def auto_reduce_results_on_epoch_end(self) -> None:
         """
@@ -206,13 +214,9 @@ def auto_reduce_results_on_epoch_end(self) -> None:
                     # TODO: How to start training in middle of epoch
                     opt_outputs = epoch_metrics[opt_idx]
 
-                    num_batch_idx = len(self._internals[dl_idx][num_opt_idx]) - 1
-                    assert num_batch_idx >= 0
-                    batch_indexes = self._internals[dl_idx][num_opt_idx].keys()
-
                     # reduce across time first
                     time_reduced_outputs = []
-                    for batch_idx in batch_indexes:
+                    for batch_idx in opt_outputs.keys():
                         tbptt_outs = opt_outputs[batch_idx]
                         tbptt_outs = tbptt_outs[0].__class__.reduce_across_time(tbptt_outs)
                         if len(tbptt_outs) > 1:
diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py
index db51fb8014de0..04fa3f4cc842b 100644
--- a/pytorch_lightning/trainer/supporters.py
+++ b/pytorch_lightning/trainer/supporters.py
@@ -56,7 +56,7 @@ def __init__(self, window_length: int):
 
     def reset(self) -> None:
         """Empty the accumulator."""
-        self = TensorRunningAccum(self.window_length)
+        self.__init__(self.window_length)
 
     def last(self):
         """Get the last added element."""
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 014e0a62679dd..06cdc43674d1b 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -133,7 +133,7 @@ def __init__(
         distributed_backend: Optional[str] = None,
         automatic_optimization: Optional[bool] = None,
         move_metrics_to_cpu: bool = False,
-        enable_pl_optimizer: bool = True,
+        enable_pl_optimizer: bool = False,
         multiple_trainloader_mode: str = 'max_size_cycle',
     ):
         r"""
diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py
index 9724f05247c00..be5d781939c04 100644
--- a/pytorch_lightning/utilities/distributed.py
+++ b/pytorch_lightning/utilities/distributed.py
@@ -15,14 +15,14 @@
 import os
 import warnings
 from functools import wraps
+from typing import Any, Optional, Union
 
 import torch
+
 from pytorch_lightning import _logger as log
-from typing import Union, Optional, Any
 
 if torch.distributed.is_available():
-    from torch.distributed import ReduceOp
-    from torch.distributed import group
+    from torch.distributed import ReduceOp, group
 else:
     class ReduceOp:
         SUM = None
@@ -145,15 +145,14 @@ def sync_ddp(
     if group is None:
         group = torch.distributed.group.WORLD
 
-    if reduce_op is None:
-        reduce_op = torch.distributed.ReduceOp.SUM
-    elif isinstance(reduce_op, str) and reduce_op in ("avg", "mean"):
-        reduce_op = torch.distributed.ReduceOp.SUM
+    op = reduce_op if isinstance(reduce_op, ReduceOp) else ReduceOp.SUM
+
+    if isinstance(reduce_op, str) and reduce_op.lower() in ("avg", "mean"):
         divide_by_world_size = True
 
     # sync all processes before reduction
     torch.distributed.barrier(group=group)
-    torch.distributed.all_reduce(result, op=reduce_op, group=group, async_op=False)
+    torch.distributed.all_reduce(result, op=op, group=group, async_op=False)
 
     if divide_by_world_size:
         result = result / torch.distributed.get_world_size(group)
@@ -207,6 +206,6 @@ def all_gather_ddp_if_available(
         if sync_grads:
             return AllGatherGrad.apply(tensor, group)
         else:
-            with torch.no_grad:
+            with torch.no_grad():
                 return AllGatherGrad.apply(tensor, group)
     return tensor
diff --git a/requirements/examples.txt b/requirements/examples.txt
index 6e48778cb222a..c87d10a39346f 100644
--- a/requirements/examples.txt
+++ b/requirements/examples.txt
@@ -1,2 +1,2 @@
 torchvision>=0.4.1
-gym>=0.17.0
+gym>=0.17.0
\ No newline at end of file
diff --git a/requirements/test.txt b/requirements/test.txt
index 3cb538a98d7c8..632f40e0287b4 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -17,3 +17,4 @@ pre-commit>=1.0
 
 cloudpickle>=1.3
 nltk>=3.3
+pandas  # needed in benchmarks
diff --git a/setup.cfg b/setup.cfg
index 4475fb11266d0..7b685fb8dc0e5 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -99,6 +99,10 @@ max-line-length = 120
 files = pytorch_lightning, pl_examples, benchmarks, tests
 disallow_untyped_defs = True
 ignore_missing_imports = True
+show_error_codes = True
+warn_redundant_casts = True
+warn_unused_configs = True
+warn_unused_ignores = True
 
 # todo: add proper typing to this module...
 [mypy-pytorch_lightning.callbacks.*]
diff --git a/tests/__init__.py b/tests/__init__.py
index 981d685430da9..e0ec83a2efbca 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -1,18 +1,31 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 
 import numpy as np
 
-TEST_ROOT = os.path.dirname(__file__)
-PROJECT_ROOT = os.path.dirname(TEST_ROOT)
-TEMP_PATH = os.path.join(PROJECT_ROOT, 'test_temp')
+_TEST_ROOT = os.path.dirname(__file__)
+_PROJECT_ROOT = os.path.dirname(_TEST_ROOT)
+_TEMP_PATH = os.path.join(_PROJECT_ROOT, 'test_temp')
 
 # todo: this setting `PYTHONPATH` may not be used by other evns like Conda for import packages
-if PROJECT_ROOT not in os.getenv('PYTHONPATH', ""):
+if _PROJECT_ROOT not in os.getenv('PYTHONPATH', ""):
     splitter = ":" if os.environ.get("PYTHONPATH", "") else ""
-    os.environ['PYTHONPATH'] = f'{PROJECT_ROOT}{splitter}{os.environ.get("PYTHONPATH", "")}'
+    os.environ['PYTHONPATH'] = f'{_PROJECT_ROOT}{splitter}{os.environ.get("PYTHONPATH", "")}'
 
 # generate a list of random seeds for each test
 RANDOM_PORTS = list(np.random.randint(12000, 19000, 1000))
 
-if not os.path.isdir(TEMP_PATH):
-    os.mkdir(TEMP_PATH)
+if not os.path.isdir(_TEMP_PATH):
+    os.mkdir(_TEMP_PATH)
diff --git a/tests/base/datasets.py b/tests/base/datasets.py
index 854d69b54eaf8..33d3801c432ab 100644
--- a/tests/base/datasets.py
+++ b/tests/base/datasets.py
@@ -22,10 +22,10 @@
 from torch import Tensor
 from torch.utils.data import Dataset
 
-from tests import PROJECT_ROOT
+from tests import _PROJECT_ROOT
 
 #: local path to test datasets
-PATH_DATASETS = os.path.join(PROJECT_ROOT, 'Datasets')
+PATH_DATASETS = os.path.join(_PROJECT_ROOT, 'Datasets')
 
 
 class MNIST(Dataset):
@@ -63,8 +63,13 @@ class MNIST(Dataset):
     TEST_FILE_NAME = 'test.pt'
     cache_folder_name = 'complete'
 
-    def __init__(self, root: str = PATH_DATASETS, train: bool = True,
-                 normalize: tuple = (0.5, 1.0), download: bool = True):
+    def __init__(
+            self,
+            root: str = PATH_DATASETS,
+            train: bool = True,
+            normalize: tuple = (0.5, 1.0),
+            download: bool = True,
+    ):
         super().__init__()
         self.root = root
         self.train = train  # training set or test set
diff --git a/tests/base/develop_utils.py b/tests/base/develop_utils.py
index 3db8eb022288a..6eb19d3c4b1e4 100644
--- a/tests/base/develop_utils.py
+++ b/tests/base/develop_utils.py
@@ -19,7 +19,7 @@
 from pytorch_lightning import seed_everything
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.loggers import TensorBoardLogger, TestTubeLogger
-from tests import TEMP_PATH, RANDOM_PORTS
+from tests import _TEMP_PATH, RANDOM_PORTS
 from tests.base.model_template import EvalModelTemplate
 
 
@@ -63,7 +63,7 @@ def get_data_path(expt_logger, path_dir=None):
         if hasattr(expt_logger, 'save_dir') and expt_logger.save_dir:
             path_dir = expt_logger.save_dir
         else:
-            path_dir = TEMP_PATH
+            path_dir = _TEMP_PATH
     path_expt = os.path.join(path_dir, name, 'version_%s' % version)
 
     # try if the new sub-folder exists, typical case for test-tube
diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py
index c00c712bb3b13..070bb4e9f6989 100644
--- a/tests/callbacks/test_callbacks.py
+++ b/tests/callbacks/test_callbacks.py
@@ -33,6 +33,8 @@ def test_trainer_callback_system(torch_save):
         limit_train_batches=3,
         limit_test_batches=2,
         progress_bar_refresh_rate=0,
+        # todo: enabled since internally we wrap the model for optimizer step, this should be fixed
+        enable_pl_optimizer=True
     )
 
     # no call yet
diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py
index 27f484c63d87c..1f3e44f58173e 100644
--- a/tests/checkpointing/test_model_checkpoint.py
+++ b/tests/checkpointing/test_model_checkpoint.py
@@ -905,3 +905,42 @@ def __init__(self, hparams):
     else:
         # make sure it's not AttributeDict
         assert type(ckpt[model.CHECKPOINT_HYPER_PARAMS_KEY]) == hparams_type
+
+
+@pytest.mark.parametrize('max_epochs', [3, 4])
+@pytest.mark.parametrize(
+    'save_top_k, expected',
+    [
+        (1, ['curr_epoch.ckpt']),
+        (2, ['curr_epoch.ckpt', 'curr_epoch-v0.ckpt']),
+    ]
+)
+def test_model_checkpoint_file_already_exists(tmpdir, max_epochs, save_top_k, expected):
+    """
+    Test that version is added to filename if required and it already exists in dirpath.
+    """
+    model_checkpoint = ModelCheckpoint(
+        dirpath=tmpdir,
+        filename='curr_epoch',
+        save_top_k=save_top_k,
+        monitor='epoch',
+        mode='max',
+    )
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        callbacks=[model_checkpoint],
+        max_epochs=max_epochs,
+        limit_train_batches=2,
+        limit_val_batches=2,
+        logger=None,
+        weights_summary=None,
+        progress_bar_refresh_rate=0,
+    )
+
+    model = BoringModel()
+    trainer.fit(model)
+    ckpt_files = os.listdir(tmpdir)
+    assert set(ckpt_files) == set(expected)
+
+    epochs_in_ckpt_files = [pl_load(os.path.join(tmpdir, f))['epoch'] - 1 for f in ckpt_files]
+    assert sorted(epochs_in_ckpt_files) == list(range(max_epochs - save_top_k, max_epochs))
diff --git a/tests/collect_env_details.py b/tests/collect_env_details.py
index 1d443795d2876..2b8c4b3fafeed 100644
--- a/tests/collect_env_details.py
+++ b/tests/collect_env_details.py
@@ -1,3 +1,16 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """Diagnose your system and show basic information
 
 This server mainly to get detail info for better bug reporting.
diff --git a/tests/conftest.py b/tests/conftest.py
index ad4b7169456a8..c6a14a99b2478 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,7 +1,21 @@
-import sys
-import threading
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from functools import partial, wraps
 from http.server import SimpleHTTPRequestHandler
+import sys
+import threading
 
 import pytest
 import torch.multiprocessing as mp
diff --git a/tests/loggers/test_all.py b/tests/loggers/test_all.py
index ea40814b18861..0aec3b22f74a9 100644
--- a/tests/loggers/test_all.py
+++ b/tests/loggers/test_all.py
@@ -74,7 +74,9 @@ def test_loggers_fit_test_all(tmpdir, monkeypatch):
     with mock.patch('pytorch_lightning.loggers.test_tube.Experiment'):
         _test_loggers_fit_test(tmpdir, TestTubeLogger)
 
-    with mock.patch('pytorch_lightning.loggers.wandb.wandb'):
+    with mock.patch('pytorch_lightning.loggers.wandb.wandb') as wandb:
+        wandb.run = None
+        wandb.init().step = 0
         _test_loggers_fit_test(tmpdir, WandbLogger)
 
 
@@ -366,7 +368,9 @@ def test_logger_with_prefix_all(tmpdir, monkeypatch):
         logger.experiment.log.assert_called_once_with({"tmp-test": 1.0}, global_step=0)
 
     # WandB
-    with mock.patch('pytorch_lightning.loggers.wandb.wandb'):
+    with mock.patch('pytorch_lightning.loggers.wandb.wandb') as wandb:
         logger = _instantiate_logger(WandbLogger, save_idr=tmpdir, prefix=prefix)
+        wandb.run = None
+        wandb.init().step = 0
         logger.log_metrics({"test": 1.0}, step=0)
         logger.experiment.log.assert_called_once_with({'tmp-test': 1.0}, step=0)
diff --git a/tests/loggers/test_wandb.py b/tests/loggers/test_wandb.py
index fa503f5d8eeb1..398ee45ef4aa0 100644
--- a/tests/loggers/test_wandb.py
+++ b/tests/loggers/test_wandb.py
@@ -22,8 +22,14 @@
 from tests.base import EvalModelTemplate
 
 
+def get_warnings(recwarn):
+    warnings_text = '\n'.join(str(w.message) for w in recwarn.list)
+    recwarn.clear()
+    return warnings_text
+
+
 @mock.patch('pytorch_lightning.loggers.wandb.wandb')
-def test_wandb_logger_init(wandb):
+def test_wandb_logger_init(wandb, recwarn):
     """Verify that basic functionality of wandb logger works.
     Wandb doesn't work well with pytest so we have to mock it out here."""
 
@@ -34,6 +40,9 @@ def test_wandb_logger_init(wandb):
     wandb.init.assert_called_once()
     wandb.init().log.assert_called_once_with({'acc': 1.0}, step=None)
 
+    # mock wandb step
+    wandb.init().step = 0
+
     # test wandb.init not called if there is a W&B run
     wandb.init().log.reset_mock()
     wandb.init.reset_mock()
@@ -49,15 +58,28 @@ def test_wandb_logger_init(wandb):
     logger.log_metrics({'acc': 1.0}, step=3)
     wandb.init().log.assert_called_with({'acc': 1.0}, step=6)
 
+    # log hyper parameters
     logger.log_hyperparams({'test': None, 'nested': {'a': 1}, 'b': [2, 3, 4]})
     wandb.init().config.update.assert_called_once_with(
         {'test': 'None', 'nested/a': 1, 'b': [2, 3, 4]},
         allow_val_change=True,
     )
 
+    # watch a model
     logger.watch('model', 'log', 10)
     wandb.init().watch.assert_called_once_with('model', log='log', log_freq=10)
 
+    # verify warning for logging at a previous step
+    assert 'Trying to log at a previous step' not in get_warnings(recwarn)
+    # current step from wandb should be 6 (last logged step)
+    logger.experiment.step = 6
+    # logging at step 2 should raise a warning (step_offset is still 3)
+    logger.log_metrics({'acc': 1.0}, step=2)
+    assert 'Trying to log at a previous step' in get_warnings(recwarn)
+    # logging again at step 2 should not display again the same warning
+    logger.log_metrics({'acc': 1.0}, step=2)
+    assert 'Trying to log at a previous step' not in get_warnings(recwarn)
+
     assert logger.name == wandb.init().project_name()
     assert logger.version == wandb.init().id
 
@@ -71,6 +93,7 @@ def test_wandb_pickle(wandb, tmpdir):
     class Experiment:
         """ """
         id = 'the_id'
+        step = 0
 
         def project_name(self):
             return 'the_project_name'
@@ -108,8 +131,11 @@ def test_wandb_logger_dirs_creation(wandb, tmpdir):
     assert logger.name is None
 
     # mock return values of experiment
+    wandb.run = None
+    wandb.init().step = 0
     logger.experiment.id = '1'
     logger.experiment.project_name.return_value = 'project'
+    logger.experiment.step = 0
 
     for _ in range(2):
         _ = logger.experiment
diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py
index cd61da7c008bc..169552ce1bd75 100644
--- a/tests/models/test_gpu.py
+++ b/tests/models/test_gpu.py
@@ -47,7 +47,7 @@ def test_multi_gpu_none_backend(tmpdir):
     tpipes.run_model_test(trainer_options, model)
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
+@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 @pytest.mark.parametrize('gpus', [1, [0], [1]])
 def test_single_gpu_model(tmpdir, gpus):
     """Make sure single GPU works (DP mode)."""
diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py
index 3a2ae8750443f..2f11c7df5f26f 100644
--- a/tests/models/test_horovod.py
+++ b/tests/models/test_horovod.py
@@ -44,9 +44,9 @@
     from horovod.common.util import nccl_built
     nccl_built()
 except (ImportError, ModuleNotFoundError, AttributeError):
-    HOROVOD_NCCL_AVAILABLE = False
+    _HOROVOD_NCCL_AVAILABLE = False
 finally:
-    HOROVOD_NCCL_AVAILABLE = True
+    _HOROVOD_NCCL_AVAILABLE = True
 
 
 def _run_horovod(trainer_options, on_gpu=False):
@@ -105,7 +105,7 @@ def test_horovod_cpu_implicit(enable_pl_optimizer, tmpdir):
 
 
 @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows")
-@pytest.mark.skipif(not HOROVOD_NCCL_AVAILABLE, reason="test requires Horovod with NCCL support")
+@pytest.mark.skipif(not _HOROVOD_NCCL_AVAILABLE, reason="test requires Horovod with NCCL support")
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 def test_horovod_multi_gpu(tmpdir):
     """Test Horovod with multi-GPU support."""
@@ -125,7 +125,7 @@ def test_horovod_multi_gpu(tmpdir):
 
 
 @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows")
-@pytest.mark.skipif(not HOROVOD_NCCL_AVAILABLE, reason="test requires Horovod with NCCL support")
+@pytest.mark.skipif(not _HOROVOD_NCCL_AVAILABLE, reason="test requires Horovod with NCCL support")
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 @pytest.mark.skipif(not _APEX_AVAILABLE, reason="test requires apex")
 def test_horovod_apex(tmpdir):
@@ -149,7 +149,7 @@ def test_horovod_apex(tmpdir):
 
 @pytest.mark.skip(reason="Skip till Horovod fixes integration with Native torch.cuda.amp")
 @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows")
-@pytest.mark.skipif(not HOROVOD_NCCL_AVAILABLE, reason="test requires Horovod with NCCL support")
+@pytest.mark.skipif(not _HOROVOD_NCCL_AVAILABLE, reason="test requires Horovod with NCCL support")
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 @pytest.mark.skipif(not _NATIVE_AMP_AVAILABLE, reason="test requires torch.cuda.amp")
 def test_horovod_amp(tmpdir):
@@ -172,7 +172,7 @@ def test_horovod_amp(tmpdir):
 
 
 @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows")
-@pytest.mark.skipif(not HOROVOD_NCCL_AVAILABLE, reason="test requires Horovod with NCCL support")
+@pytest.mark.skipif(not _HOROVOD_NCCL_AVAILABLE, reason="test requires Horovod with NCCL support")
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
 def test_horovod_transfer_batch_to_gpu(tmpdir):
     class TestTrainingStepModel(EvalModelTemplate):
diff --git a/tests/special_tests.sh b/tests/special_tests.sh
index f7cb581951783..950e3776bbc7f 100644
--- a/tests/special_tests.sh
+++ b/tests/special_tests.sh
@@ -19,4 +19,4 @@ python ${DEFAULTS} tests/plugins/test_rpc_plugin.py::test_rpc_function_calls_ddp
 python ${DEFAULTS} tests/plugins/test_ddp_sequential_plugin.py::test_ddp_sequential_plugin_ddp_rpc_manual
 python ${DEFAULTS} tests/plugins/test_ddp_sequential_plugin.py::test_ddp_sequential_plugin_ddp_rpc_manual_amp
 python ${DEFAULTS} tests/plugins/test_ddp_sequential_plugin.py::test_ddp_sequential_plugin_ddp_rpc_automatic
-# python ${DEFAULTS} tests/plugins/test_ddp_sequential_plugin.py::test_ddp_sequential_plugin_ddp_rpc_with_wrong_balance
+python ${DEFAULTS} tests/trainer/logging_tests/test_train_loop_logging_1_0.py::test_logging_sync_dist_true_ddp
diff --git a/tests/test_profiler.py b/tests/test_profiler.py
index 3bce379c1115c..91a8631a73287 100644
--- a/tests/test_profiler.py
+++ b/tests/test_profiler.py
@@ -1,6 +1,20 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
-import time
 from pathlib import Path
+import time
 
 import numpy as np
 import pytest
diff --git a/tests/trainer/logging_tests/test_train_loop_logging_1_0.py b/tests/trainer/logging_tests/test_train_loop_logging_1_0.py
index d5a985489a909..f418db2bd72a5 100644
--- a/tests/trainer/logging_tests/test_train_loop_logging_1_0.py
+++ b/tests/trainer/logging_tests/test_train_loop_logging_1_0.py
@@ -26,8 +26,8 @@
 from torch.utils.data import Dataset
 
 import pytorch_lightning as pl
-from pytorch_lightning import Trainer, callbacks
-from pytorch_lightning.callbacks import ModelCheckpoint
+from pytorch_lightning import callbacks, Trainer
+from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
 from pytorch_lightning.core.lightning import LightningModule
 from tests.base.boring_model import BoringModel, RandomDictDataset, RandomDictStringDataset
 from tests.base.deterministic_model import DeterministicModel
@@ -687,6 +687,7 @@ class TestModel(BoringModel):
         def training_step(self, batch, batch_idx):
             acc = self.step(batch[0])
             self.log('foo', torch.tensor(fake_result), on_step=False, on_epoch=True, sync_dist=True, sync_dist_op='sum')
+            self.log('foo_2', 2, on_step=False, on_epoch=True, sync_dist=True, sync_dist_op='sum')
             return acc
 
         def validation_step(self, batch, batch_idx):
@@ -706,9 +707,46 @@ def validation_step(self, batch, batch_idx):
     trainer.fit(model)
 
     assert trainer.logged_metrics['foo'] == fake_result
+    assert trainer.logged_metrics['foo_2'] == 2
     assert trainer.logged_metrics['bar'] == fake_result
 
 
+@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
+@pytest.mark.skipif(not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1',
+                    reason="test should be run outside of pytest")
+def test_logging_sync_dist_true_ddp(tmpdir):
+    """
+    Tests to ensure that the sync_dist flag works with ddp
+    """
+    class TestLoggingSyncDistModel(BoringModel):
+        def training_step(self, batch, batch_idx):
+            acc = self.step(batch[0])
+            self.log('foo', 1, on_step=False, on_epoch=True, sync_dist=True, sync_dist_op='SUM')
+            return acc
+
+        def validation_step(self, batch, batch_idx):
+            self.training_step_called = True
+            output = self.layer(batch)
+            loss = self.loss(batch, output)
+            self.log('bar', 2, on_step=False, on_epoch=True, sync_dist=True, sync_dist_op='AVG')
+            return {"x": loss}
+
+    model = TestLoggingSyncDistModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        limit_train_batches=1,
+        limit_val_batches=1,
+        max_epochs=2,
+        weights_summary=None,
+        accelerator="ddp",
+        gpus=2,
+    )
+    trainer.fit(model)
+
+    assert trainer.logged_metrics['foo'] == 2
+    assert trainer.logged_metrics['bar'] == 2
+
+
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
 def test_logging_sync_dist_true_gpu(tmpdir):
     """
@@ -818,3 +856,47 @@ def on_train_epoch_end(self, trainer, pl_module, outputs):
         'on_epoch_end': 5,
         'on_train_epoch_end': 6}
     assert trainer.callback_metrics == expected
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU machine")
+def test_metric_are_properly_reduced(tmpdir):
+    class TestingModel(BoringModel):
+        def __init__(self, *args, **kwargs):
+            super().__init__()
+            self.val_acc = pl.metrics.Accuracy()
+
+        def training_step(self, batch, batch_idx):
+            output = super().training_step(batch, batch_idx)
+            self.log("train_loss", output["loss"])
+            return output
+
+        def validation_step(self, batch, batch_idx):
+            preds = torch.tensor([[0.9, 0.1]], device=self.device)
+            targets = torch.tensor([1], device=self.device)
+            if batch_idx < 8:
+                preds = torch.tensor([[0.1, 0.9]], device=self.device)
+            self.val_acc(preds, targets)
+            self.log('val_acc', self.val_acc, on_step=True, on_epoch=True)
+            return super().validation_step(batch, batch_idx)
+
+    early_stop = EarlyStopping(monitor='val_acc', mode='max')
+
+    checkpoint = ModelCheckpoint(
+        monitor='val_acc',
+        save_last=True,
+        save_top_k=2,
+        mode='max',
+    )
+
+    model = TestingModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        gpus=1,
+        max_epochs=2,
+        limit_train_batches=5,
+        limit_val_batches=32,
+        callbacks=[early_stop, checkpoint])
+    trainer.fit(model)
+
+    assert trainer.callback_metrics["val_acc"] == 8 / 32.
+    assert "train_loss" in trainer.callback_metrics
diff --git a/tests/trainer/optimization/test_multiple_optimizers.py b/tests/trainer/optimization/test_multiple_optimizers.py
new file mode 100644
index 0000000000000..78b6f8f7ff84a
--- /dev/null
+++ b/tests/trainer/optimization/test_multiple_optimizers.py
@@ -0,0 +1,63 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Tests to ensure that the behaviours related to multiple optimizers works
+"""
+import torch
+
+import pytorch_lightning as pl
+from tests.base.boring_model import BoringModel
+
+
+def test_unbalanced_logging_with_multiple_optimizers(tmpdir):
+    """
+    This tests ensures reduction works in un-balanced logging settings
+    """
+    class TestModel(BoringModel):
+
+        loss_1 = []
+        loss_2 = []
+
+        def training_step(self, batch, batch_idx, optimizer_idx):
+            output = self.layer(batch)
+            loss = self.loss(batch, output)
+            if optimizer_idx == 0 and self.trainer.global_step > 10:
+                self.log("loss_1", loss, on_epoch=True, prog_bar=True)
+                self.loss_1.append(loss.detach().clone())
+            elif optimizer_idx == 1:
+                self.log("loss_2", loss, on_epoch=True, prog_bar=True)
+                self.loss_2.append(loss.detach().clone())
+            return {"loss": loss}
+
+        def configure_optimizers(self):
+            optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.001)
+            optimizer2 = torch.optim.SGD(self.layer.parameters(), lr=0.001)
+            return [optimizer, optimizer2]
+
+    model = TestModel()
+    model.training_epoch_end = None
+
+    # Initialize a trainer
+    trainer = pl.Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+    )
+
+    trainer.fit(model)
+
+    assert torch.equal(trainer.callback_metrics["loss_2_step"], model.loss_2[-1])
+    assert torch.equal(trainer.callback_metrics["loss_1_step"], model.loss_1[-1])
+    # test loss are properly reduced
+    assert torch.abs(trainer.callback_metrics["loss_2_epoch"] - torch.FloatTensor(model.loss_2).mean()) < 1e-6
+    assert torch.abs(trainer.callback_metrics["loss_1_epoch"] - torch.FloatTensor(model.loss_1).mean()) < 1e-6
diff --git a/tests/trainer/test_supporters.py b/tests/trainer/test_supporters.py
index 6195d7ddeb0b0..1a1203e8f2dd6 100644
--- a/tests/trainer/test_supporters.py
+++ b/tests/trainer/test_supporters.py
@@ -17,10 +17,32 @@
 import torch
 
 from torch.utils.data import TensorDataset
-from pytorch_lightning.trainer.supporters import CycleIterator, CombinedLoader, CombinedDataset, CombinedLoaderIterator
+from pytorch_lightning.trainer.supporters import (
+    CycleIterator, CombinedLoader, CombinedDataset, CombinedLoaderIterator, TensorRunningAccum)
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 
+def test_tensor_running_accum_reset():
+    """ Test that reset would set all attributes to the initialization state """
+
+    window_length = 10
+
+    accum = TensorRunningAccum(window_length=window_length)
+    assert accum.last() is None
+    assert accum.mean() is None
+
+    accum.append(torch.tensor(1.5))
+    assert accum.last() == torch.tensor(1.5)
+    assert accum.mean() == torch.tensor(1.5)
+
+    accum.reset()
+    assert accum.window_length == window_length
+    assert accum.memory is None
+    assert accum.current_idx == 0
+    assert accum.last_idx is None
+    assert not accum.rotated
+
+
 def test_cycle_iterator():
     """Test the cycling function of `CycleIterator`"""
     iterator = CycleIterator(range(100), 1000)