diff --git a/.azure-pipelines/gpu-benchmark.yml b/.azure-pipelines/gpu-benchmark.yml
index a63c9e8640bc8..6d45cc2f4566a 100644
--- a/.azure-pipelines/gpu-benchmark.yml
+++ b/.azure-pipelines/gpu-benchmark.yml
@@ -28,15 +28,15 @@ jobs:
     cancelTimeoutInMinutes: "2"
     pool: gridai-spot-pool
     container:
-      # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04
-      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.8-torch1.8"
+      # should match the one in '.azure-pipelines/gpu-benchmark.yml'
+      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
       options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g"
     workspace:
       clean: all
 
     steps:
       - bash: |
-          python -m pytest benchmarks -v --durations=0
+          python -m pytest tests/benchmarks -v --durations=0
         displayName: 'Testing: benchmarks'
         env:
           PL_RUNNING_BENCHMARKS: 1
diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml
index f1af36a6090b9..ca8c54a61479e 100644
--- a/.azure-pipelines/gpu-tests.yml
+++ b/.azure-pipelines/gpu-tests.yml
@@ -51,7 +51,7 @@ jobs:
     - bash: |
         python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)"
         pip install fairscale==0.4.0
-        pip install deepspeed==0.5.4
+        pip install deepspeed==0.5.7
         pip install . --requirement requirements/devel.txt
         pip list
       displayName: 'Install dependencies'
@@ -68,14 +68,14 @@ jobs:
       displayName: 'Get legacy checkpoints'
 
     - bash: |
-        python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50
+        python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests --ignore tests/benchmarks -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50
       displayName: 'Testing: standard'
 
     - bash: |
-        bash tests/special_tests.sh
+        bash tests/standalone_tests.sh
       env:
         PL_USE_MOCKED_MNIST: "1"
-      displayName: 'Testing: special'
+      displayName: 'Testing: standalone'
 
     - bash: |
         python -m coverage report
@@ -113,5 +113,5 @@ jobs:
       displayName: 'Testing: examples'
 
     - bash: |
-        python -m pytest benchmarks -v --maxfail=2 --durations=0
+        python -m pytest tests/benchmarks -v --maxfail=2 --durations=0
       displayName: 'Testing: benchmarks'
diff --git a/.circleci/config.yml b/.circleci/config.yml
index ecebb1e9d94b5..8758310be9b6b 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -5,6 +5,19 @@ orbs:
   go: circleci/go@1.3.0
   codecov: codecov/codecov@1.1.0
 
+trigger:
+  tags:
+    include:
+      - '*'
+  branches:
+    include:
+      - "master"
+      - "release/*"
+      - "refs/tags/*"
+pr:
+  - "master"
+  - "release/*"
+
 # Workflow Steps:
 #  1. Checkout
 #  2. Install GO
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
index 3a94ef6758910..7719cf2812558 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -16,11 +16,11 @@ assignees: ''
 Please reproduce using the BoringModel!
 
 You can use the following Colab link:
-https://colab.research.google.com/drive/1HvWVVTK8j2Nj52qU4Q4YCyzOm0_aLQF3?usp=sharing
+https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/pl_examples/bug_report/bug_report_model.ipynb
 IMPORTANT: has to be public.
 
 or this simple template:
-https://github.com/PyTorchLightning/pytorch-lightning/blob/master/pl_examples/bug_report_model.py
+https://github.com/PyTorchLightning/pytorch-lightning/blob/master/pl_examples/bug_report/bug_report_model.py
 
 If you could not reproduce using the BoringModel and still think there's a bug, please post here
 but remember, bugs with code are fixed faster!
@@ -46,9 +46,9 @@ python collect_env_details.py
 You can also fill out the list below manually.
 -->
 
-- PyTorch Lightning Version (e.g., 1.3.0):
-- PyTorch Version (e.g., 1.8)
-- Python version:
+- PyTorch Lightning Version (e.g., 1.5.0):
+- PyTorch Version (e.g., 1.10):
+- Python version (e.g., 3.9):
 - OS (e.g., Linux):
 - CUDA/cuDNN version:
 - GPU models and configuration:
diff --git a/.github/set-min-requirements.py b/.github/set-min-requirements.py
index b67ba224662ab..e5162293e2b4c 100644
--- a/.github/set-min-requirements.py
+++ b/.github/set-min-requirements.py
@@ -2,8 +2,8 @@
     "requirements.txt",
     "requirements/extra.txt",
     "requirements/loggers.txt",
-    "requirements/test.txt",
     "requirements/examples.txt",
+    # "requirements/test.txt",  # Don't use old testing packages
 )
 
 
diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml
index 5b5a140e7791a..bd45247e15df2 100644
--- a/.github/workflows/ci_dockers.yml
+++ b/.github/workflows/ci_dockers.yml
@@ -1,4 +1,4 @@
-name: CI build Docker
+name: Docker
 # https://www.docker.com/blog/first-docker-github-action-is-here
 # https://github.com/docker/build-push-action
 # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows
@@ -23,9 +23,9 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        # should be the config used in '.github/workflows/release-docker.yml', but we just keep one to check.
-        python_version: ["3.9"]
-        pytorch_version: ["1.9"]
+        # the config used in '.azure-pipelines/gpu-tests.yml' since the Dockerfile uses the cuda image
+        python_version: ["3.7"]
+        pytorch_version: ["1.8"]
     steps:
       - name: Checkout
         uses: actions/checkout@v2
@@ -92,8 +92,8 @@ jobs:
       fail-fast: false
       matrix:
         # the config used in '.github/workflows/ci_test-conda.yml'
-        python_version: ["3.7"]
-        pytorch_version: ["1.6", "1.7", "1.8", "1.9", "1.10"]
+        python_version: ["3.8"]
+        pytorch_version: ["1.7", "1.8", "1.9", "1.10"]
     steps:
       - name: Checkout
         uses: actions/checkout@v2
@@ -119,7 +119,7 @@ jobs:
       fail-fast: false
       matrix:
         # the config used in 'dockers/ipu-ci-runner/Dockerfile'
-        python_version: ["3.8"]  # latest
+        python_version: ["3.9"]  # latest
         # TODO: upgrade - PopTorch 2.2 uses torch 1.9, see:
         # https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/installation.html#version-compatibility
         pytorch_version: ["1.7"]
diff --git a/.github/workflows/ci_pkg-install.yml b/.github/workflows/ci_pkg-install.yml
index 12f3976d078e4..bf7de876d157e 100644
--- a/.github/workflows/ci_pkg-install.yml
+++ b/.github/workflows/ci_pkg-install.yml
@@ -1,4 +1,4 @@
-name: Install pkg
+name: Package
 
 # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows
 on:  # Trigger the workflow on push or pull request, but only for the master branch
@@ -9,7 +9,7 @@ on:  # Trigger the workflow on push or pull request, but only for the master bra
 
 jobs:
 
-  pkg-install:
+  install:
     runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false
@@ -26,12 +26,11 @@ jobs:
 
       - name: Prepare env
         run: |
-          pip install check-manifest "twine==3.2" setuptools wheel
+          pip install "twine==3.2" setuptools wheel
 
       - name: Create package
         run: |
-          check-manifest
-          # python setup.py check --metadata --strict
+          python setup.py check --metadata --strict
           python setup.py sdist bdist_wheel
 
       - name: Check package
diff --git a/.github/workflows/ci_schema.yml b/.github/workflows/ci_schema.yml
index 51c4400666fd0..d635285fae39a 100644
--- a/.github/workflows/ci_schema.yml
+++ b/.github/workflows/ci_schema.yml
@@ -1,11 +1,11 @@
-name: CI action schema
+name: Schema
 on: # Trigger the workflow on push or pull request, but only for the master branch
   push: {}
   pull_request:
     branches: [master, "release/*"]
 
 jobs:
-  validate-schema:
+  check:
     runs-on: ubuntu-20.04
     steps:
       - name: Checkout
diff --git a/.github/workflows/ci_test-base.yml b/.github/workflows/ci_test-base.yml
index e92249cab4030..c2f1d370e2d1a 100644
--- a/.github/workflows/ci_test-base.yml
+++ b/.github/workflows/ci_test-base.yml
@@ -1,6 +1,6 @@
 # this jobs runs `pytest` over the source directory. It does not install any extra dependencies.
 # this is useful to catch errors where an import has been added which is not part of the basic dependencies.
-name: CI basic testing
+name: Test
 
 # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows
 on:  # Trigger the workflow on push or pull request, but only for the master branch
@@ -10,8 +10,7 @@ on:  # Trigger the workflow on push or pull request, but only for the master bra
     branches: [master, "release/*"]
 
 jobs:
-  doctest:
-
+  source:
     runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false
@@ -20,7 +19,7 @@ jobs:
         # this will install stable torch
         python-version: [3.9]
 
-    # Timeout: https://stackoverflow.com/a/59076067/4521646
+    # lower timeout as this should run very quickly
     timeout-minutes: 20
     steps:
     - uses: actions/checkout@v2
@@ -60,7 +59,6 @@ jobs:
 
     - name: Test Package [only]
       run: |
-        # NOTE: run coverage on tests does not propagate failure status for Win, https://github.com/nedbat/coveragepy/issues/1003
         coverage run --source pytorch_lightning -m pytest pytorch_lightning -v --junitxml=junit/test-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}.xml
 
     - name: Upload pytest test results
diff --git a/.github/workflows/ci_test-conda.yml b/.github/workflows/ci_test-conda.yml
index cdd18d13e909a..fa366e645f1d9 100644
--- a/.github/workflows/ci_test-conda.yml
+++ b/.github/workflows/ci_test-conda.yml
@@ -1,4 +1,4 @@
-name: PyTorch & Conda
+name: Test
 
 # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows
 on:  # Trigger the workflow on push or pull request, but only for the master branch
@@ -14,11 +14,10 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.7"]
-        pytorch-version: ["1.6", "1.7", "1.8", "1.9", "1.10"]
+        python-version: ["3.8"]  # previous to last Python version as that one is already used in test-full
+        pytorch-version: ["1.7", "1.8", "1.9", "1.10"]  # nightly: add when there's a release candidate
 
-    # Timeout: https://stackoverflow.com/a/59076067/4521646
-    timeout-minutes: 35
+    timeout-minutes: 30
     steps:
     - uses: actions/checkout@v2
 
@@ -30,7 +29,8 @@ jobs:
         python ./requirements/adjust_versions.py requirements/extra.txt
         python ./requirements/adjust_versions.py requirements/examples.txt
         pip install --requirement requirements/devel.txt --find-links https://download.pytorch.org/whl/nightly/torch_nightly.html
-        pip install pytest-random-order
+        # set a per-test timeout of 2.5 minutes to fail sooner. this aids with hanging tests
+        pip install pytest-timeout
         pip list
 
     - name: Pull checkpoints from S3
@@ -43,8 +43,7 @@ jobs:
 
     - name: Tests
       run: |
-        # NOTE: run coverage on tests does not propagate failure status for Win, https://github.com/nedbat/coveragepy/issues/1003
-        coverage run --source pytorch_lightning -m pytest --random-order-seed=1 pytorch_lightning tests -v --durations=50 --junitxml=junit/test-results-${{ runner.os }}-torch${{ matrix.pytorch-version }}.xml
+        coverage run --source pytorch_lightning -m pytest --timeout 150 pytorch_lightning tests -v --durations=50 --junitxml=junit/test-results-${{ runner.os }}-torch${{ matrix.pytorch-version }}.xml
       shell: bash -l {0}
 
     - name: Upload pytest results
diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml
index 5cc4827d84888..ad8e27e263c2d 100644
--- a/.github/workflows/ci_test-full.yml
+++ b/.github/workflows/ci_test-full.yml
@@ -1,4 +1,4 @@
-name: CI complete testing
+name: Test
 
 # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows
 on:  # Trigger the workflow on push or pull request, but only for the master branch
@@ -10,7 +10,7 @@ on:  # Trigger the workflow on push or pull request, but only for the master bra
 
 jobs:
 
-  pytest:
+  cpu:
 
     runs-on: ${{ matrix.os }}
     if: github.event.pull_request.draft == false
@@ -18,20 +18,20 @@ jobs:
       fail-fast: false
       matrix:
         os: [ubuntu-18.04, windows-2019, macOS-10.15]
-        python-version: [3.6, 3.8, 3.9]
-        requires: ['minimal', 'latest']
-        release: ['stable']
-        exclude:
-          - python-version: 3.9
-            requires: 'minimal'
+        python-version: ["3.7", "3.9"]  # minimum, maximum
+        requires: ["oldest", "latest"]
+        release: ["stable"]
         include:
-          - os: ubuntu-20.04
-            python-version: 3.9
-            requires: 'latest'
-            release: 'pre'
+          # test 3.6 only on oldest until EOL: https://github.com/PyTorchLightning/pytorch-lightning/issues/9981
+          - {os: ubuntu-18.04, python-version: "3.6", requires: "oldest", release: "stable"}
+          - {os: windows-2019, python-version: "3.6", requires: "oldest", release: "stable"}
+          - {os: macOS-10.15, python-version: "3.6", requires: "oldest", release: "stable"}
+          # nightly: add when there's a release candidate
+          #- {os: ubuntu-20.04, python-version: "3.10", requires: "latest", release: "pre"}
+        exclude:
+          # Skip if torch<1.8 and py3.9 on Linux: https://github.com/pytorch/pytorch/issues/50014
+          - {os: ubuntu-18.04, python-version: "3.9", requires: "oldest", release: "stable"}
 
-    # Timeout: https://stackoverflow.com/a/59076067/4521646
-    # TODO: the macOS is taking too long, probably caching did not work...
     timeout-minutes: 40
 
     steps:
@@ -64,7 +64,7 @@ jobs:
         python .github/prune-packages.py requirements/extra.txt "horovod"
 
     - name: Set min. dependencies
-      if: matrix.requires == 'minimal'
+      if: matrix.requires == 'oldest'
       run: |
         python .github/set-min-requirements.py
 
diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml
index 1cedf2c360306..e99863dc794d4 100644
--- a/.github/workflows/code-checks.yml
+++ b/.github/workflows/code-checks.yml
@@ -1,4 +1,4 @@
-name: "Check code"
+name: Test
 
 on:  # Trigger the workflow on push or pull request, but only for the master branch
   push:
@@ -7,8 +7,7 @@ on:  # Trigger the workflow on push or pull request, but only for the master bra
     branches: [master, "release/*"]
 
 jobs:
-  python-typing-mypy:
-    name: Python typing Mypy
+  mypy:
     runs-on: ubuntu-20.04
     steps:
       - uses: actions/checkout@master
diff --git a/.github/workflows/docs-checks.yml b/.github/workflows/docs-checks.yml
index 9d6b660a168f8..841f9128da8b1 100644
--- a/.github/workflows/docs-checks.yml
+++ b/.github/workflows/docs-checks.yml
@@ -1,4 +1,4 @@
-name: "Docs check"
+name: Test
 # https://github.com/marketplace/actions/sphinx-build
 
 on:  # Trigger the workflow on push or pull request, but only for the master branch
@@ -8,7 +8,7 @@ on:  # Trigger the workflow on push or pull request, but only for the master bra
     branches: [master, "release/*"]
 
 jobs:
-  test-docs:
+  doctest:
     runs-on: ubuntu-20.04
     steps:
       - uses: actions/checkout@v2
diff --git a/.github/workflows/events-nightly.yml b/.github/workflows/events-nightly.yml
index 4afcec0496abc..7c2075ce5b440 100644
--- a/.github/workflows/events-nightly.yml
+++ b/.github/workflows/events-nightly.yml
@@ -1,4 +1,4 @@
-name: Nightly events
+name: Nightly
 
 # https://jasonet.co/posts/scheduled-actions/
 # https://github.community/t/distinct-job-for-each-schedule/17811/2
@@ -122,8 +122,8 @@ jobs:
       fail-fast: false
       matrix:
         # the config used in '.github/workflows/ci_test-conda.yml'
-        python_version: ["3.7"]
-        pytorch_version: ["1.6", "1.7", "1.8", "1.9", "1.10"]
+        python_version: ["3.8"]
+        pytorch_version: ["1.7", "1.8", "1.9", "1.10"]
 
     steps:
       - name: Checkout
@@ -163,7 +163,7 @@ jobs:
       matrix:
         # the config used in 'dockers/ipu-ci-runner/Dockerfile'
         include:
-          - python_version: "3.8"
+          - python_version: "3.9"
             pytorch_version: "1.7"
 
     steps:
diff --git a/.github/workflows/events-recurrent.yml b/.github/workflows/events-recurrent.yml
index d7f1872fde732..834adc6c169fa 100644
--- a/.github/workflows/events-recurrent.yml
+++ b/.github/workflows/events-recurrent.yml
@@ -1,4 +1,4 @@
-name: Recurrent events
+name: Recurrent
 
 # https://jasonet.co/posts/scheduled-actions/
 # https://github.community/t/distinct-job-for-each-schedule/17811/2
diff --git a/.github/workflows/probot-auto-cc.yml b/.github/workflows/probot-auto-cc.yml
new file mode 100644
index 0000000000000..5c6de911cd00e
--- /dev/null
+++ b/.github/workflows/probot-auto-cc.yml
@@ -0,0 +1,16 @@
+name: Probot
+
+on:
+  issues:
+    types: [labeled]
+  pull_request:
+    types: [labeled, ready_for_review]
+
+jobs:
+  auto-cc:
+    runs-on: ubuntu-latest
+    if: github.event_name == 'issue' || github.event.pull_request.draft == false
+    steps:
+      - uses: carmocca/probot@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml
index 92bf62d3c1ead..169e01edd8d48 100644
--- a/.github/workflows/release-docker.yml
+++ b/.github/workflows/release-docker.yml
@@ -1,4 +1,4 @@
-name: Publish Docker Releases
+name: Docker
 # https://www.docker.com/blog/first-docker-github-action-is-here
 # https://github.com/docker/build-push-action
 on:
@@ -8,7 +8,7 @@ on:
     types: [published]
 
 jobs:
-  cuda-PL:
+  publish:
     runs-on: ubuntu-20.04
     # only on releases
     if: startsWith(github.ref, 'refs/tags/') || github.event_name == 'release'
@@ -16,7 +16,7 @@ jobs:
       fail-fast: false
       matrix:
         python_version: ["3.6", "3.7", "3.8", "3.9"]
-        pytorch_version: ["1.6", "1.7", "1.8", "1.9"]
+        pytorch_version: ["1.7", "1.8", "1.9", "1.10"]
     steps:
       - name: Checkout
         uses: actions/checkout@v2
@@ -39,7 +39,7 @@ jobs:
       - name: Publish Latest to Docker
         uses: docker/build-push-action@v1.1.0
         # only on releases and latest Python and PyTorch
-        if: matrix.python_version == 3.9 && matrix.pytorch_version == 1.9
+        if: matrix.python_version == "3.9" && matrix.pytorch_version == "1.10"
         with:
           repository: pytorchlightning/pytorch_lightning
           username: ${{ secrets.DOCKER_USERNAME }}
diff --git a/.github/workflows/release-pypi.yml b/.github/workflows/release-pypi.yml
index a91837cab3340..09afd4db893d3 100644
--- a/.github/workflows/release-pypi.yml
+++ b/.github/workflows/release-pypi.yml
@@ -1,4 +1,4 @@
-name: PyPI Release
+name: PyPI
 
 # https://help.github.com/en/actions/reference/events-that-trigger-workflows
 on:  # Trigger the workflow on push or pull request, but only for the master branch
diff --git a/CHANGELOG.md b/CHANGELOG.md
index b9c808464dadc..74b19da46d05b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,152 @@ All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
+## [1.5.10] - 2021-02-08
+
+### Fixed
+
+-
+
+
+-
+
+
+## [1.5.9] - 2022-01-18
+
+### Fixed
+
+- Pin sphinx-autodoc-typehints with <v1.15 ([#11400](https://github.com/PyTorchLightning/pytorch-lightning/pull/11400))
+- Skip testing with PyTorch 1.7 and Python 3.9 on Ubuntu ([#11217](https://github.com/PyTorchLightning/pytorch-lightning/pull/11217))
+- Fixed type promotion when tensors of higher category than float are logged ([#11401](https://github.com/PyTorchLightning/pytorch-lightning/pull/11401))
+- Fixed the format of the configuration saved automatically by the CLI's `SaveConfigCallback` ([#11532](https://github.com/PyTorchLightning/pytorch-lightning/pull/11532))
+
+### Changed
+
+- Changed `LSFEnvironment` to use `LSB_DJOB_RANKFILE` environment variable instead of `LSB_HOSTS` for determining node rank and main address ([#10825](https://github.com/PyTorchLightning/pytorch-lightning/pull/10825))
+- Disabled sampler replacement when using `IterableDataset` ([#11507](https://github.com/PyTorchLightning/pytorch-lightning/pull/11507))
+
+
+## [1.5.8] - 2022-01-05
+
+### Fixed
+
+- Fixed `LightningCLI` race condition while saving the config ([#11199](https://github.com/PyTorchLightning/pytorch-lightning/pull/11199))
+- Fixed the default value used with `log(reduce_fx=min|max)` ([#11310](https://github.com/PyTorchLightning/pytorch-lightning/pull/11310))
+- Fixed data fetcher selection ([#11294](https://github.com/PyTorchLightning/pytorch-lightning/pull/11294))
+- Fixed a race condition that could result in incorrect (zero) values being observed in prediction writer callbacks ([#11288](https://github.com/PyTorchLightning/pytorch-lightning/pull/11288))
+- Fixed dataloaders not getting reloaded the correct amount of times when setting `reload_dataloaders_every_n_epochs` and `check_val_every_n_epoch` ([#10948](https://github.com/PyTorchLightning/pytorch-lightning/pull/10948))
+
+
+## [1.5.7] - 2021-12-21
+
+### Fixed
+
+- Fixed `NeptuneLogger` when using DDP ([#11030](https://github.com/PyTorchLightning/pytorch-lightning/pull/11030))
+- Fixed a bug to disable logging hyperparameters in logger if there are no hparams ([#11105](https://github.com/PyTorchLightning/pytorch-lightning/issues/11105))
+- Avoid the deprecated `onnx.export(example_outputs=...)` in torch 1.10 ([#11116](https://github.com/PyTorchLightning/pytorch-lightning/pull/11116))
+- Fixed an issue when torch-scripting a `LightningModule` after training with `Trainer(sync_batchnorm=True)` ([#11078](https://github.com/PyTorchLightning/pytorch-lightning/pull/11078))
+- Fixed an `AttributeError` occuring when using a `CombinedLoader` (multiple dataloaders) for prediction ([#11111](https://github.com/PyTorchLightning/pytorch-lightning/pull/11111))
+- Fixed bug where `Trainer(track_grad_norm=..., logger=False)` would fail ([#11114](https://github.com/PyTorchLightning/pytorch-lightning/pull/11114))
+- Fixed an incorrect warning being produced by the model summary when using `bf16` precision on CPU ([#11161](https://github.com/PyTorchLightning/pytorch-lightning/pull/11161))
+
+### Changed
+
+- DeepSpeed does not require lightning module zero 3 partitioning ([#10655](https://github.com/PyTorchLightning/pytorch-lightning/pull/10655))
+- The `ModelCheckpoint` callback now saves and restores attributes `best_k_models`, `kth_best_model_path`, `kth_value`, and `last_model_path` ([#10995](https://github.com/PyTorchLightning/pytorch-lightning/pull/10995))
+
+
+## [1.5.6] - 2021-12-15
+
+### Fixed
+
+- Fixed a bug where the DeepSpeedPlugin arguments `cpu_checkpointing` and `contiguous_memory_optimization` were not being forwarded to deepspeed correctly ([#10874](https://github.com/PyTorchLightning/pytorch-lightning/issues/10874))
+- Fixed an issue with `NeptuneLogger` causing checkpoints to be uploaded with a duplicated file extension ([#11015](https://github.com/PyTorchLightning/pytorch-lightning/issues/11015))
+- Fixed support for logging within callbacks returned from `LightningModule` ([#10991](https://github.com/PyTorchLightning/pytorch-lightning/pull/10991))
+- Fixed running sanity check with `RichProgressBar` ([#10913](https://github.com/PyTorchLightning/pytorch-lightning/pull/10913))
+- Fixed support for `CombinedLoader` while checking for warning raised with eval dataloaders ([#10994](https://github.com/PyTorchLightning/pytorch-lightning/pull/10994))
+- The TQDM progress bar now correctly shows the `on_epoch` logged values on train epoch end ([#11069](https://github.com/PyTorchLightning/pytorch-lightning/pull/11069))
+- Fixed bug where the TQDM updated the training progress bar during `trainer.validate` ([#11069](https://github.com/PyTorchLightning/pytorch-lightning/pull/11069))
+
+
+## [1.5.5] - 2021-12-07
+
+### Fixed
+
+- Disabled batch_size extraction for torchmetric instances because they accumulate the metrics internally ([#10815](https://github.com/PyTorchLightning/pytorch-lightning/pull/10815))
+- Fixed an issue with `SignalConnector` not restoring the default signal handlers on teardown when running on SLURM or with fault-tolerant training enabled ([#10611](https://github.com/PyTorchLightning/pytorch-lightning/pull/10611))
+- Fixed `SignalConnector._has_already_handler` check for callable type ([#10483](https://github.com/PyTorchLightning/pytorch-lightning/pull/10483))
+- Fixed an issue to return the results for each dataloader separately instead of duplicating them for each ([#10810](https://github.com/PyTorchLightning/pytorch-lightning/pull/10810))
+- Improved exception message if `rich` version is less than `10.2.2` ([#10839](https://github.com/PyTorchLightning/pytorch-lightning/pull/10839))
+- Fixed uploading best model checkpoint in NeptuneLogger ([#10369](https://github.com/PyTorchLightning/pytorch-lightning/pull/10369))
+- Fixed early schedule reset logic in PyTorch profiler that was causing data leak ([#10837](https://github.com/PyTorchLightning/pytorch-lightning/pull/10837))
+- Fixed a bug that caused incorrect batch indices to be passed to the `BasePredictionWriter` hooks when using a dataloader with `num_workers > 0` ([#10870](https://github.com/PyTorchLightning/pytorch-lightning/pull/10870))
+- Fixed an issue with item assignment on the logger on rank > 0 for those who support it ([#10917](https://github.com/PyTorchLightning/pytorch-lightning/pull/10917))
+- Fixed importing `torch_xla.debug` for `torch-xla<1.8` ([#10836](https://github.com/PyTorchLightning/pytorch-lightning/pull/10836))
+- Fixed an issue with `DDPSpawnPlugin` and related plugins leaving a temporary checkpoint behind ([#10934](https://github.com/PyTorchLightning/pytorch-lightning/pull/10934))
+- Fixed a `TypeError` occuring in the `SingalConnector.teardown()` method ([#10961](https://github.com/PyTorchLightning/pytorch-lightning/pull/10961))
+
+
+## [1.5.4] - 2021-11-30
+
+### Fixed
+
+- Fixed support for `--key.help=class` with the `LightningCLI` ([#10767](https://github.com/PyTorchLightning/pytorch-lightning/pull/10767))
+- Fixed `_compare_version` for python packages ([#10762](https://github.com/PyTorchLightning/pytorch-lightning/pull/10762))
+- Fixed TensorBoardLogger `SummaryWriter` not close before spawning the processes ([#10777](https://github.com/PyTorchLightning/pytorch-lightning/pull/10777))
+- Fixed a consolidation error in Lite when attempting to save the state dict of a sharded optimizer ([#10746](https://github.com/PyTorchLightning/pytorch-lightning/pull/10746))
+- Fixed the default logging level for batch hooks associated with training from `on_step=False, on_epoch=True` to `on_step=True, on_epoch=False` ([#10756](https://github.com/PyTorchLightning/pytorch-lightning/pull/10756))
+
+
+### Removed
+
+- Removed PyTorch 1.6 support ([#10367](https://github.com/PyTorchLightning/pytorch-lightning/pull/10367), [#10738](https://github.com/PyTorchLightning/pytorch-lightning/pull/10738))
+
+
+## [1.5.3] - 2021-11-24
+
+### Fixed
+
+- Fixed `ShardedTensor` state dict hook registration to check if torch distributed is available ([#10621](https://github.com/PyTorchLightning/pytorch-lightning/pull/10621))
+- Fixed an issue with `self.log` not respecting a tensor's `dtype` when applying computations ([#10076](https://github.com/PyTorchLightning/pytorch-lightning/pull/10076))
+- Fixed LigtningLite `_wrap_init` popping unexisting keys from DataLoader signature parameters ([#10613](https://github.com/PyTorchLightning/pytorch-lightning/pull/10613))
+- Fixed signals being registered within threads ([#10610](https://github.com/PyTorchLightning/pytorch-lightning/pull/10610))
+- Fixed an issue that caused Lightning to extract the batch size even though it was set by the user in `LightningModule.log` ([#10408](https://github.com/PyTorchLightning/pytorch-lightning/pull/10408))
+- Fixed `Trainer(move_metrics_to_cpu=True)` not moving the evaluation logged results to CPU ([#10631](https://github.com/PyTorchLightning/pytorch-lightning/pull/10631))
+- Fixed the `{validation,test}_step` outputs getting moved to CPU with `Trainer(move_metrics_to_cpu=True)` ([#10631](https://github.com/PyTorchLightning/pytorch-lightning/pull/10631))
+- Fixed signals being registered within threads ([#10610](https://github.com/PyTorchLightning/pytorch-lightning/pull/10610))
+- Fixed an issue with collecting logged test results with multiple dataloaders ([#10522](https://github.com/PyTorchLightning/pytorch-lightning/pull/10522))
+
+
+## [1.5.2] - 2021-11-16
+
+### Fixed
+
+- Fixed `CombinedLoader` and `max_size_cycle` didn't receive a `DistributedSampler` ([#10374](https://github.com/PyTorchLightning/pytorch-lightning/issues/10374))
+- Fixed an issue where class or init-only variables of dataclasses were passed to the dataclass constructor in `utilities.apply_to_collection` ([#9702](https://github.com/PyTorchLightning/pytorch-lightning/issues/9702))
+- Fixed `isinstance` not working with `init_meta_context`, materialized model not being moved to the device ([#10493](https://github.com/PyTorchLightning/metrics/pull/10493))
+- Fixed an issue that prevented the Trainer to shutdown workers when execution is interrupted due to failure([#10463](https://github.com/PyTorchLightning/pytorch-lightning/issues/10463))
+- Squeeze the early stopping monitor to remove empty tensor dimensions ([#10461](https://github.com/PyTorchLightning/pytorch-lightning/issues/10461))
+- Fixed sampler replacement logic with `overfit_batches` to only replace the sample when `SequentialSampler` is not used ([#10486](https://github.com/PyTorchLightning/pytorch-lightning/issues/10486))
+- Fixed scripting causing false positive deprecation warnings ([#10470](https://github.com/PyTorchLightning/pytorch-lightning/pull/10470), [#10555](https://github.com/PyTorchLightning/pytorch-lightning/pull/10555))
+- Do not fail if batch size could not be inferred for logging when using DeepSpeed ([#10438](https://github.com/PyTorchLightning/pytorch-lightning/issues/10438))
+- Fixed propagation of device and dtype information to submodules of LightningLite when they inherit from `DeviceDtypeModuleMixin` ([#10559](https://github.com/PyTorchLightning/pytorch-lightning/issues/10559))
+
+
+## [1.5.1] - 2021-11-09
+
+### Fixed
+
+- Fixed `apply_to_collection(defaultdict)` ([#10316](https://github.com/PyTorchLightning/pytorch-lightning/issues/10316))
+- Fixed failure when `DataLoader(batch_size=None)` is passed ([#10345](https://github.com/PyTorchLightning/pytorch-lightning/issues/10345))
+- Fixed interception of `__init__` arguments for sub-classed DataLoader re-instantiation in Lite ([#10334](https://github.com/PyTorchLightning/pytorch-lightning/issues/10334))
+- Fixed issue with pickling `CSVLogger` after a call to `CSVLogger.save` ([#10388](https://github.com/PyTorchLightning/pytorch-lightning/pull/10388))
+- Fixed an import error being caused by `PostLocalSGD` when `torch.distributed` not available ([#10359](https://github.com/PyTorchLightning/pytorch-lightning/pull/10359))
+- Fixed the logging with `on_step=True` in epoch-level hooks causing unintended side-effects. Logging with `on_step=True` in epoch-level hooks will now correctly raise an error ([#10409](https://github.com/PyTorchLightning/pytorch-lightning/pull/10409))
+- Fixed deadlocks for distributed training with `RichProgressBar` ([#10428](https://github.com/PyTorchLightning/pytorch-lightning/pull/10428))
+- Fixed an issue where the model wrapper in Lite converted non-floating point tensors to float ([#10429](https://github.com/PyTorchLightning/pytorch-lightning/pull/10429))
+- Fixed an issue with inferring the dataset type in fault-tolerant training ([#10432](https://github.com/PyTorchLightning/pytorch-lightning/pull/10432))
+- Fixed dataloader workers with `persistent_workers` being deleted on every iteration ([#10434](https://github.com/PyTorchLightning/pytorch-lightning/pull/10434))
+
 
 ## [1.5.0] - 2021-11-02
 
@@ -132,7 +278,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added support for empty `gpus` list to run on CPU ([#10246](https://github.com/PyTorchLightning/pytorch-lightning/pull/10246))
 - Added a warning if multiple batch sizes are found from ambiguous batch ([#10247](https://github.com/PyTorchLightning/pytorch-lightning/pull/10247))
 
-
 ### Changed
 
 - Trainer now raises a `MisconfigurationException` when its methods are called with `ckpt_path="best"` but a checkpoint callback isn't configured ([#9841](https://github.com/PyTorchLightning/pytorch-lightning/pull/9841))
@@ -184,7 +329,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Enabled `on_load_checkpoint` for `LightningDataModule` for all `trainer_fn` ([#10238](https://github.com/PyTorchLightning/pytorch-lightning/pull/10238))
 - Allowed separate config files for parameters with class type when LightningCLI is in `subclass_mode=False` ([#10286](https://github.com/PyTorchLightning/pytorch-lightning/pull/10286))
 
-
 ### Deprecated
 
 - Deprecated Trainer argument `terminate_on_nan` in favor of `detect_anomaly`([#9175](https://github.com/PyTorchLightning/pytorch-lightning/pull/9175))
@@ -220,7 +364,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Deprecated `lr_sch_names` from `LearningRateMonitor` ([#10066](https://github.com/PyTorchLightning/pytorch-lightning/pull/10066))
 - Deprecated `ProgressBar` callback in favor of `TQDMProgressBar` ([#10134](https://github.com/PyTorchLightning/pytorch-lightning/pull/10134))
 
-
 ### Removed
 
 - Removed deprecated `metrics` ([#8586](https://github.com/PyTorchLightning/pytorch-lightning/pull/8586/))
@@ -264,7 +407,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Removed automatic patching of `{train,val,test,predict}_dataloader()` on the `LightningModule` ([#9764](https://github.com/PyTorchLightning/pytorch-lightning/pull/9764))
 - Removed `pytorch_lightning.trainer.connectors.OptimizerConnector` ([#10120](https://github.com/PyTorchLightning/pytorch-lightning/pull/10120))
 
-
 ### Fixed
 
 - Fixed ImageNet evaluation in example ([#10179](https://github.com/PyTorchLightning/pytorch-lightning/pull/10179))
@@ -473,7 +615,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added private `prevent_trainer_and_dataloaders_deepcopy` context manager on the `LightningModule` ([#8472](https://github.com/PyTorchLightning/pytorch-lightning/pull/8472))
 - Added support for providing callables to the Lightning CLI instead of types ([#8400](https://github.com/PyTorchLightning/pytorch-lightning/pull/8400))
 
-
 ### Changed
 
 - Decoupled device parsing logic from Accelerator connector to Trainer ([#8180](https://github.com/PyTorchLightning/pytorch-lightning/pull/8180))
diff --git a/MANIFEST.in b/MANIFEST.in
index b810937f1a495..a68fc82474e70 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -11,69 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-# Manifest syntax https://docs.python.org/2/distutils/sourcedist.html
-graft wheelhouse
-
-recursive-exclude __pycache__  *.py[cod] *.orig
-
-# Include the README and CHANGELOG
-include *.md
-
-# Include the license file
-include LICENSE
-
-# Include the citation info
-include *.cff
-
-exclude *.sh
-exclude *.svg
-recursive-include pytorch_lightning *.py
-
-# Include marker file for PEP 561
-include pytorch_lightning/py.typed
-
-# include examples
-recursive-include pl_examples *.py *.md *.sh *.txt *.toml
-
-# exclude tests from package
-recursive-exclude tests *
-recursive-exclude site *
-exclude tests
-
-# Exclude the documentation files
-recursive-exclude docs *
-exclude docs
-recursive-include docs/source/_static/images/logos/ *
-recursive-include docs/source/_static/images/general/ pl_overview* tf_* tutorial_* PTL101_*
-
-# Include the Requirements
+include pytorch_lightning/py.typed  # marker file for PEP 561
+include CHANGELOG.md
 recursive-include requirements *.txt
-recursive-exclude requirements *.sh *.py
 include requirements.txt
-include pyproject.toml
-
-# Exclude build configs
-exclude *.yml
-exclude *.yaml
-exclude *.toml
-exclude *.jsonnet
-
-# Exclude pyright config
-exclude .pyrightconfig.json
-
-# Exclude submodules
-exclude .gitmodules
-exclude _notebooks
-
-# Exclude Makefile
-exclude Makefile
-
-prune .git
-prune .github
-prune .circleci
-prune temp*
-prune test*
-prune benchmark*
-prune dockers
-prune legacy
+include *.cff  # citation info
diff --git a/_notebooks b/_notebooks
index a2fb6468112b7..0c325829101d5 160000
--- a/_notebooks
+++ b/_notebooks
@@ -1 +1 @@
-Subproject commit a2fb6468112b7e1dad501c3b6a17533a4adfeabc
+Subproject commit 0c325829101d5a6ebf32ed99bbf5b09badf04a59
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
deleted file mode 100644
index b4a3da40d40d0..0000000000000
--- a/benchmarks/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright The PyTorch Lightning team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-
-_BENCHMARK_ROOT = os.path.dirname(__file__)
-_PROJECT_ROOT = os.path.dirname(_BENCHMARK_ROOT)
-_PATH_DATASETS = os.path.join(_PROJECT_ROOT, "Datasets")
diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
index 978f8bcdf100e..d70761cbdd37a 100644
--- a/dockers/base-cuda/Dockerfile
+++ b/dockers/base-cuda/Dockerfile
@@ -16,8 +16,8 @@ ARG CUDA_VERSION=10.2
 
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu18.04
 
-ARG PYTHON_VERSION=3.8
-ARG PYTORCH_VERSION=1.6
+ARG PYTHON_VERSION=3.9
+ARG PYTORCH_VERSION=1.8
 
 SHELL ["/bin/bash", "-c"]
 # https://techoverflow.net/2019/05/18/how-to-fix-configuring-tzdata-interactive-input-when-building-docker-images/
@@ -112,7 +112,7 @@ RUN \
 
 RUN \
     # install DeepSpeed
-    pip install deepspeed==0.5.4
+    pip install deepspeed==0.5.7
 
 RUN \
     # Show what we have
diff --git a/dockers/base-ipu/Dockerfile b/dockers/base-ipu/Dockerfile
index 01b5920d88fd1..e91a0dc4a0a1e 100644
--- a/dockers/base-ipu/Dockerfile
+++ b/dockers/base-ipu/Dockerfile
@@ -16,8 +16,7 @@ FROM ubuntu:20.04
 
 LABEL maintainer="PyTorchLightning <https://github.com/PyTorchLightning>"
 
-ARG PYTHON_VERSION=3.8
-ARG PYTORCH_VERSION=1.7
+ARG PYTHON_VERSION=3.9
 ARG CONDA_VERSION=4.9.2
 
 SHELL ["/bin/bash", "-c"]
@@ -41,7 +40,7 @@ RUN apt-get update -qq && \
     && \
 # Install conda and python.
 # NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385
-    curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py38_${CONDA_VERSION}-Linux-x86_64.sh && \
+    curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py39_${CONDA_VERSION}-Linux-x86_64.sh && \
     chmod +x ~/miniconda.sh && \
     ~/miniconda.sh -b && \
     rm ~/miniconda.sh && \
diff --git a/dockers/base-xla/Dockerfile b/dockers/base-xla/Dockerfile
index 1a2554b6b94b2..e293343614927 100644
--- a/dockers/base-xla/Dockerfile
+++ b/dockers/base-xla/Dockerfile
@@ -16,10 +16,10 @@ FROM google/cloud-sdk:slim
 
 LABEL maintainer="PyTorchLightning <https://github.com/PyTorchLightning>"
 
-# CALL: docker image build -t pytorch-lightning:XLA-extras-py3.6 -f dockers/base-xla/Dockerfile . --build-arg PYTHON_VERSION=3.6
-ARG PYTHON_VERSION=3.7
+# CALL: docker image build -t pytorch-lightning:XLA-extras-py3.6 -f dockers/base-xla/Dockerfile . --build-arg PYTHON_VERSION=3.8
+ARG PYTHON_VERSION=3.9
 ARG CONDA_VERSION=4.9.2
-ARG XLA_VERSION=1.6
+ARG XLA_VERSION=1.8
 
 SHELL ["/bin/bash", "-c"]
 # for skipping configurations
@@ -42,7 +42,7 @@ RUN apt-get update -qq && \
     && \
     # Install conda and python.
     # NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385
-    curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py38_${CONDA_VERSION}-Linux-x86_64.sh && \
+    curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py39_${CONDA_VERSION}-Linux-x86_64.sh && \
     chmod +x ~/miniconda.sh && \
     ~/miniconda.sh -b && \
     rm ~/miniconda.sh && \
diff --git a/dockers/ipu-ci-runner/Dockerfile b/dockers/ipu-ci-runner/Dockerfile
index aa8672a34a376..98f769f78fe8f 100644
--- a/dockers/ipu-ci-runner/Dockerfile
+++ b/dockers/ipu-ci-runner/Dockerfile
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG PYTHON_VERSION=3.8
+ARG PYTHON_VERSION=3.9
 ARG PYTORCH_VERSION=1.7
 
 FROM pytorchlightning/pytorch_lightning:base-ipu-py${PYTHON_VERSION}-torch${PYTORCH_VERSION}
diff --git a/dockers/release/Dockerfile b/dockers/release/Dockerfile
index bea977899ce50..f4083f2dd42fc 100644
--- a/dockers/release/Dockerfile
+++ b/dockers/release/Dockerfile
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG PYTHON_VERSION=3.7
-ARG PYTORCH_VERSION=1.6
+ARG PYTHON_VERSION=3.9
+ARG PYTORCH_VERSION=1.8
 
 FROM pytorchlightning/pytorch_lightning:base-cuda-py${PYTHON_VERSION}-torch${PYTORCH_VERSION}
 
diff --git a/dockers/tpu-tests/Dockerfile b/dockers/tpu-tests/Dockerfile
index 3fc703edb2e0d..6605b9abbaadc 100644
--- a/dockers/tpu-tests/Dockerfile
+++ b/dockers/tpu-tests/Dockerfile
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG PYTHON_VERSION=3.7
-ARG PYTORCH_VERSION=1.6
+ARG PYTHON_VERSION=3.9
+ARG PYTORCH_VERSION=1.8
 
 FROM pytorchlightning/pytorch_lightning:base-xla-py${PYTHON_VERSION}-torch${PYTORCH_VERSION}
 
diff --git a/dockers/tpu-tests/tpu_test_cases.jsonnet b/dockers/tpu-tests/tpu_test_cases.jsonnet
index 55454e7cac0a2..530c40e49ed3e 100644
--- a/dockers/tpu-tests/tpu_test_cases.jsonnet
+++ b/dockers/tpu-tests/tpu_test_cases.jsonnet
@@ -33,6 +33,7 @@ local tputests = base.BaseTest {
       echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS
       export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}"
       coverage run --source=pytorch_lightning -m pytest -v --capture=no \
+          tests/plugins/test_tpu_spawn.py \
           tests/profiler/test_xla_profiler.py \
           pytorch_lightning/utilities/xla_device.py \
           tests/accelerators/test_tpu.py \
diff --git a/docs/source/advanced/fault_tolerant_training.rst b/docs/source/advanced/fault_tolerant_training.rst
index e4a61b27e294d..63a3ce41ee8b3 100644
--- a/docs/source/advanced/fault_tolerant_training.rst
+++ b/docs/source/advanced/fault_tolerant_training.rst
@@ -134,7 +134,7 @@ Performance Impacts
 -------------------
 
 Fault-tolerant Training was tested on common and worst-case scenarios in order to measure the impact of the internal state tracking on the total training time.
-On tiny models like the `BoringModel and RandomDataset <https://github.com/PyTorchLightning/pytorch-lightning/blob/master/pl_examples/bug_report_model.py>`_
+On tiny models like the `BoringModel and RandomDataset <https://github.com/PyTorchLightning/pytorch-lightning/blob/master/pl_examples/bug_report/bug_report_model.py>`_
 which has virtually no data loading and processing overhead, we noticed up to 50% longer training time with fault tolerance enabled.
 In this worst-case scenario, fault-tolerant adds an overhead that is noticeable in comparison to the compute time for dataloading itself.
 However, for more realistic training workloads where data loading and preprocessing is more expensive, the constant overhead that fault tolerance adds becomes less noticeable or not noticeable at all.
diff --git a/docs/source/advanced/multi_gpu.rst b/docs/source/advanced/multi_gpu.rst
index 77784f8da0542..51d07f628620f 100644
--- a/docs/source/advanced/multi_gpu.rst
+++ b/docs/source/advanced/multi_gpu.rst
@@ -90,7 +90,7 @@ This is done by adding ``sync_dist=True`` to all ``self.log`` calls in the valid
 This ensures that each GPU worker has the same behaviour when tracking model checkpoints, which is important for later downstream tasks such as testing the best checkpoint across all workers.
 The ``sync_dist`` option can also be used in logging calls during the step methods, but be aware that this can lead to significant communication overhead and slow down your training.
 
-Note if you use any built in metrics or custom metrics that use the :doc:`Metrics API <../extensions/metrics>`, these do not need to be updated and are automatically handled for you.
+Note if you use any built in metrics or custom metrics that use `TorchMetrics <https://torchmetrics.readthedocs.io/>`_, these do not need to be updated and are automatically handled for you.
 
 .. testcode::
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 845b3b946972a..8aaa06ccef8ec 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -46,7 +46,7 @@
 
 # -- Project documents -------------------------------------------------------
 if _SHOULD_COPY_NOTEBOOKS:
-    HelperCLI.copy_notebooks(PATH_RAW_NB, PATH_HERE, "notebooks")
+    HelperCLI.copy_notebooks(PATH_RAW_NB, PATH_HERE, "notebooks", patterns=[".", "course_UvA-DL", "lightning_examples"])
 
 
 def _transform_changelog(path_in: str, path_out: str) -> None:
diff --git a/docs/source/extensions/logging.rst b/docs/source/extensions/logging.rst
index 1facdb93373eb..e652adbecc419 100644
--- a/docs/source/extensions/logging.rst
+++ b/docs/source/extensions/logging.rst
@@ -111,7 +111,7 @@ The :func:`~~pytorch_lightning.core.lightning.LightningModule.log` method has a
 .. note::
 
     -   Setting ``on_epoch=True`` will cache all your logged values during the full training epoch and perform a
-        reduction in ``on_train_epoch_end``. We recommend using the :doc:`metrics <../extensions/metrics>` API when working with custom reduction.
+        reduction in ``on_train_epoch_end``. We recommend using `TorchMetrics <https://torchmetrics.readthedocs.io/>`_, when working with custom reduction.
 
     -   Setting both ``on_step=True`` and ``on_epoch=True`` will create two keys per metric you log with
         suffix ``_step`` and ``_epoch``, respectively. You can refer to these keys e.g. in the `monitor`
diff --git a/docs/source/extensions/metrics.rst b/docs/source/extensions/metrics.rst
deleted file mode 100644
index 74a4a15deb2be..0000000000000
--- a/docs/source/extensions/metrics.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-#######
-Metrics
-#######
-
-``pytorch_lightning.metrics`` has been moved to a separate package `TorchMetrics <https://torchmetrics.readthedocs.io/>`_.
-We will preserve compatibility for the next few releases, nevertheless, we encourage users to update to use this stand-alone package.
-
-.. warning::
-    ``pytorch_lightning.metrics`` is deprecated from v1.3 and will be removed in v1.5.
diff --git a/docs/source/guides/speed.rst b/docs/source/guides/speed.rst
index 0b8edb43b7ec5..87b5b9d0ad139 100644
--- a/docs/source/guides/speed.rst
+++ b/docs/source/guides/speed.rst
@@ -151,9 +151,7 @@ For debugging purposes or for dataloaders that load very small datasets, it is d
 
     import warnings
 
-    warnings.filterwarnings(
-        "ignore", ".*does not have many workers. Consider increasing the value of the `num_workers` argument*"
-    )
+    warnings.filterwarnings("ignore", ".*Consider increasing the value of the `num_workers` argument*")
 
 Spawn
 """""
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 72da9c3e354c4..c1b20b958591b 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -84,7 +84,6 @@ PyTorch Lightning
    extensions/callbacks
    extensions/datamodules
    extensions/logging
-   extensions/metrics
    extensions/plugins
    extensions/loops
 
diff --git a/environment.yml b/environment.yml
index fb21d21c97730..7e3c84e913f9e 100644
--- a/environment.yml
+++ b/environment.yml
@@ -29,7 +29,7 @@ dependencies:
     - python>=3.6
     - pip>20.1
     - numpy>=1.17.2
-    - pytorch>=1.6
+    - pytorch>=1.7.*
     - future>=0.17.1
     - PyYAML>=5.1
     - tqdm>=4.41.0
@@ -41,13 +41,14 @@ dependencies:
     - scikit-learn>=0.20.0
     - matplotlib>=3.1.1
     - omegaconf>=2.0.5
+    - torchtext>=0.8.*
 
     # Examples
-    - torchvision>=0.6
+    - torchvision>=0.8.*
 
     - pip:
         - test-tube>=0.7.5
         - mlflow>=1.0.0
         - comet_ml>=3.1.12
         - wandb>=0.8.21
-        - neptune-client>=0.4.109
+        - neptune-client>=0.10.0
diff --git a/pl_examples/basic_examples/mnist_datamodule.py b/pl_examples/basic_examples/mnist_datamodule.py
index 1d2371c702ce0..a8d33b287f380 100644
--- a/pl_examples/basic_examples/mnist_datamodule.py
+++ b/pl_examples/basic_examples/mnist_datamodule.py
@@ -11,13 +11,18 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import logging
 import os
 import platform
-from typing import Optional
+import random
+import time
+import urllib
+from typing import Optional, Tuple
 from urllib.error import HTTPError
 from warnings import warn
 
-from torch.utils.data import DataLoader, random_split
+import torch
+from torch.utils.data import DataLoader, Dataset, random_split
 
 from pl_examples import _DATASETS_PATH
 from pytorch_lightning import LightningDataModule
@@ -27,6 +32,97 @@
     from torchvision import transforms as transform_lib
 
 
+class _MNIST(Dataset):
+    """Carbon copy of ``tests.helpers.datasets.MNIST``.
+
+    We cannot import the tests as they are not distributed with the package.
+    See https://github.com/PyTorchLightning/pytorch-lightning/pull/7614#discussion_r671183652 for more context.
+    """
+
+    RESOURCES = (
+        "https://pl-public-data.s3.amazonaws.com/MNIST/processed/training.pt",
+        "https://pl-public-data.s3.amazonaws.com/MNIST/processed/test.pt",
+    )
+
+    TRAIN_FILE_NAME = "training.pt"
+    TEST_FILE_NAME = "test.pt"
+    cache_folder_name = "complete"
+
+    def __init__(
+        self, root: str, train: bool = True, normalize: tuple = (0.1307, 0.3081), download: bool = True, **kwargs
+    ):
+        super().__init__()
+        self.root = root
+        self.train = train  # training set or test set
+        self.normalize = normalize
+
+        self.prepare_data(download)
+
+        data_file = self.TRAIN_FILE_NAME if self.train else self.TEST_FILE_NAME
+        self.data, self.targets = self._try_load(os.path.join(self.cached_folder_path, data_file))
+
+    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, int]:
+        img = self.data[idx].float().unsqueeze(0)
+        target = int(self.targets[idx])
+
+        if self.normalize is not None and len(self.normalize) == 2:
+            img = self.normalize_tensor(img, *self.normalize)
+
+        return img, target
+
+    def __len__(self) -> int:
+        return len(self.data)
+
+    @property
+    def cached_folder_path(self) -> str:
+        return os.path.join(self.root, "MNIST", self.cache_folder_name)
+
+    def _check_exists(self, data_folder: str) -> bool:
+        existing = True
+        for fname in (self.TRAIN_FILE_NAME, self.TEST_FILE_NAME):
+            existing = existing and os.path.isfile(os.path.join(data_folder, fname))
+        return existing
+
+    def prepare_data(self, download: bool = True):
+        if download and not self._check_exists(self.cached_folder_path):
+            self._download(self.cached_folder_path)
+        if not self._check_exists(self.cached_folder_path):
+            raise RuntimeError("Dataset not found.")
+
+    def _download(self, data_folder: str) -> None:
+        os.makedirs(data_folder, exist_ok=True)
+        for url in self.RESOURCES:
+            logging.info(f"Downloading {url}")
+            fpath = os.path.join(data_folder, os.path.basename(url))
+            urllib.request.urlretrieve(url, fpath)
+
+    @staticmethod
+    def _try_load(path_data, trials: int = 30, delta: float = 1.0):
+        """Resolving loading from the same time from multiple concurrent processes."""
+        res, exception = None, None
+        assert trials, "at least some trial has to be set"
+        assert os.path.isfile(path_data), f"missing file: {path_data}"
+        for _ in range(trials):
+            try:
+                res = torch.load(path_data)
+            # todo: specify the possible exception
+            except Exception as e:
+                exception = e
+                time.sleep(delta * random.random())
+            else:
+                break
+        if exception is not None:
+            # raise the caught exception
+            raise exception
+        return res
+
+    @staticmethod
+    def normalize_tensor(tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0) -> torch.Tensor:
+        mean = torch.as_tensor(mean, dtype=tensor.dtype, device=tensor.device)
+        std = torch.as_tensor(std, dtype=tensor.dtype, device=tensor.device)
+        return tensor.sub(mean).div(std)
+
+
 def MNIST(*args, **kwargs):
     torchvision_mnist_available = not bool(os.getenv("PL_USE_MOCKED_MNIST", False))
     if torchvision_mnist_available:
@@ -39,7 +135,7 @@ def MNIST(*args, **kwargs):
             torchvision_mnist_available = False
     if not torchvision_mnist_available:
         print("`torchvision.datasets.MNIST` not available. Using our hosted version")
-        from tests.helpers.datasets import MNIST
+        MNIST = _MNIST
     return MNIST(*args, **kwargs)
 
 
diff --git a/pl_examples/bug_report/bug_report_model.ipynb b/pl_examples/bug_report/bug_report_model.ipynb
new file mode 100644
index 0000000000000..a6cb1933f113d
--- /dev/null
+++ b/pl_examples/bug_report/bug_report_model.ipynb
@@ -0,0 +1,267 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "name": "bug_report_model.ipynb",
+      "provenance": [],
+      "collapsed_sections": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3 (ipykernel)",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.9.7"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "rR4_BAUYs3Mb"
+      },
+      "source": [
+        "![image.png](data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAQYAAABSCAYAAAC2XXppAAAaWElEQVR4Ae2dh3dU1RbG3x/i0mXvoighgKgI2BtW7KKiSAcFFVBQmgqKil2UJzYUQXl2ERSRpoiIIJhMOiEQkpCQRup+63eSHS73zkxmkmFC2Wetm5m55ZTv7P3tcs5M/iNWDAFDwBDwIfAf32f7aAgYAoaAGDGYEBgChkAAASOGACR2whAwBIwYTAYMAUMggIARQwASO2EIGAJGDCYDhoAhEEDAiCEAiZ0wBAwBIwaTAUPAEAggYMQQgMROGAKGgBGDyYAhYAgEEDBiCEBiJwwBQ8CIwWTAEDAEAggYMQQgsROGgCFgxGAyYAgYAgEEjBgCkNgJQ8AQMGIwGTAEDIEAAkYMAUjshCFgCBgxmAwYAoZAAAEjhgAkdsIQMASMGEwGDAFDIIBAhxJDXW2j7NldL6WFdVJb0xjonJ0wBAyBjkGgQ4ihoUGkvLhONi6vlLlPFMg7j++Qv5aVS0VZvdTXG0F0jChYq4bAPgSSSwyNIpV7GiTzrypZMKNIJvXLlhHdQzKye0ie6pcjC57bKRkbqqW6vF4ajR/2zZK9MwSSjEDSiGFvVaMUZNbI93NL5Lk7cmRUz5AMS0mX4Rxdm47RF2TIs3fmynfv7pYdmTWyt7ohyXBYc4aAIQACB5wYyCMUFdTJmsVlMntIvoy9OFOGdQ3JsNR0GdEttO9IDbUQxNjemfLqsHxZ9UWZ7C6slTrLP5i0GgJJReCAEQOhQNmuOvn7l0r574QCGX95lgyHELqmywg/KXgJovme4d1CMv6KbJk7YYdsWlHhkpQN9UnFxhozBI5YBBJPDJpH2FgtC2cVydM3ZMuIbhkyrEu6DI9GCF5y6NbkPUAio3pkyuQbc2TRrELJ3FQtVXuMHY5YabWBJw2BhBJDTVWjyw38MK9EZt6dJ6MvyGzyEuIghHDhBbmIhy/MlJn35MmS9y3/kDTpsIaOWAQSQgyN9SK7ttXK2q/L5PWR22Vsb2/Y4Mkj+LyC/Ugg2rXUphBkeGpIHuuT5dqgLfIP9bW2fHHESq8N/IAh0C5iII+wp7hetv5WKe9P2ikTroIQMppWG+L1ElD+1FZIpDn/MKpbSJ68Ols+eHqnbF5VKRWl9cLeCCuGgCGQGATaRQx7iurkqzdKZHr/HBnZo42EgKfQTApjLsqQUT0z9q1UhPMimlcvyD+M7pkh027JlR/nlUhZUV1iELFaDAFDoO3LlY0NIqEN1fJwrwjLj+GUOtw5FyaE5ImrsmXuuO0yplemEDK0GmZAECnpbpVjWv8c2fp7pU2nIWAIJAiBNnsMuO5b11bK0C4sP8agyFFIYcLVWfLd3BJZvqC0dULw15MSkonX5sjmXysSBIlVYwgYAu0jht8qY7PufmXmc2pIhqaky8TrcmTVwlIp3VUvC2YUykPn+DY+hXvWey61aTv15pWxEUNjY6Ps2bNH/v77b9mwYUPg4HxmZqaUlpbGLR01NTWSnZ0dqDNcO3ruzz//lFAoJBUVsfU/7k7F8ABj3bx5s6xfv162b98udXVtD8tyc3Pl999/d0dxcbGAd7RC27T722+/SX5+vjS0I1mUk5Pj6vnjjz86FE/w++uvv1xftm7dGm34B+21jiEGwocu6TKpX46sXFQqe6sbHTHMGrhNhpx3YImhvr5e1q1bJ7fddptce+217rj66qvlqquucsf1118vDz30kLz55pvyzz//SG1tbcyTh1LNmDFDrrzySqFOPWjnuuuuc8c111zTcp7rl19+uYwdO1Y2bdoUczuJvnHt2rUycOBAueKKK+T9999vEylqn+bMmeNwvOyyy2TFihWt4sdcgHmvXr1c21VVVVpV3K+vv/660C54o5CtkZK3AQhq586dcT3jfd77HsODfF188cXyyCOPeC8dMu+TTwyQQkq6TLo+R1YvLnOkQL4i79+9Mu7SrLg2Qbk8RJweA2yOwJ5//vnSqVMnOeecc+Tcc8+V8847z71y7owzzpAuXbrI4MGDBaWJ1Yrl5eXJ448/LmeffbZ07ty55aANDs7z6r125plnyr333uusZkdJzc8//+wU6rTTTpNXXnlFsPRtLS+88IIb31lnnSVLliwRvKhoZdWqVXLBBRfIKaecIm+88YZUVrY9VzR9+nShXeYSzy/WecO6z549W1577bVW+xttLHqtrKxMLrnkEjn11FPd3Or5Q+k1ucTQHD48dX2OrFpcJjXVTW4m34X44/ty912JuPMVbSQGhBEBuvHGG2XmzJnueO655+SJJ56Q/v37O+GGMEaMGCHbtm2LaU53794tixcvFgT02WeflWeeecYdDzzwgHTr1s21d8cdd8iUKVNark2dOlU++OADwQXvqAIx4OWcfvrp7SaGWbNmtRDsjz/+2KqiEUY9//zz8tRTT8ny5ctbvT8aRmAOsaekpDgPLFZieOyxx5wxGDRokLTHY9G+VVdXy0svveTGNG/ePD19SL0mlRj4evXUm3Nk9f/2kQJoVZc3yOezimTQOWnxJx/bQAy//vqrs1IIEESA61deXu4OrCUexX333eesT58+feSzzz6LycXEdUUoqM97fPzxx9K3b1/p2rWrU7yioqL9riOMhDhaeI9nE6tg63PRXqmPI1yBGAilIAasZklJiRsv98fjjlM3xADh4h3FQgy0AVZY2WjeBdf27t0brvst5yAG2gXnLVu2uL7H8hyeId4Sr4kgBjqEPDGm9nhALQPzvEE2wCHeefFUEdPbpBLDY30yZfmnpS2egvawuqJBFs8ulqFdQjKUL1l5k4utvW8HMSBAWCqvUtInBHXhwoXO8mDpp02b5rwGLBqkkZ6erl1veWWy0tLSBCUj+VZYWNhy7fPPP3euJUREHBxO+OgDzxBzf/PNN7Jo0SL57rvvXCITRfUWhG3jxo3y/fffu7YyMjJk9erV8sMPP7i8COREQSlIpNIn+oA3Q/9JkHpzJ0oMhFAvv/yya3PZsmWuD/QFtzxWAY+XGCBJcKXv9NU/F+RtCDfAY/78+Y5sGC8YQzwrV65sGYsSQ2pqqrsOlswjzxHW8JwqFN7dTz/9JEuXLnX5AEK6W265Rb799lt3LwlRf19Q9F9++cU9w3NgDpb+ArbgzJhIhIYryBDXqYd+EM6ARaSCDDCeL774Qj788EP5+uuvnRySNKYeMKR/iSpJIwb2JjzeN1N++rg08EUovjW5La1G5jxa0LRaEQ85tIMYUNRwxIBAIHA9e/Z01oe8AZ9vvvlmF4vzDOThLZDChAkTXMJpzJgxLsuv11FKPAbawyL7lQxSYXWCxOUNN9wg3bt3dzkO2r/99tudl0EyTQWV0IYQ5MILL5Q777zThTuEAuRFCGNIotE/lHrUqFFy6aWXurZpn/f0DzLQfigxEJ+TeCX0IXHG/YRcd911l3z66aeCMrVW4iUGViNIzDLW9957r6VPtAP5TZ48WUjYMjbyM/Rn5MiRDhf6yPi1X15iwBPs16+fC2sICbl32LBhTrnwxFBEzukck/dhvOSeIBZCSCVYHXNWVpYLuS666CLhIHkMvii21xsDe2SFurkerpBPoR3q6d27tyMl5h9y9BdWW8jdgBN9BAeeffDBB4XwR/uCDCaqJI0Y1AsgwfjJs4Wya9v+SSnIobigVr55u9h912JISoyeQzuIQT0GP5iEEySjmAAEhfckFp988knnbpO9x1prQbHJEzBBPXr0cCsauJJaohEDVh3PAAFHgJl4svQDBgxwgodbzjkEAHJCqBEUyIp4GoHGq8Ha3X333U65yFeQRIQEcK3J+KPctIGwQgAQ1SeffOIIRImB8dIe93MvpMR4qIOxQQ7ecen4vK/xEoMSsD/5SJ9QLjCBFMjygwHEyTnGzfhRTi8xMAa9ritM1MN5xsHzeA4cKP/999/viJhrEAVtQIyMg7nBw0AeduzY4aw/JEV4CSkTeik2eCVasNxgT/KRkFQLcgJpUxc5CGRLQ0zqot+Mk7yLFpSdvBTXmDf6P2TIEDc3zBXj4mCeCJ8SVZJLDGyESg253Y0fTimUwtwa2W+Vm99wKGmQpfNKZGwfdkDGQA7tIAYUbvjw4W69GdcU6wX7M2lYYCYCq8PKBJOKO4rFglAmTZrUEhIwkSxLcT/Kyf3qsjJR0YgBT2D06NFOyBFMEpbkQLBouPN4IUw6gvH000+7kAbF19UPlMb7DHsB6CdLdrjHCNJHH33kVj1wa7HKjIkxEFOjILih5BiUMFhy5F4OFASLRgz+8MMPt5okjZcYCBNQMurHirKfA8XCuiPw9JM616xZ48Ik7h8/frw7r8SgoRYeA89wnpUeyIUlZ+aDsUImKCAEh1X/999/XciBEoMVc8f+EtxzyJc5JCxAHrD84I9BIIT56quvZNy4ca5OcIOEdDVHiYG2IB4t1Es9ENI777zj6iIMIDQgCU49hHO0R6F9wljOMSa8IMIXCICwAg9DPQgI/9AlBvIFXUNuU9TYXpmy6MWiwLcjWaHYsKxCxvaNfWs0vxcZ6wYnXD5NPiIoKOOtt97qmJpXlAaQmVSsJEqim49w8xBYJgoLjavL5H355ZdOsWBwVjhUQFQgIhED1p+4Hw+EvuCRoKgII2EDFguCQJAQmptuuskpsRID51Bo7uEZxsZzKAheBEqFJ0PsqtdYr2f8tIuQ01f1GBgzeyoYp95Pf/BeEEysWWvuanuJgfAGoYeMGB/EzXgVE17pH33BWvs9Bs7RVzDXpC7P4JXhfmPFmSOwBSu8DTBkTrHEeESc18QvBoG2uM5qFffzLOcJLSATSAXiRh4okYgBo8OY6AOrFdTDQT8hCPpHXXh3FOaGfRnMC3IJWSkOzA8hJaEfdR76xAA5NHsO744rEPYwaKmvFflnTaW8MDBPRup9Byj5iJXCujARJ598sjtwZ5k0FApXGqXC7dMCQeB+IwQ8/9Zbb7mYmLgexSYGRNlUqPS5SMRADEsdCATuKZYMofQWhIZ7UHSUhWQaioK1QlgRTLwELQg2ysTYcHuVvPQ6RKaCjXDz2UsMJB+9xEb7WDnGh1VrbSdfIogBvBgv48PDoQ/eAkbkeSIRA1aUcXvnAWuN646SYYUZOwXSxDtUYvC35SUGjIE3lKJ+rDveDsqJN0OJRgzImxKDu7n5D+0yX/QPnCmQMB4h55hv73i4zjzieXD9sCGGUT1C8u2cfRtpIIUtayrl+QF5TV+/jvX7F+0IJQCd+BNhfvHFF91BcpDlSbLSWFfvZKBE7FAk9kYoiUe5j89YKSYPVveXSMSAMNAegow7S8bZX7AMKAfhBB4MxOQlBnYsFhQUtDxGn3GbIQbCiFjcSy8xvPrqq265UivEghPqJJsYIEsEHsvqV1b6RijHGMN5DOAJEXjnjs/qCULkSgy7du1qlRiYXxQaj8FLDPTj3XffbSEG8iWUSMQAzhAIROLf30BfIR6uQQzIGiSMEVJicJV7/uA90P5hRQyP9s2UTSuadrk5UlhbKc/fm+d+KDamb1eqJ9EOYkCAiFexGnqgWFhyFJLJ8RfiWZQZYiBxxOYYBA5rzn4FWNxfIhEDbRBf4n2gCFhr2vcWvAGsI0qAZWOlQUMJrBwxrJcYEHjyD5Ae+RDyFP4+kUhFUSA5vCDNMSBgkYgBAY3XY6BehDda8ecYICLyPZAgSgTJeb026mKJECWNRAx4fIwtFmJgiRhcUXxCCT8JgWc0Ypg7d25CiAEviDEpMfAZWcOTpG8QIPPmLXh2zP/hE0p0C8nkG3OlKJ9fYBLZAinc1wZSaA432ppjgBhQunAE4J0A73smjCQlwoQVhRQgCWI91vvDlUjEwL0kk1TwsPCQC8IKaSAIfGeDHATCMXToUOcBRCMG6iRDzsoCAsMzKBpkB0EQv0McxM0ks/Bw/B6DN5RQjyFeYkBp8W5YrycW9x7kCGgDxQ1HDFhcPCG8MJZuSa6SCIUQwIv+E2pECiXiIQaIGKWjLhK2uO+QmZJKa8SQKI8BufJ6DNoHPCNCXWSNZWbGz/yTJCUkwqDQ90M/lEht+gczr43Ml9rqxqbwoa2k0A5iUCsNMTAp8RQ23ZChRllRANx8Mup+N1PrVGJgEv37GEhm8Sx1oAgIKQLA+j1JR5JPeAaEGuQgsGgaStC+32OgTZJSrPXTN8jvnnvucQTIWEls4U0Q5xIf4ykpMVAfHoOfGFiNgBhIfsaSY6B+BJn7Wfpjvd170B/Ii7FADHgHeCvgABFB1Hg6+n0DyAHypC6Uh7rBSvHyL1eCcziPAQ8KLL2hBAQMVlhq+g2RkgSGTJELiIHlQkg2UihB3+mLP5RQj07lAJy5j/v9oYQSA9fwzNTTAm8I68QTT3RLyYyfEJY+MbfMmRqoWMJG7Utrr8ldrmxW5FHnZ8iXrxVL2rpKmXFvbvzhg4YRbSQGJggwmQRCgXiJAcuL4BJKIDCsZrDTLZLngUJj4WiPfIaucjA5PIPVJsdBOEICFCGlXl5RYLLmuKysgVOwnCQEjznmGGf5ISpvYTzkPkhCMk4sDgKEoCJgWBiUgXCCe9l9h4dBfXxvAbLQQl9x50866STnubD8F61g3ekz99NWuOOoo45yxMeSIeEGpHP00Ue7UEqxgTRY7UEBUCatj7FAlqwm+UMJyPr44493OLNSo1af/pKMBAvamThxYkuOgWtYYbw1sKf+4447zi3xopwQA0uRxx57rCNtP/lDZtTJdcZCwQuB6DmPZ6YFnLmP8yQNvYV5wChwDQ9RiYHzEI56UOCpWECoyMZB5zFsWV0pAzulybCUkPtXc7qJKeor+xguzpT5U3fKzAF5MjSlafky6jNeIvC+d1/fDsm4K7Jk44p9G4q8gPvfAzTMinCQLFywYMF+AuS/P9JnFASBQThZHvQqk/8ZQg88APIZrFogbN4COZAEQ3BQLCwjqyIoJISBVVWryHNYdPpN/xEwf16CexgnBEJowhIkqxdYap6BqHDnVfgYC+3iGbCs5xV++op1e/TRR93uO+8KiHcM+p7x4f6CL5Y33EEoQK6E8IZQA5eYcyznQbpKsFhzQhCSsuz8o4+47pAeG4dQCJRJsWFvAeERnhFek9ZD3+i3tsN91K0FrAgD3377bTdH4MC+Cc6DEfsFmAu+ZEefvYVlRiw4+QklTe7hGZ0fvR+y4j7uZ669BRJjfFzDo6FtCmPgGuQPtsgD44CQIAzagPAJJVrz5rzttfa+zR4DO5N2ba9r2cb84NlpMf9oCx7DuMuyZERqRszPBIgjNSSDzm760tX8aeykjJ7k8gLBxAE0wqJC5b3e2nueR7jwAlgCY9+61zr5n8c91vawkl6B1Xs5hxBCECgtm294JQHlFWLu5zNkQFJOY3Wtx/tKn2ibXAX1ET8zZr9wY52pi/uI771joV+0gaJxj5KJtx3ve57HsyEhyv2RDu6jbohHdwNCSHqO3AoKjgcDdlxjzNxPP3G38YIIMdTLoE5tz99PPmu/whEpY+Z5+s1YwY1Cf2gfggI7PnsLdTFPkDA46jPkiajLG5Zxnfu4P1wfaIdrtKOFkItNbhgWFJ86kFnmkHETGuJdkqTk+USVthODiLCNuTCvTlYuLpMZd+c5RR0U4y8wuZWHWJckvV5Ct5A81DldBndOk5kDcmXlojIpzmdjT3yQMMF6xPdkUwyPBYWpEVCEprWibfkFy/+c975o9+p9/ufDfdZ79TXaPfFei3a/thfu1fucXtdzKCnWE9efRCJYYy2xuIRw7DIlj4DbT2jmXXXx16V18hrtmt4X7h49x2u4otf918LdH+lentVr3uf48hghJZvnCCcwSIRFhD+ETuQZkEM8Gj8Z+vsTz+d2EYNrqFGkrlokd3O1fPlmsfsJ+QfOTJMh/BZk9zb+FqSPCNRb4NuXD3ZKk4nXZss3b5VIfjr/+BYFj2fI7bsX8PmGHclLYmNc5UROSPt6d/g8TdadDT+aEyGfw+4/fvEKwjjhhBPcZ7ajexXp8EGgaSR4HnhF5CbIUeEZgAO5BeQPfMi3QBSJLO0nhube8FN9lbsbJW1dlXwweaeM7pkpEITzDNpJEOQwCFXGXJQpH0/dKaE/qtz/kvDumkwkKNHqwtVnSy0JMSaH7yVYSTwChAysDBBDk5RFKUi64SUQwrFio3mAxLd+8NQI6RFCkHdglUYTyOCAt4AXQb4CvBJZEkYM2qmGOpGSgjr5c+keeXXYdhlybpNSx/3LTHgNzXkEfjT29VH58ufSctldyA+YaGvJfyW2YymMRBmCS1xq5cAgQAKOOJ1kI8lKkqZ8x0PX8o8UTw1ywHNgFYnkMDgQYvCdF3IWBwKHhBODikhdjUhBVq38/MlumdY/VwZ1SnO5gVjDi8HnprtfjJ5+e46sWFgqu/JqpY78YhLDBh2L/xWB1cN/zT4nFgGUgpwDFhFS5uDz4Rw+REKQcZN4VhyQwQOFwwEjBje4BpGqikbJ3bJX/vdKkVuJILzAA4hEEC5s6JQm4y/Pcv/lyuURKpKbR4g0MXbeEDhSEDiwxNCMIq5/eUm9/LuuSuY9uUNG9AjJA532X94cTh6hU5qMPj/D/U/K0PrmPMJB4CEcKcJg4zQEFIGkEIM2Rv6heHudrF+yR14elC+DO6e7JU7+yQz/T2L24G2yflm5lHZwHkH7a6+GwJGKQFKJQUGu3SuyI6dWfpq/W6bcnC1Tb8lxPxK7a3ut8EMtB0MeQftqr4bAkYhAhxCDA7pBpLqyUXZk18r2jBrZW2GEcCQKoI354ESg44ihGQ82JyVzg9LBOQ3WK0Pg4EKgw4nh4ILDemMIGAIgYMRgcmAIGAIBBIwYApDYCUPAEDBiMBkwBAyBAAJGDAFI7IQhYAgYMZgMGAKGQAABI4YAJHbCEDAEjBhMBgwBQyCAgBFDABI7YQgYAkYMJgOGgCEQQMCIIQCJnTAEDAEjBpMBQ8AQCCBgxBCAxE4YAoaAEYPJgCFgCAQQMGIIQGInDAFDwIjBZMAQMAQCCBgxBCCxE4aAIWDEYDJgCBgCAQSMGAKQ2AlDwBD4P9CuROTFaWXrAAAAAElFTkSuQmCC)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "i7XbLCXGkll9"
+      },
+      "source": [
+        "# The Boring Model\n",
+        "Replicate a bug you experience, using this model.\n",
+        "\n",
+        "[Remember! we're always available for support on Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "2LODD6w9ixlT"
+      },
+      "source": [
+        "---\n",
+        "## Setup env"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "zK7-Gg69kMnG"
+      },
+      "source": [
+        "%%capture\n",
+        "! pip install -qU pytorch-lightning"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "WvuSN5jEbY8P"
+      },
+      "source": [
+        "---\n",
+        "## Deps"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "w4_TYnt_keJi"
+      },
+      "source": [
+        "import os\n",
+        "\n",
+        "import torch\n",
+        "from torch.utils.data import DataLoader, Dataset\n",
+        "\n",
+        "from pytorch_lightning import LightningModule, Trainer"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "XrJDukwPtUnS"
+      },
+      "source": [
+        "---\n",
+        "## Data\n",
+        "Random data is best for debugging. If you needs special tensor shapes or batch compositions or dataloaders, modify as needed"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "hvgTiaZpkvwS"
+      },
+      "source": [
+        "class RandomDataset(Dataset):\n",
+        "    def __init__(self, size, num_samples):\n",
+        "        self.len = num_samples\n",
+        "        self.data = torch.randn(num_samples, size)\n",
+        "\n",
+        "    def __getitem__(self, index):\n",
+        "        return self.data[index]\n",
+        "\n",
+        "    def __len__(self):\n",
+        "        return self.len"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "sxVlWjGhl02D"
+      },
+      "source": [
+        "num_samples = 10000"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "V7ELesz1kVQo"
+      },
+      "source": [
+        "class BoringModel(LightningModule):\n",
+        "    def __init__(self):\n",
+        "        super().__init__()\n",
+        "        self.layer = torch.nn.Linear(32, 2)\n",
+        "\n",
+        "    def forward(self, x):\n",
+        "        return self.layer(x)\n",
+        "\n",
+        "    def training_step(self, batch, batch_idx):\n",
+        "        loss = self(batch).sum()\n",
+        "        self.log(\"train_loss\", loss)\n",
+        "        return {\"loss\": loss}\n",
+        "\n",
+        "    def validation_step(self, batch, batch_idx):\n",
+        "        loss = self(batch).sum()\n",
+        "        self.log(\"valid_loss\", loss)\n",
+        "\n",
+        "    def test_step(self, batch, batch_idx):\n",
+        "        loss = self(batch).sum()\n",
+        "        self.log(\"test_loss\", loss)\n",
+        "\n",
+        "    def configure_optimizers(self):\n",
+        "        return torch.optim.SGD(self.layer.parameters(), lr=0.1)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ubvW3LGSupmt"
+      },
+      "source": [
+        "---\n",
+        "## Define the test"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "4Dk6Ykv8lI7X"
+      },
+      "source": [
+        "def run():\n",
+        "    train_data = DataLoader(RandomDataset(32, 64), batch_size=2)\n",
+        "    val_data = DataLoader(RandomDataset(32, 64), batch_size=2)\n",
+        "    test_data = DataLoader(RandomDataset(32, 64), batch_size=2)\n",
+        "\n",
+        "    model = BoringModel()\n",
+        "    trainer = Trainer(\n",
+        "        default_root_dir=os.getcwd(),\n",
+        "        limit_train_batches=1,\n",
+        "        limit_val_batches=1,\n",
+        "        limit_test_batches=1,\n",
+        "        num_sanity_val_steps=0,\n",
+        "        max_epochs=1,\n",
+        "        enable_model_summary=False,\n",
+        "    )\n",
+        "    trainer.fit(model, train_dataloaders=train_data, val_dataloaders=val_data)\n",
+        "    trainer.test(model, dataloaders=test_data)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "4dPfTZVgmgxz"
+      },
+      "source": [
+        "---\n",
+        "## Run Test"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "AAtq1hwSmjKe"
+      },
+      "source": [
+        "run()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Flyi--SpvsJN"
+      },
+      "source": [
+        "---\n",
+        "## Environment\n",
+        "Run this to get the environment details"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "0-yvGFRoaDSi"
+      },
+      "source": [
+        "%%capture\n",
+        "! wget https://raw.githubusercontent.com/PyTorchLightning/pytorch-lightning/master/requirements/collect_env_details.py"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "quj4LUDgmFvj"
+      },
+      "source": [
+        "! python collect_env_details.py"
+      ],
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
diff --git a/pl_examples/bug_report_model.py b/pl_examples/bug_report/bug_report_model.py
similarity index 98%
rename from pl_examples/bug_report_model.py
rename to pl_examples/bug_report/bug_report_model.py
index 270b0cd2abe8d..7739630237d32 100644
--- a/pl_examples/bug_report_model.py
+++ b/pl_examples/bug_report/bug_report_model.py
@@ -53,6 +53,7 @@ def run():
         default_root_dir=os.getcwd(),
         limit_train_batches=1,
         limit_val_batches=1,
+        limit_test_batches=1,
         num_sanity_val_steps=0,
         max_epochs=1,
         enable_model_summary=False,
diff --git a/pl_examples/run_examples.sh b/pl_examples/run_examples.sh
index 4a15c3367d35f..a04a57631d9cb 100755
--- a/pl_examples/run_examples.sh
+++ b/pl_examples/run_examples.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 set -ex
 
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
 dir_path=$(dirname "${BASH_SOURCE[0]}")
 args="
   --data.batch_size=32
diff --git a/pl_examples/test_examples.py b/pl_examples/test_examples.py
index 19d09836ef34c..00ca558c53606 100644
--- a/pl_examples/test_examples.py
+++ b/pl_examples/test_examples.py
@@ -14,9 +14,10 @@
 from unittest import mock
 
 import pytest
+import torch
 
 from pl_examples import _DALI_AVAILABLE
-from tests.helpers.runif import RunIf
+from pytorch_lightning.utilities.imports import _IS_WINDOWS
 
 ARGS_DEFAULT = (
     "--trainer.default_root_dir %(tmpdir)s "
@@ -31,7 +32,8 @@
 
 
 @pytest.mark.skipif(not _DALI_AVAILABLE, reason="Nvidia DALI required")
-@RunIf(min_gpus=1, skip_windows=True)
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required")
+@pytest.mark.skipif(_IS_WINDOWS, reason="Not supported on Windows")
 @pytest.mark.parametrize("cli_args", [ARGS_GPU])
 def test_examples_mnist_dali(tmpdir, cli_args):
     from pl_examples.integration_examples.dali_image_classifier import cli_main
diff --git a/pyproject.toml b/pyproject.toml
index 6546d96e3d5e5..c527ffaa856cf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,6 @@ requires = [
 
 [tool.isort]
 known_first_party = [
-    "benchmarks",
     "docs",
     "pl_examples",
     "pytorch_lightning",
@@ -24,7 +23,7 @@ line-length = 120
 
 
 [tool.mypy]
-files = ["pytorch_lightning", "pl_examples", "benchmarks"]
+files = ["pytorch_lightning"]
 disallow_untyped_defs = "True"
 ignore_missing_imports = "True"
 show_error_codes = "True"
@@ -44,7 +43,6 @@ module = [
     "pytorch_lightning.core.*",
     "pytorch_lightning.loggers.*",
     "pytorch_lightning.loops.*",
-    "pytorch_lightning.metrics.*",
     "pytorch_lightning.overrides.*",
     "pytorch_lightning.plugins.environments.*",
     "pytorch_lightning.plugins.training_type.*",
@@ -53,9 +51,6 @@ module = [
     "pytorch_lightning.distributed.*",
     "pytorch_lightning.tuner.*",
     "pytorch_lightning.utilities.*",
-    "pl_examples.*",
-    "benchmarks.*",
-    "tests.helpers.*"
 ]
 ignore_errors = "True"
 
diff --git a/pytorch_lightning/__about__.py b/pytorch_lightning/__about__.py
index dc61686e5dbea..83335912ce29e 100644
--- a/pytorch_lightning/__about__.py
+++ b/pytorch_lightning/__about__.py
@@ -1,7 +1,7 @@
 import time
 
 _this_year = time.strftime("%Y")
-__version__ = "1.5.0"
+__version__ = "1.5.10"
 __author__ = "William Falcon et al."
 __author_email__ = "waf2107@columbia.edu"
 __license__ = "Apache-2.0"
diff --git a/pytorch_lightning/callbacks/device_stats_monitor.py b/pytorch_lightning/callbacks/device_stats_monitor.py
index b743ed3e1bbeb..016d2015a81e1 100644
--- a/pytorch_lightning/callbacks/device_stats_monitor.py
+++ b/pytorch_lightning/callbacks/device_stats_monitor.py
@@ -59,6 +59,7 @@ def on_train_batch_start(
 
         device_stats = trainer.accelerator.get_device_stats(pl_module.device)
         prefixed_device_stats = prefix_metrics_keys(device_stats, "on_train_batch_start")
+        assert trainer.logger is not None
         trainer.logger.log_metrics(prefixed_device_stats, step=trainer.global_step)
 
     def on_train_batch_end(
@@ -75,6 +76,7 @@ def on_train_batch_end(
 
         device_stats = trainer.accelerator.get_device_stats(pl_module.device)
         prefixed_device_stats = prefix_metrics_keys(device_stats, "on_train_batch_end")
+        assert trainer.logger is not None
         trainer.logger.log_metrics(prefixed_device_stats, step=trainer.global_step)
 
 
diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py
index b5118846875db..096cb4849cc39 100644
--- a/pytorch_lightning/callbacks/early_stopping.py
+++ b/pytorch_lightning/callbacks/early_stopping.py
@@ -202,7 +202,7 @@ def _run_early_stopping_check(self, trainer: "pl.Trainer") -> None:
         ):  # short circuit if metric not present
             return
 
-        current = logs.get(self.monitor)
+        current = logs[self.monitor].squeeze()
         should_stop, reason = self._evaluate_stopping_criteria(current)
 
         # stop every ddp process if any world process decides to stop
diff --git a/pytorch_lightning/callbacks/gpu_stats_monitor.py b/pytorch_lightning/callbacks/gpu_stats_monitor.py
index 7ee6771056666..7bb0289050a1e 100644
--- a/pytorch_lightning/callbacks/gpu_stats_monitor.py
+++ b/pytorch_lightning/callbacks/gpu_stats_monitor.py
@@ -161,6 +161,7 @@ def on_train_batch_start(
             # First log at beginning of second step
             logs["batch_time/inter_step (ms)"] = (time.time() - self._snap_inter_step_time) * 1000
 
+        assert trainer.logger is not None
         trainer.logger.log_metrics(logs, step=trainer.global_step)
 
     @rank_zero_only
@@ -185,6 +186,7 @@ def on_train_batch_end(
         if self._log_stats.intra_step_time and self._snap_intra_step_time:
             logs["batch_time/intra_step (ms)"] = (time.time() - self._snap_intra_step_time) * 1000
 
+        assert trainer.logger is not None
         trainer.logger.log_metrics(logs, step=trainer.global_step)
 
     @staticmethod
diff --git a/pytorch_lightning/callbacks/lr_monitor.py b/pytorch_lightning/callbacks/lr_monitor.py
index c9875cae83e62..b881c464dba6d 100644
--- a/pytorch_lightning/callbacks/lr_monitor.py
+++ b/pytorch_lightning/callbacks/lr_monitor.py
@@ -147,6 +147,7 @@ def _check_no_key(key: str) -> bool:
         self.last_momentum_values = {name + "-momentum": None for name in names_flatten}
 
     def on_train_batch_start(self, trainer: "pl.Trainer", *args: Any, **kwargs: Any) -> None:
+        assert trainer.logger is not None
         if not trainer.logger_connector.should_update_logs:
             return
 
@@ -158,6 +159,7 @@ def on_train_batch_start(self, trainer: "pl.Trainer", *args: Any, **kwargs: Any)
                 trainer.logger.log_metrics(latest_stat, step=trainer.global_step)
 
     def on_train_epoch_start(self, trainer: "pl.Trainer", *args: Any, **kwargs: Any) -> None:
+        assert trainer.logger is not None
         if self.logging_interval != "step":
             interval = "epoch" if self.logging_interval is None else "any"
             latest_stat = self._extract_stats(trainer, interval)
diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py
index 6b70f6af171c5..c4efffdc6d643 100644
--- a/pytorch_lightning/callbacks/model_checkpoint.py
+++ b/pytorch_lightning/callbacks/model_checkpoint.py
@@ -114,8 +114,14 @@ class ModelCheckpoint(Callback):
             guaranteed to execute at the exact time specified, but should be close.
             This must be mutually exclusive with ``every_n_train_steps`` and ``every_n_epochs``.
         every_n_epochs: Number of epochs between checkpoints.
-            If ``every_n_epochs == None or every_n_epochs == 0``, we skip saving when the epoch ends.
-            To disable, set ``every_n_epochs = 0``. This value must be ``None`` or non-negative.
+            This value must be ``None`` or non-negative.
+            To disable saving after each epoch, set ``every_n_epochs = 0``.
+            If all of ``every_n_epochs``, ``every_n_train_steps`` and
+            ``train_time_interval`` are ``None``, we save a checkpoint at the end of every epoch
+            (equivalent to ``every_n_epochs = 1``).
+            If ``every_n_epochs == None`` and either ``every_n_train_steps != None`` or ``train_time_interval != None``,
+            saving at the end of each epoch is disabled
+            (equivalent to ``every_n_epochs = 0``).
             This must be mutually exclusive with ``every_n_train_steps`` and ``train_time_interval``.
             Setting both ``ModelCheckpoint(..., every_n_epochs=V, save_on_train_epoch_end=False)`` and
             ``Trainer(max_epochs=N, check_val_every_n_epoch=M)``
@@ -351,6 +357,10 @@ def on_save_checkpoint(
             "best_model_path": self.best_model_path,
             "current_score": self.current_score,
             "dirpath": self.dirpath,
+            "best_k_models": self.best_k_models,
+            "kth_best_model_path": self.kth_best_model_path,
+            "kth_value": self.kth_value,
+            "last_model_path": self.last_model_path,
         }
 
     def on_load_checkpoint(
@@ -358,6 +368,10 @@ def on_load_checkpoint(
     ) -> None:
         self.best_model_score = callback_state["best_model_score"]
         self.best_model_path = callback_state["best_model_path"]
+        self.best_k_models = callback_state.get("best_k_models", self.best_k_models)
+        self.kth_best_model_path = callback_state.get("kth_best_model_path", self.kth_best_model_path)
+        self.kth_value = callback_state.get("kth_value", self.kth_value)
+        self.last_model_path = callback_state.get("last_model_path", self.last_model_path)
 
     def save_checkpoint(self, trainer: "pl.Trainer") -> None:
         """Performs the main logic around saving a checkpoint.
diff --git a/pytorch_lightning/callbacks/progress/rich_progress.py b/pytorch_lightning/callbacks/progress/rich_progress.py
index f6f862704f599..b9e975e018cfa 100644
--- a/pytorch_lightning/callbacks/progress/rich_progress.py
+++ b/pytorch_lightning/callbacks/progress/rich_progress.py
@@ -17,7 +17,8 @@
 from typing import Any, Optional, Union
 
 from pytorch_lightning.callbacks.progress.base import ProgressBarBase
-from pytorch_lightning.utilities import _RICH_AVAILABLE
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.imports import _RICH_AVAILABLE
 
 Task, Style = None, None
 if _RICH_AVAILABLE:
@@ -129,13 +130,20 @@ def render(self, task) -> RenderableType:
     class MetricsTextColumn(ProgressColumn):
         """A column containing text."""
 
-        def __init__(self, trainer, pl_module):
+        def __init__(self, trainer, style):
             self._trainer = trainer
-            self._pl_module = pl_module
             self._tasks = {}
             self._current_task_id = 0
+            self._metrics = {}
+            self._style = style
             super().__init__()
 
+        def update(self, metrics):
+            # Called when metrics are ready to be rendered.
+            # This is to prevent render from causing deadlock issues by requesting metrics
+            # in separate threads.
+            self._metrics = metrics
+
         def render(self, task) -> Text:
             from pytorch_lightning.trainer.states import TrainerFn
 
@@ -149,32 +157,37 @@ def render(self, task) -> Text:
             if self._trainer.training and task.id != self._current_task_id:
                 return self._tasks[task.id]
             _text = ""
-            # TODO(@daniellepintz): make this code cleaner
-            progress_bar_callback = getattr(self._trainer, "progress_bar_callback", None)
-            if progress_bar_callback:
-                metrics = self._trainer.progress_bar_callback.get_metrics(self._trainer, self._pl_module)
-            else:
-                metrics = self._trainer.progress_bar_metrics
-
-            for k, v in metrics.items():
+
+            for k, v in self._metrics.items():
                 _text += f"{k}: {round(v, 3) if isinstance(v, float) else v} "
-            return Text(_text, justify="left")
+            return Text(_text, justify="left", style=self._style)
 
 
 @dataclass
 class RichProgressBarTheme:
     """Styles to associate to different base components.
 
+    Args:
+        description: Style for the progress bar description. For eg., Epoch x, Testing, etc.
+        progress_bar: Style for the bar in progress.
+        progress_bar_finished: Style for the finished progress bar.
+        progress_bar_pulse: Style for the progress bar when `IterableDataset` is being processed.
+        batch_progress: Style for the progress tracker (i.e 10/50 batches completed).
+        time: Style for the processed time and estimate time remaining.
+        processing_speed: Style for the speed of the batches being processed.
+        metrics: Style for the metrics
+
     https://rich.readthedocs.io/en/stable/style.html
     """
 
-    text_color: str = "white"
-    progress_bar_complete: Union[str, Style] = "#6206E0"
+    description: Union[str, Style] = "white"
+    progress_bar: Union[str, Style] = "#6206E0"
     progress_bar_finished: Union[str, Style] = "#6206E0"
     progress_bar_pulse: Union[str, Style] = "#6206E0"
-    batch_process: str = "white"
-    time: str = "grey54"
-    processing_speed: str = "grey70"
+    batch_progress: Union[str, Style] = "white"
+    time: Union[str, Style] = "grey54"
+    processing_speed: Union[str, Style] = "grey70"
+    metrics: Union[str, Style] = "white"
 
 
 class RichProgressBar(ProgressBarBase):
@@ -210,9 +223,10 @@ def __init__(
         theme: RichProgressBarTheme = RichProgressBarTheme(),
     ) -> None:
         if not _RICH_AVAILABLE:
-            raise ModuleNotFoundError(
-                "`RichProgressBar` requires `rich` to be installed. Install it by running `pip install -U rich`."
+            raise MisconfigurationException(
+                "`RichProgressBar` requires `rich` >= 10.2.2. Install it by running `pip install -U rich`."
             )
+
         super().__init__()
         self._refresh_rate_per_second: int = refresh_rate_per_second
         self._leave: bool = leave
@@ -220,9 +234,9 @@ def __init__(
         self.progress: Optional[Progress] = None
         self.val_sanity_progress_bar_id: Optional[int] = None
         self._reset_progress_bar_ids()
+        self._metric_component = None
         self._progress_stopped: bool = False
         self.theme = theme
-        self._console: Console = Console()
 
     @property
     def refresh_rate_per_second(self) -> float:
@@ -263,12 +277,15 @@ def test_description(self) -> str:
     def predict_description(self) -> str:
         return "Predicting"
 
-    def _init_progress(self, trainer, pl_module):
-        if self.progress is None or self._progress_stopped:
+    def _init_progress(self, trainer):
+        if self.is_enabled and (self.progress is None or self._progress_stopped):
             self._reset_progress_bar_ids()
+            self._console: Console = Console()
             self._console.clear_live()
+            self._metric_component = MetricsTextColumn(trainer, self.theme.metrics)
             self.progress = CustomProgress(
-                *self.configure_columns(trainer, pl_module),
+                *self.configure_columns(trainer),
+                self._metric_component,
                 refresh_per_second=self.refresh_rate_per_second,
                 disable=self.is_disabled,
                 console=self._console,
@@ -279,19 +296,19 @@ def _init_progress(self, trainer, pl_module):
 
     def on_train_start(self, trainer, pl_module):
         super().on_train_start(trainer, pl_module)
-        self._init_progress(trainer, pl_module)
+        self._init_progress(trainer)
 
     def on_predict_start(self, trainer, pl_module):
         super().on_predict_start(trainer, pl_module)
-        self._init_progress(trainer, pl_module)
+        self._init_progress(trainer)
 
     def on_test_start(self, trainer, pl_module):
         super().on_test_start(trainer, pl_module)
-        self._init_progress(trainer, pl_module)
+        self._init_progress(trainer)
 
     def on_validation_start(self, trainer, pl_module):
         super().on_validation_start(trainer, pl_module)
-        self._init_progress(trainer, pl_module)
+        self._init_progress(trainer)
 
     def __getstate__(self):
         # can't pickle the rich progress objects
@@ -302,17 +319,18 @@ def __getstate__(self):
 
     def __setstate__(self, state):
         self.__dict__ = state
-        # reset console reference after loading progress
-        self._console = Console()
+        state["_console"] = Console()
 
     def on_sanity_check_start(self, trainer, pl_module):
         super().on_sanity_check_start(trainer, pl_module)
-        self._init_progress(trainer, pl_module)
+        self._init_progress(trainer)
         self.val_sanity_progress_bar_id = self._add_task(trainer.num_sanity_val_steps, self.sanity_check_description)
 
     def on_sanity_check_end(self, trainer, pl_module):
         super().on_sanity_check_end(trainer, pl_module)
-        self._update(self.val_sanity_progress_bar_id, visible=False)
+        if self.progress is not None:
+            self.progress.update(self.val_sanity_progress_bar_id, advance=0, visible=False)
+            self.progress.refresh()
 
     def on_train_epoch_start(self, trainer, pl_module):
         super().on_train_epoch_start(trainer, pl_module)
@@ -328,10 +346,10 @@ def on_train_epoch_start(self, trainer, pl_module):
         train_description = self._get_train_description(trainer.current_epoch)
         if self.main_progress_bar_id is not None and self._leave:
             self._stop_progress()
-            self._init_progress(trainer, pl_module)
+            self._init_progress(trainer)
         if self.main_progress_bar_id is None:
             self.main_progress_bar_id = self._add_task(total_batches, train_description)
-        else:
+        elif self.progress is not None:
             self.progress.reset(
                 self.main_progress_bar_id, total=total_batches, description=train_description, visible=True
             )
@@ -349,7 +367,7 @@ def on_validation_epoch_start(self, trainer, pl_module):
     def _add_task(self, total_batches: int, description: str, visible: bool = True) -> Optional[int]:
         if self.progress is not None:
             return self.progress.add_task(
-                f"[{self.theme.text_color}]{description}", total=total_batches, visible=visible
+                f"[{self.theme.description}]{description}", total=total_batches, visible=visible
             )
 
     def _update(self, progress_bar_id: int, visible: bool = True) -> None:
@@ -372,6 +390,7 @@ def on_predict_epoch_start(self, trainer, pl_module):
     def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
         super().on_train_batch_end(trainer, pl_module, outputs, batch, batch_idx)
         self._update(self.main_progress_bar_id)
+        self._update_metrics(trainer, pl_module)
 
     def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx):
         super().on_validation_batch_end(trainer, pl_module, outputs, batch, batch_idx, dataloader_idx)
@@ -414,6 +433,11 @@ def _reset_progress_bar_ids(self):
         self.test_progress_bar_id: Optional[int] = None
         self.predict_progress_bar_id: Optional[int] = None
 
+    def _update_metrics(self, trainer, pl_module) -> None:
+        metrics = self.get_metrics(trainer, pl_module)
+        if self._metric_component:
+            self._metric_component.update(metrics)
+
     def teardown(self, trainer, pl_module, stage: Optional[str] = None) -> None:
         self._stop_progress()
 
@@ -436,16 +460,15 @@ def main_progress_bar(self) -> Task:
     def test_progress_bar(self) -> Task:
         return self.progress.tasks[self.test_progress_bar_id]
 
-    def configure_columns(self, trainer, pl_module) -> list:
+    def configure_columns(self, trainer) -> list:
         return [
             TextColumn("[progress.description]{task.description}"),
             CustomBarColumn(
-                complete_style=self.theme.progress_bar_complete,
+                complete_style=self.theme.progress_bar,
                 finished_style=self.theme.progress_bar_finished,
                 pulse_style=self.theme.progress_bar_pulse,
             ),
-            BatchesProcessedColumn(style=self.theme.batch_process),
+            BatchesProcessedColumn(style=self.theme.batch_progress),
             CustomTimeColumn(style=self.theme.time),
             ProcessingSpeedColumn(style=self.theme.processing_speed),
-            MetricsTextColumn(trainer, pl_module),
         ]
diff --git a/pytorch_lightning/callbacks/progress/tqdm_progress.py b/pytorch_lightning/callbacks/progress/tqdm_progress.py
index 672d9d893ad61..11103e4b0595d 100644
--- a/pytorch_lightning/callbacks/progress/tqdm_progress.py
+++ b/pytorch_lightning/callbacks/progress/tqdm_progress.py
@@ -25,6 +25,7 @@
 else:
     from tqdm import tqdm as _tqdm
 
+import pytorch_lightning as pl
 from pytorch_lightning.callbacks.progress.base import ProgressBarBase
 
 _PAD_SIZE = 5
@@ -206,12 +207,10 @@ def init_test_tqdm(self) -> Tqdm:
         return bar
 
     def on_sanity_check_start(self, trainer, pl_module):
-        super().on_sanity_check_start(trainer, pl_module)
         self.val_progress_bar = self.init_sanity_tqdm()
         self.main_progress_bar = Tqdm(disable=True)  # dummy progress bar
 
     def on_sanity_check_end(self, trainer, pl_module):
-        super().on_sanity_check_end(trainer, pl_module)
         self.main_progress_bar.close()
         self.val_progress_bar.close()
 
@@ -233,37 +232,44 @@ def on_train_epoch_start(self, trainer, pl_module):
 
     def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
         super().on_train_batch_end(trainer, pl_module, outputs, batch, batch_idx)
-        total_batches = self.total_train_batches + self.total_val_batches
-        total_batches = convert_inf(total_batches)
-        if self._should_update(self.train_batch_idx, total_batches):
+        if self._should_update(self.train_batch_idx):
             self._update_bar(self.main_progress_bar)
             self.main_progress_bar.set_postfix(self.get_metrics(trainer, pl_module))
 
+    def on_train_epoch_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
+        if self.is_enabled:
+            self._update_bar(self.main_progress_bar)
+            self.main_progress_bar.set_postfix(self.get_metrics(trainer, pl_module))
+
+    def on_train_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
+        self.main_progress_bar.close()
+
     def on_validation_start(self, trainer, pl_module):
         super().on_validation_start(trainer, pl_module)
         if trainer.sanity_checking:
             reset(self.val_progress_bar, total=sum(trainer.num_sanity_val_batches), current=self.val_batch_idx)
         else:
-            self._update_bar(self.main_progress_bar)  # fill up remaining
+            if trainer.state.fn == pl.trainer.states.TrainerFn.FITTING:
+                self._update_bar(self.main_progress_bar)  # fill up remaining
             self.val_progress_bar = self.init_validation_tqdm()
             reset(self.val_progress_bar, total=self.total_val_batches, current=self.val_batch_idx)
 
     def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx):
         super().on_validation_batch_end(trainer, pl_module, outputs, batch, batch_idx, dataloader_idx)
-        if self._should_update(self.val_batch_idx, convert_inf(self.total_val_batches)):
+        if self._should_update(self.val_batch_idx):
+            self._update_bar(self.val_progress_bar)
+            if trainer.state.fn == pl.trainer.states.TrainerFn.FITTING:
+                self._update_bar(self.main_progress_bar)
+
+    def on_validation_epoch_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
+        if self.is_enabled:
             self._update_bar(self.val_progress_bar)
-            self._update_bar(self.main_progress_bar)
 
     def on_validation_end(self, trainer, pl_module):
-        super().on_validation_end(trainer, pl_module)
-        if self.main_progress_bar is not None:
+        if self.main_progress_bar is not None and trainer.state.fn == pl.trainer.states.TrainerFn.FITTING:
             self.main_progress_bar.set_postfix(self.get_metrics(trainer, pl_module))
         self.val_progress_bar.close()
 
-    def on_train_end(self, trainer, pl_module):
-        super().on_train_end(trainer, pl_module)
-        self.main_progress_bar.close()
-
     def on_test_start(self, trainer, pl_module):
         super().on_test_start(trainer, pl_module)
         self.test_progress_bar = self.init_test_tqdm()
@@ -271,11 +277,14 @@ def on_test_start(self, trainer, pl_module):
 
     def on_test_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx):
         super().on_test_batch_end(trainer, pl_module, outputs, batch, batch_idx, dataloader_idx)
-        if self._should_update(self.test_batch_idx, self.total_test_batches):
+        if self._should_update(self.test_batch_idx):
+            self._update_bar(self.test_progress_bar)
+
+    def on_test_epoch_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
+        if self.is_enabled:
             self._update_bar(self.test_progress_bar)
 
     def on_test_end(self, trainer, pl_module):
-        super().on_test_end(trainer, pl_module)
         self.test_progress_bar.close()
 
     def on_predict_epoch_start(self, trainer, pl_module):
@@ -285,7 +294,7 @@ def on_predict_epoch_start(self, trainer, pl_module):
 
     def on_predict_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx):
         super().on_predict_batch_end(trainer, pl_module, outputs, batch, batch_idx, dataloader_idx)
-        if self._should_update(self.predict_batch_idx, self.total_predict_batches):
+        if self._should_update(self.predict_batch_idx):
             self._update_bar(self.predict_progress_bar)
 
     def on_predict_end(self, trainer, pl_module):
@@ -309,8 +318,8 @@ def print(
             s = sep.join(map(str, args))
             active_progress_bar.write(s, end=end, file=file, nolock=nolock)
 
-    def _should_update(self, current, total) -> bool:
-        return self.is_enabled and (current % self.refresh_rate == 0 or current == total)
+    def _should_update(self, idx: int) -> bool:
+        return self.is_enabled and (idx % self.refresh_rate == 0)
 
     def _update_bar(self, bar: Optional[Tqdm]) -> None:
         """Updates the bar by the refresh rate without overshooting."""
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index c59193859b171..74485f7ddc89e 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -46,7 +46,7 @@
 )
 from pytorch_lightning.utilities.apply_func import apply_to_collection, convert_to_tensors
 from pytorch_lightning.utilities.cloud_io import get_filesystem
-from pytorch_lightning.utilities.distributed import distributed_available, sync_ddp
+from pytorch_lightning.utilities.distributed import distributed_available, rank_zero_debug, sync_ddp
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.memory import get_model_size_mb
 from pytorch_lightning.utilities.model_summary import ModelSummary, summarize
@@ -116,6 +116,8 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
         self._param_requires_grad_state = {}
         self._metric_attributes: Optional[Dict[int, str]] = None
         self._should_prevent_trainer_and_dataloaders_deepcopy: bool = False
+        # TODO: remove after the 1.6 release
+        self._running_torchscript = False
 
         self._register_sharded_tensor_state_dict_hooks_if_available()
 
@@ -408,10 +410,6 @@ def log(
             value, object, self.__check_allowed, name, value, wrong_dtype=(numbers.Number, Metric, Tensor, dict)
         )
 
-        # set the default depending on the fx_name
-        on_step = self.__auto_choose_log_on_step(on_step)
-        on_epoch = self.__auto_choose_log_on_epoch(on_epoch)
-
         if self.trainer is None:
             # not an error to support testing the `*_step` methods without a `Trainer` reference
             rank_zero_warn(
@@ -430,7 +428,10 @@ def log(
             raise MisconfigurationException(
                 "You are trying to `self.log()` but it is not managed by the `Trainer` control flow"
             )
-        _FxValidator.check_logging(self._current_fx_name, on_step=on_step, on_epoch=on_epoch)
+
+        on_step, on_epoch = _FxValidator.check_logging_and_get_default_levels(
+            self._current_fx_name, on_step=on_step, on_epoch=on_epoch
+        )
 
         # make sure user doesn't introduce logic for multi-dataloaders
         if "/dataloader_idx_" in name:
@@ -485,7 +486,8 @@ def log(
             on_epoch=on_epoch,
             reduce_fx=reduce_fx,
             enable_graph=enable_graph,
-            dataloader_idx=(self._current_dataloader_idx if add_dataloader_idx else None),
+            add_dataloader_idx=add_dataloader_idx,
+            dataloader_idx=self._current_dataloader_idx,
             batch_size=batch_size,
             sync_dist=sync_dist and distributed_available(),
             sync_dist_fn=self.trainer.training_type_plugin.reduce or sync_ddp,
@@ -591,18 +593,6 @@ def log_grad_norm(self, grad_norm_dict):
         """
         self.log_dict(grad_norm_dict, on_step=True, on_epoch=True, prog_bar=True, logger=True)
 
-    def __auto_choose_log_on_step(self, on_step: Optional[bool]) -> bool:
-        if on_step is None:
-            on_step = False
-            on_step |= self._current_fx_name in ("training_step", "training_step_end")
-        return on_step
-
-    def __auto_choose_log_on_epoch(self, on_epoch: Optional[bool]) -> bool:
-        if on_epoch is None:
-            on_epoch = True
-            on_epoch &= self._current_fx_name not in ("training_step", "training_step_end")
-        return on_epoch
-
     def all_gather(
         self, data: Union[torch.Tensor, Dict, List, Tuple], group: Optional[Any] = None, sync_grads: bool = False
     ):
@@ -1802,7 +1792,7 @@ def get_progress_bar_dict(self) -> Dict[str, Union[int, str]]:
         r"""
         .. deprecated:: v1.5
             This method was deprecated in v1.5 in favor of
-            `pytorch_lightning.callbacks.progress.base.get_standard_metrics` and will be removed in v1.7.
+            `pytorch_lightning.callbacks.progress.base.get_metrics` and will be removed in v1.7.
 
         Implement this to override the default items displayed in the progress bar.
         By default it includes the average loss value, split index of BPTT (if used)
@@ -1899,7 +1889,7 @@ def to_onnx(self, file_path: Union[str, Path], input_sample: Optional[Any] = Non
 
         input_sample = self._apply_batch_transfer_handler(input_sample)
 
-        if "example_outputs" not in kwargs:
+        if not _TORCH_GREATER_EQUAL_1_10 and "example_outputs" not in kwargs:
             self.eval()
             if isinstance(input_sample, Tuple):
                 kwargs["example_outputs"] = self(*input_sample)
@@ -1962,6 +1952,8 @@ def to_torchscript(
         """
         mode = self.training
 
+        self._running_torchscript = True
+
         if method == "script":
             torchscript_module = torch.jit.script(self.eval(), **kwargs)
         elif method == "trace":
@@ -1987,6 +1979,8 @@ def to_torchscript(
             with fs.open(file_path, "wb") as f:
                 torch.jit.save(torchscript_module, f)
 
+        self._running_torchscript = False
+
         return torchscript_module
 
     @property
@@ -1996,11 +1990,12 @@ def model_size(self) -> float:
         Note:
             This property will not return correct value for Deepspeed (stage 3) and fully-sharded training.
         """
-        rank_zero_deprecation(
-            "The `LightningModule.model_size` property was deprecated in v1.5 and will be removed in v1.7."
-            " Please use the `pytorch_lightning.utilities.memory.get_model_size_mb`.",
-            stacklevel=5,
-        )
+        if not self._running_torchscript:  # remove with the deprecation removal
+            rank_zero_deprecation(
+                "The `LightningModule.model_size` property was deprecated in v1.5 and will be removed in v1.7."
+                " Please use the `pytorch_lightning.utilities.memory.get_model_size_mb`.",
+                stacklevel=5,
+            )
         return get_model_size_mb(self)
 
     def add_to_queue(self, queue: torch.multiprocessing.SimpleQueue) -> None:
@@ -2052,7 +2047,8 @@ def _register_sharded_tensor_state_dict_hooks_if_available(self) -> None:
 
         These hooks ensure that ShardedTensors are included when saving, and are loaded the LightningModule correctly.
         """
-        if not _TORCH_GREATER_EQUAL_1_10 or _IS_WINDOWS:
+        if not _TORCH_GREATER_EQUAL_1_10 or _IS_WINDOWS or not torch.distributed.is_available():
+            rank_zero_debug("Could not register sharded tensor state dict hooks")
             return
 
         from torch.distributed._sharded_tensor import pre_load_state_dict_hook, state_dict_hook
diff --git a/pytorch_lightning/core/mixins/device_dtype_mixin.py b/pytorch_lightning/core/mixins/device_dtype_mixin.py
index e02790edddd1e..e8b122989cd9c 100644
--- a/pytorch_lightning/core/mixins/device_dtype_mixin.py
+++ b/pytorch_lightning/core/mixins/device_dtype_mixin.py
@@ -17,6 +17,8 @@
 import torch
 from torch.nn import Module
 
+import pytorch_lightning as pl
+
 
 class DeviceDtypeModuleMixin(Module):
     __jit_unused_properties__ = ["device", "dtype"]
@@ -177,7 +179,9 @@ def __update_properties(
         self, device: Optional[torch.device] = None, dtype: Optional[Union[str, torch.dtype]] = None
     ) -> None:
         def apply_fn(module: Union["DeviceDtypeModuleMixin", Module]) -> None:
-            if not isinstance(module, DeviceDtypeModuleMixin):
+            # TODO: Find why `isinstance(module, DeviceDtypeModuleMixin)` doesn't
+            # work when using `init_meta_context`.
+            if not isinstance(module, (DeviceDtypeModuleMixin, pl.LightningModule)):
                 return
             if device is not None:
                 module._device = device
diff --git a/pytorch_lightning/core/mixins/hparams_mixin.py b/pytorch_lightning/core/mixins/hparams_mixin.py
index 0e722f2bdb683..26a272dd3dd1d 100644
--- a/pytorch_lightning/core/mixins/hparams_mixin.py
+++ b/pytorch_lightning/core/mixins/hparams_mixin.py
@@ -28,7 +28,7 @@ class HyperparametersMixin:
 
     def __init__(self) -> None:
         super().__init__()
-        self._log_hyperparams = True
+        self._log_hyperparams = False
 
     def save_hyperparameters(
         self,
diff --git a/pytorch_lightning/lite/lite.py b/pytorch_lightning/lite/lite.py
index 2e6f10d356fe0..2e45f6c7d0e48 100644
--- a/pytorch_lightning/lite/lite.py
+++ b/pytorch_lightning/lite/lite.py
@@ -238,18 +238,15 @@ def _setup_dataloader(
                 )
             sampler = self._get_distributed_sampler(dataloader, **self._strategy.distributed_sampler_kwargs)
 
-        dataloader_kwargs = TrainerDataLoadingMixin._get_dataloader_init_kwargs(dataloader, sampler)
-        try:
-            dataloader = type(dataloader)(**dataloader_kwargs)
-        except TypeError:
-            dataloader_kwargs.pop("dataset")
-            dataloader = type(dataloader)(**dataloader_kwargs)
+        # the dataloader needs to be re-instantiated because we want to update the input arguments (e.g., sampler)
+        dataloader = TrainerDataLoadingMixin._update_dataloader(dataloader, sampler)
+
         # add worker_init_fn for correct seeding in worker processes
         TrainerDataLoadingMixin._auto_add_worker_init_fn(dataloader, self.global_rank)
-        return _LiteDataLoader(
-            dataloader=self._strategy.process_dataloader(dataloader),
-            device=self.device if move_to_device and not isinstance(self._strategy, TPUSpawnPlugin) else None,
-        )
+
+        dataloader = self._strategy.process_dataloader(dataloader)
+        device = self.device if move_to_device and not isinstance(self._strategy, TPUSpawnPlugin) else None
+        return _LiteDataLoader(dataloader=dataloader, device=device)
 
     def backward(self, tensor: Tensor, *args: Any, model: Optional[_LiteModule] = None, **kwargs: Any) -> None:
         """Replaces ``loss.backward()`` in your training loop. Handles precision and automatically for you.
diff --git a/pytorch_lightning/lite/wrappers.py b/pytorch_lightning/lite/wrappers.py
index ad01b44ef30f4..c13800cb842d6 100644
--- a/pytorch_lightning/lite/wrappers.py
+++ b/pytorch_lightning/lite/wrappers.py
@@ -14,6 +14,7 @@
 import functools
 import inspect
 from contextlib import contextmanager
+from itertools import chain
 from typing import Any, Callable, Dict, Generator, Iterable, Iterator, Optional, Set, Sized, Type, Union
 
 import torch
@@ -23,6 +24,7 @@
 from torch.utils.data import DataLoader
 
 from pytorch_lightning.accelerators import Accelerator
+from pytorch_lightning.core.mixins import DeviceDtypeModuleMixin
 from pytorch_lightning.plugins import PrecisionPlugin
 from pytorch_lightning.utilities.apply_func import apply_to_collection, move_data_to_device
 
@@ -44,7 +46,7 @@ def __init__(self, optimizer: Optimizer, accelerator: Accelerator) -> None:
         """
         # `__del__` is skipped in case the optimizer has implemented custom destructor logic which we would
         # not want to call on destruction of the `_LiteOptimizer
-        self.__dict__ = {k: v for k, v in optimizer.__dict__.items() if k not in ("step", "__del__")}
+        self.__dict__ = {k: v for k, v in optimizer.__dict__.items() if k not in ("state_dict", "step", "__del__")}
         self.__class__ = type("Lite" + optimizer.__class__.__name__, (self.__class__, optimizer.__class__), {})
         self._optimizer = optimizer
         self._accelerator = accelerator
@@ -53,6 +55,9 @@ def __init__(self, optimizer: Optimizer, accelerator: Accelerator) -> None:
     def optimizer(self) -> Optimizer:
         return self._optimizer
 
+    def state_dict(self) -> Dict[str, Tensor]:
+        return self._accelerator.optimizer_state(self.optimizer)
+
     def step(self, closure: Optional[Callable] = None) -> None:
         closure = closure or _do_nothing_closure
         self._accelerator.optimizer_step(
@@ -63,7 +68,7 @@ def step(self, closure: Optional[Callable] = None) -> None:
         )
 
 
-class _LiteModule(nn.Module):
+class _LiteModule(DeviceDtypeModuleMixin):
     def __init__(self, module: nn.Module, precision_plugin: PrecisionPlugin) -> None:
         """The LiteModule is a thin wrapper around the :class:`torch.nn.Module` and handles precision / autocast
         automatically for the forward pass.
@@ -94,12 +99,17 @@ def forward(self, *args: Any, **kwargs: Any) -> Any:
         }
         # TODO (@awaelchli): let the precision plugin handle the conversion
         to_type = precision_to_type[precision]
-        args, kwargs = apply_to_collection([args, kwargs], function=lambda t: t.to(to_type), dtype=Tensor)
+
+        def _convert_float_tensor(t: Tensor) -> Tensor:
+            return t.to(to_type) if torch.is_floating_point(t) else t
+
+        args, kwargs = apply_to_collection([args, kwargs], function=_convert_float_tensor, dtype=Tensor)
 
         with self._precision_plugin.forward_context():
             output = self.module(*args, **kwargs)
 
-        output = apply_to_collection(output, function=lambda t: t.to(torch.get_default_dtype()), dtype=Tensor)
+        to_type = torch.get_default_dtype()
+        output = apply_to_collection(output, function=_convert_float_tensor, dtype=Tensor)
         return output
 
 
@@ -107,9 +117,9 @@ def _wrap_init(f: Callable) -> Callable:
     @functools.wraps(f)
     def wrapper(module: Any, *args: Any, **kwargs: Dict[str, Any]) -> None:
         params = dict(inspect.signature(module._old_init).parameters)
-        params.pop("args")
-        params.pop("kwargs")
-        for init_name, init_arg in zip(params, args):
+        params.pop("args", None)
+        params.pop("kwargs", None)
+        for init_name, init_arg in chain(zip(params, args), kwargs.items()):
             setattr(module, init_name, init_arg)
         f(module, *args, **kwargs)
 
@@ -118,15 +128,15 @@ def wrapper(module: Any, *args: Any, **kwargs: Dict[str, Any]) -> None:
 
 # https://stackoverflow.com/a/63851681/9201239
 def _get_all_subclasses(cls: Type[Any]) -> Set[Type[Any]]:
-    subclass_list = []
+    subclasses = set()
 
     def recurse(cl: Type[Any]) -> None:
         for subclass in cl.__subclasses__():
-            subclass_list.append(subclass)
+            subclasses.add(subclass)
             recurse(subclass)
 
     recurse(cls)
-    return set(subclass_list)
+    return subclasses
 
 
 def _enable_class(cls: Type[Any]) -> None:
diff --git a/pytorch_lightning/loggers/base.py b/pytorch_lightning/loggers/base.py
index e5ccae435d8c9..0698a409762b4 100644
--- a/pytorch_lightning/loggers/base.py
+++ b/pytorch_lightning/loggers/base.py
@@ -474,6 +474,9 @@ def __getitem__(self, idx) -> "DummyExperiment":
         # enables self.logger.experiment[0].add_image(...)
         return self
 
+    def __setitem__(self, *args, **kwargs) -> None:
+        pass
+
 
 class DummyLogger(LightningLoggerBase):
     """Dummy logger for internal use.
diff --git a/pytorch_lightning/loggers/csv_logs.py b/pytorch_lightning/loggers/csv_logs.py
index 77adfe551f72d..cd513bb30012b 100644
--- a/pytorch_lightning/loggers/csv_logs.py
+++ b/pytorch_lightning/loggers/csv_logs.py
@@ -95,9 +95,11 @@ def save(self) -> None:
         metrics_keys = list(last_m.keys())
 
         with open(self.metrics_file_path, "w", newline="") as f:
-            self.writer = csv.DictWriter(f, fieldnames=metrics_keys)
-            self.writer.writeheader()
-            self.writer.writerows(self.metrics)
+            # Don't assign the writer to self.
+            # Keeps an open reference and prevents pickling otherwise
+            writer = csv.DictWriter(f, fieldnames=metrics_keys)
+            writer.writeheader()
+            writer.writerows(self.metrics)
 
 
 class CSVLogger(LightningLoggerBase):
diff --git a/pytorch_lightning/loggers/neptune.py b/pytorch_lightning/loggers/neptune.py
index f7c611ed787ce..2e7031ff46c4f 100644
--- a/pytorch_lightning/loggers/neptune.py
+++ b/pytorch_lightning/loggers/neptune.py
@@ -44,7 +44,7 @@
         from neptune.new.types import File as NeptuneFile
     except ModuleNotFoundError:
         import neptune
-        from neptune.exceptions import NeptuneLegacyProjectException
+        from neptune.exceptions import NeptuneLegacyProjectException, NeptuneOfflineModeFetchException
         from neptune.run import Run
         from neptune.types import File as NeptuneFile
 else:
@@ -266,51 +266,64 @@ def __init__(
         prefix: str = "training",
         **neptune_run_kwargs,
     ):
-
         # verify if user passed proper init arguments
         self._verify_input_arguments(api_key, project, name, run, neptune_run_kwargs)
+        if neptune is None:
+            raise ModuleNotFoundError(
+                "You want to use the `Neptune` logger which is not installed yet, install it with"
+                " `pip install neptune-client`."
+            )
 
         super().__init__()
         self._log_model_checkpoints = log_model_checkpoints
         self._prefix = prefix
+        self._run_name = name
+        self._project_name = project
+        self._api_key = api_key
+        self._run_instance = run
+        self._neptune_run_kwargs = neptune_run_kwargs
+        self._run_short_id = None
+
+        if self._run_instance is not None:
+            self._retrieve_run_data()
 
-        self._run_instance = self._init_run_instance(api_key, project, name, run, neptune_run_kwargs)
+            # make sure that we've log integration version for outside `Run` instances
+            self._run_instance[_INTEGRATION_VERSION_KEY] = __version__
 
-        self._run_short_id = self.run._short_id  # skipcq: PYL-W0212
+    def _retrieve_run_data(self):
         try:
-            self.run.wait()
+            self._run_instance.wait()
+            self._run_short_id = self.run._short_id  # skipcq: PYL-W0212
             self._run_name = self._run_instance["sys/name"].fetch()
         except NeptuneOfflineModeFetchException:
             self._run_name = "offline-name"
 
-    def _init_run_instance(self, api_key, project, name, run, neptune_run_kwargs) -> Run:
-        if run is not None:
-            run_instance = run
-        else:
-            try:
-                run_instance = neptune.init(
-                    project=project,
-                    api_token=api_key,
-                    name=name,
-                    **neptune_run_kwargs,
-                )
-            except NeptuneLegacyProjectException as e:
-                raise TypeError(
-                    f"""Project {project} has not been migrated to the new structure.
-                    You can still integrate it with the Neptune logger using legacy Python API
-                    available as part of neptune-contrib package:
-                      - https://docs-legacy.neptune.ai/integrations/pytorch_lightning.html\n
-                    """
-                ) from e
-
-        # make sure that we've log integration version for both newly created and outside `Run` instances
-        run_instance[_INTEGRATION_VERSION_KEY] = __version__
-
-        # keep api_key and project, they will be required when resuming Run for pickled logger
-        self._api_key = api_key
-        self._project_name = run_instance._project_name  # skipcq: PYL-W0212
+    @property
+    def _neptune_init_args(self):
+        args = {}
+        # Backward compatibility in case of previous version retrieval
+        try:
+            args = self._neptune_run_kwargs
+        except AttributeError:
+            pass
+
+        if self._project_name is not None:
+            args["project"] = self._project_name
+
+        if self._api_key is not None:
+            args["api_token"] = self._api_key
+
+        if self._run_short_id is not None:
+            args["run"] = self._run_short_id
+
+        # Backward compatibility in case of previous version retrieval
+        try:
+            if self._run_name is not None:
+                args["name"] = self._run_name
+        except AttributeError:
+            pass
 
-        return run_instance
+        return args
 
     def _construct_path_with_prefix(self, *keys) -> str:
         """Return sequence of keys joined by `LOGGER_JOIN_CHAR`, started with `_prefix` if defined."""
@@ -379,7 +392,7 @@ def __getstate__(self):
 
     def __setstate__(self, state):
         self.__dict__ = state
-        self._run_instance = neptune.init(project=self._project_name, api_token=self._api_key, run=self._run_short_id)
+        self._run_instance = neptune.init(**self._neptune_init_args)
 
     @property
     @rank_zero_experiment
@@ -412,8 +425,23 @@ def training_step(self, batch, batch_idx):
         return self.run
 
     @property
+    @rank_zero_experiment
     def run(self) -> Run:
-        return self._run_instance
+        try:
+            if not self._run_instance:
+                self._run_instance = neptune.init(**self._neptune_init_args)
+                self._retrieve_run_data()
+                # make sure that we've log integration version for newly created
+                self._run_instance[_INTEGRATION_VERSION_KEY] = __version__
+
+            return self._run_instance
+        except NeptuneLegacyProjectException as e:
+            raise TypeError(
+                f"Project {self._project_name} has not been migrated to the new structure."
+                " You can still integrate it with the Neptune logger using legacy Python API"
+                " available as part of neptune-contrib package:"
+                " https://docs-legacy.neptune.ai/integrations/pytorch_lightning.html\n"
+            ) from e
 
     @rank_zero_only
     def log_hyperparams(self, params: Union[Dict[str, Any], Namespace]) -> None:  # skipcq: PYL-W0221
@@ -473,13 +501,13 @@ def log_metrics(self, metrics: Dict[str, Union[torch.Tensor, float]], step: Opti
 
         for key, val in metrics.items():
             # `step` is ignored because Neptune expects strictly increasing step values which
-            # Lighting does not always guarantee.
-            self.experiment[key].log(val)
+            # Lightning does not always guarantee.
+            self.run[key].log(val)
 
     @rank_zero_only
     def finalize(self, status: str) -> None:
         if status:
-            self.experiment[self._construct_path_with_prefix("status")] = status
+            self.run[self._construct_path_with_prefix("status")] = status
 
         super().finalize(status)
 
@@ -493,12 +521,14 @@ def save_dir(self) -> Optional[str]:
         """
         return os.path.join(os.getcwd(), ".neptune")
 
+    @rank_zero_only
     def log_model_summary(self, model, max_depth=-1):
         model_str = str(ModelSummary(model=model, max_depth=max_depth))
-        self.experiment[self._construct_path_with_prefix("model/summary")] = neptune.types.File.from_content(
+        self.run[self._construct_path_with_prefix("model/summary")] = neptune.types.File.from_content(
             content=model_str, extension="txt"
         )
 
+    @rank_zero_only
     def after_save_checkpoint(self, checkpoint_callback: "ReferenceType[ModelCheckpoint]") -> None:
         """Automatically log checkpointed model. Called after model checkpoint callback saves a new checkpoint.
 
@@ -515,39 +545,46 @@ def after_save_checkpoint(self, checkpoint_callback: "ReferenceType[ModelCheckpo
         if checkpoint_callback.last_model_path:
             model_last_name = self._get_full_model_name(checkpoint_callback.last_model_path, checkpoint_callback)
             file_names.add(model_last_name)
-            self.experiment[f"{checkpoints_namespace}/{model_last_name}"].upload(checkpoint_callback.last_model_path)
+            self.run[f"{checkpoints_namespace}/{model_last_name}"].upload(checkpoint_callback.last_model_path)
 
         # save best k models
         for key in checkpoint_callback.best_k_models.keys():
             model_name = self._get_full_model_name(key, checkpoint_callback)
             file_names.add(model_name)
-            self.experiment[f"{checkpoints_namespace}/{model_name}"].upload(key)
+            self.run[f"{checkpoints_namespace}/{model_name}"].upload(key)
+
+        # log best model path and checkpoint
+        if checkpoint_callback.best_model_path:
+            self.run[self._construct_path_with_prefix("model/best_model_path")] = checkpoint_callback.best_model_path
+
+            model_name = self._get_full_model_name(checkpoint_callback.best_model_path, checkpoint_callback)
+            file_names.add(model_name)
+            self.run[f"{checkpoints_namespace}/{model_name}"].upload(checkpoint_callback.best_model_path)
 
         # remove old models logged to experiment if they are not part of best k models at this point
-        if self.experiment.exists(checkpoints_namespace):
-            exp_structure = self.experiment.get_structure()
+        if self.run.exists(checkpoints_namespace):
+            exp_structure = self.run.get_structure()
             uploaded_model_names = self._get_full_model_names_from_exp_structure(exp_structure, checkpoints_namespace)
 
             for file_to_drop in list(uploaded_model_names - file_names):
-                del self.experiment[f"{checkpoints_namespace}/{file_to_drop}"]
+                del self.run[f"{checkpoints_namespace}/{file_to_drop}"]
 
-        # log best model path and best model score
-        if checkpoint_callback.best_model_path:
-            self.experiment[
-                self._construct_path_with_prefix("model/best_model_path")
-            ] = checkpoint_callback.best_model_path
+        # log best model score
         if checkpoint_callback.best_model_score:
-            self.experiment[self._construct_path_with_prefix("model/best_model_score")] = (
+            self.run[self._construct_path_with_prefix("model/best_model_score")] = (
                 checkpoint_callback.best_model_score.cpu().detach().numpy()
             )
 
     @staticmethod
     def _get_full_model_name(model_path: str, checkpoint_callback: "ReferenceType[ModelCheckpoint]") -> str:
         """Returns model name which is string `modle_path` appended to `checkpoint_callback.dirpath`."""
-        expected_model_path = f"{checkpoint_callback.dirpath}/"
+        expected_model_path = f"{checkpoint_callback.dirpath}{os.path.sep}"
         if not model_path.startswith(expected_model_path):
             raise ValueError(f"{model_path} was expected to start with {expected_model_path}.")
-        return model_path[len(expected_model_path) :]
+        # Remove extension from filepath
+        filepath, _ = os.path.splitext(model_path[len(expected_model_path) :])
+
+        return filepath
 
     @classmethod
     def _get_full_model_names_from_exp_structure(cls, exp_structure: dict, namespace: str) -> Set[str]:
@@ -628,13 +665,11 @@ def log_artifact(self, artifact: str, destination: Optional[str] = None) -> None
         self._signal_deprecated_api_usage("log_artifact", f"logger.run['{key}].log('path_to_file')")
         self.run[key].log(destination)
 
-    @rank_zero_only
     def set_property(self, *args, **kwargs):
         self._signal_deprecated_api_usage(
             "log_artifact", f"logger.run['{self._prefix}/{self.PARAMETERS_KEY}/key'].log(value)", raise_exception=True
         )
 
-    @rank_zero_only
     def append_tags(self, *args, **kwargs):
         self._signal_deprecated_api_usage(
             "append_tags", "logger.run['sys/tags'].add(['foo', 'bar'])", raise_exception=True
diff --git a/pytorch_lightning/loggers/tensorboard.py b/pytorch_lightning/loggers/tensorboard.py
index f26fc75ac58db..1ceadb8658a3d 100644
--- a/pytorch_lightning/loggers/tensorboard.py
+++ b/pytorch_lightning/loggers/tensorboard.py
@@ -240,7 +240,9 @@ def log_graph(self, model: "pl.LightningModule", input_array=None):
 
             if input_array is not None:
                 input_array = model._apply_batch_transfer_handler(input_array)
+                model._running_torchscript = True
                 self.experiment.add_graph(model, input_array)
+                model._running_torchscript = False
             else:
                 rank_zero_warn(
                     "Could not log computational graph since the"
diff --git a/pytorch_lightning/loops/dataloader/evaluation_loop.py b/pytorch_lightning/loops/dataloader/evaluation_loop.py
index 6140bd60d6a7f..f95196d360ca8 100644
--- a/pytorch_lightning/loops/dataloader/evaluation_loop.py
+++ b/pytorch_lightning/loops/dataloader/evaluation_loop.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, List, Optional, Sequence, Union
+from typing import Any, List, Sequence
 
 from deprecate.utils import void
 from torch.utils.data.dataloader import DataLoader
@@ -32,7 +32,8 @@ def __init__(self):
         self.epoch_loop = EvaluationEpochLoop()
 
         self._results = ResultCollection(training=False)
-        self._max_batches: Optional[Union[int, Sequence[int]]] = None
+        self._outputs: List[EPOCH_OUTPUT] = []
+        self._max_batches: List[int] = []
         self._has_run: bool = False
 
     @property
@@ -147,7 +148,7 @@ def teardown(self) -> None:
         self._results.cpu()
         self.epoch_loop.teardown()
 
-    def _get_max_batches(self) -> List[Union[int, float]]:
+    def _get_max_batches(self) -> List[int]:
         """Returns the max number of batches for each dataloader."""
         if self.trainer.testing:
             max_batches = self.trainer.num_test_batches
@@ -165,7 +166,7 @@ def _reload_evaluation_dataloaders(self) -> None:
         """Reloads dataloaders if necessary."""
         if self.trainer.testing:
             self.trainer.reset_test_dataloader()
-        elif self.trainer.val_dataloaders is None or self.trainer._should_reload_dl_epoch:
+        elif self.trainer.val_dataloaders is None or self.trainer._should_reload_val_dl:
             self.trainer.reset_val_dataloader()
 
     def _on_evaluation_start(self, *args: Any, **kwargs: Any) -> None:
diff --git a/pytorch_lightning/loops/dataloader/prediction_loop.py b/pytorch_lightning/loops/dataloader/prediction_loop.py
index cf40316312107..58fee7743c1e5 100644
--- a/pytorch_lightning/loops/dataloader/prediction_loop.py
+++ b/pytorch_lightning/loops/dataloader/prediction_loop.py
@@ -53,10 +53,7 @@ def num_dataloaders(self) -> int:
     @property
     def max_batches(self) -> List[int]:
         """The max number of batches this loop will run for each dataloader."""
-        max_batches = self.trainer.num_predict_batches
-        if isinstance(max_batches, int):
-            max_batches = [max_batches] * len(self.dataloaders)
-        return max_batches
+        return self.trainer.num_predict_batches
 
     @property
     def dataloaders(self) -> Sequence[DataLoader]:
diff --git a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py
index b4660c96a0989..102603f20302b 100644
--- a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py
+++ b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py
@@ -24,7 +24,6 @@
 from pytorch_lightning.trainer.progress import BatchProgress
 from pytorch_lightning.utilities.auto_restart import MergedIteratorState, reload_dataloader_state_dict
 from pytorch_lightning.utilities.fetching import AbstractDataFetcher, DataFetcher
-from pytorch_lightning.utilities.memory import recursive_detach
 from pytorch_lightning.utilities.model_helpers import is_overridden
 from pytorch_lightning.utilities.types import EPOCH_OUTPUT, STEP_OUTPUT
 
@@ -134,10 +133,13 @@ def advance(
         self.trainer.logger_connector.update_eval_step_metrics()
 
         # track epoch level outputs
-        if self._should_track_batch_outputs_for_epoch_end():
-            output = recursive_detach(output, to_cpu=self.trainer.move_metrics_to_cpu)
-            if output is not None:
-                self.outputs.append(output)
+        if self._should_track_batch_outputs_for_epoch_end() and output is not None:
+            self.outputs.append(output)
+
+        if self.trainer.move_metrics_to_cpu:
+            # the evaluation step output is not moved as they are not considered "metrics"
+            assert self.trainer._results is not None
+            self.trainer._results.cpu()
 
         if not self.batch_progress.is_last_batch:
             # if fault tolerant is enabled and process has been notified, exit.
diff --git a/pytorch_lightning/loops/epoch/prediction_epoch_loop.py b/pytorch_lightning/loops/epoch/prediction_epoch_loop.py
index 58e65233dfe81..7b31432cab8f6 100644
--- a/pytorch_lightning/loops/epoch/prediction_epoch_loop.py
+++ b/pytorch_lightning/loops/epoch/prediction_epoch_loop.py
@@ -26,7 +26,7 @@ def __init__(self) -> None:
         self._dl_max_batches: Optional[int] = None
         self._num_dataloaders: Optional[int] = None
         self._warning_cache = WarningCache()
-        self._all_batch_indices: List[int] = []
+        self._seen_batch_indices: List[List[int]] = []
 
     @property
     def done(self) -> bool:
@@ -44,8 +44,8 @@ def connect(self, **kwargs: "Loop") -> None:
 
     def reset(self) -> None:
         """Resets the loops internal state."""
-        self._all_batch_indices: List[int] = []
-        self.predictions: List[Any] = []
+        self._seen_batch_indices = []
+        self.predictions = []
         self.batch_progress.reset_on_run()
 
     def on_run_start(
@@ -69,6 +69,8 @@ def on_run_start(
         self._dl_max_batches = dl_max_batches
         self._num_dataloaders = num_dataloaders
         self.return_predictions = return_predictions
+        # this call requires that `self.return_predictions` is set
+        self._seen_batch_indices = self._get_batch_indices(dataloader_idx)
 
     def advance(
         self,
@@ -88,6 +90,10 @@ def advance(
             return_predictions: whether to return the obtained predictions
         """
         batch_idx, batch = next(dataloader_iter)
+        self._seen_batch_indices = self._get_batch_indices(dataloader_idx)
+        # we need to truncate the list of batch indicies due to prefetching in the dataloader and Lightning
+        self._seen_batch_indices = self._seen_batch_indices[: (self.batch_progress.current.completed + 1)]
+
         if batch is None:
             raise StopIteration
 
@@ -99,13 +105,10 @@ def advance(
         with self.trainer.profiler.profile("predict_step"):
             self._predict_step(batch, batch_idx, dataloader_idx)
 
-    def on_run_end(self) -> Tuple[List[Any], List[int]]:
+    def on_run_end(self) -> Tuple[List[Any], List[List[int]]]:
         """Returns the predictions and the corresponding batch indices."""
-        predictions = self.predictions
-        all_batch_indices = self._all_batch_indices
-        # free memory
-        self.predictions = []
-        self._all_batch_indices = []
+        predictions, all_batch_indices = self.predictions, self._seen_batch_indices
+        self.predictions, self._seen_batch_indices = [], []  # free memory
         return predictions, all_batch_indices
 
     def _predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int) -> None:
@@ -121,7 +124,7 @@ def _predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int) -> None
         step_kwargs = self._build_kwargs(batch, batch_idx, dataloader_idx)
 
         # extract batch_indices and store them
-        self._store_batch_indices(dataloader_idx)
+        self.current_batch_indices = self._seen_batch_indices[batch_idx] if self._seen_batch_indices else []
 
         model_ref = self.trainer.lightning_module
 
@@ -160,12 +163,13 @@ def _build_kwargs(self, batch: Any, batch_idx: int, dataloader_idx: int) -> Dict
             step_kwargs["dataloader_idx"] = dataloader_idx
         return step_kwargs
 
-    def _store_batch_indices(self, dataloader_idx: int) -> None:
-        """Stores the batch indices if the predictions should be stored."""
-        batch_sampler = self.trainer.predict_dataloaders[dataloader_idx].batch_sampler
-        if isinstance(batch_sampler, IndexBatchSamplerWrapper):
-            self.current_batch_indices = batch_sampler.batch_indices
-            if self.should_store_predictions:
-                self._all_batch_indices.append(batch_sampler.batch_indices)
-        else:
-            warning_cache.warn("Lightning couldn't infer the indices fetched for your dataloader.")
+    def _get_batch_indices(self, dataloader_idx: int) -> List[List[int]]:
+        """Returns a reference to the seen batch indices if the dataloader has a batch sampler wrapped by our
+        :class:`~pytorch_lightning.overrides.distributed.IndexBatchSamplerWrapper`."""
+        # the batch_sampler is not be defined in case of CombinedDataLoaders
+        batch_sampler = getattr(self.trainer.predict_dataloaders[dataloader_idx], "batch_sampler", None)
+        if isinstance(batch_sampler, IndexBatchSamplerWrapper) and self.should_store_predictions:
+            return batch_sampler.seen_batch_indices
+
+        warning_cache.warn("Lightning couldn't infer the indices fetched for your dataloader.")
+        return []
diff --git a/pytorch_lightning/loops/epoch/training_epoch_loop.py b/pytorch_lightning/loops/epoch/training_epoch_loop.py
index 21d89a8be8b52..8ddca3ad505e8 100644
--- a/pytorch_lightning/loops/epoch/training_epoch_loop.py
+++ b/pytorch_lightning/loops/epoch/training_epoch_loop.py
@@ -161,9 +161,7 @@ def advance(self, *args: Any, **kwargs: Any) -> None:
 
         self.batch_progress.increment_ready()
 
-        # cache the batch size value to avoid extracting it again after the batch loop runs as the value will be
-        # different if tbptt is enabled
-        batch_size = self.trainer.logger_connector.on_batch_start(batch_idx, batch)
+        self.trainer.logger_connector.on_batch_start(batch_idx, batch)
 
         if batch is None:
             self._warning_cache.warn("train_dataloader yielded None. If this was on purpose, ignore this warning...")
@@ -194,8 +192,6 @@ def advance(self, *args: Any, **kwargs: Any) -> None:
             with self.trainer.profiler.profile("run_training_batch"):
                 batch_output = self.batch_loop.run(batch, batch_idx)
 
-        self.trainer._results.batch_size = batch_size
-
         self.batch_progress.increment_processed()
 
         # update non-plateau LR schedulers
diff --git a/pytorch_lightning/loops/fit_loop.py b/pytorch_lightning/loops/fit_loop.py
index df6634c963851..fcd75ef274914 100644
--- a/pytorch_lightning/loops/fit_loop.py
+++ b/pytorch_lightning/loops/fit_loop.py
@@ -205,7 +205,7 @@ def on_advance_start(self) -> None:
         model = self.trainer.lightning_module
 
         # reset train dataloader
-        if not self._is_fresh_start_epoch and self.trainer._should_reload_dl_epoch:
+        if not self._is_fresh_start_epoch and self.trainer._should_reload_train_dl:
             self.trainer.reset_train_dataloader(model)
         self._is_fresh_start_epoch = False
 
diff --git a/pytorch_lightning/overrides/distributed.py b/pytorch_lightning/overrides/distributed.py
index 0cf392dd44775..835d7f87040c1 100644
--- a/pytorch_lightning/overrides/distributed.py
+++ b/pytorch_lightning/overrides/distributed.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import itertools
-from typing import Any, Iterator, List, Optional
+from typing import Any, Iterator, List
 
 import torch
 from torch.nn.parallel import DistributedDataParallel
@@ -20,6 +20,7 @@
 
 import pytorch_lightning as pl
 from pytorch_lightning.overrides.base import _LightningModuleWrapperBase
+from pytorch_lightning.utilities import rank_zero_deprecation
 
 
 class LightningDistributedModule(_LightningModuleWrapperBase):
@@ -119,12 +120,31 @@ class IndexBatchSamplerWrapper:
     """This class is used to wrap a :class:`torch.utils.data.BatchSampler` and capture its indices."""
 
     def __init__(self, sampler: BatchSampler) -> None:
+        self.seen_batch_indices: List[List[int]] = []
         self._sampler = sampler
-        self.batch_indices: Optional[List[int]] = None
+        self._batch_indices: List[int] = []
+
+    @property
+    def batch_indices(self) -> List[int]:
+        rank_zero_deprecation(
+            "The attribute `IndexBatchSamplerWrapper.batch_indices` was deprecated in v1.5 and will be removed in"
+            " v1.7. Access the full list `seen_batch_indices` instead."
+        )
+        return self._batch_indices
+
+    @batch_indices.setter
+    def batch_indices(self, indices: List[int]) -> None:
+        rank_zero_deprecation(
+            "The attribute `IndexBatchSamplerWrapper.batch_indices` was deprecated in v1.5 and will be removed in"
+            " v1.7. Access the full list `seen_batch_indices` instead."
+        )
+        self._batch_indices = indices
 
     def __iter__(self) -> Iterator[List[int]]:
+        self.seen_batch_indices = []
         for batch in self._sampler:
-            self.batch_indices = batch
+            self._batch_indices = batch
+            self.seen_batch_indices.append(batch)
             yield batch
 
     def __len__(self) -> int:
diff --git a/pytorch_lightning/plugins/environments/lsf_environment.py b/pytorch_lightning/plugins/environments/lsf_environment.py
index 06563c7f017bb..4d2ead915ed0c 100644
--- a/pytorch_lightning/plugins/environments/lsf_environment.py
+++ b/pytorch_lightning/plugins/environments/lsf_environment.py
@@ -14,9 +14,12 @@
 
 import os
 import socket
+from typing import Dict, List
 
 from pytorch_lightning import _logger as log
 from pytorch_lightning.plugins.environments import ClusterEnvironment
+from pytorch_lightning.utilities import rank_zero_deprecation
+from pytorch_lightning.utilities.cloud_io import get_filesystem
 
 
 class LSFEnvironment(ClusterEnvironment):
@@ -25,128 +28,161 @@ class LSFEnvironment(ClusterEnvironment):
     It is expected that any execution using this ClusterEnvironment was executed
     using the Job Step Manager i.e. ``jsrun``.
 
-    This plugin expects the following environment variables.
+    This plugin expects the following environment variables:
 
-    LSB_JOBID:
-        The LSF assigned job ID
+    ``LSB_JOBID``
+      The LSF assigned job ID
 
-    LSB_HOSTS:
-        The hosts used in the job. This string is expected to have the format "batch <rank_0_host> ...."
+    ``LSB_DJOB_RANKFILE``
+      The OpenMPI compatibile rank file for the LSF job
 
-    JSM_NAMESPACE_LOCAL_RANK:
-        The node local rank for the task. This environment variable is set by jsrun
+    ``JSM_NAMESPACE_LOCAL_RANK``
+      The node local rank for the task. This environment variable is set by ``jsrun``
 
-    JSM_NAMESPACE_SIZE:
-        The world size for the task. This environment variable is set by jsrun
-    """
+    ``JSM_NAMESPACE_SIZE``
+      The world size for the task. This environment variable is set by ``jsrun``
 
-    def __init__(self):
-        self._master_address = self._get_master_address()
-        self._master_port = self._get_master_port()
-        log.debug(f"MASTER_ADDR: {self._master_address}")
-        log.debug(f"MASTER_PORT: {self._master_port}")
+    ``JSM_NAMESPACE_RANK``
+      The global rank for the task. This environment variable is set by ``jsrun``
+    """
 
-    @staticmethod
-    def is_using_lsf() -> bool:
-        """Returns ``True`` if the current process was launched using the jsrun command."""
-        required_env_vars = ("LSB_JOBID", "LSB_HOSTS", "JSM_NAMESPACE_LOCAL_RANK", "JSM_NAMESPACE_SIZE")
-        return all(v in os.environ for v in required_env_vars)
+    def __init__(self) -> None:
+        super().__init__()
+        # TODO: remove in 1.7
+        if hasattr(self, "is_using_lsf") and callable(self.is_using_lsf):
+            rank_zero_deprecation(
+                f"`{self.__class__.__name__}.is_using_lsf` has been deprecated in v1.6 and will be removed in v1.7."
+                " Implement the static method `detect()` instead (do not forget to add the `@staticmethod` decorator)."
+            )
+        self._main_address = self._get_main_address()
+        self._main_port = self._get_main_port()
+        self._node_rank = self._get_node_rank()
+        self._set_init_progress_group_env_vars()
+
+    def _set_init_progress_group_env_vars(self) -> None:
+        # set environment variables needed for initializing torch distributed process group
+        os.environ["MASTER_ADDR"] = str(self._main_address)
+        log.debug(f"MASTER_ADDR: {os.environ['MASTER_ADDR']}")
+        os.environ["MASTER_PORT"] = str(self._main_port)
+        log.debug(f"MASTER_PORT: {os.environ['MASTER_PORT']}")
 
     @property
     def creates_processes_externally(self) -> bool:
+        """LSF creates subprocesses, i.e., PyTorch Lightning does not need to spawn them."""
         return True
 
-    def master_address(self):
-        """The master address is read from a list of hosts contained in the environment variable `LSB_HOSTS`."""
-        return self._master_address
+    def master_address(self) -> str:
+        """The main address is read from an OpenMPI host rank file in the environment variable
+        ``LSB_DJOB_RANKFILE``."""
+        return self._main_address
+
+    def master_port(self) -> int:
+        """The main port is calculated from the LSF job ID."""
+        return self._main_port
 
-    def master_port(self):
-        """THe master port gets calculated from the LSF job ID."""
-        return self._master_port
+    @staticmethod
+    def is_using_lsf() -> bool:
+        """Returns ``True`` if the current process was launched using the ``jsrun`` command."""
+        required_env_vars = {"LSB_JOBID", "LSB_DJOB_RANKFILE", "JSM_NAMESPACE_LOCAL_RANK", "JSM_NAMESPACE_SIZE"}
+        return required_env_vars.issubset(os.environ.keys())
 
-    def world_size(self):
-        """The world size is read from the environment variable `JSM_NAMESPACE_SIZE`."""
-        var = "JSM_NAMESPACE_SIZE"
-        world_size = os.environ.get(var)
+    def world_size(self) -> int:
+        """The world size is read from the environment variable ``JSM_NAMESPACE_SIZE``."""
+        world_size = os.environ.get("JSM_NAMESPACE_SIZE")
         if world_size is None:
             raise ValueError(
-                f"Cannot determine world size from environment variable {var}."
-                " Make sure you run your executable with `jsrun`"
+                "Cannot determine world size. Environment variable `JSM_NAMESPACE_SIZE` not found."
+                "Make sure you run your executable with `jsrun`."
             )
         return int(world_size)
 
     def set_world_size(self, size: int) -> None:
         log.debug("LSFEnvironment.set_world_size was called, but setting world size is not allowed. Ignored.")
 
-    def global_rank(self):
-        """The world size is read from the environment variable `JSM_NAMESPACE_RANK`."""
-        var = "JSM_NAMESPACE_RANK"
-        global_rank = os.environ.get(var)
+    def global_rank(self) -> int:
+        """The world size is read from the environment variable ``JSM_NAMESPACE_RANK``."""
+        global_rank = os.environ.get("JSM_NAMESPACE_RANK")
         if global_rank is None:
             raise ValueError(
-                f"Cannot determine global rank from environment variable {var}."
-                " Make sure you run your executable with `jsrun`"
+                "Cannot determine global rank. Environment variable `JSM_NAMESPACE_RANK` not found."
+                "Make sure you run your executable with `jsrun`."
             )
         return int(global_rank)
 
     def set_global_rank(self, rank: int) -> None:
         log.debug("LSFEnvironment.set_global_rank was called, but setting global rank is not allowed. Ignored.")
 
-    def local_rank(self):
+    def local_rank(self) -> int:
         """The local rank is read from the environment variable `JSM_NAMESPACE_LOCAL_RANK`."""
-        var = "JSM_NAMESPACE_LOCAL_RANK"
-        local_rank = os.environ.get(var)
+        local_rank = os.environ.get("JSM_NAMESPACE_LOCAL_RANK")
         if local_rank is None:
             raise ValueError(
-                f"Cannot determine local rank from environment variable {var}."
-                " Make sure you run your executable with `jsrun`"
+                "Cannot determine local rank. Environment variable `JSM_NAMESPACE_LOCAL_RANK` not found."
+                "Make sure you run your executable with `jsrun`."
             )
         return int(local_rank)
 
-    def node_rank(self):
-        """The node rank is determined by the position of the current hostname in the list of hosts stored in the
-        environment variable `LSB_HOSTS`."""
+    def node_rank(self) -> int:
+        """The node rank is determined by the position of the current hostname in the OpenMPI host rank file stored
+        in ``LSB_DJOB_RANKFILE``."""
+        return self._node_rank
+
+    def _get_node_rank(self) -> int:
+        """A helper method for getting the node rank.
+
+        The node rank is determined by the position of the current node in the list of hosts used in the job. This is
+        calculated by reading all hosts from ``LSB_DJOB_RANKFILE`` and finding this node's hostname in the list.
+        """
         hosts = self._read_hosts()
-        count = {}
+        count: Dict[str, int] = {}
         for host in hosts:
-            if "batch" in host or "login" in host:
-                continue
             if host not in count:
                 count[host] = len(count)
         return count[socket.gethostname()]
 
     @staticmethod
-    def _read_hosts():
-        hosts = os.environ.get("LSB_HOSTS")
-        if not hosts:
-            raise ValueError("Could not find hosts in environment variable LSB_HOSTS")
-        hosts = hosts.split()
-        if len(hosts) < 2:
-            raise ValueError(
-                'Cannot parse hosts from LSB_HOSTS environment variable. Expected format: "batch <rank_0_host> ..."'
-            )
-        return hosts
+    def _read_hosts() -> List[str]:
+        """Read compute hosts that are a part of the compute job.
 
-    def _get_master_address(self):
+        LSF uses the Job Step Manager (JSM) to manage job steps. Job steps are executed by the JSM from "launch" nodes.
+        Each job is assigned a launch node. This launch node will be the first node in the list contained in
+        ``LSB_DJOB_RANKFILE``.
+        """
+        var = "LSB_DJOB_RANKFILE"
+        rankfile = os.environ.get(var)
+        if rankfile is None:
+            raise ValueError("Did not find the environment variable `LSB_DJOB_RANKFILE`")
+        if not rankfile:
+            raise ValueError("The environment variable `LSB_DJOB_RANKFILE` is empty")
+
+        fs = get_filesystem(rankfile)
+        with fs.open(rankfile, "r") as f:
+            ret = [line.strip() for line in f]
+        # remove the launch node (i.e. the first node in LSB_DJOB_RANKFILE) from the list
+        return ret[1:]
+
+    def _get_main_address(self) -> str:
+        """A helper for getting the main address.
+
+        The main address is assigned to the first node in the list of nodes used for the job.
+        """
         hosts = self._read_hosts()
-        return hosts[1]
+        return hosts[0]
 
     @staticmethod
-    def _get_master_port():
-        """A helper function for accessing the master port.
+    def _get_main_port() -> int:
+        """A helper function for accessing the main port.
 
-        Uses the LSF job ID so all ranks can compute the master port.
+        Uses the LSF job ID so all ranks can compute the main port.
         """
-        # check for user-specified master port
-        port = os.environ.get("MASTER_PORT")
-        if not port:
-            jobid = os.environ.get("LSB_JOBID")
-            if not jobid:
-                raise ValueError("Could not find job id in environment variable LSB_JOBID")
-            port = int(jobid)
+        # check for user-specified main port
+        if "MASTER_PORT" in os.environ:
+            log.debug(f"Using externally specified main port: {os.environ['MASTER_PORT']}")
+            return int(os.environ["MASTER_PORT"])
+        if "LSB_JOBID" in os.environ:
+            port = int(os.environ["LSB_JOBID"])
             # all ports should be in the 10k+ range
-            port = int(port) % 1000 + 10000
-            log.debug(f"calculated LSF master port: {port}")
-        else:
-            log.debug(f"using externally specified master port: {port}")
-        return int(port)
+            port = port % 1000 + 10000
+            log.debug(f"calculated LSF main port: {port}")
+            return port
+        raise ValueError("Could not find job id in environment variable LSB_JOBID")
diff --git a/pytorch_lightning/plugins/precision/precision_plugin.py b/pytorch_lightning/plugins/precision/precision_plugin.py
index f1ebbf58d8326..012922ea2b60a 100644
--- a/pytorch_lightning/plugins/precision/precision_plugin.py
+++ b/pytorch_lightning/plugins/precision/precision_plugin.py
@@ -165,7 +165,8 @@ def optimizer_step(
     def _track_grad_norm(self, trainer: "pl.Trainer") -> None:
         if trainer.track_grad_norm == -1:
             return
-        grad_norm_dict = grad_norm(trainer.lightning_module, trainer.track_grad_norm, trainer.logger.group_separator)
+        kwargs = {"group_separator": trainer.logger.group_separator} if trainer.logger is not None else {}
+        grad_norm_dict = grad_norm(trainer.lightning_module, trainer.track_grad_norm, **kwargs)
         if grad_norm_dict:
             prev_fx = trainer.lightning_module._current_fx_name
             trainer.lightning_module._current_fx_name = "on_before_optimizer_step"
diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py
index ea4820f61ec7c..9c4a7e36c66ba 100644
--- a/pytorch_lightning/plugins/training_type/ddp.py
+++ b/pytorch_lightning/plugins/training_type/ddp.py
@@ -50,7 +50,7 @@
     rank_zero_deprecation,
     rank_zero_warn,
 )
-from pytorch_lightning.utilities.distributed import distributed_available
+from pytorch_lightning.utilities.distributed import _revert_sync_batchnorm, distributed_available
 from pytorch_lightning.utilities.distributed import group as _group
 from pytorch_lightning.utilities.distributed import (
     init_dist_connection,
@@ -63,11 +63,6 @@
 from pytorch_lightning.utilities.seed import reset_seed
 from pytorch_lightning.utilities.types import STEP_OUTPUT
 
-if _TORCH_GREATER_EQUAL_1_10:
-    if not _IS_WINDOWS:
-        from torch.distributed.optim import DistributedOptimizer
-    from torch.distributed.optim import PostLocalSGDOptimizer, ZeroRedundancyOptimizer
-
 if _FAIRSCALE_AVAILABLE:
     from fairscale.optim import OSS
 if _HYDRA_AVAILABLE:
@@ -75,9 +70,7 @@
     from hydra.utils import get_original_cwd, to_absolute_path
 if _TORCH_GREATER_EQUAL_1_8:
     from pytorch_lightning.utilities.distributed import register_ddp_comm_hook
-if _TORCH_GREATER_EQUAL_1_10:
-    import torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook as post_localSGD
-    import torch.distributed.algorithms.model_averaging.averagers as averagers
+
 
 log = logging.getLogger(__name__)
 
@@ -324,12 +317,11 @@ def _register_ddp_hooks(self) -> None:
                 ddp_comm_wrapper=self._ddp_comm_wrapper,
             )
 
-            if (
-                _TORCH_GREATER_EQUAL_1_10
-                and isinstance(self._ddp_comm_state, post_localSGD.PostLocalSGDState)
-                and self.lightning_module.trainer.state.fn == TrainerFn.FITTING
-            ):
-                self._reinit_optimizers_with_post_localSGD(self._ddp_comm_state.start_localSGD_iter)
+            if _TORCH_GREATER_EQUAL_1_10 and self.lightning_module.trainer.state.fn == TrainerFn.FITTING:
+                import torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook as post_localSGD
+
+                if isinstance(self._ddp_comm_state, post_localSGD.PostLocalSGDState):
+                    self._reinit_optimizers_with_post_localSGD(self._ddp_comm_state.start_localSGD_iter)
 
     def _reinit_optimizers_with_post_localSGD(self, warmup_steps: int):
         optimizers = self.lightning_module.trainer.optimizers
@@ -337,6 +329,12 @@ def _reinit_optimizers_with_post_localSGD(self, warmup_steps: int):
             raise ValueError(
                 "Post-localSGD algorithm is used, but model averaging period is not provided to DDP plugin."
             )
+        if _TORCH_GREATER_EQUAL_1_10:
+            if not _IS_WINDOWS:
+                from torch.distributed.optim import DistributedOptimizer
+            import torch.distributed.algorithms.model_averaging.averagers as averagers
+            from torch.distributed.optim import PostLocalSGDOptimizer, ZeroRedundancyOptimizer
+
         averager = averagers.PeriodicModelAverager(period=self._model_averaging_period, warmup_steps=warmup_steps)
         for x, optimizer in enumerate(optimizers):
             if isinstance(optimizer, LightningOptimizer):
@@ -538,6 +536,9 @@ def teardown(self) -> None:
         if isinstance(self.model, DistributedDataParallel):
             self.model = self.lightning_module
 
+        if self.sync_batchnorm:
+            self.model = _revert_sync_batchnorm(self.model)
+
         if self.on_gpu:
             # GPU teardown
             self.lightning_module.cpu()
diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py
index ff5159f739cdc..d71df296e8544 100644
--- a/pytorch_lightning/plugins/training_type/ddp_spawn.py
+++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py
@@ -41,7 +41,7 @@
 from pytorch_lightning.utilities.apply_func import apply_to_collection, move_data_to_device
 from pytorch_lightning.utilities.cloud_io import atomic_save
 from pytorch_lightning.utilities.cloud_io import load as pl_load
-from pytorch_lightning.utilities.distributed import distributed_available
+from pytorch_lightning.utilities.distributed import _revert_sync_batchnorm, distributed_available
 from pytorch_lightning.utilities.distributed import group as _group
 from pytorch_lightning.utilities.distributed import (
     init_dist_connection,
@@ -333,6 +333,7 @@ def __recover_child_process_weights(self, best_path, last_path):
         if last_path is not None and self.lightning_module.trainer.state.fn == TrainerFn.FITTING:
             ckpt = pl_load(last_path, map_location=lambda storage, loc: storage)
             self.lightning_module.load_state_dict(ckpt)
+            self.checkpoint_io.remove_checkpoint(last_path)
 
     def barrier(self, *args, **kwargs) -> None:
         if not distributed_available():
@@ -435,6 +436,9 @@ def teardown(self) -> None:
         if isinstance(self.model, DistributedDataParallel):
             self.model = self.lightning_module
 
+        if self.sync_batchnorm:
+            self.model = _revert_sync_batchnorm(self.model)
+
         if self.on_gpu:
             # GPU teardown
             self.lightning_module.cpu()
diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index 8dc42c2f36b88..3359e7776d6e5 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -129,7 +129,6 @@ def __init__(
         contiguous_memory_optimization: bool = False,
         synchronize_checkpoint_boundary: bool = False,
         load_full_weights: bool = False,
-        partition_module: bool = True,
     ) -> None:
         """Provides capabilities to run training using the DeepSpeed library, with training optimizations for large
         billion parameter models. `For more information: https://pytorch-
@@ -259,12 +258,6 @@ def __init__(
             load_full_weights: True when loading a single checkpoint file containing the model state dict
                 when using ZeRO Stage 3. This differs from the DeepSpeed checkpoint which contains shards
                 per worker.
-
-            partition_module: When True, partitions the ``LightningModule`` across devices when using ZeRO Stage 3.
-                This is the default behaviour to ensure that the entire module is appropriately initialized
-                for DeepSpeed. When False we do not explicitly convert the model, which is fine if NO layers
-                or ALL layers are defined in ``configure_sharded_model``. This is useful for layers such as
-                ``torch.nn.RNN`` which do internal logic when moving to device.
         """
         if not _DEEPSPEED_AVAILABLE:
             raise MisconfigurationException(
@@ -317,7 +310,6 @@ def __init__(
 
         self.remote_device = remote_device
         self.load_full_weights = load_full_weights
-        self.partition_module = partition_module
 
         # default FP16 parameters.
         self.loss_scale = loss_scale
@@ -463,13 +455,6 @@ def init_deepspeed(self):
         precision = self.lightning_module.trainer.accelerator.precision
         model = LightningDeepSpeedModule(pl_module=self.model, precision=precision)
 
-        if self.zero_stage_3 and self.partition_module:
-            # Ensure the entire model has been moved to the appropriate device
-            dtype = torch.float16 if self.precision in (16, "mixed") else torch.float32
-            deepspeed.zero.Init(
-                module=model, remote_device=self.remote_device, pin_memory=True, config=self.config, dtype=dtype
-            )
-
         if self.lightning_module.trainer and self.lightning_module.trainer.training:
             self._initialize_deepspeed_train(model)
         else:
@@ -524,7 +509,7 @@ def model_sharded_context(self) -> Generator[None, None, None]:
             assert self._config_initialized
             dtype = torch.float16 if self.precision in (16, "mixed") else torch.float32
             model_parallel_context = deepspeed.zero.Init(
-                remote_device=self.remote_device, pin_memory=True, config=self.config, dtype=dtype
+                remote_device=self.remote_device, pin_memory=True, config_dict_or_path=self.config, dtype=dtype
             )
         else:
             model_parallel_context = super().model_sharded_context()
@@ -538,8 +523,8 @@ def _set_deepspeed_activation_checkpointing(self):
             deepspeed.checkpointing.configure(
                 mpu_=None,
                 partition_activations=checkpoint_config.get("partition_activations"),
-                contiguous_checkpointing=checkpoint_config.get("contiguous_checkpointing"),
-                checkpoint_in_cpu=checkpoint_config.get("checkpoint_in_cpu"),
+                contiguous_checkpointing=checkpoint_config.get("contiguous_memory_optimization"),
+                checkpoint_in_cpu=checkpoint_config.get("cpu_checkpointing"),
                 profile=checkpoint_config.get("profile"),
             )
 
@@ -554,7 +539,7 @@ def _initialize_deepspeed_inference(self, model):
             optimizer, lr_scheduler, _ = self._init_optimizers()
             scheduler = lr_scheduler["scheduler"]
         inference_config = {
-            # todo: this is required for DeepSpeed throughput timers, or throughput timers will be incorrect
+            # todo: this is required for DeepSpeed throughput timers
             "train_micro_batch_size_per_gpu": 1
         }
         if "fp16" in self.config:
@@ -622,11 +607,6 @@ def _format_batch_size_and_grad_accum_config(self):
             )
         self.config["gradient_accumulation_steps"] = self.lightning_module.trainer.accumulate_grad_batches
         if "train_micro_batch_size_per_gpu" not in self.config:
-            rank_zero_warn(
-                "Inferring the batch size for internal deepspeed logging from the `train_dataloader()`. "
-                "If you require skipping this, please pass "
-                "`Trainer(strategy=DeepSpeedPlugin(logging_batch_size_per_gpu=batch_size))`"
-            )
             batch_size = self._auto_select_batch_size()
             self.config["train_micro_batch_size_per_gpu"] = batch_size
         if "gradient_clipping" not in self.config:
@@ -638,9 +618,19 @@ def _auto_select_batch_size(self):
         batch_size = 1
         train_dl_source = self.lightning_module.trainer._data_connector._train_dataloader_source
         if train_dl_source.is_defined():
-            train_dataloader = train_dl_source.dataloader()
-            if hasattr(train_dataloader, "batch_sampler"):
-                batch_size = train_dataloader.batch_sampler.batch_size
+            try:
+                train_dataloader = train_dl_source.dataloader()
+                if hasattr(train_dataloader, "batch_sampler"):
+                    batch_size = train_dataloader.batch_sampler.batch_size
+            # broad exception on purpose as `source.dataloader()` will fail if the dataloader requires `setup`
+            # to have been called before
+            except Exception:
+                if self.global_rank == 0:
+                    deepspeed.utils.logging.logger.warning(
+                        "Tried to infer the batch size for internal deepspeed logging from the `train_dataloader()`. "
+                        "To ensure DeepSpeed logging remains correct, please manually pass the plugin with the "
+                        "batch size, `Trainer(strategy=DeepSpeedPlugin(logging_batch_size_per_gpu=batch_size))`."
+                    )
         return batch_size
 
     def _format_precision_config(self):
diff --git a/pytorch_lightning/plugins/training_type/ipu.py b/pytorch_lightning/plugins/training_type/ipu.py
index 4d9f937c58467..898e62791d6ee 100644
--- a/pytorch_lightning/plugins/training_type/ipu.py
+++ b/pytorch_lightning/plugins/training_type/ipu.py
@@ -237,21 +237,25 @@ def to_tensor(x):
         args = apply_to_collection(args, dtype=(int, float), function=to_tensor)
         return args
 
-    def training_step(self, *args, **kwargs):
+    def _step(self, stage: RunningStage, *args: Any, **kwargs: Any):
         args = self._prepare_input(args)
-        return self.poptorch_models[RunningStage.TRAINING](*args, **kwargs)
+        poptorch_model = self.poptorch_models[stage]
+        self.lightning_module._running_torchscript = True
+        out = poptorch_model(*args, **kwargs)
+        self.lightning_module._running_torchscript = False
+        return out
+
+    def training_step(self, *args, **kwargs):
+        return self._step(RunningStage.TRAINING, *args, **kwargs)
 
     def validation_step(self, *args, **kwargs):
-        args = self._prepare_input(args)
-        return self.poptorch_models[RunningStage.VALIDATING](*args, **kwargs)
+        return self._step(RunningStage.VALIDATING, *args, **kwargs)
 
     def test_step(self, *args, **kwargs):
-        args = self._prepare_input(args)
-        return self.poptorch_models[RunningStage.TESTING](*args, **kwargs)
+        return self._step(RunningStage.TESTING, *args, **kwargs)
 
     def predict_step(self, *args, **kwargs):
-        args = self._prepare_input(args)
-        return self.poptorch_models[RunningStage.PREDICTING](*args, **kwargs)
+        return self._step(RunningStage.PREDICTING, *args, **kwargs)
 
     def teardown(self) -> None:
         # undo dataloader patching
diff --git a/pytorch_lightning/plugins/training_type/tpu_spawn.py b/pytorch_lightning/plugins/training_type/tpu_spawn.py
index 2509122bd99e2..92bd0f06735d8 100644
--- a/pytorch_lightning/plugins/training_type/tpu_spawn.py
+++ b/pytorch_lightning/plugins/training_type/tpu_spawn.py
@@ -24,6 +24,7 @@
 from torch.utils.data import DataLoader
 
 import pytorch_lightning as pl
+from pytorch_lightning.loggers import LoggerCollection, TensorBoardLogger
 from pytorch_lightning.overrides import LightningDistributedModule
 from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
 from pytorch_lightning.plugins.io.xla_plugin import XLACheckpointIO
@@ -254,10 +255,6 @@ def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[
 
         return output
 
-    def _close_logger(self, trainer) -> None:
-        if trainer.logger is not None:
-            trainer.logger.finalize("success")
-
     def get_mp_spawn_kwargs(self, trainer: Optional["pl.Trainer"] = None) -> Dict[str, Any]:
         return {
             "nprocs": len(self.parallel_devices),
@@ -293,13 +290,17 @@ def start_training(self, trainer: "pl.Trainer") -> None:
         # todo: precision pluging is call in accelerator setup and should be moved
         if "XLA_USE_BF16" in os.environ:
             del os.environ["XLA_USE_BF16"]
-        self._close_logger(trainer)
+        self._clean_logger(trainer)
         return super().start_training(trainer)
 
     def start_evaluating(self, trainer: "pl.Trainer") -> None:
-        self._close_logger(trainer)
+        self._clean_logger(trainer)
         return super().start_evaluating(trainer)
 
+    def start_predicting(self, trainer: "pl.Trainer") -> None:
+        self._clean_logger(trainer)
+        return super().start_predicting(trainer)
+
     def training_step(self, *args, **kwargs):
         return self.model(*args, **kwargs)
 
@@ -375,3 +376,13 @@ def checkpoint_io(self) -> CheckpointIO:
     @checkpoint_io.setter
     def checkpoint_io(self, plugin: CheckpointIO) -> None:
         raise MisconfigurationException("TPU Spawn Plugin currently does not support custom checkpoint plugins.")
+
+    @staticmethod
+    def _clean_logger(trainer: "pl.Trainer") -> None:
+        loggers = trainer.logger._logger_iterable if isinstance(trainer.logger, LoggerCollection) else [trainer.logger]
+        for logger in loggers:
+            if isinstance(logger, TensorBoardLogger) and logger._experiment is not None:
+                # the experiment class of `TensorBoard` holds a multiprocessing queue which can make ours hang.
+                # we want to make sure these are closed before we spawn our own threads.
+                # assuming nothing else references the experiment object, python should instantly `__del__` it.
+                logger._experiment = None
diff --git a/pytorch_lightning/profiler/pytorch.py b/pytorch_lightning/profiler/pytorch.py
index 58f4a18895498..f5c5968c0f1d2 100644
--- a/pytorch_lightning/profiler/pytorch.py
+++ b/pytorch_lightning/profiler/pytorch.py
@@ -335,9 +335,24 @@ def _init_kineto(self, profiler_kwargs: Any) -> None:
         with_stack = profiler_kwargs.get("with_stack", False) or self._export_to_flame_graph
         self._profiler_kwargs["with_stack"] = with_stack
 
+    @property
+    def _total_steps(self) -> int:
+        trainer = self._lightning_module.trainer
+        if self._schedule.is_training:
+            return trainer.num_training_batches
+        if self._schedule._current_action == "validation_step":
+            return sum(trainer.num_val_batches) + sum(trainer.num_sanity_val_batches)
+        if self._schedule._current_action == "test_step":
+            return sum(trainer.num_test_batches)
+        if self._schedule._current_action == "predict_step":
+            return sum(trainer.num_predict_batches)
+
     def _should_override_schedule(self) -> bool:
-        return (self._lightning_module is not None and self._lightning_module.trainer.limit_train_batches < 5) and (
-            self._schedule is not None and self._schedule._schedule == self._default_schedule()
+        return (
+            self._lightning_module is not None
+            and self._schedule is not None
+            and self._total_steps < 5
+            and self._schedule._schedule == self._default_schedule()
         )
 
     @staticmethod
@@ -362,10 +377,8 @@ def start(self, action_name: str) -> None:
 
             # close profiler if it is already opened. might happen if 2 profilers
             # are created and the first one did not call `describe`
-            try:
+            if torch.autograd._profiler_enabled():
                 torch.autograd._disable_profiler()
-            except (AttributeError, RuntimeError):
-                pass
 
             if self._schedule is not None:
                 self._schedule.setup(action_name)
@@ -410,6 +423,9 @@ def stop(self, action_name: str) -> None:
             action_name in self.STEP_FUNCTIONS or action_name.startswith(self.STEP_FUNCTION_PREFIX)
         ):
 
+            if self._schedule is not None:
+                self._schedule.pre_step(action_name)
+
             # the default schedule requires a minimum of 5 steps to properly work: `wait=1, warmup=1, active=3`.
             # otherwise, this will raise a `segmentation fault`.
             if self._should_override_schedule():
@@ -420,9 +436,6 @@ def stop(self, action_name: str) -> None:
                 self._schedule = None
                 self.profiler.schedule = torch.profiler.profiler._default_schedule_fn
 
-            if self._schedule is not None:
-                self._schedule.pre_step(action_name)
-
             def on_trace_ready(profiler):
                 if self.dirpath is not None:
                     if self._export_to_chrome:
diff --git a/pytorch_lightning/profiler/xla.py b/pytorch_lightning/profiler/xla.py
index e30f06f84e952..c89685bcad0be 100644
--- a/pytorch_lightning/profiler/xla.py
+++ b/pytorch_lightning/profiler/xla.py
@@ -42,9 +42,10 @@
 from typing import Dict
 
 from pytorch_lightning.profiler.base import BaseProfiler
-from pytorch_lightning.utilities import _TPU_AVAILABLE
+from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_8, _TPU_AVAILABLE
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
-if _TPU_AVAILABLE:
+if _TPU_AVAILABLE and _TORCH_GREATER_EQUAL_1_8:
     import torch_xla.debug.profiler as xp
 
 log = logging.getLogger(__name__)
@@ -65,6 +66,10 @@ class XLAProfiler(BaseProfiler):
     def __init__(self, port: int = 9012) -> None:
         """This Profiler will help you debug and optimize training workload performance for your models using Cloud
         TPU performance tools."""
+        if not _TPU_AVAILABLE:
+            raise MisconfigurationException("`XLAProfiler` is only supported on TPUs")
+        if not _TORCH_GREATER_EQUAL_1_8:
+            raise MisconfigurationException("`XLAProfiler` is only supported with `torch-xla >= 1.8`")
         super().__init__(dirpath=None, filename=None)
         self.port = port
         self._recording_map: Dict = {}
diff --git a/pytorch_lightning/trainer/configuration_validator.py b/pytorch_lightning/trainer/configuration_validator.py
index c44529c539b92..ed247ac94feac 100644
--- a/pytorch_lightning/trainer/configuration_validator.py
+++ b/pytorch_lightning/trainer/configuration_validator.py
@@ -19,7 +19,7 @@
 from pytorch_lightning.utilities.warnings import rank_zero_deprecation, rank_zero_warn
 
 
-def verify_loop_configurations(trainer: "pl.Trainer", model: "pl.LightningModule") -> None:
+def verify_loop_configurations(trainer: "pl.Trainer") -> None:
     r"""
     Checks that the model is configured correctly before the run is started.
 
@@ -28,6 +28,10 @@ def verify_loop_configurations(trainer: "pl.Trainer", model: "pl.LightningModule
         model: The model to check the configuration.
 
     """
+    model = trainer.lightning_module
+
+    if trainer.state.fn is None:
+        raise ValueError("Unexpected: Trainer state fn must be set before validating loop configuration.")
     if trainer.state.fn in (TrainerFn.FITTING, TrainerFn.TUNING):
         __verify_train_val_loop_configuration(trainer, model)
         __verify_manual_optimization_support(trainer, model)
diff --git a/pytorch_lightning/trainer/connectors/callback_connector.py b/pytorch_lightning/trainer/connectors/callback_connector.py
index 4d41734ed90e6..c683a294d0440 100644
--- a/pytorch_lightning/trainer/connectors/callback_connector.py
+++ b/pytorch_lightning/trainer/connectors/callback_connector.py
@@ -249,10 +249,11 @@ def _configure_timer_callback(self, max_time: Optional[Union[str, timedelta, Dic
     def _trainer_has_checkpoint_callbacks(self):
         return len(self.trainer.checkpoint_callbacks) > 0
 
-    def attach_model_logging_functions(self, model):
+    def _attach_model_logging_functions(self):
+        lightning_module = self.trainer.lightning_module
         for callback in self.trainer.callbacks:
-            callback.log = model.log
-            callback.log_dict = model.log_dict
+            callback.log = lightning_module.log
+            callback.log_dict = lightning_module.log_dict
 
     def _attach_model_callbacks(self) -> None:
         """Attaches the callbacks defined in the model.
diff --git a/pytorch_lightning/trainer/connectors/data_connector.py b/pytorch_lightning/trainer/connectors/data_connector.py
index 8f286964940d2..73d19a2e28a1e 100644
--- a/pytorch_lightning/trainer/connectors/data_connector.py
+++ b/pytorch_lightning/trainer/connectors/data_connector.py
@@ -120,7 +120,7 @@ def _select_data_fetcher(self) -> AbstractDataFetcher:
 
     def get_profiled_dataloader(self, dataloader: Iterable, dataloader_idx: int = 0) -> Iterable:
         stage: str = self.trainer.state.stage.value
-        data_fetcher = setattr(self, f"{stage}_data_fetcher", None) or self._select_data_fetcher()
+        data_fetcher = getattr(self, f"{stage}_data_fetcher", None) or self._select_data_fetcher()
         data_fetcher.setup(
             dataloader,
             stage=stage,
diff --git a/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py b/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py
index a928122a2053a..ad3dce3c12964 100644
--- a/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py
+++ b/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Tuple, Union
+from typing import Optional, Tuple, Union
 
 from typing_extensions import TypedDict
 
@@ -20,50 +20,98 @@
 
 class _FxValidator:
     class _LogOptions(TypedDict):
-        on_step: Union[Tuple[bool], Tuple[bool, bool]]
-        on_epoch: Union[Tuple[bool], Tuple[bool, bool]]
+        allowed_on_step: Union[Tuple[bool], Tuple[bool, bool]]
+        allowed_on_epoch: Union[Tuple[bool], Tuple[bool, bool]]
+        default_on_step: bool
+        default_on_epoch: bool
 
     functions = {
         "on_before_accelerator_backend_setup": None,
         "on_configure_sharded_model": None,
-        "on_before_backward": _LogOptions(on_step=(False, True), on_epoch=(False, True)),
-        "on_after_backward": _LogOptions(on_step=(False, True), on_epoch=(False, True)),
-        "on_before_optimizer_step": _LogOptions(on_step=(False, True), on_epoch=(False, True)),
-        "on_before_zero_grad": _LogOptions(on_step=(False, True), on_epoch=(False, True)),
+        "on_before_backward": _LogOptions(
+            allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=True, default_on_epoch=False
+        ),
+        "on_after_backward": _LogOptions(
+            allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=True, default_on_epoch=False
+        ),
+        "on_before_optimizer_step": _LogOptions(
+            allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=True, default_on_epoch=False
+        ),
+        "on_before_zero_grad": _LogOptions(
+            allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=True, default_on_epoch=False
+        ),
         "on_init_start": None,
         "on_init_end": None,
         "on_fit_start": None,
         "on_fit_end": None,
         "on_sanity_check_start": None,
         "on_sanity_check_end": None,
-        "on_train_start": _LogOptions(on_step=(False,), on_epoch=(True,)),
+        "on_train_start": _LogOptions(
+            allowed_on_step=(False,), allowed_on_epoch=(True,), default_on_step=False, default_on_epoch=True
+        ),
         "on_train_end": None,
-        "on_validation_start": _LogOptions(on_step=(False,), on_epoch=(True,)),
+        "on_validation_start": _LogOptions(
+            allowed_on_step=(False,), allowed_on_epoch=(True,), default_on_step=False, default_on_epoch=True
+        ),
         "on_validation_end": None,
-        "on_test_start": _LogOptions(on_step=(False,), on_epoch=(True,)),
+        "on_test_start": _LogOptions(
+            allowed_on_step=(False,), allowed_on_epoch=(True,), default_on_step=False, default_on_epoch=True
+        ),
         "on_test_end": None,
         "on_predict_start": None,
         "on_predict_end": None,
         "on_pretrain_routine_start": None,
         "on_pretrain_routine_end": None,
-        "on_train_epoch_start": _LogOptions(on_step=(False, True), on_epoch=(True,)),
-        "on_train_epoch_end": _LogOptions(on_step=(False,), on_epoch=(True,)),
-        "on_validation_epoch_start": _LogOptions(on_step=(False, True), on_epoch=(True,)),
-        "on_validation_epoch_end": _LogOptions(on_step=(False,), on_epoch=(True,)),
-        "on_test_epoch_start": _LogOptions(on_step=(False, True), on_epoch=(True,)),
-        "on_test_epoch_end": _LogOptions(on_step=(False,), on_epoch=(True,)),
+        "on_train_epoch_start": _LogOptions(
+            allowed_on_step=(False,), allowed_on_epoch=(True,), default_on_step=False, default_on_epoch=True
+        ),
+        "on_train_epoch_end": _LogOptions(
+            allowed_on_step=(False,), allowed_on_epoch=(True,), default_on_step=False, default_on_epoch=True
+        ),
+        "on_validation_epoch_start": _LogOptions(
+            allowed_on_step=(False,), allowed_on_epoch=(True,), default_on_step=False, default_on_epoch=True
+        ),
+        "on_validation_epoch_end": _LogOptions(
+            allowed_on_step=(False,), allowed_on_epoch=(True,), default_on_step=False, default_on_epoch=True
+        ),
+        "on_test_epoch_start": _LogOptions(
+            allowed_on_step=(False,), allowed_on_epoch=(True,), default_on_step=False, default_on_epoch=True
+        ),
+        "on_test_epoch_end": _LogOptions(
+            allowed_on_step=(False,), allowed_on_epoch=(True,), default_on_step=False, default_on_epoch=True
+        ),
         "on_predict_epoch_start": None,
         "on_predict_epoch_end": None,
-        "on_epoch_start": _LogOptions(on_step=(False, True), on_epoch=(True,)),
-        "on_epoch_end": _LogOptions(on_step=(False,), on_epoch=(True,)),
-        "on_batch_start": _LogOptions(on_step=(False, True), on_epoch=(False, True)),
-        "on_batch_end": _LogOptions(on_step=(False, True), on_epoch=(False, True)),
-        "on_train_batch_start": _LogOptions(on_step=(False, True), on_epoch=(False, True)),
-        "on_train_batch_end": _LogOptions(on_step=(False, True), on_epoch=(False, True)),
-        "on_validation_batch_start": _LogOptions(on_step=(False, True), on_epoch=(False, True)),
-        "on_validation_batch_end": _LogOptions(on_step=(False, True), on_epoch=(False, True)),
-        "on_test_batch_start": _LogOptions(on_step=(False, True), on_epoch=(False, True)),
-        "on_test_batch_end": _LogOptions(on_step=(False, True), on_epoch=(False, True)),
+        "on_epoch_start": _LogOptions(
+            allowed_on_step=(False,), allowed_on_epoch=(True,), default_on_step=False, default_on_epoch=True
+        ),
+        "on_epoch_end": _LogOptions(
+            allowed_on_step=(False,), allowed_on_epoch=(True,), default_on_step=False, default_on_epoch=True
+        ),
+        "on_batch_start": _LogOptions(
+            allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=True, default_on_epoch=False
+        ),
+        "on_batch_end": _LogOptions(
+            allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=True, default_on_epoch=False
+        ),
+        "on_train_batch_start": _LogOptions(
+            allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=True, default_on_epoch=False
+        ),
+        "on_train_batch_end": _LogOptions(
+            allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=True, default_on_epoch=False
+        ),
+        "on_validation_batch_start": _LogOptions(
+            allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=False, default_on_epoch=True
+        ),
+        "on_validation_batch_end": _LogOptions(
+            allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=False, default_on_epoch=True
+        ),
+        "on_test_batch_start": _LogOptions(
+            allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=False, default_on_epoch=True
+        ),
+        "on_test_batch_end": _LogOptions(
+            allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=False, default_on_epoch=True
+        ),
         "on_predict_batch_start": None,
         "on_predict_batch_end": None,
         "on_keyboard_interrupt": None,
@@ -73,16 +121,34 @@ class _LogOptions(TypedDict):
         "setup": None,
         "teardown": None,
         "configure_sharded_model": None,
-        "training_step": _LogOptions(on_step=(False, True), on_epoch=(False, True)),
-        "validation_step": _LogOptions(on_step=(False, True), on_epoch=(False, True)),
-        "test_step": _LogOptions(on_step=(False, True), on_epoch=(False, True)),
+        "training_step": _LogOptions(
+            allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=True, default_on_epoch=False
+        ),
+        "validation_step": _LogOptions(
+            allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=False, default_on_epoch=True
+        ),
+        "test_step": _LogOptions(
+            allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=False, default_on_epoch=True
+        ),
         "predict_step": None,
-        "training_step_end": _LogOptions(on_step=(False, True), on_epoch=(False, True)),
-        "validation_step_end": _LogOptions(on_step=(False, True), on_epoch=(False, True)),
-        "test_step_end": _LogOptions(on_step=(False, True), on_epoch=(False, True)),
-        "training_epoch_end": _LogOptions(on_step=(False,), on_epoch=(True,)),
-        "validation_epoch_end": _LogOptions(on_step=(False,), on_epoch=(True,)),
-        "test_epoch_end": _LogOptions(on_step=(False,), on_epoch=(True,)),
+        "training_step_end": _LogOptions(
+            allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=True, default_on_epoch=False
+        ),
+        "validation_step_end": _LogOptions(
+            allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=False, default_on_epoch=True
+        ),
+        "test_step_end": _LogOptions(
+            allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=False, default_on_epoch=True
+        ),
+        "training_epoch_end": _LogOptions(
+            allowed_on_step=(False,), allowed_on_epoch=(True,), default_on_step=False, default_on_epoch=True
+        ),
+        "validation_epoch_end": _LogOptions(
+            allowed_on_step=(False,), allowed_on_epoch=(True,), default_on_step=False, default_on_epoch=True
+        ),
+        "test_epoch_end": _LogOptions(
+            allowed_on_step=(False,), allowed_on_epoch=(True,), default_on_step=False, default_on_epoch=True
+        ),
         "configure_optimizers": None,
         "on_train_dataloader": None,
         "train_dataloader": None,
@@ -97,22 +163,48 @@ class _LogOptions(TypedDict):
     }
 
     @classmethod
-    def check_logging(cls, fx_name: str, on_step: bool, on_epoch: bool) -> None:
-        """Check if the given function name is allowed to log."""
+    def check_logging(cls, fx_name: str) -> None:
+        """Check if the given hook is allowed to log."""
         if fx_name not in cls.functions:
             raise RuntimeError(
                 f"Logging inside `{fx_name}` is not implemented."
-                " Please, open an issue in `https://github.com/PyTorchLightning/pytorch-lightning/issues`"
+                " Please, open an issue in `https://github.com/PyTorchLightning/pytorch-lightning/issues`."
             )
-        allowed = cls.functions[fx_name]
-        if allowed is None:
-            raise MisconfigurationException(f"You can't `self.log()` inside `{fx_name}`")
 
-        m = "You can't `self.log({}={})` inside `{}`, must be one of {}"
-        if on_step not in allowed["on_step"]:
-            msg = m.format("on_step", on_step, fx_name, allowed["on_step"])
+        if cls.functions[fx_name] is None:
+            raise MisconfigurationException(f"You can't `self.log()` inside `{fx_name}`.")
+
+    @classmethod
+    def get_default_logging_levels(
+        cls, fx_name: str, on_step: Optional[bool], on_epoch: Optional[bool]
+    ) -> Tuple[bool, bool]:
+        """Return default logging levels for given hook."""
+        fx_config = cls.functions[fx_name]
+        assert fx_config is not None
+        on_step = fx_config["default_on_step"] if on_step is None else on_step
+        on_epoch = fx_config["default_on_epoch"] if on_epoch is None else on_epoch
+        return on_step, on_epoch
+
+    @classmethod
+    def check_logging_levels(cls, fx_name: str, on_step: bool, on_epoch: bool) -> None:
+        """Check if the logging levels are allowed in the given hook."""
+        fx_config = cls.functions[fx_name]
+        assert fx_config is not None
+        m = "You can't `self.log({}={})` inside `{}`, must be one of {}."
+        if on_step not in fx_config["allowed_on_step"]:
+            msg = m.format("on_step", on_step, fx_name, fx_config["allowed_on_step"])
             raise MisconfigurationException(msg)
 
-        if on_epoch not in allowed["on_epoch"]:
-            msg = m.format("on_epoch", on_epoch, fx_name, allowed["on_epoch"])
+        if on_epoch not in fx_config["allowed_on_epoch"]:
+            msg = m.format("on_epoch", on_epoch, fx_name, fx_config["allowed_on_epoch"])
             raise MisconfigurationException(msg)
+
+    @classmethod
+    def check_logging_and_get_default_levels(
+        cls, fx_name: str, on_step: Optional[bool], on_epoch: Optional[bool]
+    ) -> Tuple[bool, bool]:
+        """Check if the given hook name is allowed to log and return logging levels."""
+        cls.check_logging(fx_name)
+        on_step, on_epoch = cls.get_default_logging_levels(fx_name, on_step, on_epoch)
+        cls.check_logging_levels(fx_name, on_step, on_epoch)
+        return on_step, on_epoch
diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
index 37fcb06a1dc24..f574eebecd5f7 100644
--- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
+++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
@@ -154,17 +154,20 @@ def update_eval_step_metrics(self) -> None:
         # increment the step even if nothing was logged
         self._increment_eval_log_step()
 
-    def _prepare_eval_loop_results(self, metrics: _OUT_DICT) -> None:
+    def _prepare_eval_loop_results(self) -> None:
         if self.trainer.sanity_checking:
             return
 
+        on_step = not self._epoch_end_reached
         num_dataloaders = self.trainer._evaluation_loop.num_dataloaders
         has_been_initialized = len(self.eval_loop_results) == num_dataloaders
-        for dl_idx in range(self.trainer._evaluation_loop.num_dataloaders):
-            # remove callback metrics that don't belong to this dataloader
-            callback_metrics = {
-                k: v for k, v in metrics.items() if "dataloader_idx" not in k or f"dataloader_idx_{dl_idx}" in k
-            }
+        assert self.trainer._evaluation_loop._results is not None
+        for dl_idx in range(num_dataloaders):
+            metrics = self.trainer._evaluation_loop._results.metrics(
+                on_step, dataloader_idx=dl_idx if num_dataloaders > 1 else None
+            )
+            callback_metrics = metrics["callback"]
+
             if has_been_initialized:
                 self.eval_loop_results[dl_idx].update(callback_metrics)
             else:
@@ -178,7 +181,7 @@ def update_eval_epoch_metrics(self) -> List[_OUT_DICT]:
             # log all the metrics as a single dict
             self.log_metrics(metrics["log"])
 
-        self._prepare_eval_loop_results(metrics["callback"])
+        self._prepare_eval_loop_results()
 
         # log results of evaluation
         if (
@@ -210,7 +213,6 @@ def update_eval_epoch_metrics(self) -> List[_OUT_DICT]:
 
     def on_train_split_start(self, split_idx: int, split_batch: Any) -> None:
         self._split_idx = split_idx
-        self.on_new_batch(split_batch)
 
     def update_train_step_metrics(self) -> None:
         if self.trainer.fit_loop._should_accumulate() and self.trainer.lightning_module.automatic_optimization:
@@ -253,28 +255,23 @@ def _log_gpus_metrics(self) -> None:
     Utilities and properties
     """
 
-    def on_new_batch(self, batch: Any) -> int:
-        # when the user requests `dataloader_iter`, we can't track the batch_size
-        # and this is left to user responsibility.
-        if not isinstance(batch, pl.utilities.fetching.StepFuncDataLoaderIter):
-            assert self.trainer._results is not None
-            return self.trainer._results.extract_batch_size(batch)
-        return 1
-
     def on_epoch_start(self) -> None:
         self._epoch_end_reached = False
 
-    def on_batch_start(self, batch_idx: int, batch: Any) -> int:
+    def on_batch_start(self, batch_idx: int, batch: Any) -> None:
         self._batch_idx = batch_idx
         self._epoch_end_reached = False
-        return self.on_new_batch(batch)
+
+        assert self.trainer._results is not None
+        # attach reference to the new batch and remove the cached batch_size
+        self.trainer._results.batch = batch
+        self.trainer._results.batch_size = None
 
     def epoch_end_reached(self) -> None:
         self._epoch_end_reached = True
         self._batch_idx = None
         self._split_idx = None
         assert self.trainer._results is not None
-        self.trainer._results.batch_size = 1
 
     def on_epoch_end(self) -> None:
         assert self._epoch_end_reached
@@ -291,6 +288,11 @@ def on_batch_end(self) -> None:
         self._callback_metrics.update(metrics["callback"])
         self._logged_metrics.update(metrics["log"])
 
+        assert self.trainer._results is not None
+        # drop the reference to current batch and batch_size
+        self.trainer._results.batch = None
+        self.trainer._results.batch_size = None
+
     def should_reset_tensors(self, fx: str) -> bool:
         is_different_fx = self._current_fx != fx
         if self._split_idx is None:
diff --git a/pytorch_lightning/trainer/connectors/logger_connector/result.py b/pytorch_lightning/trainer/connectors/logger_connector/result.py
index f798cf3ee2b82..dc461bd621288 100644
--- a/pytorch_lightning/trainer/connectors/logger_connector/result.py
+++ b/pytorch_lightning/trainer/connectors/logger_connector/result.py
@@ -51,8 +51,8 @@ class _Sync:
     fn: Optional[Callable] = None
     _should: bool = False
     rank_zero_only: bool = False
-    op: Optional[str] = None
-    group: Optional[Any] = None
+    _op: Optional[str] = None
+    _group: Optional[Any] = None
 
     def __post_init__(self) -> None:
         self._generate_sync_fn()
@@ -67,6 +67,26 @@ def should(self, should: bool) -> None:
         # `self._fn` needs to be re-generated.
         self._generate_sync_fn()
 
+    @property
+    def op(self) -> Optional[str]:
+        return self._op
+
+    @op.setter
+    def op(self, op: Optional[str]) -> None:
+        self._op = op
+        # `self._fn` needs to be re-generated.
+        self._generate_sync_fn()
+
+    @property
+    def group(self) -> Optional[Any]:
+        return self._group
+
+    @group.setter
+    def group(self, group: Optional[Any]) -> None:
+        self._group = group
+        # `self._fn` needs to be re-generated.
+        self._generate_sync_fn()
+
     def _generate_sync_fn(self) -> None:
         """Used to compute the syncing function and cache it."""
         fn = self.no_op if self.fn is None or not self.should or self.rank_zero_only else self.fn
@@ -93,6 +113,7 @@ class _Metadata:
     on_epoch: bool = True
     reduce_fx: Callable = torch.mean
     enable_graph: bool = False
+    add_dataloader_idx: bool = True
     dataloader_idx: Optional[int] = None
     metric_attribute: Optional[str] = None
     _sync: Optional[_Sync] = None
@@ -187,13 +208,29 @@ def __init__(self, metadata: _Metadata, is_tensor: bool) -> None:
         self.meta = metadata
         self.has_reset = False
         if is_tensor:
-            self.add_state("value", torch.tensor(0, dtype=torch.float), dist_reduce_fx=torch.sum)
+            if metadata.is_max_reduction:
+                default = float("-inf")
+            elif metadata.is_min_reduction:
+                default = float("inf")
+            else:
+                default = 0.0
+            # do not set a dtype in case the default dtype was changed
+            self.add_state("value", torch.tensor(default), dist_reduce_fx=torch.sum)
             if self.meta.is_mean_reduction:
-                self.add_state("cumulated_batch_size", torch.tensor(0, dtype=torch.float), dist_reduce_fx=torch.sum)
+                self.cumulated_batch_size: torch.Tensor
+                self.add_state("cumulated_batch_size", torch.tensor(0), dist_reduce_fx=torch.sum)
 
-    def update(self, value: _IN_METRIC, batch_size: torch.Tensor) -> None:
+    def update(self, value: _IN_METRIC, batch_size: int) -> None:
         if self.is_tensor:
-            value = value.float()
+            if not torch.is_floating_point(value):
+                dtype = torch.get_default_dtype()
+                warning_cache.warn(
+                    # do not include the value to avoid cache misses
+                    f"You called `self.log({self.meta.name!r}, ...)` in your `{self.meta.fx}` but the value needs to"
+                    f" be floating point. Converting it to {dtype}."
+                )
+                value = value.to(dtype)
+
             if self.meta.on_step:
                 self._forward_cache = self.meta.sync(value.clone())  # `clone` because `sync` is in-place
 
@@ -204,12 +241,13 @@ def update(self, value: _IN_METRIC, batch_size: torch.Tensor) -> None:
 
             # perform accumulation with reduction
             if self.meta.is_mean_reduction:
-                self.value += value.mean() * batch_size
-                self.cumulated_batch_size += batch_size
+                # do not use `+=` as it doesn't do type promotion
+                self.value = self.value + value.mean() * batch_size
+                self.cumulated_batch_size = self.cumulated_batch_size + batch_size
             elif self.meta.is_max_reduction or self.meta.is_min_reduction:
                 self.value = self.meta.reduce_fx(self.value, value.mean())
             elif self.meta.is_sum_reduction:
-                self.value += value.mean()
+                self.value = self.value + value.mean()
         else:
             self.value = value
             self._forward_cache = value._forward_cache
@@ -230,7 +268,7 @@ def reset(self) -> None:
             self.value.reset()
         self.has_reset = True
 
-    def forward(self, value: _IN_METRIC, batch_size: torch.Tensor) -> None:
+    def forward(self, value: _IN_METRIC, batch_size: int) -> None:
         if self.meta.enable_graph:
             with torch.no_grad():
                 self.update(value, batch_size)
@@ -306,12 +344,13 @@ class ResultMetricCollection(dict):
     with the same metadata.
     """
 
-    def __init__(self, *args: Any) -> None:
-        super().__init__(*args)
-
     @property
     def meta(self) -> _Metadata:
-        return list(self.values())[0].meta
+        return next(iter(self.values())).meta
+
+    @property
+    def has_tensor(self) -> bool:
+        return any(v.is_tensor for v in self.values())
 
     def __getstate__(self, drop_value: bool = False) -> dict:
         def getstate(item: ResultMetric) -> dict:
@@ -356,8 +395,9 @@ class ResultCollection(dict):
     def __init__(self, training: bool, device: Optional[Union[str, torch.device]] = None) -> None:
         super().__init__()
         self.training = training
-        self._batch_size = torch.tensor(1, device=device)
         self.device: Optional[Union[str, torch.device]] = device
+        self.batch: Optional[Any] = None
+        self.batch_size: Optional[int] = None
 
     @property
     def result_metrics(self) -> List[ResultMetric]:
@@ -370,14 +410,24 @@ def append_fn(v: ResultMetric) -> None:
         apply_to_collection(list(self.values()), ResultMetric, append_fn)
         return o
 
-    @property
-    def batch_size(self) -> torch.Tensor:
-        # performance: cache the `batch_size` tensor instead of re-creating it
-        return self._batch_size
+    def _extract_batch_size(self, value: _METRIC_COLLECTION, batch_size: Optional[int], meta: _Metadata) -> int:
+        # check if we have extracted the batch size already
+        if batch_size is None:
+            batch_size = self.batch_size
 
-    @batch_size.setter
-    def batch_size(self, value: int) -> None:
-        self._batch_size = torch.tensor(value, device=self.device)
+        if batch_size is not None:
+            return batch_size
+
+        batch_size = 1
+        is_tensor = value.is_tensor if isinstance(value, ResultMetric) else value.has_tensor
+        if self.batch is not None and is_tensor and meta.on_epoch and meta.is_mean_reduction:
+            try:
+                batch_size = extract_batch_size(self.batch)
+                self.batch_size = batch_size
+            except RecursionError:
+                pass
+
+        return batch_size
 
     def log(
         self,
@@ -393,6 +443,7 @@ def log(
         sync_dist: bool = False,
         sync_dist_fn: Callable = _Sync.no_op,
         sync_dist_group: Optional[Any] = None,
+        add_dataloader_idx: bool = True,
         dataloader_idx: Optional[int] = None,
         batch_size: Optional[int] = None,
         metric_attribute: Optional[str] = None,
@@ -410,7 +461,7 @@ def log(
         # storage key
         key = f"{fx}.{name}"
         # add dataloader_suffix to both key and fx
-        if dataloader_idx is not None:
+        if add_dataloader_idx and dataloader_idx is not None:
             key += f".{dataloader_idx}"
             fx += f".{dataloader_idx}"
 
@@ -423,10 +474,11 @@ def log(
             on_epoch=on_epoch,
             reduce_fx=reduce_fx,
             enable_graph=enable_graph,
+            add_dataloader_idx=add_dataloader_idx,
             dataloader_idx=dataloader_idx,
             metric_attribute=metric_attribute,
         )
-        meta.sync = _Sync(_should=sync_dist, fn=sync_dist_fn, group=sync_dist_group, rank_zero_only=rank_zero_only)
+        meta.sync = _Sync(_should=sync_dist, fn=sync_dist_fn, _group=sync_dist_group, rank_zero_only=rank_zero_only)
 
         # register logged value if it doesn't exist
         if key not in self:
@@ -438,10 +490,8 @@ def log(
                 f"You called `self.log({name}, ...)` twice in `{fx}` with different arguments. This is not allowed"
             )
 
-        if batch_size is not None:
-            self.batch_size = batch_size
-
-        self.update_metrics(key, value)
+        batch_size = self._extract_batch_size(self[key], batch_size, meta)
+        self.update_metrics(key, value, batch_size)
 
     def register_key(self, key: str, meta: _Metadata, value: _METRIC_COLLECTION) -> None:
         """Create one ResultMetric object per value.
@@ -458,10 +508,10 @@ def fn(v: _IN_METRIC) -> ResultMetric:
             value = ResultMetricCollection(value)
         self[key] = value
 
-    def update_metrics(self, key: str, value: _METRIC_COLLECTION) -> None:
-        def fn(result_metric: ResultMetric, v: ResultMetric) -> None:
+    def update_metrics(self, key: str, value: _METRIC_COLLECTION, batch_size: int) -> None:
+        def fn(result_metric: ResultMetric, v: torch.Tensor) -> None:
             # performance: avoid calling `__call__` to avoid the checks in `torch.nn.Module._call_impl`
-            result_metric.forward(v.to(self.device), self.batch_size)
+            result_metric.forward(v.to(self.device), batch_size)
             result_metric.has_reset = False
 
         apply_to_collections(self[key], value, ResultMetric, fn)
@@ -483,24 +533,29 @@ def _get_cache(result_metric: ResultMetric, on_step: bool) -> Optional[torch.Ten
             return cache.detach()
         return cache
 
-    def valid_items(self) -> Generator:
+    def valid_items(self, dataloader_idx: Optional[int] = None) -> Generator:
         """This function is used to iterate over current valid metrics."""
-        return ((k, v) for k, v in self.items() if not (isinstance(v, ResultMetric) and v.has_reset))
+        return (
+            (k, v)
+            for k, v in self.items()
+            if not (isinstance(v, ResultMetric) and v.has_reset) and (dataloader_idx in (None, v.meta.dataloader_idx))
+        )
 
     def _forked_name(self, result_metric: ResultMetric, on_step: bool) -> Tuple[str, str]:
         name = result_metric.meta.name
         forked_name = result_metric.meta.forked_name(on_step)
+        add_dataloader_idx = result_metric.meta.add_dataloader_idx
         dl_idx = result_metric.meta.dataloader_idx
-        if dl_idx is not None:
+        if add_dataloader_idx and dl_idx is not None:
             dataloader_suffix = self.DATALOADER_SUFFIX.format(dl_idx)
             name += dataloader_suffix
             forked_name += dataloader_suffix
         return name, forked_name
 
-    def metrics(self, on_step: bool) -> _METRICS:
+    def metrics(self, on_step: bool, dataloader_idx: Optional[int] = None) -> _METRICS:
         metrics = _METRICS(callback={}, log={}, pbar={})
 
-        for _, result_metric in self.valid_items():
+        for _, result_metric in self.valid_items(dataloader_idx):
 
             # extract forward_cache or computed from the ResultMetric. ignore when the output is None
             value = apply_to_collection(result_metric, ResultMetric, self._get_cache, on_step, include_none=False)
@@ -555,19 +610,10 @@ def fn(item: ResultMetric) -> None:
 
         apply_to_collection(self, ResultMetric, fn)
 
-    def extract_batch_size(self, batch: Any) -> int:
-        try:
-            batch_size = extract_batch_size(batch)
-        except RecursionError:
-            batch_size = 1
-        self.batch_size = batch_size  # the setter converts it to `Tensor`
-        return batch_size
-
     def to(self, *args: Any, **kwargs: Any) -> "ResultCollection":
         """Move all data to the given device."""
         self.update(apply_to_collection(dict(self), (torch.Tensor, Metric), move_data_to_device, *args, **kwargs))
 
-        self._batch_size = self._batch_size.to(*args, **kwargs)
         if "device" in kwargs:
             self.device = kwargs["device"]
         return self
diff --git a/pytorch_lightning/trainer/connectors/signal_connector.py b/pytorch_lightning/trainer/connectors/signal_connector.py
index dc33d1244441f..8145a692ceeb4 100644
--- a/pytorch_lightning/trainer/connectors/signal_connector.py
+++ b/pytorch_lightning/trainer/connectors/signal_connector.py
@@ -2,36 +2,46 @@
 import os
 import signal
 import sys
-from signal import Signals
+import threading
 from subprocess import call
-from types import FrameType, FunctionType
-from typing import Callable, List, Union
+from types import FrameType
+from typing import Any, Callable, Dict, List, Set, Union
 
 import pytorch_lightning as pl
-from pytorch_lightning.utilities.imports import _fault_tolerant_training
+from pytorch_lightning.utilities.imports import _fault_tolerant_training, _IS_WINDOWS
+
+# copied from signal.pyi
+_SIGNUM = Union[int, signal.Signals]
+_HANDLER = Union[Callable[[_SIGNUM, FrameType], Any], int, signal.Handlers, None]
 
 log = logging.getLogger(__name__)
 
 
 class HandlersCompose:
-    def __init__(self, signal_handlers: Union[List[Callable], Callable]):
+    def __init__(self, signal_handlers: Union[List[_HANDLER], _HANDLER]) -> None:
         if not isinstance(signal_handlers, list):
             signal_handlers = [signal_handlers]
         self.signal_handlers = signal_handlers
 
-    def __call__(self, signum: Signals, frame: FrameType) -> None:
+    def __call__(self, signum: _SIGNUM, frame: FrameType) -> None:
         for signal_handler in self.signal_handlers:
-            signal_handler(signum, frame)
+            if isinstance(signal_handler, int):
+                signal_handler = signal.getsignal(signal_handler)
+            if callable(signal_handler):
+                signal_handler(signum, frame)
 
 
 class SignalConnector:
-    def __init__(self, trainer: "pl.Trainer"):
+    def __init__(self, trainer: "pl.Trainer") -> None:
         self.trainer = trainer
         self.trainer._terminate_gracefully = False
+        self._original_handlers: Dict[_SIGNUM, _HANDLER] = {}
 
     def register_signal_handlers(self) -> None:
-        sigusr1_handlers: List[Callable] = []
-        sigterm_handlers: List[Callable] = []
+        self._original_handlers = self._get_current_signal_handlers()
+
+        sigusr1_handlers: List[_HANDLER] = []
+        sigterm_handlers: List[_HANDLER] = []
 
         if _fault_tolerant_training():
             sigusr1_handlers.append(self.fault_tolerant_sigusr1_handler_fn)
@@ -42,14 +52,14 @@ def register_signal_handlers(self) -> None:
             sigterm_handlers.append(self.sigterm_handler_fn)
 
         # signal.SIGUSR1 doesn't seem available on windows
-        if not self._is_on_windows():
-            if not self._has_already_handler(signal.SIGUSR1):
-                signal.signal(signal.SIGUSR1, HandlersCompose(sigusr1_handlers))
+        if not _IS_WINDOWS:
+            if sigusr1_handlers and not self._has_already_handler(signal.SIGUSR1):
+                self._register_signal(signal.SIGUSR1, HandlersCompose(sigusr1_handlers))
 
-            if not self._has_already_handler(signal.SIGTERM):
-                signal.signal(signal.SIGTERM, HandlersCompose(sigterm_handlers))
+            if sigterm_handlers and not self._has_already_handler(signal.SIGTERM):
+                self._register_signal(signal.SIGTERM, HandlersCompose(sigterm_handlers))
 
-    def slurm_sigusr1_handler_fn(self, signum: Signals, frame: FrameType) -> None:
+    def slurm_sigusr1_handler_fn(self, signum: _SIGNUM, frame: FrameType) -> None:
         if self.trainer.is_global_zero:
             # save weights
             log.info("handling SIGUSR1")
@@ -80,12 +90,49 @@ def slurm_sigusr1_handler_fn(self, signum: Signals, frame: FrameType) -> None:
             if self.trainer.logger:
                 self.trainer.logger.finalize("finished")
 
-    def fault_tolerant_sigusr1_handler_fn(self, signum: Signals, frame: FrameType) -> None:
+    def fault_tolerant_sigusr1_handler_fn(self, signum: _SIGNUM, frame: FrameType) -> None:
         self.trainer._terminate_gracefully = True
 
-    def sigterm_handler_fn(self, signum: Signals, frame: FrameType) -> None:
+    def sigterm_handler_fn(self, signum: _SIGNUM, frame: FrameType) -> None:
         log.info("bypassing sigterm")
 
+    def teardown(self) -> None:
+        """Restores the signals that were previsouly configured before :class:`SignalConnector` replaced them."""
+        for signum, handler in self._original_handlers.items():
+            if handler is not None:
+                self._register_signal(signum, handler)
+        self._original_handlers = {}
+
+    @staticmethod
+    def _get_current_signal_handlers() -> Dict[_SIGNUM, _HANDLER]:
+        """Collects the currently assigned signal handlers."""
+        valid_signals = SignalConnector._valid_signals()
+        if not _IS_WINDOWS:
+            # SIGKILL and SIGSTOP are not allowed to be modified by the user
+            valid_signals -= {signal.SIGKILL, signal.SIGSTOP}
+        return {signum: signal.getsignal(signum) for signum in valid_signals}
+
+    @staticmethod
+    def _valid_signals() -> Set[signal.Signals]:
+        """Returns all valid signals supported on the current platform.
+
+        Behaves identically to :func:`signals.valid_signals` in Python 3.8+ and implements the equivalent behavior for
+        older Python versions.
+        """
+        if sys.version_info >= (3, 8):
+            return signal.valid_signals()
+        elif _IS_WINDOWS:
+            # supported signals on Windows: https://docs.python.org/3/library/signal.html#signal.signal
+            return {
+                signal.SIGABRT,
+                signal.SIGFPE,
+                signal.SIGILL,
+                signal.SIGINT,
+                signal.SIGSEGV,
+                signal.SIGTERM,
+            }
+        return set(signal.Signals)
+
     def _is_on_slurm(self) -> bool:
         # see if we're using slurm (not interactive)
         on_slurm = False
@@ -99,11 +146,16 @@ def _is_on_slurm(self) -> bool:
 
         return on_slurm
 
-    def _is_on_windows(self) -> bool:
-        return sys.platform == "win32"
+    @staticmethod
+    def _has_already_handler(signum: _SIGNUM) -> bool:
+        return signal.getsignal(signum) not in (None, signal.SIG_DFL)
 
-    def _has_already_handler(self, signum: Signals) -> bool:
-        try:
-            return isinstance(signal.getsignal(signum), FunctionType)
-        except AttributeError:
-            return False
+    @staticmethod
+    def _register_signal(signum: _SIGNUM, handlers: _HANDLER) -> None:
+        if threading.current_thread() is threading.main_thread():
+            signal.signal(signum, handlers)  # type: ignore[arg-type]
+
+    def __getstate__(self) -> Dict:
+        state = self.__dict__.copy()
+        state["_original_handlers"] = {}
+        return state
diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py
index e149aef9a7997..fdeddcbca1e50 100644
--- a/pytorch_lightning/trainer/data_loading.py
+++ b/pytorch_lightning/trainer/data_loading.py
@@ -28,7 +28,7 @@
 from pytorch_lightning.overrides.distributed import IndexBatchSamplerWrapper, UnrepeatedDistributedSampler
 from pytorch_lightning.trainer.connectors.accelerator_connector import AcceleratorConnector
 from pytorch_lightning.trainer.states import RunningStage
-from pytorch_lightning.trainer.supporters import CombinedLoader
+from pytorch_lightning.trainer.supporters import CombinedLoader, CycleIterator
 from pytorch_lightning.utilities import rank_zero_warn
 from pytorch_lightning.utilities.apply_func import apply_to_collection
 from pytorch_lightning.utilities.auto_restart import (
@@ -37,7 +37,7 @@
     CaptureMapDataset,
     FastForwardSampler,
 )
-from pytorch_lightning.utilities.data import has_iterable_dataset, has_len_all_ranks
+from pytorch_lightning.utilities.data import get_len, has_iterable_dataset, has_len_all_ranks
 from pytorch_lightning.utilities.enums import DistributedType
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.imports import _fault_tolerant_training
@@ -50,21 +50,42 @@ class TrainerDataLoadingMixin(ABC):
     # this is just a summary on variables used in this abstract class,
     #  the proper values/initialisation should be done in child class
     val_check_interval: float
+    reload_dataloaders_every_n_epochs: int
     tpu_local_core_rank: int
     train_dataloader: DataLoader
-    num_training_batches: Union[int, float]
-    val_check_batch: float
-    val_dataloaders: Optional[List[DataLoader]]
-    num_val_batches: List[Union[int, float]]
-    test_dataloaders: Optional[List[DataLoader]]
-    num_test_batches: List[Union[int, float]]
     limit_train_batches: Union[int, float]
+    num_training_batches: int
+    val_check_batch: float
+    val_dataloaders: List[DataLoader]
+    limit_val_batches: Union[int, float]
+    num_val_batches: List[int]
+    test_dataloaders: List[DataLoader]
+    limit_test_batches: Union[int, float]
+    num_test_batches: List[int]
+    predict_dataloaders: List[DataLoader]
+    limit_predict_batches: Union[int, float]
+    num_predict_batches: List[int]
     log_every_n_steps: int
     overfit_batches: Union[int, float]
     distributed_sampler_kwargs: dict
     accelerator: Accelerator
     accelerator_connector: AcceleratorConnector
     call_hook: Callable
+    current_epoch: int
+    _last_train_dl_reload_epoch: int
+    _last_val_dl_reload_epoch: int
+
+    @property
+    def _should_reload_train_dl(self) -> bool:
+        """Check if train dataloader should be reloaded."""
+        n_epochs = self.reload_dataloaders_every_n_epochs
+        return n_epochs and (self.current_epoch - self._last_train_dl_reload_epoch >= n_epochs)
+
+    @property
+    def _should_reload_val_dl(self) -> bool:
+        """Check if validation dataloader should be reloaded."""
+        n_epochs = self.reload_dataloaders_every_n_epochs
+        return n_epochs and (self.current_epoch - self._last_val_dl_reload_epoch >= n_epochs)
 
     def _worker_check(self, dataloader: DataLoader, name: str) -> None:
         if not isinstance(dataloader, DataLoader):
@@ -107,6 +128,7 @@ def _worker_check(self, dataloader: DataLoader, name: str) -> None:
                 )
 
         elif dataloader.num_workers <= 2 < num_cpus and not using_spawn:
+            # if changed, update the `filterwarnings` snippet in 'speed.html#num-workers'
             rank_zero_warn(
                 f"The dataloader, {name}, does not have many workers which may be a bottleneck."
                 " Consider increasing the value of the `num_workers` argument`"
@@ -136,14 +158,22 @@ def prepare_dataloader(self, dataloader: Any, shuffle: bool, mode: Optional[Runn
         if isinstance(dataloader, CombinedLoader):
             # apply `prepare_dataloader` on all the collection of loaders
             dataloader.loaders = apply_to_collection(
-                dataloader.loaders, DataLoader, self.prepare_dataloader, shuffle, mode=mode
+                dataloader.loaders, (DataLoader, CycleIterator), self.prepare_dataloader, shuffle, mode=mode
             )
+            # the length need to recomputed across all dataloaders in case of special behavior.
+            dataloader._apply_cycle_iterator_length()
             return dataloader
 
         # don't do anything if it's not a dataloader
-        if not isinstance(dataloader, DataLoader):
+        if not isinstance(dataloader, (DataLoader, CycleIterator)):
             return dataloader
 
+        cycle_iterator: Optional[CycleIterator] = None
+
+        if isinstance(dataloader, CycleIterator):
+            cycle_iterator = dataloader
+            dataloader = dataloader.loader
+
         if (
             _fault_tolerant_training()  # injects components to track the state
             or self._requires_distributed_sampler(dataloader)  # sets the distributed sampler
@@ -153,13 +183,17 @@ def prepare_dataloader(self, dataloader: Any, shuffle: bool, mode: Optional[Runn
             sampler = self._resolve_sampler(dataloader, shuffle=shuffle, mode=mode)
             dataloader = self._update_dataloader(dataloader, sampler, mode=mode)
 
+        if cycle_iterator is not None:
+            cycle_iterator.loader = dataloader
+            return cycle_iterator
+
         return dataloader
 
     def _resolve_sampler(self, dataloader: DataLoader, shuffle: bool, mode: Optional[RunningStage] = None) -> Sampler:
         if self._requires_distributed_sampler(dataloader):
             if not isinstance(dataloader.sampler, (SequentialSampler, RandomSampler)):
                 raise MisconfigurationException(
-                    "You seem to have configured a sampler in your DataLoader. This will be replaced "
+                    "You seem to have configured a sampler in your DataLoader. This will be replaced"
                     " by `DistributedSampler` since `replace_sampler_ddp` is True and you are using"
                     " distributed training. Either remove the sampler from your DataLoader or set"
                     " `replace_sampler_ddp=False` if you want to use your custom sampler."
@@ -184,7 +218,7 @@ def _dataloader_init_kwargs_resolve_sampler(
         batch_sampler = getattr(dataloader, "batch_sampler")
         is_predicting = mode == RunningStage.PREDICTING
         # checking the batch sampler type is different than PyTorch default.
-        if (batch_sampler is not None and type(batch_sampler) is not BatchSampler) or is_predicting:
+        if batch_sampler is not None and (type(batch_sampler) is not BatchSampler or is_predicting):
             batch_sampler = type(batch_sampler)(
                 sampler,
                 batch_size=batch_sampler.batch_size,
@@ -238,9 +272,13 @@ def _get_dataloader_init_kwargs(
 
         # kwargs to re-construct the dataloader
         dl_kwargs = {k: v for k, v in attrs.items() if k in non_defaults}
-        dl_kwargs.update(
-            TrainerDataLoadingMixin._dataloader_init_kwargs_resolve_sampler(dataloader, sampler, mode=mode)
-        )
+        if isinstance(dl_kwargs["dataset"], IterableDataset):
+            dl_kwargs["batch_sampler"] = None
+            dl_kwargs["sampler"] = None
+        else:
+            dl_kwargs.update(
+                TrainerDataLoadingMixin._dataloader_init_kwargs_resolve_sampler(dataloader, sampler, mode=mode)
+            )
 
         required_args = {
             p.name
@@ -282,10 +320,11 @@ def _get_dataloader_init_kwargs(
             dl_kwargs["sampler"] = None
 
         if _fault_tolerant_training():
-            if isinstance(dl_kwargs["dataset"], IterableDataset):
+            dataset = dl_kwargs["dataset"]
+            if isinstance(dataset, IterableDataset):
                 # wrap the `IterableDataset` into a `CaptureIterableDataset` to record sampler states.
                 dl_kwargs["dataset"] = CaptureIterableDataset(dataset=dl_kwargs["dataset"])
-            elif len(dl_kwargs["dataset"]):
+            elif get_len(dataset) != float("inf"):
                 dl_kwargs["dataset"] = CaptureMapDataset(dataset=dl_kwargs["dataset"])
             else:
                 raise MisconfigurationException(
@@ -396,6 +435,9 @@ def reset_train_dataloader(self, model: Optional["pl.LightningModule"] = None) -
                 " you want to see logs for the training epoch."
             )
 
+        # store epoch of dataloader reset for reload_dataloaders_every_n_epochs
+        self._last_train_dl_reload_epoch = self.current_epoch
+
     def _reset_eval_dataloader(
         self, mode: RunningStage, model: Optional["pl.LightningModule"] = None
     ) -> Tuple[List[Union[int, float]], List[DataLoader]]:
@@ -425,8 +467,7 @@ def _reset_eval_dataloader(
         for loader_i in range(len(dataloaders)):
             loader = dataloaders[loader_i]
 
-            if hasattr(loader, "sampler") and isinstance(loader.sampler, RandomSampler):
-
+            if hasattr(loader, "sampler") and not isinstance(loader.sampler, SequentialSampler):
                 # when overfitting, the dataloader should not have sampler
                 if self.overfit_batches > 0 and mode.evaluating:
                     rank_zero_warn(
@@ -437,9 +478,11 @@ def _reset_eval_dataloader(
                         loader, SequentialSampler(loader.dataset), mode=mode
                     )
                 else:
-                    rank_zero_warn(
-                        f"Your `{mode.dataloader_prefix}_dataloader` has `shuffle=True`,"
-                        "it is strongly recommended that you turn this off for val/test/predict dataloaders."
+                    apply_to_collection(
+                        loader.loaders if isinstance(loader, CombinedLoader) else loader,
+                        DataLoader,
+                        self._check_eval_shuffling,
+                        mode=mode,
                     )
 
         if any(dl is None for dl in dataloaders):
@@ -460,7 +503,7 @@ def _reset_eval_dataloader(
         module = model or self.lightning_module or self.datamodule
         if len(dataloaders) != 0:
             for i, dataloader in enumerate(dataloaders):
-                num_batches = (
+                orig_num_batches = num_batches = (
                     len(dataloader)
                     if has_len_all_ranks(dataloader, self.training_type_plugin, module)
                     else float("inf")
@@ -486,7 +529,7 @@ def _reset_eval_dataloader(
                     min_pct = 1.0 / len(dataloader)
                     raise MisconfigurationException(
                         f"you requested to check {limit_eval_batches} of the `{mode.dataloader_prefix}_dataloader` but"
-                        f" {limit_eval_batches}*{num_batches} < 1. Please increase the"
+                        f" {limit_eval_batches} * {orig_num_batches} < 1. Please increase the"
                         f" `limit_{mode.dataloader_prefix}_batches` flag. Try at least"
                         f" `limit_{mode.dataloader_prefix}_batches={min_pct}`"
                     )
@@ -509,6 +552,9 @@ def reset_val_dataloader(self, model: Optional["pl.LightningModule"] = None) ->
                 RunningStage.VALIDATING, model=pl_module
             )
 
+            # store epoch of dataloader reset for reload_dataloaders_every_n_epochs
+            self._last_val_dl_reload_epoch = self.current_epoch
+
     def reset_test_dataloader(self, model: Optional["pl.LightningModule"] = None) -> None:
         """Resets the test dataloader and determines the number of batches.
 
@@ -578,16 +624,17 @@ def _add_sampler_metadata_collate(dataloader: DataLoader) -> None:
 
     @staticmethod
     def _resolve_overfit_batches(dataloader: Collection[DataLoader]) -> Collection[DataLoader]:
-        has_random_sampler = False
+        all_have_sequential_sampler = True
 
-        def resolve_had_random_sampler(dataloader: DataLoader):
-            nonlocal has_random_sampler
-            if not has_random_sampler:
-                has_random_sampler = isinstance(dataloader.sampler, RandomSampler)
+        def resolve_has_no_sequential_sampler(dataloader: DataLoader):
+            nonlocal all_have_sequential_sampler
+            all_have_sequential_sampler = all_have_sequential_sampler & isinstance(
+                dataloader.sampler, SequentialSampler
+            )
 
-        apply_to_collection(dataloader, DataLoader, resolve_had_random_sampler)
+        apply_to_collection(dataloader, DataLoader, resolve_has_no_sequential_sampler)
 
-        if has_random_sampler:
+        if not all_have_sequential_sampler:
             rank_zero_warn(
                 "You requested to overfit but enabled training dataloader shuffling."
                 " We are turning off the training dataloader shuffling for you."
@@ -601,3 +648,16 @@ def replace_sampler(dataloader: DataLoader) -> DataLoader:
             dataloader = apply_to_collection(dataloader, DataLoader, replace_sampler)
 
         return dataloader
+
+    @staticmethod
+    def _check_eval_shuffling(dataloader, mode):
+        if (
+            hasattr(dataloader, "sampler")
+            and not isinstance(dataloader.sampler, SequentialSampler)
+            and not isinstance(dataloader.dataset, IterableDataset)
+        ):
+            rank_zero_warn(
+                f"Your `{mode.dataloader_prefix}_dataloader` has `shuffle=True`,"
+                " it is strongly recommended that you turn this off for val/test/predict dataloaders.",
+                category=UserWarning,
+            )
diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py
index 816f4da38f5b9..2c91fb0d245d1 100644
--- a/pytorch_lightning/trainer/supporters.py
+++ b/pytorch_lightning/trainer/supporters.py
@@ -304,10 +304,10 @@ def __len__(self) -> int:
 
 
 class CombinedLoader:
-    """Combines different dataloaders and allows sampling in parallel. Supported modes are 'min_size', which raises
-    StopIteration after the shortest loader (the one with the lowest number of batches) is done, and
-    'max_size_cycle` which raises StopIteration after the longest loader (the one with most batches) is done, while
-    cycling through the shorter loaders.
+    """Combines different dataloaders and allows sampling in parallel. Supported modes are ``"min_size"``, which
+    raises StopIteration after the shortest loader (the one with the lowest number of batches) is done, and
+    ``"max_size_cycle"`` which raises StopIteration after the longest loader (the one with most batches) is done,
+    while cycling through the shorter loaders.
 
     Examples:
         >>> loaders = {'a': torch.utils.data.DataLoader(range(6), batch_size=4),
@@ -457,6 +457,19 @@ def _wrap_loaders_max_size_cycle(self) -> Any:
             )
             state.reset()
 
+    def _apply_cycle_iterator_length(self) -> None:
+        """When the model is `max_size_cycle`, compute the length across all ``CycleIterator`` and re-assign it to
+        all dataloaders."""
+        if self.mode != "max_size_cycle":
+            return
+
+        def set_len(cycle_iterator: CycleIterator, length: int) -> None:
+            cycle_iterator.length = length
+
+        all_lengths = apply_to_collection(self.loaders, CycleIterator, lambda c: get_len(c.loader))
+        max_length = _nested_calc_num_data(all_lengths, max)
+        apply_to_collection(self.loaders, CycleIterator, set_len, length=max_length)
+
     def __iter__(self) -> Any:
         """Create and return an iterator, `CombinedLoaderIterator`, for the combined loader."""
 
@@ -473,11 +486,12 @@ def __getstate__patch__(*_):
         return iterator
 
     @staticmethod
-    def _calc_num_batches(loaders: Any) -> Union[int, float]:
+    def _calc_num_batches(loaders: Any, mode="min_size") -> Union[int, float]:
         """Compute the length (aka the number of batches) of `CombinedLoader`.
 
         Args:
             loaders: a collections of loaders.
+            mode: Mode used by the CombinedDataloader
 
         Returns:
             length: the minimum length of loaders
@@ -486,10 +500,10 @@ def _calc_num_batches(loaders: Any) -> Union[int, float]:
 
         if isinstance(all_lengths, (int, float)):
             return all_lengths
-        return _nested_calc_num_data(all_lengths, min)
+        return _nested_calc_num_data(all_lengths, max if mode == "max_size_cycle" else min)
 
     def __len__(self) -> int:
-        return self._calc_num_batches(self.loaders)
+        return self._calc_num_batches(self.loaders, mode=self.mode)
 
     @staticmethod
     def _shutdown_workers_and_reset_iterator(dataloader) -> None:
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 7e5d21e18dc26..22df6859fb256 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -86,7 +86,7 @@
 from pytorch_lightning.utilities.distributed import distributed_available
 from pytorch_lightning.utilities.exceptions import ExitGracefullyException, MisconfigurationException
 from pytorch_lightning.utilities.imports import _fault_tolerant_training
-from pytorch_lightning.utilities.meta import materialize_module
+from pytorch_lightning.utilities.meta import is_on_meta_device, materialize_module
 from pytorch_lightning.utilities.model_helpers import is_overridden
 from pytorch_lightning.utilities.seed import reset_seed
 from pytorch_lightning.utilities.types import (
@@ -569,6 +569,7 @@ def __init__(
         self.__init_profiler(profiler)
 
         # init logger flags
+        self.logger: Optional[LightningLoggerBase]
         self.logger_connector.on_trainer_init(logger, flush_logs_every_n_steps, log_every_n_steps, move_metrics_to_cpu)
 
         # init debugging flags
@@ -662,6 +663,8 @@ def _setup_on_init(self, num_sanity_val_steps: int) -> None:
         self.num_val_batches = []
         self.test_dataloaders = None
         self.val_dataloaders = None
+        self._last_train_dl_reload_epoch = float("-inf")
+        self._last_val_dl_reload_epoch = float("-inf")
 
         # when true, print evaluation results in .validate() and .test()
         self.verbose_evaluate = True
@@ -697,6 +700,8 @@ def _call_and_handle_interrupt(self, trainer_fn: Callable, *args: Any, **kwargs:
             # reset bookkeeping
             self.state.stage = None
             self.on_exception(exception)
+            # shutdown workers
+            self._data_connector.teardown()
             raise
 
     def fit(
@@ -749,6 +754,8 @@ def _fit_impl(
         self.state.fn = TrainerFn.FITTING
         self.state.status = TrainerStatus.RUNNING
         self.training = True
+        self._last_train_dl_reload_epoch = float("-inf")
+        self._last_val_dl_reload_epoch = float("-inf")
 
         # if a datamodule comes in as the second arg, then fix it for the user
         if isinstance(train_dataloaders, LightningDataModule):
@@ -1112,17 +1119,16 @@ def _run(
         if hasattr(model, "hparams"):
             parsing.clean_namespace(model.hparams)
 
-        verify_loop_configurations(self, model)
-
-        # attach model log function to callback
-        self._callback_connector.attach_model_logging_functions(model)
-
         # attach model to the training type plugin
         self.training_type_plugin.connect(model)
 
+        self._callback_connector._attach_model_callbacks()
+        self._callback_connector._attach_model_logging_functions()
+
+        verify_loop_configurations(self)
+
         # hook
         self._data_connector.prepare_data()
-        self._callback_connector._attach_model_callbacks()
 
         # ----------------------------
         # SET UP TRAINING
@@ -1262,6 +1268,7 @@ def _post_dispatch(self):
         self._data_connector.teardown()
         self._active_loop.teardown()
         self.logger_connector.teardown()
+        self.signal_connector.teardown()
 
     def _dispatch(self):
         if self.evaluating:
@@ -1393,7 +1400,7 @@ def __set_ckpt_path(self, ckpt_path: Optional[str], model_provided: bool, model_
                 " The best model of the previous `fit` call will be used."
                 f" You can pass `{fn}(ckpt_path='best')` to use and best model"
                 " checkpoint and avoid this warning or"
-                " `ckpt_path=trainer.model_checkpoint.last_model_path` to use the last model."
+                " `ckpt_path=trainer.checkpoint_callback.last_model_path` to use the last model."
             )
             ckpt_path = "best"
 
@@ -1435,10 +1442,21 @@ def _call_setup_hook(self) -> None:
 
     def _call_configure_sharded_model(self) -> None:
         with self.accelerator.model_sharded_context():
-            materialize_module(self.lightning_module)
+            self._handle_meta_model()
             self.call_hook("configure_sharded_model")
             self.call_hook("on_configure_sharded_model")
 
+    def _handle_meta_model(self) -> None:
+        if not is_on_meta_device(self.lightning_module):
+            return
+
+        if isinstance(self.training_type_plugin, DDPSpawnPlugin):
+            raise MisconfigurationException("LightningModule on meta device isn't supported with spawn.")
+
+        materialize_module(self.lightning_module)
+        # the trainer reference is lost during materialization
+        self.lightning_module.trainer = proxy(self)
+
     def _call_teardown_hook(self) -> None:
         fn = self.state.fn._setup_fn
 
@@ -1812,12 +1830,6 @@ def progress_bar_dict(self) -> dict:
             return self.progress_bar_callback.get_metrics(self, ref_model)
         return self.progress_bar_metrics
 
-    @property
-    def _should_reload_dl_epoch(self) -> bool:
-        """Check if dataloader should be reloaded in the current epoch."""
-        n_epochs = self.reload_dataloaders_every_n_epochs
-        return n_epochs and (not self.current_epoch % n_epochs)
-
     @property
     def disable_validation(self) -> bool:
         """Check if validation is disabled during training."""
diff --git a/pytorch_lightning/utilities/apply_func.py b/pytorch_lightning/utilities/apply_func.py
index 3bd920c2e304b..1ce23050e0113 100644
--- a/pytorch_lightning/utilities/apply_func.py
+++ b/pytorch_lightning/utilities/apply_func.py
@@ -14,9 +14,9 @@
 import dataclasses
 import operator
 from abc import ABC
-from collections import OrderedDict
+from collections import defaultdict, OrderedDict
 from collections.abc import Mapping, Sequence
-from copy import copy
+from copy import copy, deepcopy
 from functools import partial
 from typing import Any, Callable, List, Optional, Tuple, Union
 
@@ -34,6 +34,9 @@
     Batch = type(None)
 
 
+_CPU_DEVICES = ("cpu", torch.device("cpu"))
+
+
 def to_dtype_tensor(
     value: Union[int, float, List[Union[int, float]]], dtype: torch.dtype, device: Union[str, torch.device]
 ) -> torch.Tensor:
@@ -102,6 +105,8 @@ def apply_to_collection(
             )
             if include_none or v is not None:
                 out.append((k, v))
+        if isinstance(data, defaultdict):
+            return elem_type(data.default_factory, OrderedDict(out))
         return elem_type(OrderedDict(out))
 
     is_namedtuple = _is_namedtuple(data)
@@ -117,11 +122,21 @@ def apply_to_collection(
         return elem_type(*out) if is_namedtuple else elem_type(out)
 
     if _is_dataclass_instance(data):
-        out_dict = {}
+        # make a deepcopy of the data,
+        # but do not deepcopy mapped fields since the computation would
+        # be wasted on values that likely get immediately overwritten
+        fields = {}
+        memo = {}
         for field in dataclasses.fields(data):
-            if field.init:
+            field_value = getattr(data, field.name)
+            fields[field.name] = (field_value, field.init)
+            memo[id(field_value)] = field_value
+        result = deepcopy(data, memo=memo)
+        # apply function to each field
+        for field_name, (field_value, field_init) in fields.items():
+            if field_init:
                 v = apply_to_collection(
-                    getattr(data, field.name),
+                    field_value,
                     dtype,
                     function,
                     *args,
@@ -129,9 +144,10 @@ def apply_to_collection(
                     include_none=include_none,
                     **kwargs,
                 )
-                if include_none or v is not None:
-                    out_dict[field.name] = v
-        return elem_type(**out_dict)
+            if not field_init or (not include_none and v is None):  # retain old value
+                v = getattr(data, field_name)
+            setattr(result, field_name, v)
+        return result
 
     # data is neither of dtype, nor a collection
     return data
@@ -255,7 +271,10 @@ def batch_to(data: Any) -> Any:
                 setattr(device_data, field, device_field)
             return device_data
 
-        kwargs = dict(non_blocking=True) if isinstance(data, torch.Tensor) else {}
+        kwargs = {}
+        # Don't issue non-blocking transfers to CPU
+        if isinstance(data, torch.Tensor) and device not in _CPU_DEVICES:
+            kwargs["non_blocking"] = True
         data_output = data.to(device, **kwargs)
         if data_output is not None:
             return data_output
diff --git a/pytorch_lightning/utilities/cli.py b/pytorch_lightning/utilities/cli.py
index 9d8cca7db1c69..cd63dd4d3b777 100644
--- a/pytorch_lightning/utilities/cli.py
+++ b/pytorch_lightning/utilities/cli.py
@@ -265,9 +265,27 @@ def _convert_argv_issue_84(classes: Tuple[Type, ...], nested_key: str, argv: Lis
             else:
                 clean_argv.append(arg)
             i += 1
+
+        # the user requested a help message
+        help_key = argv_key + ".help"
+        if help_key in passed_args:
+            argv_class = passed_args[help_key]
+            if "." in argv_class:
+                # user passed the class path directly
+                class_path = argv_class
+            else:
+                # convert shorthand format to the classpath
+                for cls in classes:
+                    if cls.__name__ == argv_class:
+                        class_path = _class_path_from_class(cls)
+                        break
+                else:
+                    raise ValueError(f"Could not generate get the class_path for {repr(argv_class)}")
+            return clean_argv + [help_key, class_path]
+
         # generate the associated config file
-        argv_class = passed_args.pop(argv_key, None)
-        if argv_class is None:
+        argv_class = passed_args.pop(argv_key, "")
+        if not argv_class:
             # the user passed a config as a str
             class_path = passed_args[f"{argv_key}.class_path"]
             init_args_key = f"{argv_key}.init_args"
@@ -377,21 +395,30 @@ def __init__(
     def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[str] = None) -> None:
         # save the config in `setup` because (1) we want it to save regardless of the trainer function run
         # and we want to save before processes are spawned
-        log_dir = trainer.log_dir
+        log_dir = trainer.log_dir  # this broadcasts the directory
         assert log_dir is not None
         config_path = os.path.join(log_dir, self.config_filename)
-        if not self.overwrite and os.path.isfile(config_path):
-            raise RuntimeError(
-                f"{self.__class__.__name__} expected {config_path} to NOT exist. Aborting to avoid overwriting"
-                " results of a previous run. You can delete the previous config file,"
-                " set `LightningCLI(save_config_callback=None)` to disable config saving,"
-                " or set `LightningCLI(save_config_overwrite=True)` to overwrite the config file."
-            )
+        fs = get_filesystem(log_dir)
+
+        if not self.overwrite:
+            # check if the file exists on rank 0
+            file_exists = fs.isfile(config_path) if trainer.is_global_zero else False
+            # broadcast whether to fail to all ranks
+            file_exists = trainer.accelerator.broadcast(file_exists)
+            if file_exists:
+                raise RuntimeError(
+                    f"{self.__class__.__name__} expected {config_path} to NOT exist. Aborting to avoid overwriting"
+                    " results of a previous run. You can delete the previous config file,"
+                    " set `LightningCLI(save_config_callback=None)` to disable config saving,"
+                    " or set `LightningCLI(save_config_overwrite=True)` to overwrite the config file."
+                )
+
+        # save the file on rank 0
         if trainer.is_global_zero:
             # save only on rank zero to avoid race conditions on DDP.
             # the `log_dir` needs to be created as we rely on the logger to do it usually
             # but it hasn't logged anything at this point
-            get_filesystem(log_dir).makedirs(log_dir, exist_ok=True)
+            fs.makedirs(log_dir, exist_ok=True)
             self.parser.save(
                 self.config, config_path, skip_none=False, overwrite=self.overwrite, multifile=self.multifile
             )
@@ -658,8 +685,8 @@ def _instantiate_trainer(self, config: Dict[str, Any], callbacks: List[Callback]
                 config["callbacks"].append(self.trainer_defaults["callbacks"])
         if self.save_config_callback and not config["fast_dev_run"]:
             config_callback = self.save_config_callback(
-                self.parser,
-                self.config,
+                self._parser(self.subcommand),
+                self.config.get(str(self.subcommand), self.config),
                 self.save_config_filename,
                 overwrite=self.save_config_overwrite,
                 multifile=self.save_config_multifile,
@@ -742,9 +769,7 @@ def configure_optimizers(
 
     def _get(self, config: Dict[str, Any], key: str, default: Optional[Any] = None) -> Any:
         """Utility to get a config value which might be inside a subcommand."""
-        if self.subcommand is not None:
-            return config[self.subcommand].get(key, default)
-        return config.get(key, default)
+        return config.get(str(self.subcommand), config).get(key, default)
 
     def _run_subcommand(self, subcommand: str) -> None:
         """Run the chosen subcommand."""
@@ -772,8 +797,16 @@ def _prepare_subcommand_kwargs(self, subcommand: str) -> Dict[str, Any]:
         return fn_kwargs
 
 
-def _global_add_class_path(class_type: Type, init_args: Dict[str, Any] = None) -> Dict[str, Any]:
-    return {"class_path": class_type.__module__ + "." + class_type.__name__, "init_args": init_args or {}}
+def _class_path_from_class(class_type: Type) -> str:
+    return class_type.__module__ + "." + class_type.__name__
+
+
+def _global_add_class_path(
+    class_type: Type, init_args: Optional[Union[Namespace, Dict[str, Any]]] = None
+) -> Dict[str, Any]:
+    if isinstance(init_args, Namespace):
+        init_args = init_args.as_dict()
+    return {"class_path": _class_path_from_class(class_type), "init_args": init_args or {}}
 
 
 def _add_class_path_generator(class_type: Type) -> Callable[[Dict[str, Any]], Dict[str, Any]]:
diff --git a/pytorch_lightning/utilities/data.py b/pytorch_lightning/utilities/data.py
index a75afa775848b..e6cfdcd953e61 100644
--- a/pytorch_lightning/utilities/data.py
+++ b/pytorch_lightning/utilities/data.py
@@ -29,7 +29,10 @@
 
 def _extract_batch_size(batch: BType) -> Generator[int, None, None]:
     if isinstance(batch, torch.Tensor):
-        yield batch.size(0)
+        if batch.ndim == 0:
+            yield 1
+        else:
+            yield batch.size(0)
     elif isinstance(batch, str):
         yield len(batch)
     elif isinstance(batch, (Iterable, Mapping)):
diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py
index 47fa7b791eae0..d4d488d973ebf 100644
--- a/pytorch_lightning/utilities/distributed.py
+++ b/pytorch_lightning/utilities/distributed.py
@@ -19,6 +19,7 @@
 from typing import Any, Callable, List, Optional, Tuple, Union
 
 import torch
+from torch.nn import Module
 from torch.nn.parallel.distributed import DistributedDataParallel
 
 import pytorch_lightning as pl
@@ -394,3 +395,37 @@ def init_dist_connection(
             f"All distributed processes registered. Starting with {world_size} processes\n"
             f"{'-' * 100}\n"
         )
+
+
+class _BatchNormXd(torch.nn.modules.batchnorm._BatchNorm):
+    def _check_input_dim(self, input: torch.Tensor) -> None:
+        # The only difference between BatchNorm1d, BatchNorm2d, BatchNorm3d, etc
+        # is this method that is overwritten by the subclass.
+        # Here, we are bypassing some tensor sanity checks and trusting that the user
+        # provides the right input dimensions at inference.
+        return
+
+
+def _revert_sync_batchnorm(module: Module) -> Module:
+    # Code adapted from https://github.com/pytorch/pytorch/issues/41081#issuecomment-783961547
+    # Original author: Kapil Yedidi (@kapily)
+    converted_module = module
+    if isinstance(module, torch.nn.modules.batchnorm.SyncBatchNorm):
+        # Unfortunately, SyncBatchNorm does not store the original class - if it did
+        # we could return the one that was originally created.
+        converted_module = _BatchNormXd(
+            module.num_features, module.eps, module.momentum, module.affine, module.track_running_stats
+        )
+        if module.affine:
+            with torch.no_grad():
+                converted_module.weight = module.weight
+                converted_module.bias = module.bias
+        converted_module.running_mean = module.running_mean
+        converted_module.running_var = module.running_var
+        converted_module.num_batches_tracked = module.num_batches_tracked
+        if hasattr(module, "qconfig"):
+            converted_module.qconfig = module.qconfig
+    for name, child in module.named_children():
+        converted_module.add_module(name, _revert_sync_batchnorm(child))
+    del module
+    return converted_module
diff --git a/pytorch_lightning/utilities/fetching.py b/pytorch_lightning/utilities/fetching.py
index fd9baf3e9c4f1..9b80d2f9874c7 100644
--- a/pytorch_lightning/utilities/fetching.py
+++ b/pytorch_lightning/utilities/fetching.py
@@ -206,15 +206,15 @@ def reset(self) -> None:
         self.batches: List = []
         self.fetched: int = 0
         self.done: bool = False
+
+    def teardown(self) -> None:
+        self.reset()
         if isinstance(self.dataloader, CombinedLoader):
             self.dataloader.reset()
         if isinstance(self.dataloader, DataLoader):
             CombinedLoader._shutdown_workers_and_reset_iterator(self.dataloader)
         self.dataloader_iter = None
 
-    def teardown(self) -> None:
-        self.reset()
-
 
 class DataFetcher(AbstractDataFetcher):
 
diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py
index edf5f75aee6a9..247dfcc71b7c1 100644
--- a/pytorch_lightning/utilities/imports.py
+++ b/pytorch_lightning/utilities/imports.py
@@ -59,7 +59,7 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version:
             pkg_version = Version(pkg.__version__)
         else:
             # try pkg_resources to infer version
-            pkg_version = Version(pkg_resources.get_distribution(pkg).version)
+            pkg_version = Version(pkg_resources.get_distribution(package).version)
     except TypeError:
         # this is mocked by Sphinx, so it should return True to generate all summaries
         return True
diff --git a/pytorch_lightning/utilities/meta.py b/pytorch_lightning/utilities/meta.py
index 60e6cc791b7ae..6d3c1d6b5f11b 100644
--- a/pytorch_lightning/utilities/meta.py
+++ b/pytorch_lightning/utilities/meta.py
@@ -18,13 +18,14 @@
 from functools import partial
 from itertools import chain
 from types import ModuleType
-from typing import Callable, Dict, Generator, Iterator, List, Optional, Set, Type
+from typing import Any, Callable, Dict, Generator, Iterator, List, Optional, Set, Type
 
 import torch
 from torch import nn, Tensor
 from torch.nn import Module
 from torch.nn.modules.container import ModuleDict, ModuleList, Sequential
 
+import pytorch_lightning as pl
 from pytorch_lightning.utilities import rank_zero_warn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_10
@@ -191,7 +192,6 @@ def materialize_module(root_module: nn.Module) -> nn.Module:
 
 # cache subclasses to optimize the search when resetting the meta device later on.
 __STORAGE_META__ = {}
-
 __CREATED_MODULES__ = set()
 
 
@@ -237,45 +237,52 @@ def _set_meta_device() -> None:
 
     for subclass in get_all_subclasses(torch.nn.modules.module.Module):
 
-        if isinstance(subclass, (Sequential, ModuleList, ModuleDict)):
+        if subclass in (Sequential, ModuleList, ModuleDict, pl.LightningModule):
             continue
 
         # if a subclass has already been stored, we should use the cache
         if str(subclass) in __STORAGE_META__:
-            # reset the class import package to its rightfull state.
+            # reset the class import package to its rightful state.
             mods, subclass, meta_class = __STORAGE_META__[subclass]
             for mod in mods:
                 setattr(mod, subclass.__name__, meta_class)
             continue
 
+        class _IsinstanceMetaclass(type(subclass)):
+            def __instancecheck__(self, instance: Any) -> bool:
+                """Overrides the ``isinstance`` check on ``_MaterializerModule`` objects."""
+                return isinstance(instance, self.__bases__[0])
+
         # Create a class subclassing current `subclass` overriding its new method.
         # this will enable use to use `torch.distributed.nn.utils.init_meta` to create a `meta`
         # version of the current subclass module
-        class _MetaClass(subclass):
+        class _MaterializerModule(subclass, metaclass=_IsinstanceMetaclass):
             @classmethod
             @contextmanager
-            def instantiation_context(cls, materialize: bool):
+            def instantiation_context(cls):
                 _unset_meta_device(from_created=True)
                 yield
                 _set_meta_device_populated(from_created=True)
 
             @classmethod
             def materialize(cls, materialize_fn: Callable):
-                with cls.instantiation_context(materialize=True):
+                with cls.instantiation_context():
                     obj = materialize_fn()
                 return obj
 
             @staticmethod
             def add_subclasses(subclass):
-                """This is used to unrol the instantion tree while creating the modules."""
-                __CREATED_MODULES__.add(subclass)
+                """This is used to unroll the instantiation tree while creating the modules."""
+                # Don't store the LightningModule as skipped from the Meta process.
+                if subclass != pl.LightningModule:
+                    __CREATED_MODULES__.add(subclass)
                 if subclass.__bases__[0] != torch.nn.modules.module.Module:
-                    _MetaClass.add_subclasses(subclass.__bases__[0])
+                    _MaterializerModule.add_subclasses(subclass.__bases__[0])
 
             def __new__(cls, *args, **kwargs):
                 subclass = cls.__bases__[0]
                 cls.add_subclasses(subclass)
-                with cls.instantiation_context(materialize=False):
+                with cls.instantiation_context():
                     obj = init_meta(subclass, *args, **kwargs)
 
                 obj.materialize = partial(cls.materialize, materialize_fn=obj.materialize)
@@ -294,9 +301,8 @@ def search(mod: ModuleType) -> List[ModuleType]:
         # nn.Module class can be imported at different level and they all need to be mocked.
         # Example: torch.nn.Linear is actually torch.nn.modules.linear.Linear
         # Therefore, torch.nn.Linear, torch.nn.modules.Linear, torch.nn.modules.linear.Linear
-        # needs to be replaced by the torch.nn.linear.modules.Linear _MetaClass
-        out = []
-        out.append(search(mod))
+        # needs to be replaced by the torch.nn.linear.modules.Linear _MaterializerModule
+        out = [search(mod)]
         for name in submodules[1:]:
             mod = getattr(mod, name)
             out.append(search(mod))
@@ -305,11 +311,11 @@ def search(mod: ModuleType) -> List[ModuleType]:
         mods = [mod for mod in chain(*out) if mod]
 
         # store the modules search so it doesn't have to be performed again for this class
-        __STORAGE_META__[subclass] = (mods, subclass, _MetaClass)
+        __STORAGE_META__[subclass] = (mods, subclass, _MaterializerModule)
 
         # replace all subclass by its meta form
         for mod in mods:
-            setattr(mod, subclass.__name__, _MetaClass)
+            setattr(mod, subclass.__name__, _MaterializerModule)
 
 
 @contextmanager
@@ -321,3 +327,11 @@ def init_meta_context() -> Generator:
     _set_meta_device()
     yield
     _unset_meta_device()
+
+
+def is_on_meta_device(module: nn.Module) -> bool:
+    try:
+        param = next(module.parameters())
+        return param.device.type == "meta"
+    except StopIteration:
+        return False
diff --git a/pytorch_lightning/utilities/model_helpers.py b/pytorch_lightning/utilities/model_helpers.py
index 3146b33fe153d..90707279500e0 100644
--- a/pytorch_lightning/utilities/model_helpers.py
+++ b/pytorch_lightning/utilities/model_helpers.py
@@ -47,6 +47,8 @@ def is_overridden(
             raise ValueError("Expected a parent")
 
     instance_attr = getattr(instance, method_name, None)
+    if instance_attr is None:
+        return False
     # `functools.wraps()` support
     if hasattr(instance_attr, "__wrapped__"):
         instance_attr = instance_attr.__wrapped__
diff --git a/pytorch_lightning/utilities/model_summary.py b/pytorch_lightning/utilities/model_summary.py
index 9c2690202df90..83f9861b0f550 100644
--- a/pytorch_lightning/utilities/model_summary.py
+++ b/pytorch_lightning/utilities/model_summary.py
@@ -12,7 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import contextlib
 import logging
+import sys
 from collections import OrderedDict
 from typing import Any, Dict, List, Optional, Tuple, Union
 
@@ -23,7 +25,7 @@
 from torch.utils.hooks import RemovableHandle
 
 import pytorch_lightning as pl
-from pytorch_lightning.utilities import AMPType, DeviceType, ModelSummaryMode, rank_zero_deprecation
+from pytorch_lightning.utilities import ModelSummaryMode, rank_zero_deprecation
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_8
 from pytorch_lightning.utilities.warnings import WarningCache
@@ -282,12 +284,17 @@ def _forward_example_input(self) -> None:
         input_ = model.example_input_array
         input_ = model._apply_batch_transfer_handler(input_)
 
-        if trainer is not None and trainer.amp_backend == AMPType.NATIVE and trainer._device_type != DeviceType.TPU:
-            model.forward = torch.cuda.amp.autocast()(model.forward)
-
         mode = model.training
         model.eval()
-        with torch.no_grad():
+
+        if trainer is not None:
+            forward_context = trainer.precision_plugin.forward_context()
+        elif sys.version_info >= (3, 7):
+            forward_context = contextlib.nullcontext()
+        else:
+            forward_context = contextlib.suppress()
+
+        with torch.no_grad(), forward_context:
             # let the model hooks collect the input- and output shapes
             if isinstance(input_, (list, tuple)):
                 model(*input_)
diff --git a/requirements.txt b/requirements.txt
index 69074cbfb249c..dd34e9273c31e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 # the default package dependencies
 
 numpy>=1.17.2
-torch>=1.6
+torch>=1.7.*
 future>=0.17.1  # required for builtins in setup.py
 tqdm>=4.41.0
 PyYAML>=5.1
@@ -11,3 +11,4 @@ torchmetrics>=0.4.1
 pyDeprecate==0.3.1
 packaging>=17.0
 typing-extensions
+setuptools==59.5.0 # required for https://github.com/pytorch/pytorch/pull/69904
diff --git a/requirements/adjust_versions.py b/requirements/adjust_versions.py
index 3ebb3c28835b3..bdff1af6a7771 100644
--- a/requirements/adjust_versions.py
+++ b/requirements/adjust_versions.py
@@ -6,7 +6,8 @@
 # IMPORTANT: this list needs to be sorted in reverse
 VERSIONS = [
     dict(torch="1.11.0", torchvision="0.11.*", torchtext=""),  # nightly
-    dict(torch="1.10.0", torchvision="0.11.1", torchtext="0.11.0"),  # stable
+    dict(torch="1.10.1", torchvision="0.11.2", torchtext="0.11.1"),  # stable
+    dict(torch="1.10.0", torchvision="0.11.1", torchtext="0.11.0"),
     dict(torch="1.9.1", torchvision="0.10.1", torchtext="0.10.1"),
     dict(torch="1.9.0", torchvision="0.10.0", torchtext="0.10.0"),
     # dict(torch="1.8.2", torchvision="0.9.1", torchtext="0.9.1"), # LTS # Not on PyPI, commented so 1.8.1 is used
@@ -14,7 +15,6 @@
     dict(torch="1.8.0", torchvision="0.9.0", torchtext="0.9.0"),
     dict(torch="1.7.1", torchvision="0.8.2", torchtext="0.8.1"),
     dict(torch="1.7.0", torchvision="0.8.1", torchtext="0.8.0"),
-    dict(torch="1.6.0", torchvision="0.7.0", torchtext="0.7"),
 ]
 
 
@@ -33,28 +33,59 @@ def find_latest(ver: str) -> Dict[str, str]:
     raise ValueError(f"Missing {ver} in {VERSIONS}")
 
 
-def main(path_req: str, torch_version: Optional[str] = None) -> None:
+def main(req: str, torch_version: Optional[str] = None) -> str:
     if not torch_version:
         import torch
 
         torch_version = torch.__version__
     assert torch_version, f"invalid torch: {torch_version}"
 
-    with open(path_req) as fp:
-        req = fp.read()
-    # remove comments
-    req = re.sub(rf"\s*#.*{os.linesep}", os.linesep, req)
+    # remove comments and strip whitespace
+    req = re.sub(rf"\s*#.*{os.linesep}", os.linesep, req).strip()
 
     latest = find_latest(torch_version)
     for lib, version in latest.items():
-        replace = f"{lib}=={version}" if version else lib
-        replace += os.linesep
-        req = re.sub(rf"{lib}[>=]*[\d\.]*{os.linesep}", replace, req)
+        replace = f"{lib}=={version}" if version else ""
+        req = re.sub(rf"\b{lib}(?!\w).*", replace, req)
 
-    print(req)  # on purpose - to debug
-    with open(path_req, "w") as fp:
-        fp.write(req)
+    return req
+
+
+def test():
+    requirements = """
+    torch>=1.2.*
+    torch==1.2.3
+    torch==1.4
+    torch
+    future>=0.17.1
+    pytorch==1.5.6+123dev0
+    torchvision
+    torchmetrics>=0.4.1
+    """
+    expected = """
+    torch==1.9.1
+    torch==1.9.1
+    torch==1.9.1
+    torch==1.9.1
+    future>=0.17.1
+    pytorch==1.5.6+123dev0
+    torchvision==0.10.1
+    torchmetrics>=0.4.1
+    """.strip()
+    actual = main(requirements, "1.9")
+    assert actual == expected, (actual, expected)
 
 
 if __name__ == "__main__":
-    main(*sys.argv[1:])
+    test()  # sanity check
+
+    if len(sys.argv) == 3:
+        requirements_path, torch_version = sys.argv[1:]
+    else:
+        requirements_path, torch_version = sys.argv[1], None
+
+    with open(requirements_path, "r+") as fp:
+        requirements = fp.read()
+        requirements = main(requirements, torch_version)
+        print(requirements)  # on purpose - to debug
+        fp.write(requirements)
diff --git a/requirements/docs.txt b/requirements/docs.txt
index 40b7b5919f90d..e9fea736f8b68 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -7,7 +7,7 @@ docutils>=0.16,<0.18  # Sphinx not yet compatible with docutils >= 0.18
 sphinxcontrib-fulltoc>=1.0
 sphinxcontrib-mockautodoc
 https://github.com/PyTorchLightning/lightning_sphinx_theme/archive/master.zip#egg=pt-lightning-sphinx-theme
-sphinx-autodoc-typehints>=1.0
+sphinx-autodoc-typehints>=1.0,<1.15  # v1.15 failing on master (#11405)
 sphinx-paramlinks>=0.5.1
 sphinx-togglebutton>=0.2
 sphinx-copybutton>=0.3
diff --git a/requirements/examples.txt b/requirements/examples.txt
index e38f1f92bcb83..8591f9bd509c2 100644
--- a/requirements/examples.txt
+++ b/requirements/examples.txt
@@ -1,3 +1,3 @@
-torchvision>=0.7
+torchvision>=0.8.*
 gym>=0.17.0
 ipython[all]
diff --git a/requirements/extra.txt b/requirements/extra.txt
index e3763fcae487b..74743185e880c 100644
--- a/requirements/extra.txt
+++ b/requirements/extra.txt
@@ -2,9 +2,9 @@
 
 matplotlib>3.1
 horovod>=0.21.2  # no need to install with [pytorch] as pytorch is already installed
-torchtext>=0.7
+torchtext>=0.8.*
 omegaconf>=2.0.5
 hydra-core>=1.0.5
-jsonargparse[signatures]>=3.19.3
+jsonargparse[signatures]>=4.0.4,<5.0.0
 gcsfs>=2021.5.0
 rich>=10.2.2
diff --git a/requirements/test.txt b/requirements/test.txt
index de749e2339f10..941b53dc8c102 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -2,9 +2,8 @@ coverage>5.2.0
 codecov>=2.1
 pytest>=6.0
 pytest-rerunfailures>=10.2
-check-manifest
 twine==3.2
-mypy>=0.900
+mypy>=0.920
 flake8>=3.9.2
 pre-commit>=1.0
 
diff --git a/setup.cfg b/setup.cfg
index 9d63c0e556341..20f00ef8ae102 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -66,14 +66,6 @@ ignore =
     W503  # Ignore "Line break occurred before a binary operator"
     E203  # Ignore "whitespace before ':'"
 
-# setup.cfg or tox.ini
-[check-manifest]
-ignore =
-    *.yml
-    .github
-    .github/*
-    .circleci
-
 
 [metadata]
 license_file = LICENSE
diff --git a/setup.py b/setup.py
index ddbae8b974a03..9d54a0d5641f5 100755
--- a/setup.py
+++ b/setup.py
@@ -74,10 +74,10 @@ def _load_py_module(fname, pkg="pytorch_lightning"):
     url=about.__homepage__,
     download_url="https://github.com/PyTorchLightning/pytorch-lightning",
     license=about.__license__,
-    packages=find_packages(exclude=["tests", "tests/*", "benchmarks", "legacy", "legacy/*"]),
+    packages=find_packages(exclude=["tests*", "pl_examples*", "legacy*"]),
+    include_package_data=True,
     long_description=long_description,
     long_description_content_type="text/markdown",
-    include_package_data=True,
     zip_safe=False,
     keywords=["deep learning", "pytorch", "AI"],
     python_requires=">=3.6",
diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py
index 810f96bfdd08d..3cb9ed382f5be 100644
--- a/tests/accelerators/test_accelerator_connector.py
+++ b/tests/accelerators/test_accelerator_connector.py
@@ -24,7 +24,6 @@
 from pytorch_lightning.accelerators.accelerator import Accelerator
 from pytorch_lightning.accelerators.cpu import CPUAccelerator
 from pytorch_lightning.accelerators.gpu import GPUAccelerator
-from pytorch_lightning.callbacks import Callback
 from pytorch_lightning.plugins import (
     DataParallelPlugin,
     DDP2Plugin,
@@ -45,7 +44,6 @@
 )
 from pytorch_lightning.utilities import DeviceType, DistributedType
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from tests.helpers.boring_model import BoringModel
 from tests.helpers.runif import RunIf
 
 
@@ -99,25 +97,16 @@ def test_accelerator_choice_ddp_spawn(cuda_available_mock, device_count_mock):
     },
 )
 @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True)
-def test_accelerator_choice_ddp_slurm(setup_distributed_mock):
-    class CB(Callback):
-        def on_fit_start(self, trainer, pl_module):
-            assert trainer._accelerator_connector._is_slurm_managing_tasks
-            assert isinstance(trainer.accelerator, GPUAccelerator)
-            assert isinstance(trainer.training_type_plugin, DDPPlugin)
-            assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment)
-            assert trainer.training_type_plugin.cluster_environment.local_rank() == 1
-            assert trainer.training_type_plugin.local_rank == 1
-            raise SystemExit()
-
-    model = BoringModel()
-    trainer = Trainer(fast_dev_run=True, accelerator="ddp", gpus=2, callbacks=[CB()])
-
-    with pytest.raises(SystemExit):
-        trainer.fit(model)
+def test_accelerator_choice_ddp_slurm(*_):
+    trainer = Trainer(fast_dev_run=True, accelerator="ddp", gpus=2)
+    assert trainer._accelerator_connector._is_slurm_managing_tasks
+    assert isinstance(trainer.accelerator, GPUAccelerator)
+    assert isinstance(trainer.training_type_plugin, DDPPlugin)
+    assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment)
+    assert trainer.training_type_plugin.cluster_environment.local_rank() == 1
+    assert trainer.training_type_plugin.local_rank == 1
 
 
-@RunIf(min_gpus=2)
 @mock.patch.dict(
     os.environ,
     {
@@ -131,25 +120,16 @@ def on_fit_start(self, trainer, pl_module):
 )
 @mock.patch("torch.cuda.device_count", return_value=2)
 @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True)
-def test_accelerator_choice_ddp2_slurm(device_count_mock, setup_distributed_mock):
-    class CB(Callback):
-        def on_fit_start(self, trainer, pl_module):
-            assert trainer._accelerator_connector._is_slurm_managing_tasks
-            assert isinstance(trainer.accelerator, GPUAccelerator)
-            assert isinstance(trainer.training_type_plugin, DDP2Plugin)
-            assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment)
-            assert trainer.training_type_plugin.cluster_environment.local_rank() == 1
-            assert trainer.training_type_plugin.local_rank == 1
-            raise SystemExit()
-
-    model = BoringModel()
-    trainer = Trainer(fast_dev_run=True, accelerator="ddp2", gpus=2, callbacks=[CB()])
-
-    with pytest.raises(SystemExit):
-        trainer.fit(model)
+def test_accelerator_choice_ddp2_slurm(*_):
+    trainer = Trainer(fast_dev_run=True, accelerator="ddp2", gpus=2)
+    assert trainer._accelerator_connector._is_slurm_managing_tasks
+    assert isinstance(trainer.accelerator, GPUAccelerator)
+    assert isinstance(trainer.training_type_plugin, DDP2Plugin)
+    assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment)
+    assert trainer.training_type_plugin.cluster_environment.local_rank() == 1
+    assert trainer.training_type_plugin.local_rank == 1
 
 
-@RunIf(min_gpus=1)
 @mock.patch.dict(
     os.environ,
     {
@@ -163,24 +143,15 @@ def on_fit_start(self, trainer, pl_module):
 )
 @mock.patch("torch.cuda.device_count", return_value=2)
 @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True)
-def test_accelerator_choice_ddp_te(device_count_mock, setup_distributed_mock):
-    class CB(Callback):
-        def on_fit_start(self, trainer, pl_module):
-            assert isinstance(trainer.accelerator, GPUAccelerator)
-            assert isinstance(trainer.training_type_plugin, DDPPlugin)
-            assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
-            assert trainer.training_type_plugin.cluster_environment.local_rank() == 1
-            assert trainer.training_type_plugin.local_rank == 1
-            raise SystemExit()
-
-    model = BoringModel()
-    trainer = Trainer(fast_dev_run=True, accelerator="ddp", gpus=2, callbacks=[CB()])
-
-    with pytest.raises(SystemExit):
-        trainer.fit(model)
+def test_accelerator_choice_ddp_te(*_):
+    trainer = Trainer(fast_dev_run=True, accelerator="ddp", gpus=2)
+    assert isinstance(trainer.accelerator, GPUAccelerator)
+    assert isinstance(trainer.training_type_plugin, DDPPlugin)
+    assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
+    assert trainer.training_type_plugin.cluster_environment.local_rank() == 1
+    assert trainer.training_type_plugin.local_rank == 1
 
 
-@RunIf(min_gpus=1)
 @mock.patch.dict(
     os.environ,
     {
@@ -194,21 +165,13 @@ def on_fit_start(self, trainer, pl_module):
 )
 @mock.patch("torch.cuda.device_count", return_value=2)
 @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True)
-def test_accelerator_choice_ddp2_te(device_count_mock, setup_distributed_mock):
-    class CB(Callback):
-        def on_fit_start(self, trainer, pl_module):
-            assert isinstance(trainer.accelerator, GPUAccelerator)
-            assert isinstance(trainer.training_type_plugin, DDP2Plugin)
-            assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
-            assert trainer.training_type_plugin.cluster_environment.local_rank() == 1
-            assert trainer.training_type_plugin.local_rank == 1
-            raise SystemExit()
-
-    model = BoringModel()
-    trainer = Trainer(fast_dev_run=True, accelerator="ddp2", gpus=2, callbacks=[CB()])
-
-    with pytest.raises(SystemExit):
-        trainer.fit(model)
+def test_accelerator_choice_ddp2_te(*_):
+    trainer = Trainer(fast_dev_run=True, accelerator="ddp2", gpus=2)
+    assert isinstance(trainer.accelerator, GPUAccelerator)
+    assert isinstance(trainer.training_type_plugin, DDP2Plugin)
+    assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
+    assert trainer.training_type_plugin.cluster_environment.local_rank() == 1
+    assert trainer.training_type_plugin.local_rank == 1
 
 
 @mock.patch.dict(
@@ -216,24 +179,15 @@ def on_fit_start(self, trainer, pl_module):
 )
 @mock.patch("torch.cuda.device_count", return_value=0)
 @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True)
-def test_accelerator_choice_ddp_cpu_te(device_count_mock, setup_distributed_mock):
-    class CB(Callback):
-        def on_fit_start(self, trainer, pl_module):
-            assert isinstance(trainer.accelerator, CPUAccelerator)
-            assert isinstance(trainer.training_type_plugin, DDPPlugin)
-            assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
-            assert trainer.training_type_plugin.cluster_environment.local_rank() == 1
-            assert trainer.training_type_plugin.local_rank == 1
-            raise SystemExit()
-
-    model = BoringModel()
-    trainer = Trainer(fast_dev_run=True, accelerator="ddp_cpu", num_processes=2, callbacks=[CB()])
-
-    with pytest.raises(SystemExit):
-        trainer.fit(model)
+def test_accelerator_choice_ddp_cpu_te(*_):
+    trainer = Trainer(fast_dev_run=True, accelerator="ddp_cpu", num_processes=2)
+    assert isinstance(trainer.accelerator, CPUAccelerator)
+    assert isinstance(trainer.training_type_plugin, DDPPlugin)
+    assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
+    assert trainer.training_type_plugin.cluster_environment.local_rank() == 1
+    assert trainer.training_type_plugin.local_rank == 1
 
 
-@RunIf(min_gpus=1)
 @mock.patch.dict(
     os.environ,
     {
@@ -247,21 +201,13 @@ def on_fit_start(self, trainer, pl_module):
 )
 @mock.patch("torch.cuda.device_count", return_value=1)
 @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True)
-def test_accelerator_choice_ddp_kubeflow(device_count_mock, setup_distributed_mock):
-    class CB(Callback):
-        def on_fit_start(self, trainer, pl_module):
-            assert isinstance(trainer.accelerator, GPUAccelerator)
-            assert isinstance(trainer.training_type_plugin, DDPPlugin)
-            assert isinstance(trainer.training_type_plugin.cluster_environment, KubeflowEnvironment)
-            assert trainer.training_type_plugin.cluster_environment.local_rank() == 0
-            assert trainer.training_type_plugin.local_rank == 0
-            raise SystemExit()
-
-    model = BoringModel()
-    trainer = Trainer(fast_dev_run=True, accelerator="ddp", gpus=1, callbacks=[CB()])
-
-    with pytest.raises(SystemExit):
-        trainer.fit(model)
+def test_accelerator_choice_ddp_kubeflow(*_):
+    trainer = Trainer(fast_dev_run=True, accelerator="ddp", gpus=1)
+    assert isinstance(trainer.accelerator, GPUAccelerator)
+    assert isinstance(trainer.training_type_plugin, DDPPlugin)
+    assert isinstance(trainer.training_type_plugin.cluster_environment, KubeflowEnvironment)
+    assert trainer.training_type_plugin.cluster_environment.local_rank() == 0
+    assert trainer.training_type_plugin.local_rank == 0
 
 
 @mock.patch.dict(
@@ -276,21 +222,13 @@ def on_fit_start(self, trainer, pl_module):
 )
 @mock.patch("torch.cuda.device_count", return_value=0)
 @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True)
-def test_accelerator_choice_ddp_cpu_kubeflow(device_count_mock, setup_distributed_mock):
-    class CB(Callback):
-        def on_fit_start(self, trainer, pl_module):
-            assert isinstance(trainer.accelerator, CPUAccelerator)
-            assert isinstance(trainer.training_type_plugin, DDPPlugin)
-            assert isinstance(trainer.training_type_plugin.cluster_environment, KubeflowEnvironment)
-            assert trainer.training_type_plugin.cluster_environment.local_rank() == 0
-            assert trainer.training_type_plugin.local_rank == 0
-            raise SystemExit()
-
-    model = BoringModel()
-    trainer = Trainer(fast_dev_run=True, accelerator="ddp_cpu", num_processes=1, callbacks=[CB()])
-
-    with pytest.raises(SystemExit):
-        trainer.fit(model)
+def test_accelerator_choice_ddp_cpu_kubeflow(*_):
+    trainer = Trainer(fast_dev_run=True, accelerator="ddp_cpu", num_processes=1)
+    assert isinstance(trainer.accelerator, CPUAccelerator)
+    assert isinstance(trainer.training_type_plugin, DDPPlugin)
+    assert isinstance(trainer.training_type_plugin.cluster_environment, KubeflowEnvironment)
+    assert trainer.training_type_plugin.cluster_environment.local_rank() == 0
+    assert trainer.training_type_plugin.local_rank == 0
 
 
 @mock.patch.dict(
@@ -306,25 +244,17 @@ def on_fit_start(self, trainer, pl_module):
 )
 @mock.patch("torch.cuda.device_count", return_value=0)
 @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True)
-def test_accelerator_choice_ddp_cpu_slurm(device_count_mock, setup_distributed_mock):
-    class CB(Callback):
-        def on_fit_start(self, trainer, pl_module):
-            assert trainer._accelerator_connector._is_slurm_managing_tasks
-            assert isinstance(trainer.accelerator, CPUAccelerator)
-            assert isinstance(trainer.training_type_plugin, DDPPlugin)
-            assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment)
-            assert trainer.training_type_plugin.local_rank == 0
-            raise SystemExit()
-
-    model = BoringModel()
-    trainer = Trainer(fast_dev_run=True, accelerator="ddp_cpu", num_processes=2, callbacks=[CB()])
-
-    with pytest.raises(SystemExit):
-        trainer.fit(model)
+def test_accelerator_choice_ddp_cpu_slurm(*_):
+    trainer = Trainer(fast_dev_run=True, accelerator="ddp_cpu", num_processes=2)
+    assert trainer._accelerator_connector._is_slurm_managing_tasks
+    assert isinstance(trainer.accelerator, CPUAccelerator)
+    assert isinstance(trainer.training_type_plugin, DDPPlugin)
+    assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment)
+    assert trainer.training_type_plugin.local_rank == 0
 
 
-@RunIf(special=True)
-def test_accelerator_choice_ddp_cpu_and_plugin(tmpdir):
+@RunIf(skip_windows=True, standalone=True)
+def test_accelerator_choice_ddp_cpu_and_strategy(tmpdir):
     """Test that accelerator="ddp_cpu" can work together with an instance of DDPPlugin."""
     _test_accelerator_choice_ddp_cpu_and_plugin(tmpdir, ddp_plugin_class=DDPPlugin)
 
@@ -336,8 +266,6 @@ def test_accelerator_choice_ddp_cpu_and_plugin_spawn(tmpdir):
 
 
 def _test_accelerator_choice_ddp_cpu_and_plugin(tmpdir, ddp_plugin_class):
-
-    model = BoringModel()
     trainer = Trainer(
         default_root_dir=tmpdir,
         plugins=[ddp_plugin_class(find_unused_parameters=True)],
@@ -349,7 +277,6 @@ def _test_accelerator_choice_ddp_cpu_and_plugin(tmpdir, ddp_plugin_class):
     assert isinstance(trainer.accelerator, CPUAccelerator)
     assert trainer.training_type_plugin.num_processes == 2
     assert trainer.training_type_plugin.parallel_devices == [torch.device("cpu")] * 2
-    trainer.fit(model)
 
 
 @mock.patch.dict(
@@ -432,19 +359,11 @@ class DistributedPlugin(DDPPlugin):
 )
 @mock.patch("torch.cuda.device_count", return_value=0)
 @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True)
-def test_dist_backend_accelerator_mapping(device_count_mock, setup_distributed_mock):
-    class CB(Callback):
-        def on_fit_start(self, trainer, pl_module):
-            assert isinstance(trainer.accelerator, CPUAccelerator)
-            assert isinstance(trainer.training_type_plugin, DDPPlugin)
-            assert trainer.training_type_plugin.local_rank == 0
-            raise SystemExit()
-
-    model = BoringModel()
-    trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", num_processes=2, callbacks=[CB()])
-
-    with pytest.raises(SystemExit):
-        trainer.fit(model)
+def test_dist_backend_accelerator_mapping(*_):
+    trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", num_processes=2)
+    assert isinstance(trainer.accelerator, CPUAccelerator)
+    assert isinstance(trainer.training_type_plugin, DDPPlugin)
+    assert trainer.training_type_plugin.local_rank == 0
 
 
 @mock.patch("pytorch_lightning.utilities._IS_INTERACTIVE", return_value=True)
@@ -775,24 +694,15 @@ def test_strategy_choice_ddp_spawn(cuda_available_mock, device_count_mock):
 @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True)
 @pytest.mark.parametrize("strategy", ["ddp", DDPPlugin()])
 def test_strategy_choice_ddp_slurm(setup_distributed_mock, strategy):
-    class CB(Callback):
-        def on_fit_start(self, trainer, pl_module):
-            assert trainer._accelerator_connector._is_slurm_managing_tasks
-            assert isinstance(trainer.accelerator, GPUAccelerator)
-            assert isinstance(trainer.training_type_plugin, DDPPlugin)
-            assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment)
-            assert trainer.training_type_plugin.cluster_environment.local_rank() == 1
-            assert trainer.training_type_plugin.local_rank == 1
-            raise SystemExit()
-
-    model = BoringModel()
-    trainer = Trainer(fast_dev_run=True, strategy=strategy, gpus=2, callbacks=[CB()])
-
-    with pytest.raises(SystemExit):
-        trainer.fit(model)
+    trainer = Trainer(fast_dev_run=True, strategy=strategy, gpus=2)
+    assert trainer._accelerator_connector._is_slurm_managing_tasks
+    assert isinstance(trainer.accelerator, GPUAccelerator)
+    assert isinstance(trainer.training_type_plugin, DDPPlugin)
+    assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment)
+    assert trainer.training_type_plugin.cluster_environment.local_rank() == 1
+    assert trainer.training_type_plugin.local_rank == 1
 
 
-@RunIf(min_gpus=2)
 @mock.patch.dict(
     os.environ,
     {
@@ -808,24 +718,15 @@ def on_fit_start(self, trainer, pl_module):
 @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True)
 @pytest.mark.parametrize("strategy", ["ddp2", DDP2Plugin()])
 def test_strategy_choice_ddp2_slurm(device_count_mock, setup_distributed_mock, strategy):
-    class CB(Callback):
-        def on_fit_start(self, trainer, pl_module):
-            assert trainer._accelerator_connector._is_slurm_managing_tasks
-            assert isinstance(trainer.accelerator, GPUAccelerator)
-            assert isinstance(trainer.training_type_plugin, DDP2Plugin)
-            assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment)
-            assert trainer.training_type_plugin.cluster_environment.local_rank() == 1
-            assert trainer.training_type_plugin.local_rank == 1
-            raise SystemExit()
-
-    model = BoringModel()
-    trainer = Trainer(fast_dev_run=True, strategy=strategy, gpus=2, callbacks=[CB()])
-
-    with pytest.raises(SystemExit):
-        trainer.fit(model)
+    trainer = Trainer(fast_dev_run=True, strategy=strategy, gpus=2)
+    assert trainer._accelerator_connector._is_slurm_managing_tasks
+    assert isinstance(trainer.accelerator, GPUAccelerator)
+    assert isinstance(trainer.training_type_plugin, DDP2Plugin)
+    assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment)
+    assert trainer.training_type_plugin.cluster_environment.local_rank() == 1
+    assert trainer.training_type_plugin.local_rank == 1
 
 
-@RunIf(min_gpus=1)
 @mock.patch.dict(
     os.environ,
     {
@@ -839,24 +740,15 @@ def on_fit_start(self, trainer, pl_module):
 )
 @mock.patch("torch.cuda.device_count", return_value=2)
 @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True)
-def test_strategy_choice_ddp_te(device_count_mock, setup_distributed_mock):
-    class CB(Callback):
-        def on_fit_start(self, trainer, pl_module):
-            assert isinstance(trainer.accelerator, GPUAccelerator)
-            assert isinstance(trainer.training_type_plugin, DDPPlugin)
-            assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
-            assert trainer.training_type_plugin.cluster_environment.local_rank() == 1
-            assert trainer.training_type_plugin.local_rank == 1
-            raise SystemExit()
-
-    model = BoringModel()
-    trainer = Trainer(fast_dev_run=True, strategy="ddp", gpus=2, callbacks=[CB()])
-
-    with pytest.raises(SystemExit):
-        trainer.fit(model)
+def test_strategy_choice_ddp_te(*_):
+    trainer = Trainer(fast_dev_run=True, strategy="ddp", gpus=2)
+    assert isinstance(trainer.accelerator, GPUAccelerator)
+    assert isinstance(trainer.training_type_plugin, DDPPlugin)
+    assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
+    assert trainer.training_type_plugin.cluster_environment.local_rank() == 1
+    assert trainer.training_type_plugin.local_rank == 1
 
 
-@RunIf(min_gpus=1)
 @mock.patch.dict(
     os.environ,
     {
@@ -870,21 +762,13 @@ def on_fit_start(self, trainer, pl_module):
 )
 @mock.patch("torch.cuda.device_count", return_value=2)
 @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True)
-def test_strategy_choice_ddp2_te(device_count_mock, setup_distributed_mock):
-    class CB(Callback):
-        def on_fit_start(self, trainer, pl_module):
-            assert isinstance(trainer.accelerator, GPUAccelerator)
-            assert isinstance(trainer.training_type_plugin, DDP2Plugin)
-            assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
-            assert trainer.training_type_plugin.cluster_environment.local_rank() == 1
-            assert trainer.training_type_plugin.local_rank == 1
-            raise SystemExit()
-
-    model = BoringModel()
-    trainer = Trainer(fast_dev_run=True, strategy="ddp2", gpus=2, callbacks=[CB()])
-
-    with pytest.raises(SystemExit):
-        trainer.fit(model)
+def test_strategy_choice_ddp2_te(*_):
+    trainer = Trainer(fast_dev_run=True, strategy="ddp2", gpus=2)
+    assert isinstance(trainer.accelerator, GPUAccelerator)
+    assert isinstance(trainer.training_type_plugin, DDP2Plugin)
+    assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
+    assert trainer.training_type_plugin.cluster_environment.local_rank() == 1
+    assert trainer.training_type_plugin.local_rank == 1
 
 
 @mock.patch.dict(
@@ -892,24 +776,15 @@ def on_fit_start(self, trainer, pl_module):
 )
 @mock.patch("torch.cuda.device_count", return_value=0)
 @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True)
-def test_strategy_choice_ddp_cpu_te(device_count_mock, setup_distributed_mock):
-    class CB(Callback):
-        def on_fit_start(self, trainer, pl_module):
-            assert isinstance(trainer.accelerator, CPUAccelerator)
-            assert isinstance(trainer.training_type_plugin, DDPPlugin)
-            assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
-            assert trainer.training_type_plugin.cluster_environment.local_rank() == 1
-            assert trainer.training_type_plugin.local_rank == 1
-            raise SystemExit()
-
-    model = BoringModel()
-    trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", num_processes=2, callbacks=[CB()])
-
-    with pytest.raises(SystemExit):
-        trainer.fit(model)
+def test_strategy_choice_ddp_cpu_te(*_):
+    trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", num_processes=2)
+    assert isinstance(trainer.accelerator, CPUAccelerator)
+    assert isinstance(trainer.training_type_plugin, DDPPlugin)
+    assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
+    assert trainer.training_type_plugin.cluster_environment.local_rank() == 1
+    assert trainer.training_type_plugin.local_rank == 1
 
 
-@RunIf(min_gpus=1)
 @mock.patch.dict(
     os.environ,
     {
@@ -923,21 +798,13 @@ def on_fit_start(self, trainer, pl_module):
 )
 @mock.patch("torch.cuda.device_count", return_value=1)
 @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True)
-def test_strategy_choice_ddp_kubeflow(device_count_mock, setup_distributed_mock):
-    class CB(Callback):
-        def on_fit_start(self, trainer, pl_module):
-            assert isinstance(trainer.accelerator, GPUAccelerator)
-            assert isinstance(trainer.training_type_plugin, DDPPlugin)
-            assert isinstance(trainer.training_type_plugin.cluster_environment, KubeflowEnvironment)
-            assert trainer.training_type_plugin.cluster_environment.local_rank() == 0
-            assert trainer.training_type_plugin.local_rank == 0
-            raise SystemExit()
-
-    model = BoringModel()
-    trainer = Trainer(fast_dev_run=True, strategy="ddp", gpus=1, callbacks=[CB()])
-
-    with pytest.raises(SystemExit):
-        trainer.fit(model)
+def test_strategy_choice_ddp_kubeflow(*_):
+    trainer = Trainer(fast_dev_run=True, strategy="ddp", gpus=1)
+    assert isinstance(trainer.accelerator, GPUAccelerator)
+    assert isinstance(trainer.training_type_plugin, DDPPlugin)
+    assert isinstance(trainer.training_type_plugin.cluster_environment, KubeflowEnvironment)
+    assert trainer.training_type_plugin.cluster_environment.local_rank() == 0
+    assert trainer.training_type_plugin.local_rank == 0
 
 
 @mock.patch.dict(
@@ -952,21 +819,13 @@ def on_fit_start(self, trainer, pl_module):
 )
 @mock.patch("torch.cuda.device_count", return_value=0)
 @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True)
-def test_strategy_choice_ddp_cpu_kubeflow(device_count_mock, setup_distributed_mock):
-    class CB(Callback):
-        def on_fit_start(self, trainer, pl_module):
-            assert isinstance(trainer.accelerator, CPUAccelerator)
-            assert isinstance(trainer.training_type_plugin, DDPPlugin)
-            assert isinstance(trainer.training_type_plugin.cluster_environment, KubeflowEnvironment)
-            assert trainer.training_type_plugin.cluster_environment.local_rank() == 0
-            assert trainer.training_type_plugin.local_rank == 0
-            raise SystemExit()
-
-    model = BoringModel()
-    trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", num_processes=2, callbacks=[CB()])
-
-    with pytest.raises(SystemExit):
-        trainer.fit(model)
+def test_strategy_choice_ddp_cpu_kubeflow(*_):
+    trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", num_processes=2)
+    assert isinstance(trainer.accelerator, CPUAccelerator)
+    assert isinstance(trainer.training_type_plugin, DDPPlugin)
+    assert isinstance(trainer.training_type_plugin.cluster_environment, KubeflowEnvironment)
+    assert trainer.training_type_plugin.cluster_environment.local_rank() == 0
+    assert trainer.training_type_plugin.local_rank == 0
 
 
 @mock.patch.dict(
@@ -984,20 +843,11 @@ def on_fit_start(self, trainer, pl_module):
 @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True)
 @pytest.mark.parametrize("strategy", ["ddp", DDPPlugin()])
 def test_strategy_choice_ddp_cpu_slurm(device_count_mock, setup_distributed_mock, strategy):
-    class CB(Callback):
-        def on_fit_start(self, trainer, pl_module):
-            assert trainer._accelerator_connector._is_slurm_managing_tasks
-            assert isinstance(trainer.accelerator, CPUAccelerator)
-            assert isinstance(trainer.training_type_plugin, DDPPlugin)
-            assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment)
-            assert trainer.training_type_plugin.local_rank == 0
-            raise SystemExit()
-
-    model = BoringModel()
-    trainer = Trainer(fast_dev_run=True, strategy=strategy, num_processes=2, callbacks=[CB()])
-
-    with pytest.raises(SystemExit):
-        trainer.fit(model)
+    trainer = Trainer(fast_dev_run=True, strategy=strategy, num_processes=2)
+    assert isinstance(trainer.accelerator, CPUAccelerator)
+    assert isinstance(trainer.training_type_plugin, DDPPlugin)
+    assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment)
+    assert trainer.training_type_plugin.local_rank == 0
 
 
 def test_unsupported_tpu_choice(monkeypatch):
diff --git a/tests/accelerators/test_ddp.py b/tests/accelerators/test_ddp.py
index 6b28640e92ab4..db2f388971c12 100644
--- a/tests/accelerators/test_ddp.py
+++ b/tests/accelerators/test_ddp.py
@@ -108,17 +108,9 @@ def setup(self, stage: Optional[str] = None) -> None:
         trainer.fit(model)
 
 
-@RunIf(min_gpus=2, min_torch="1.8.1", special=True)
-def test_ddp_wrapper_16(tmpdir):
-    _test_ddp_wrapper(tmpdir, precision=16)
-
-
-@RunIf(min_gpus=2, min_torch="1.8.1", special=True)
-def test_ddp_wrapper_32(tmpdir):
-    _test_ddp_wrapper(tmpdir, precision=32)
-
-
-def _test_ddp_wrapper(tmpdir, precision):
+@RunIf(min_gpus=2, min_torch="1.8.1", standalone=True)
+@pytest.mark.parametrize("precision", (16, 32))
+def test_ddp_wrapper(tmpdir, precision):
     """Test parameters to ignore are carried over for DDP."""
 
     class WeirdModule(torch.nn.Module):
diff --git a/tests/accelerators/test_gpu.py b/tests/accelerators/test_gpu.py
index 85ce0cd9f0f18..764630f30b0b1 100644
--- a/tests/accelerators/test_gpu.py
+++ b/tests/accelerators/test_gpu.py
@@ -1,8 +1,25 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from unittest import mock
+
 import torch
 
+from pytorch_lightning import Trainer
 from pytorch_lightning.accelerators import GPUAccelerator
 from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin
 from pytorch_lightning.plugins.training_type.dp import DataParallelPlugin
+from tests.helpers import BoringModel
 from tests.helpers.runif import RunIf
 
 
@@ -34,3 +51,20 @@ def test_get_nvidia_gpu_stats(tmpdir):
 
     for f in fields:
         assert any(f in h for h in gpu_stats.keys())
+
+
+@RunIf(min_gpus=1)
+@mock.patch("torch.cuda.set_device")
+def test_set_cuda_device(set_device_mock, tmpdir):
+    model = BoringModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        fast_dev_run=True,
+        accelerator="gpu",
+        devices=1,
+        enable_checkpointing=False,
+        enable_model_summary=False,
+        enable_progress_bar=False,
+    )
+    trainer.fit(model)
+    set_device_mock.assert_called_once()
diff --git a/tests/accelerators/test_multi_nodes_gpu.py b/tests/accelerators/test_multi_nodes_gpu.py
index 0df49a41b0fd0..09f632746b1dd 100644
--- a/tests/accelerators/test_multi_nodes_gpu.py
+++ b/tests/accelerators/test_multi_nodes_gpu.py
@@ -31,7 +31,7 @@
 # TODO(Borda): When multi-node tests are re-enabled (.github/workflows/ci_test-mnodes.yml)
 # use an environment variable `PL_RUNNING_MULTINODE_TESTS` and set `RunIf(multinode=True)`
 @pytest.mark.skip("Multi-node testing is currently disabled")
-@RunIf(special=True)
+@RunIf(standalone=True)
 def test_logging_sync_dist_true_ddp(tmpdir):
     """Tests to ensure that the sync_dist flag works with CPU (should just return the original value)"""
     fake_result = 1
@@ -68,7 +68,7 @@ def validation_step(self, batch, batch_idx):
 # TODO(Borda): When multi-node tests are re-enabled (.github/workflows/ci_test-mnodes.yml)
 # use an environment variable `PL_RUNNING_MULTINODE_TESTS` and set `RunIf(multinode=True)`
 @pytest.mark.skip("Multi-node testing is currently disabled")
-@RunIf(special=True)
+@RunIf(standalone=True)
 def test__validation_step__log(tmpdir):
     """Tests that validation_step can log."""
 
diff --git a/tests/benchmarks/__init__.py b/tests/benchmarks/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/benchmarks/generate_comparison.py b/tests/benchmarks/generate_comparison.py
similarity index 97%
rename from benchmarks/generate_comparison.py
rename to tests/benchmarks/generate_comparison.py
index 5a9cde0d80ed3..bc95b5d9cf591 100644
--- a/benchmarks/generate_comparison.py
+++ b/tests/benchmarks/generate_comparison.py
@@ -16,7 +16,7 @@
 import matplotlib.pylab as plt
 import pandas as pd
 
-from benchmarks.test_basic_parity import measure_loops
+from tests.benchmarks.test_basic_parity import measure_loops
 from tests.helpers.advanced_models import ParityModuleMNIST, ParityModuleRNN
 
 NUM_EPOCHS = 20
diff --git a/benchmarks/test_basic_parity.py b/tests/benchmarks/test_basic_parity.py
similarity index 100%
rename from benchmarks/test_basic_parity.py
rename to tests/benchmarks/test_basic_parity.py
diff --git a/benchmarks/test_sharded_parity.py b/tests/benchmarks/test_sharded_parity.py
similarity index 100%
rename from benchmarks/test_sharded_parity.py
rename to tests/benchmarks/test_sharded_parity.py
diff --git a/tests/callbacks/test_early_stopping.py b/tests/callbacks/test_early_stopping.py
index 2b4fe9f05eb87..1540cbeba5189 100644
--- a/tests/callbacks/test_early_stopping.py
+++ b/tests/callbacks/test_early_stopping.py
@@ -381,7 +381,7 @@ def on_train_end(self) -> None:
 
 _ES_CHECK = dict(check_on_train_epoch_end=True)
 _ES_CHECK_P3 = dict(patience=3, check_on_train_epoch_end=True)
-_NO_WIN = dict(marks=RunIf(skip_windows=True))
+_SPAWN_MARK = dict(marks=RunIf(skip_windows=True, skip_49370=True))
 
 
 @pytest.mark.parametrize(
@@ -389,8 +389,8 @@ def on_train_end(self) -> None:
     [
         ([EarlyStopping("abc"), EarlyStopping("cba", patience=3)], 3, False, None, 1),
         ([EarlyStopping("cba", patience=3), EarlyStopping("abc")], 3, False, None, 1),
-        pytest.param([EarlyStopping("abc"), EarlyStopping("cba", patience=3)], 3, False, "ddp_spawn", 2, **_NO_WIN),
-        pytest.param([EarlyStopping("cba", patience=3), EarlyStopping("abc")], 3, False, "ddp_spawn", 2, **_NO_WIN),
+        pytest.param([EarlyStopping("abc"), EarlyStopping("cba", patience=3)], 3, False, "ddp_spawn", 2, **_SPAWN_MARK),
+        pytest.param([EarlyStopping("cba", patience=3), EarlyStopping("abc")], 3, False, "ddp_spawn", 2, **_SPAWN_MARK),
         ([EarlyStopping("abc", **_ES_CHECK), EarlyStopping("cba", **_ES_CHECK_P3)], 3, True, None, 1),
         ([EarlyStopping("cba", **_ES_CHECK_P3), EarlyStopping("abc", **_ES_CHECK)], 3, True, None, 1),
         pytest.param(
@@ -399,7 +399,7 @@ def on_train_end(self) -> None:
             True,
             "ddp_spawn",
             2,
-            **_NO_WIN,
+            **_SPAWN_MARK,
         ),
         pytest.param(
             [EarlyStopping("cba", **_ES_CHECK_P3), EarlyStopping("abc", **_ES_CHECK)],
@@ -407,7 +407,7 @@ def on_train_end(self) -> None:
             True,
             "ddp_spawn",
             2,
-            **_NO_WIN,
+            **_SPAWN_MARK,
         ),
     ],
 )
@@ -469,3 +469,16 @@ def validation_step(self, batch, batch_idx):
         assert trainer.global_step == len(side_effect) * int(trainer.limit_train_batches * trainer.val_check_interval)
     else:
         assert trainer.current_epoch == len(side_effect) * trainer.check_val_every_n_epoch - 1
+
+
+def test_early_stopping_squeezes():
+    early_stopping = EarlyStopping(monitor="foo")
+    trainer = Trainer()
+    trainer.callback_metrics["foo"] = torch.tensor([[[0]]])
+
+    with mock.patch(
+        "pytorch_lightning.callbacks.EarlyStopping._evaluate_stopping_criteria", return_value=(False, "")
+    ) as es_mock:
+        early_stopping._run_early_stopping_check(trainer)
+
+    es_mock.assert_called_once_with(torch.tensor(0))
diff --git a/tests/callbacks/test_gpu_stats_monitor.py b/tests/callbacks/test_gpu_stats_monitor.py
index 5ed3f533b5588..ca9197c6a078c 100644
--- a/tests/callbacks/test_gpu_stats_monitor.py
+++ b/tests/callbacks/test_gpu_stats_monitor.py
@@ -83,7 +83,7 @@ def test_gpu_stats_monitor_no_queries(tmpdir):
     with mock.patch("pytorch_lightning.loggers.tensorboard.TensorBoardLogger.log_metrics") as log_metrics_mock:
         trainer.fit(model)
 
-    assert log_metrics_mock.mock_calls[2:] == [
+    assert log_metrics_mock.mock_calls[1:] == [
         mock.call({"batch_time/intra_step (ms)": mock.ANY}, step=0),
         mock.call({"batch_time/inter_step (ms)": mock.ANY}, step=1),
         mock.call({"batch_time/intra_step (ms)": mock.ANY}, step=1),
diff --git a/tests/callbacks/test_prediction_writer.py b/tests/callbacks/test_prediction_writer.py
index 75e0dbd31ec79..f086316052995 100644
--- a/tests/callbacks/test_prediction_writer.py
+++ b/tests/callbacks/test_prediction_writer.py
@@ -11,54 +11,132 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from unittest.mock import ANY, call, Mock
 
 import pytest
+from torch.utils.data import DataLoader
 
+import pytorch_lightning as pl
 from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks import BasePredictionWriter
+from pytorch_lightning.trainer.supporters import CombinedLoader
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from tests.helpers import BoringModel
+from tests.helpers import BoringModel, RandomDataset
+from tests.helpers.runif import RunIf
 
 
-def test_prediction_writer(tmpdir):
-    class CustomPredictionWriter(BasePredictionWriter):
-        def __init__(self, writer_interval: str):
-            super().__init__(writer_interval)
+class DummyPredictionWriter(BasePredictionWriter):
+    def write_on_batch_end(self, *args, **kwargs):
+        pass
 
-            self.write_on_batch_end_called = False
-            self.write_on_epoch_end_called = False
+    def write_on_epoch_end(self, *args, **kwargs):
+        pass
 
-        def write_on_batch_end(self, *args, **kwargs):
-            self.write_on_batch_end_called = True
-
-        def write_on_epoch_end(self, *args, **kwargs):
-            self.write_on_epoch_end_called = True
 
+def test_prediction_writer_invalid_write_interval():
+    """Test that configuring an unknown interval name raises an error."""
     with pytest.raises(MisconfigurationException, match=r"`write_interval` should be one of \['batch"):
-        CustomPredictionWriter("something")
+        DummyPredictionWriter("something")
+
+
+def test_prediction_writer_hook_call_intervals(tmpdir):
+    """Test that the `write_on_batch_end` and `write_on_epoch_end` hooks get invoked based on the defined
+    interval."""
+    DummyPredictionWriter.write_on_batch_end = Mock()
+    DummyPredictionWriter.write_on_epoch_end = Mock()
+
+    dataloader = DataLoader(RandomDataset(32, 64))
 
     model = BoringModel()
-    cb = CustomPredictionWriter("batch_and_epoch")
+    cb = DummyPredictionWriter("batch_and_epoch")
     trainer = Trainer(limit_predict_batches=4, callbacks=cb)
-    results = trainer.predict(model, dataloaders=model.train_dataloader())
+    results = trainer.predict(model, dataloaders=dataloader)
     assert len(results) == 4
-    assert cb.write_on_batch_end_called
-    assert cb.write_on_epoch_end_called
+    assert cb.write_on_batch_end.call_count == 4
+    assert cb.write_on_epoch_end.call_count == 1
+
+    DummyPredictionWriter.write_on_batch_end.reset_mock()
+    DummyPredictionWriter.write_on_epoch_end.reset_mock()
 
-    cb = CustomPredictionWriter("batch_and_epoch")
+    cb = DummyPredictionWriter("batch_and_epoch")
     trainer = Trainer(limit_predict_batches=4, callbacks=cb)
-    trainer.predict(model, dataloaders=model.train_dataloader(), return_predictions=False)
-    assert cb.write_on_batch_end_called
-    assert cb.write_on_epoch_end_called
+    trainer.predict(model, dataloaders=dataloader, return_predictions=False)
+    assert cb.write_on_batch_end.call_count == 4
+    assert cb.write_on_epoch_end.call_count == 1
+
+    DummyPredictionWriter.write_on_batch_end.reset_mock()
+    DummyPredictionWriter.write_on_epoch_end.reset_mock()
 
-    cb = CustomPredictionWriter("batch")
+    cb = DummyPredictionWriter("batch")
     trainer = Trainer(limit_predict_batches=4, callbacks=cb)
-    trainer.predict(model, dataloaders=model.train_dataloader(), return_predictions=False)
-    assert cb.write_on_batch_end_called
-    assert not cb.write_on_epoch_end_called
+    trainer.predict(model, dataloaders=dataloader, return_predictions=False)
+    assert cb.write_on_batch_end.call_count == 4
+    assert cb.write_on_epoch_end.call_count == 0
 
-    cb = CustomPredictionWriter("epoch")
+    DummyPredictionWriter.write_on_batch_end.reset_mock()
+    DummyPredictionWriter.write_on_epoch_end.reset_mock()
+
+    cb = DummyPredictionWriter("epoch")
     trainer = Trainer(limit_predict_batches=4, callbacks=cb)
-    trainer.predict(model, dataloaders=model.train_dataloader(), return_predictions=False)
-    assert not cb.write_on_batch_end_called
-    assert cb.write_on_epoch_end_called
+    trainer.predict(model, dataloaders=dataloader, return_predictions=False)
+    assert cb.write_on_batch_end.call_count == 0
+    assert cb.write_on_epoch_end.call_count == 1
+
+
+@pytest.mark.parametrize("num_workers", [0, pytest.param(2, marks=RunIf(slow=True))])
+def test_prediction_writer_batch_indices(tmpdir, num_workers):
+    DummyPredictionWriter.write_on_batch_end = Mock()
+    DummyPredictionWriter.write_on_epoch_end = Mock()
+
+    dataloader = DataLoader(RandomDataset(32, 64), batch_size=4, num_workers=num_workers)
+    model = BoringModel()
+    writer = DummyPredictionWriter("batch_and_epoch")
+    trainer = Trainer(limit_predict_batches=4, callbacks=writer)
+    trainer.predict(model, dataloaders=dataloader)
+
+    writer.write_on_batch_end.assert_has_calls(
+        [
+            call(trainer, model, ANY, [0, 1, 2, 3], ANY, 0, 0),
+            call(trainer, model, ANY, [4, 5, 6, 7], ANY, 1, 0),
+            call(trainer, model, ANY, [8, 9, 10, 11], ANY, 2, 0),
+            call(trainer, model, ANY, [12, 13, 14, 15], ANY, 3, 0),
+        ]
+    )
+
+    writer.write_on_epoch_end.assert_has_calls(
+        [
+            call(trainer, model, ANY, [[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]]),
+        ]
+    )
+
+
+def test_prediction_writer_partial_support_for_combined_loader(tmpdir):
+    """Test partial support for CombinedLoader: prediction works but sample indices don't get tracked."""
+    pl.loops.epoch.prediction_epoch_loop.warning_cache.clear()
+
+    class PredictionModel(BoringModel):
+        def predict_dataloader(self):
+            return CombinedLoader(
+                {
+                    "a": DataLoader(RandomDataset(32, 8), batch_size=2),
+                    "b": DataLoader(RandomDataset(32, 8), batch_size=4),
+                }
+            )
+
+        def predict_step(self, batch, *args, **kwargs):
+            return self(batch["a"])
+
+    DummyPredictionWriter.write_on_batch_end = Mock()
+    DummyPredictionWriter.write_on_epoch_end = Mock()
+
+    model = PredictionModel()
+    writer = DummyPredictionWriter("batch_and_epoch")
+    trainer = Trainer(callbacks=writer)
+    with pytest.warns(UserWarning, match="Lightning couldn't infer the indices fetched for your dataloader."):
+        trainer.predict(model)
+
+    writer.write_on_batch_end.assert_has_calls(
+        [call(trainer, model, ANY, [], ANY, 0, 0), call(trainer, model, ANY, [], ANY, 1, 0)]
+    )
+
+    writer.write_on_epoch_end.assert_has_calls([call(trainer, model, ANY, [[]])])
diff --git a/tests/callbacks/test_pruning.py b/tests/callbacks/test_pruning.py
index 1c1f84b5b95a0..f63892df94310 100644
--- a/tests/callbacks/test_pruning.py
+++ b/tests/callbacks/test_pruning.py
@@ -160,34 +160,25 @@ def test_pruning_callback(
     )
 
 
-@RunIf(special=True, min_gpus=2)
-def test_pruning_callback_ddp_0(tmpdir):
+@RunIf(standalone=True, min_gpus=2)
+@pytest.mark.parametrize("parameters_to_prune", (False, True))
+@pytest.mark.parametrize("use_global_unstructured", (False, True))
+def test_pruning_callback_ddp(tmpdir, parameters_to_prune, use_global_unstructured):
     train_with_pruning_callback(
-        tmpdir, parameters_to_prune=False, use_global_unstructured=False, strategy="ddp", gpus=2
+        tmpdir,
+        parameters_to_prune=parameters_to_prune,
+        use_global_unstructured=use_global_unstructured,
+        strategy="ddp",
+        gpus=2,
     )
 
 
-@RunIf(special=True, min_gpus=2)
-def test_pruning_callback_ddp_1(tmpdir):
-    train_with_pruning_callback(tmpdir, parameters_to_prune=False, use_global_unstructured=True, strategy="ddp", gpus=2)
-
-
-@RunIf(special=True, min_gpus=2)
-def test_pruning_callback_ddp_2(tmpdir):
-    train_with_pruning_callback(tmpdir, parameters_to_prune=True, use_global_unstructured=False, strategy="ddp", gpus=2)
-
-
-@RunIf(special=True, min_gpus=2)
-def test_pruning_callback_ddp_3(tmpdir):
-    train_with_pruning_callback(tmpdir, parameters_to_prune=True, use_global_unstructured=True, strategy="ddp", gpus=2)
-
-
 @RunIf(min_gpus=2, skip_windows=True)
 def test_pruning_callback_ddp_spawn(tmpdir):
     train_with_pruning_callback(tmpdir, use_global_unstructured=True, strategy="ddp_spawn", gpus=2)
 
 
-@RunIf(skip_windows=True)
+@RunIf(skip_windows=True, skip_49370=True)
 def test_pruning_callback_ddp_cpu(tmpdir):
     train_with_pruning_callback(tmpdir, parameters_to_prune=True, strategy="ddp_spawn", num_processes=2)
 
diff --git a/tests/callbacks/test_rich_progress_bar.py b/tests/callbacks/test_rich_progress_bar.py
index 6c0a201c794c3..ea38287d1789f 100644
--- a/tests/callbacks/test_rich_progress_bar.py
+++ b/tests/callbacks/test_rich_progress_bar.py
@@ -85,7 +85,7 @@ def predict_dataloader(self):
 
 def test_rich_progress_bar_import_error():
     if not _RICH_AVAILABLE:
-        with pytest.raises(ImportError, match="`RichProgressBar` requires `rich` to be installed."):
+        with pytest.raises(ImportError, match="`RichProgressBar` requires `rich` >= 10.2.2."):
             Trainer(callbacks=RichProgressBar())
 
 
@@ -106,11 +106,11 @@ def test_rich_progress_bar_custom_theme(tmpdir):
 
         assert progress_bar.theme == theme
         args, kwargs = mocks["CustomBarColumn"].call_args
-        assert kwargs["complete_style"] == theme.progress_bar_complete
+        assert kwargs["complete_style"] == theme.progress_bar
         assert kwargs["finished_style"] == theme.progress_bar_finished
 
         args, kwargs = mocks["BatchesProcessedColumn"].call_args
-        assert kwargs["style"] == theme.batch_process
+        assert kwargs["style"] == theme.batch_progress
 
         args, kwargs = mocks["CustomTimeColumn"].call_args
         assert kwargs["style"] == theme.time
@@ -150,15 +150,15 @@ def test_rich_progress_bar_configure_columns():
     custom_column = TextColumn("[progress.description]Testing Rich!")
 
     class CustomRichProgressBar(RichProgressBar):
-        def configure_columns(self, trainer, pl_module):
+        def configure_columns(self, trainer):
             return [custom_column]
 
     progress_bar = CustomRichProgressBar()
 
-    progress_bar._init_progress(Mock(), Mock())
+    progress_bar._init_progress(Mock())
 
     assert progress_bar.progress.columns[0] == custom_column
-    assert len(progress_bar.progress.columns) == 1
+    assert len(progress_bar.progress.columns) == 2
 
 
 @RunIf(rich=True)
@@ -180,3 +180,24 @@ def test_rich_progress_bar_leave(tmpdir, leave, reset_call_count):
         )
         trainer.fit(model)
     assert mock_progress_reset.call_count == reset_call_count
+
+
+@RunIf(rich=True)
+@pytest.mark.parametrize("limit_val_batches", (1, 5))
+def test_rich_progress_bar_num_sanity_val_steps(tmpdir, limit_val_batches: int):
+    model = BoringModel()
+
+    progress_bar = RichProgressBar()
+    num_sanity_val_steps = 3
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        num_sanity_val_steps=num_sanity_val_steps,
+        limit_train_batches=1,
+        limit_val_batches=limit_val_batches,
+        max_epochs=1,
+        callbacks=progress_bar,
+    )
+
+    trainer.fit(model)
+    assert progress_bar.progress.tasks[0].completed == min(num_sanity_val_steps, limit_val_batches)
diff --git a/tests/callbacks/test_stochastic_weight_avg.py b/tests/callbacks/test_stochastic_weight_avg.py
index e10f99d33d564..910d6443d4def 100644
--- a/tests/callbacks/test_stochastic_weight_avg.py
+++ b/tests/callbacks/test_stochastic_weight_avg.py
@@ -138,7 +138,7 @@ def train_with_swa(
     assert trainer.lightning_module == model
 
 
-@RunIf(min_gpus=2, special=True)
+@RunIf(min_gpus=2, standalone=True)
 def test_swa_callback_ddp(tmpdir):
     train_with_swa(tmpdir, strategy="ddp", gpus=2)
 
@@ -148,7 +148,7 @@ def test_swa_callback_ddp_spawn(tmpdir):
     train_with_swa(tmpdir, strategy="ddp_spawn", gpus=2)
 
 
-@RunIf(skip_windows=True)
+@RunIf(skip_windows=True, skip_49370=True)
 def test_swa_callback_ddp_cpu(tmpdir):
     train_with_swa(tmpdir, strategy="ddp_spawn", num_processes=2)
 
diff --git a/tests/callbacks/test_tqdm_progress_bar.py b/tests/callbacks/test_tqdm_progress_bar.py
index b92fb18d54ccd..8f4c1b4e540b7 100644
--- a/tests/callbacks/test_tqdm_progress_bar.py
+++ b/tests/callbacks/test_tqdm_progress_bar.py
@@ -14,6 +14,7 @@
 import os
 import pickle
 import sys
+from collections import defaultdict
 from typing import Optional, Union
 from unittest import mock
 from unittest.mock import ANY, call, Mock
@@ -521,21 +522,12 @@ def test_tqdm_progress_bar_can_be_pickled():
     pickle.dumps(bar)
 
 
-@RunIf(min_gpus=2, special=True)
-def test_tqdm_progress_bar_max_val_check_interval_0(tmpdir):
-    _test_progress_bar_max_val_check_interval(
-        tmpdir, total_train_samples=8, train_batch_size=4, total_val_samples=2, val_batch_size=1, val_check_interval=0.2
-    )
-
-
-@RunIf(min_gpus=2, special=True)
-def test_tqdm_progress_bar_max_val_check_interval_1(tmpdir):
-    _test_progress_bar_max_val_check_interval(
-        tmpdir, total_train_samples=8, train_batch_size=4, total_val_samples=2, val_batch_size=1, val_check_interval=0.5
-    )
-
-
-def _test_progress_bar_max_val_check_interval(
+@RunIf(min_gpus=2, standalone=True)
+@pytest.mark.parametrize(
+    ["total_train_samples", "train_batch_size", "total_val_samples", "val_batch_size", "val_check_interval"],
+    [(8, 4, 2, 1, 0.2), (8, 4, 2, 1, 0.5)],
+)
+def test_progress_bar_max_val_check_interval(
     tmpdir, total_train_samples, train_batch_size, total_val_samples, val_batch_size, val_check_interval
 ):
     world_size = 2
@@ -616,3 +608,65 @@ def test_tqdm_progress_bar_main_bar_resume():
     # restarting mid validation epoch is not currently supported
     assert bar.val_progress_bar.n == 0
     assert bar.val_progress_bar.total == 3
+
+
+def test_tqdm_progress_bar_correct_value_epoch_end(tmpdir):
+    class MockedProgressBar(TQDMProgressBar):
+        calls = defaultdict(list)
+
+        def get_metrics(self, trainer, pl_module):
+            items = super().get_metrics(trainer, model)
+            del items["v_num"]
+            del items["loss"]
+            # this is equivalent to mocking `set_postfix` as this method gets called every time
+            self.calls[trainer.state.fn].append(
+                (trainer.state.stage, trainer.current_epoch, trainer.global_step, items)
+            )
+            return items
+
+    class MyModel(BoringModel):
+        def training_step(self, batch, batch_idx):
+            self.log("a", self.global_step, prog_bar=True, on_step=False, on_epoch=True, reduce_fx=max)
+            return super().training_step(batch, batch_idx)
+
+        def validation_step(self, batch, batch_idx):
+            self.log("b", self.global_step, prog_bar=True, on_step=False, on_epoch=True, reduce_fx=max)
+            return super().validation_step(batch, batch_idx)
+
+        def test_step(self, batch, batch_idx):
+            self.log("c", self.global_step, prog_bar=True, on_step=False, on_epoch=True, reduce_fx=max)
+            return super().test_step(batch, batch_idx)
+
+    model = MyModel()
+    pbar = MockedProgressBar()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        limit_train_batches=2,
+        limit_val_batches=2,
+        limit_test_batches=2,
+        max_epochs=2,
+        enable_model_summary=False,
+        enable_checkpointing=False,
+        log_every_n_steps=1,
+        callbacks=pbar,
+    )
+
+    trainer.fit(model)
+    assert pbar.calls["fit"] == [
+        ("sanity_check", 0, 0, {"b": 0}),
+        ("train", 0, 0, {}),
+        ("train", 0, 1, {}),
+        ("validate", 0, 1, {"b": 1}),  # validation end
+        # epoch end over, `on_epoch=True` metrics are computed
+        ("train", 0, 2, {"a": 1, "b": 1}),  # training epoch end
+        ("train", 1, 2, {"a": 1, "b": 1}),
+        ("train", 1, 3, {"a": 1, "b": 1}),
+        ("validate", 1, 3, {"a": 1, "b": 3}),  # validation end
+        ("train", 1, 4, {"a": 3, "b": 3}),  # training epoch end
+    ]
+
+    trainer.validate(model, verbose=False)
+    assert pbar.calls["validate"] == []
+
+    trainer.test(model, verbose=False)
+    assert pbar.calls["test"] == []
diff --git a/tests/checkpointing/test_checkpoint_callback_frequency.py b/tests/checkpointing/test_checkpoint_callback_frequency.py
index c75d7332e2e42..2c14c7de29b9c 100644
--- a/tests/checkpointing/test_checkpoint_callback_frequency.py
+++ b/tests/checkpointing/test_checkpoint_callback_frequency.py
@@ -87,18 +87,9 @@ def training_step(self, batch, batch_idx):
 
 
 @mock.patch("torch.save")
-@RunIf(special=True, min_gpus=2)
-def test_top_k_ddp_0(save_mock, tmpdir):
-    _top_k_ddp(save_mock, tmpdir, k=1, epochs=1, val_check_interval=1.0, expected=1)
-
-
-@mock.patch("torch.save")
-@RunIf(special=True, min_gpus=2)
-def test_top_k_ddp_1(save_mock, tmpdir):
-    _top_k_ddp(save_mock, tmpdir, k=2, epochs=2, val_check_interval=0.3, expected=4)
-
-
-def _top_k_ddp(save_mock, tmpdir, k, epochs, val_check_interval, expected):
+@RunIf(standalone=True, min_gpus=2)
+@pytest.mark.parametrize(["k", "epochs", "val_check_interval", "expected"], [(1, 1, 1.0, 1), (2, 2, 0.3, 4)])
+def test_top_k_ddp(save_mock, tmpdir, k, epochs, val_check_interval, expected):
     class TestModel(BoringModel):
         def training_step(self, batch, batch_idx):
             local_rank = int(os.getenv("LOCAL_RANK"))
diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py
index 518d67cf251f5..733ea9348b2bd 100644
--- a/tests/checkpointing/test_model_checkpoint.py
+++ b/tests/checkpointing/test_model_checkpoint.py
@@ -385,7 +385,7 @@ def on_train_end(self, trainer, pl_module):
             assert torch.save.call_count == 0
 
 
-@RunIf(skip_windows=True)
+@RunIf(skip_windows=True, skip_49370=True)
 def test_model_checkpoint_no_extraneous_invocations(tmpdir):
     """Test to ensure that the model callback saves the checkpoints only once in distributed mode."""
     model = LogInTwoMethods()
@@ -1206,3 +1206,37 @@ def test_check_val_every_n_epochs_top_k_integration(tmpdir):
     )
     trainer.fit(model)
     assert set(os.listdir(tmpdir)) == {"epoch=1.ckpt", "epoch=3.ckpt"}
+
+
+def test_model_checkpoint_saveload_ckpt(tmpdir):
+    ckpt = {
+        "monitor": "random_value",
+        "best_model_path": "epoch=10-step=1436.ckpt",
+        "best_model_score": torch.tensor(2.246),
+        "current_score": torch.tensor(1.5),
+        "dirpath": tmpdir,
+        "best_k_models": {"epoch=10-step=1436.ckpt": torch.tensor(2.246)},
+        "kth_best_model_path": "epoch=10-step=1436.ckpt",
+        "kth_value": torch.tensor(2.246),
+        "last_model_path": "last2245.ckpt",
+    }
+
+    # test on_save_checkpoint
+    cb_write = ModelCheckpoint(dirpath=tmpdir, monitor="random_value", save_top_k=-1, save_last=True)
+    for key, val in ckpt.items():
+        setattr(cb_write, key, val)
+    written_ckpt = cb_write.on_save_checkpoint("", "", "")
+    for state in ckpt:
+        assert ckpt[state] == written_ckpt[state]
+
+    # test on_load_checkpoint
+    # Note: "current_score", "dirpath" and "monitor" are currently not restored by on_load_checkpoint.
+    # We therefore set "dirpath" and "monitor" to something different than for ckpt/cb_write so we can assert them.
+    # "current_score" is left as initialized, i.e. None, and can therefore also be asserted
+    cb_restore = ModelCheckpoint(dirpath=tmpdir + "restore", monitor=None, save_top_k=-1, save_last=True)
+    cb_restore.on_load_checkpoint("", "", written_ckpt)
+    for key, val in written_ckpt.items():
+        if key not in ("current_score", "dirpath", "monitor"):
+            assert getattr(cb_restore, key) == val
+        else:
+            assert getattr(cb_restore, key) != val
diff --git a/tests/checkpointing/test_torch_saving.py b/tests/checkpointing/test_torch_saving.py
index 8b0f0e457bff9..f9634a9dadb2a 100644
--- a/tests/checkpointing/test_torch_saving.py
+++ b/tests/checkpointing/test_torch_saving.py
@@ -34,7 +34,7 @@ def test_model_torch_save(tmpdir):
     trainer = torch.load(temp_path)
 
 
-@RunIf(skip_windows=True)
+@RunIf(skip_windows=True, skip_49370=True)
 def test_model_torch_save_ddp_cpu(tmpdir):
     """Test to ensure torch save does not fail for model and trainer using cpu ddp."""
     model = BoringModel()
diff --git a/tests/conftest.py b/tests/conftest.py
index 860f9357e4636..772061d8bbd3d 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
+import signal
 import sys
 import threading
 from functools import partial
@@ -22,7 +23,8 @@
 import torch.distributed
 
 from pytorch_lightning.plugins.environments.lightning_environment import find_free_network_port
-from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_7, _TORCH_GREATER_EQUAL_1_8
+from pytorch_lightning.trainer.connectors.signal_connector import SignalConnector
+from pytorch_lightning.utilities.imports import _IS_WINDOWS, _TORCH_GREATER_EQUAL_1_7, _TORCH_GREATER_EQUAL_1_8
 from tests import _PATH_DATASETS
 
 
@@ -81,6 +83,23 @@ def restore_env_variables():
     assert not leaked_vars, f"test is leaking environment variable(s): {set(leaked_vars)}"
 
 
+@pytest.fixture(scope="function", autouse=True)
+def restore_signal_handlers():
+    """Ensures that signal handlers get restored before the next test runs.
+
+    This is a safety net for tests that don't run Trainer's teardown.
+    """
+    valid_signals = SignalConnector._valid_signals()
+    if not _IS_WINDOWS:
+        # SIGKILL and SIGSTOP are not allowed to be modified by the user
+        valid_signals -= {signal.SIGKILL, signal.SIGSTOP}
+    handlers = {signum: signal.getsignal(signum) for signum in valid_signals}
+    yield
+    for signum, handler in handlers.items():
+        if handler is not None:
+            signal.signal(signum, handler)
+
+
 @pytest.fixture(scope="function", autouse=True)
 def teardown_process_group():
     """Ensures that the distributed process group gets closed before the next test runs."""
@@ -156,3 +175,16 @@ def single_process_pg():
         torch.distributed.destroy_process_group()
         os.environ.clear()
         os.environ.update(orig_environ)
+
+
+def pytest_collection_modifyitems(items):
+    if os.getenv("PL_RUN_STANDALONE_TESTS", "0") != "1":
+        return
+    # filter out non-standalone tests
+    items[:] = [
+        item
+        for item in items
+        for marker in item.own_markers
+        # has `@RunIf(standalone=True)`
+        if marker.name == "skipif" and marker.kwargs.get("standalone")
+    ]
diff --git a/tests/core/test_metric_result_integration.py b/tests/core/test_metric_result_integration.py
index 12fe7f2fb4652..85149a78211f6 100644
--- a/tests/core/test_metric_result_integration.py
+++ b/tests/core/test_metric_result_integration.py
@@ -482,7 +482,7 @@ def test_result_collection_reload_1_gpu_ddp(tmpdir):
     result_collection_reload(default_root_dir=tmpdir, strategy="ddp", gpus=1)
 
 
-@RunIf(min_gpus=2, special=True)
+@RunIf(min_gpus=2, standalone=True)
 @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"})
 @pytest.mark.skipif(not _TORCH_GREATER_EQUAL_1_7, reason="Requires at least PyTorch 1.7")
 def test_result_collection_reload_2_gpus(tmpdir):
@@ -552,12 +552,71 @@ def on_train_epoch_end(self) -> None:
 
 def test_metric_result_computed_check():
     """Unittest ``_get_cache`` with multielement tensors."""
-    sync = _Sync()
     metadata = _Metadata("foo", "bar", on_epoch=True, enable_graph=True)
-    metadata.sync = sync
+    metadata.sync = _Sync()
     rm = ResultMetric(metadata, is_tensor=True)
     computed_value = torch.tensor([1, 2, 3])
     rm._computed = computed_value
     cache = ResultCollection._get_cache(rm, on_step=False)
     # `enable_graph=True` so no detach, identity works
     assert cache is computed_value
+
+
+@pytest.mark.parametrize("floating_dtype", (torch.float, torch.double))
+def test_metric_result_respects_dtype(floating_dtype):
+    torch.set_default_dtype(floating_dtype)
+    fixed_dtype = torch.long  # default by PyTorch
+
+    metadata = _Metadata("foo", "bar")
+    metadata.sync = _Sync()
+    rm = ResultMetric(metadata, is_tensor=True)
+
+    assert rm.value.dtype == floating_dtype
+    assert rm.cumulated_batch_size.dtype == fixed_dtype
+
+    # two fixed point numbers - should be converted
+    value, batch_size = torch.tensor(2), 3
+    assert value.dtype == fixed_dtype
+    with pytest.warns(
+        UserWarning, match=rf"`self.log\('bar', ...\)` in your `foo` .* Converting it to {floating_dtype}"
+    ):
+        rm.update(value, batch_size)
+    # floating and fixed
+    rm.update(torch.tensor(4.0), 5)
+
+    total = rm.compute()
+
+    assert total == (2 * 3 + 4 * 5) / (5 + 3)
+    assert total.dtype == floating_dtype
+
+    # restore to avoid impacting other tests
+    torch.set_default_dtype(torch.float)
+
+
+@pytest.mark.parametrize("reduce_fx", ("mean", sum))
+def test_metric_result_dtype_promotion(reduce_fx):
+    metadata = _Metadata("foo", "bar", reduce_fx=reduce_fx)
+    metadata.sync = _Sync()
+    rm = ResultMetric(metadata, is_tensor=True)
+    assert rm.value.dtype == torch.float
+
+    # log a double
+    rm.update(torch.tensor(0, dtype=torch.double), 1)
+    # `rm.value.dtype` is promoted
+    assert rm.value.dtype == torch.double
+    # log a float
+    rm.update(torch.tensor(0, dtype=torch.float), 1)
+    # the previous dtype stays
+    assert rm.value.dtype == torch.double
+
+    total = rm.compute()
+    assert total.dtype == torch.double
+
+
+@pytest.mark.parametrize(["reduce_fx", "expected"], [(max, -2), (min, 2)])
+def test_result_metric_max_min(reduce_fx, expected):
+    metadata = _Metadata("foo", "bar", reduce_fx=reduce_fx)
+    metadata.sync = _Sync()
+    rm = ResultMetric(metadata, is_tensor=True)
+    rm.update(torch.tensor(expected), 1)
+    assert rm.compute() == expected
diff --git a/tests/core/test_results.py b/tests/core/test_results.py
index 0e62441b1d40e..a39ce51788ff9 100644
--- a/tests/core/test_results.py
+++ b/tests/core/test_results.py
@@ -33,7 +33,7 @@ def _setup_ddp(rank, worldsize):
 def _ddp_test_fn(rank, worldsize):
     _setup_ddp(rank, worldsize)
     tensor = torch.tensor([1.0])
-    sync = _Sync(sync_ddp_if_available, _should=True, op="SUM")
+    sync = _Sync(sync_ddp_if_available, _should=True, _op="SUM")
     actual = sync(tensor)
     assert actual.item() == dist.get_world_size(), "Result-Log does not work properly with DDP and Tensors"
 
diff --git a/tests/deprecated_api/__init__.py b/tests/deprecated_api/__init__.py
index 1026981f75307..91c7ef1c1f880 100644
--- a/tests/deprecated_api/__init__.py
+++ b/tests/deprecated_api/__init__.py
@@ -14,7 +14,7 @@
 """Test deprecated functionality which will be removed in vX.Y.Z."""
 import sys
 from contextlib import contextmanager
-from typing import Optional
+from typing import Optional, Type
 
 import pytest
 
@@ -26,14 +26,28 @@ def _soft_unimport_module(str_module):
 
 
 @contextmanager
-def no_deprecated_call(match: Optional[str] = None):
+def no_warning_call(expected_warning: Type[Warning] = UserWarning, match: Optional[str] = None):
     with pytest.warns(None) as record:
         yield
+
+    if match is None:
         try:
-            w = record.pop(DeprecationWarning)
-            if match is not None and match not in str(w.message):
-                return
+            w = record.pop(expected_warning)
         except AssertionError:
-            # no DeprecationWarning raised
+            # no warning raised
+            return
+    else:
+        for w in record.list:
+            if w.category is expected_warning and match in w.message.args[0]:
+                break
+        else:
             return
-        raise AssertionError(f"`DeprecationWarning` was raised: {w}")
+
+    msg = "A warning" if expected_warning is None else f"`{expected_warning.__name__}`"
+    raise AssertionError(f"{msg} was raised: {w}")
+
+
+@contextmanager
+def no_deprecated_call(match: Optional[str] = None):
+    with no_warning_call(expected_warning=DeprecationWarning, match=match):
+        yield
diff --git a/tests/deprecated_api/test_remove_1-6.py b/tests/deprecated_api/test_remove_1-6.py
index 62791b482c186..144f84551105c 100644
--- a/tests/deprecated_api/test_remove_1-6.py
+++ b/tests/deprecated_api/test_remove_1-6.py
@@ -118,13 +118,12 @@ def test_v1_6_0_reload_dataloaders_every_epoch(tmpdir):
             limit_val_batches=0.3,
             reload_dataloaders_every_epoch=True,
             max_epochs=3,
+            num_sanity_val_steps=0,
         )
     trainer.fit(model)
     trainer.test()
 
-    expected_sequence = (
-        [call.val_dataloader()] + [call.train_dataloader(), call.val_dataloader()] * 3 + [call.test_dataloader()]
-    )
+    expected_sequence = [call.train_dataloader(), call.val_dataloader()] * 3 + [call.test_dataloader()]
     assert tracker.mock_calls == expected_sequence
 
 
diff --git a/tests/deprecated_api/test_remove_1-7.py b/tests/deprecated_api/test_remove_1-7.py
index 16c511b6effd9..62ec4d8d5490a 100644
--- a/tests/deprecated_api/test_remove_1-7.py
+++ b/tests/deprecated_api/test_remove_1-7.py
@@ -13,9 +13,9 @@
 # limitations under the License.
 """Test deprecated functionality which will be removed in v1.7.0."""
 from unittest import mock
+from unittest.mock import Mock
 
 import pytest
-import torch
 
 from pytorch_lightning import Callback, LightningDataModule, Trainer
 from pytorch_lightning.callbacks.gpu_stats_monitor import GPUStatsMonitor
@@ -23,6 +23,7 @@
 from pytorch_lightning.callbacks.progress import ProgressBar
 from pytorch_lightning.callbacks.xla_stats_monitor import XLAStatsMonitor
 from pytorch_lightning.loggers import LoggerCollection, TestTubeLogger
+from pytorch_lightning.overrides.distributed import IndexBatchSamplerWrapper
 from tests.callbacks.test_callbacks import OldStatefulCallback
 from tests.deprecated_api import _soft_unimport_module
 from tests.helpers import BoringModel
@@ -230,22 +231,16 @@ def test_v1_7_0_flush_logs_every_n_steps_trainer_constructor(tmpdir):
 
 
 class BoringCallbackDDPSpawnModel(BoringModel):
-    def __init__(self):
-        super().__init__()
+    def add_to_queue(self, queue):
+        ...
 
-    def add_to_queue(self, queue: torch.multiprocessing.SimpleQueue) -> None:
-        queue.put("test_val")
-        return super().add_to_queue(queue)
+    def get_from_queue(self, queue):
+        ...
 
-    def get_from_queue(self, queue: torch.multiprocessing.SimpleQueue) -> None:
-        self.test_val = queue.get()
-        return super().get_from_queue(queue)
 
-
-@RunIf(skip_windows=True)
 def test_v1_7_0_deprecate_add_get_queue(tmpdir):
     model = BoringCallbackDDPSpawnModel()
-    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, num_processes=2, strategy="ddp_spawn")
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True)
 
     with pytest.deprecated_call(match=r"`LightningModule.add_to_queue` method was deprecated in v1.5"):
         trainer.fit(model)
@@ -455,3 +450,12 @@ def test_v1_7_0_deprecate_lr_sch_names(tmpdir):
 
     with pytest.deprecated_call(match="`LearningRateMonitor.lr_sch_names` has been deprecated in v1.5"):
         assert lr_monitor.lr_sch_names == ["lr-SGD"]
+
+
+def test_v1_7_0_index_batch_sampler_wrapper_batch_indices():
+    sampler = IndexBatchSamplerWrapper(Mock())
+    with pytest.deprecated_call(match="was deprecated in v1.5 and will be removed in v1.7"):
+        _ = sampler.batch_indices
+
+    with pytest.deprecated_call(match="was deprecated in v1.5 and will be removed in v1.7"):
+        sampler.batch_indices = []
diff --git a/tests/helpers/datasets.py b/tests/helpers/datasets.py
index 561642ae8cfbe..33bf1d9b8e13f 100644
--- a/tests/helpers/datasets.py
+++ b/tests/helpers/datasets.py
@@ -19,7 +19,6 @@
 from typing import Optional, Sequence, Tuple
 
 import torch
-from torch import Tensor
 from torch.utils.data import Dataset
 
 
@@ -70,7 +69,7 @@ def __init__(
         data_file = self.TRAIN_FILE_NAME if self.train else self.TEST_FILE_NAME
         self.data, self.targets = self._try_load(os.path.join(self.cached_folder_path, data_file))
 
-    def __getitem__(self, idx: int) -> Tuple[Tensor, int]:
+    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, int]:
         img = self.data[idx].float().unsqueeze(0)
         target = int(self.targets[idx])
 
@@ -126,7 +125,7 @@ def _try_load(path_data, trials: int = 30, delta: float = 1.0):
         return res
 
     @staticmethod
-    def normalize_tensor(tensor: Tensor, mean: float = 0.0, std: float = 1.0) -> Tensor:
+    def normalize_tensor(tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0) -> torch.Tensor:
         mean = torch.as_tensor(mean, dtype=tensor.dtype, device=tensor.device)
         std = torch.as_tensor(std, dtype=tensor.dtype, device=tensor.device)
         return tensor.sub(mean).div(std)
diff --git a/tests/helpers/runif.py b/tests/helpers/runif.py
index 490e023662f79..07bd6438da125 100644
--- a/tests/helpers/runif.py
+++ b/tests/helpers/runif.py
@@ -65,11 +65,13 @@ def __new__(
         horovod: bool = False,
         horovod_nccl: bool = False,
         skip_windows: bool = False,
-        special: bool = False,
+        standalone: bool = False,
         fairscale: bool = False,
         fairscale_fully_sharded: bool = False,
         deepspeed: bool = False,
         rich: bool = False,
+        skip_49370: bool = False,
+        skip_hanging_spawn: bool = False,
         **kwargs,
     ):
         """
@@ -86,11 +88,13 @@ def __new__(
             horovod: if Horovod is installed
             horovod_nccl: if Horovod is installed with NCCL support
             skip_windows: skip test for Windows platform (typically for some limited torch functionality)
-            special: running in special mode, outside pytest suit
+            standalone: Mark the test as standalone, our CI will run it in a separate process.
             fairscale: if `fairscale` module is required to run the test
             fairscale_fully_sharded: if `fairscale` fully sharded module is required to run the test
             deepspeed: if `deepspeed` module is required to run the test
             rich: if `rich` module is required to run the test
+            skip_49370: Skip the test as it's impacted by https://github.com/pytorch/pytorch/issues/49370.
+            skip_hanging_spawn: Skip the test as it's impacted by hanging loggers on spawn.
             kwargs: native pytest.mark.skipif keyword arguments
         """
         conditions = []
@@ -144,10 +148,12 @@ def __new__(
             conditions.append(not _HOROVOD_NCCL_AVAILABLE)
             reasons.append("Horovod with NCCL")
 
-        if special:
-            env_flag = os.getenv("PL_RUNNING_SPECIAL_TESTS", "0")
+        if standalone:
+            env_flag = os.getenv("PL_RUN_STANDALONE_TESTS", "0")
             conditions.append(env_flag != "1")
-            reasons.append("Special execution")
+            reasons.append("Standalone execution")
+            # used in tests/conftest.py::pytest_collection_modifyitems
+            kwargs["standalone"] = True
 
         if fairscale:
             conditions.append(not _FAIRSCALE_AVAILABLE)
@@ -165,6 +171,24 @@ def __new__(
             conditions.append(not _RICH_AVAILABLE)
             reasons.append("Rich")
 
+        if skip_49370:
+            # strategy=ddp_spawn, accelerator=cpu, python>=3.9, torch<1.8 does not work
+            py_version = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}"
+            ge_3_9 = Version(py_version) >= Version("3.9")
+            torch_version = get_distribution("torch").version
+            old_torch = Version(torch_version) < Version("1.8")
+            conditions.append(ge_3_9 and old_torch)
+            reasons.append("Impacted by https://github.com/pytorch/pytorch/issues/49370")
+
+        if skip_hanging_spawn:
+            # strategy=ddp_spawn, accelerator=cpu, python>=3.8, torch<1.9 does not work
+            py_version = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}"
+            ge_3_8 = Version(py_version) >= Version("3.8")
+            torch_version = get_distribution("torch").version
+            old_torch = Version(torch_version) < Version("1.9")
+            conditions.append(ge_3_8 and old_torch)
+            reasons.append("Impacted by hanging DDP spawn")
+
         reasons = [rs for cond, rs in zip(conditions, reasons) if cond]
         return pytest.mark.skipif(
             *args, condition=any(conditions), reason=f"Requires: [{' + '.join(reasons)}]", **kwargs
diff --git a/tests/lite/test_lite.py b/tests/lite/test_lite.py
index 8eac30f9cf823..97046c71bedbd 100644
--- a/tests/lite/test_lite.py
+++ b/tests/lite/test_lite.py
@@ -24,7 +24,12 @@
 from torch.utils.data import DataLoader, DistributedSampler, Sampler
 
 from pytorch_lightning.lite import LightningLite
-from pytorch_lightning.lite.wrappers import _LiteDataLoader, _LiteModule, _LiteOptimizer
+from pytorch_lightning.lite.wrappers import (
+    _LiteDataLoader,
+    _LiteModule,
+    _LiteOptimizer,
+    _replace_dataloader_init_method,
+)
 from pytorch_lightning.plugins import DeepSpeedPlugin, PrecisionPlugin, TrainingTypePlugin
 from pytorch_lightning.utilities import DistributedType
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -164,56 +169,33 @@ def test_setup_dataloaders_return_type():
     assert lite_dataloader1.dataset is dataset1
 
 
-def test_setup_custom_dataloaders():
-    """Test that the setup_dataloaders method returns the dataloaders wrapped as LiteDataLoader."""
-    lite = EmptyLite()
-
-    class CustomDataLoader(DataLoader):
-        def __init__(self, value: int = 2, *args, **kwargs):
-            self.value = value
-            super().__init__(range(value), *args, **kwargs)
-
-    dataloader = CustomDataLoader(2, batch_size=2)
+def test_setup_dataloaders_with_custom_type():
+    """Test that Lite intercepts arguments passed to custom subclasses of torch.utils.DataLoader and sets them as
+    attributes."""
 
-    # single dataloader
-    lite_dataloader = lite.setup_dataloaders(dataloader)
-    assert lite_dataloader._dataloader
-    assert lite_dataloader.value == 2
-    batch0 = next(iter(lite_dataloader))
-    assert torch.equal(batch0, torch.tensor([0, 1]))
-
-    class CustomDataLoader2(DataLoader):
-        def __init__(self, range, *args, **kwargs):
-            self.range = range
-            super().__init__(range, *args, **kwargs)
-
-    dataloader = CustomDataLoader2(range(2), batch_size=2)
-
-    # single dataloader
-    lite_dataloader = lite.setup_dataloaders(dataloader)
-    assert lite_dataloader._dataloader
-    batch0 = next(iter(lite_dataloader))
-    assert torch.equal(batch0, torch.tensor([0, 1]))
+    class DataLoaderSubclass1(DataLoader):
+        def __init__(self, attribute1, *args, **kwargs):
+            # intentionally not setting this attribute, calling super with different args
+            # self.attribute1 = attribute1
+            super().__init__(*args, **kwargs)
 
-    class CustomDataLoader(DataLoader):
-        def __init__(self, value: int, *args, **kwargs):
-            super().__init__(range(value), *args, **kwargs)
+    class DataLoaderSubclass2(DataLoaderSubclass1):
+        def __init__(self, attribute1, attribute2, *args, **kwargs):
+            # intentionally not setting this attribute, calling super with different args
+            # self.attribute2 = attribute2
+            super().__init__(attribute1, *args, **kwargs)
 
     class LiteWithCustomDataLoader(LightningLite):
         def run(self):
-            # This doesn't fail as the context manager would save all the arguments provided
-            # to the dataloaders.
-            dataloader = CustomDataLoader(2, batch_size=2)
-            self.setup_dataloaders(dataloader)
+            dataloader = DataLoaderSubclass2("attribute1", "attribute2", dataset=range(4), batch_size=2)
+            assert dataloader.attribute1 == "attribute1"
+            assert dataloader.attribute2 == "attribute2"
+            lite_dataloader = self.setup_dataloaders(dataloader)
+            assert lite_dataloader.attribute1 == "attribute1"
+            assert lite_dataloader.attribute2 == "attribute2"
 
     LiteWithCustomDataLoader().run()
 
-    with pytest.raises(
-        MisconfigurationException, match="Trying to inject `DistributedSampler` into the `CustomDataLoader` instance"
-    ):
-        dataloader = CustomDataLoader(2, batch_size=2)
-        lite_dataloader = lite.setup_dataloaders(dataloader)
-
 
 def test_setup_dataloaders_twice_fails():
     """Test that calling setup_dataloaders with a dataloader that is already wrapped fails."""
@@ -398,7 +380,7 @@ def test_autocast():
     lite._precision_plugin.forward_context().__exit__.assert_called()
 
 
-@RunIf(min_gpus=2, deepspeed=True, special=True)
+@RunIf(min_gpus=2, deepspeed=True, standalone=True)
 def test_deepspeed_multiple_models():
     class Lite(LightningLite):
         def run(self):
@@ -462,3 +444,25 @@ def run(self):
             assert self.is_global_zero == (self.local_rank == 0)
 
     Lite(strategy=DeepSpeedPlugin(stage=3, logging_batch_size_per_gpu=1), devices=2, accelerator="gpu").run()
+
+
+def test_replace_dataloader_init_method():
+    """Test that the context manager enables to save the parameters passed to the DataLoader __init__ method."""
+
+    class CustomDataLoader(DataLoader):
+        def __init__(self, extra_argument: int, *args, **kwargs):
+            super().__init__(*args, **kwargs)
+
+    dataloader = CustomDataLoader(extra_argument=1, dataset=range(1))
+    lite = EmptyLite()
+    with pytest.raises(MisconfigurationException, match="extra_argument"):
+        dataloader = lite.setup_dataloaders(dataloader)
+
+    with _replace_dataloader_init_method():
+        dataloader = CustomDataLoader(extra_argument=1, dataset=range(1))
+        assert dataloader.extra_argument == 1
+        dataloader = lite.setup_dataloaders(dataloader)
+
+        dataloader = CustomDataLoader(1, range(1))
+        assert dataloader.extra_argument == 1
+        dataloader = lite.setup_dataloaders(dataloader)
diff --git a/tests/lite/test_parity.py b/tests/lite/test_parity.py
index bec9339ec8e2f..d4d0ca6e5e9c7 100644
--- a/tests/lite/test_parity.py
+++ b/tests/lite/test_parity.py
@@ -190,7 +190,7 @@ def test_boring_lite_model_ddp_spawn(precision, strategy, devices, accelerator,
         assert torch.equal(w_pure.cpu(), w_lite.cpu())
 
 
-@RunIf(min_gpus=2, special=True)
+@RunIf(min_gpus=2, standalone=True)
 @pytest.mark.parametrize(
     "precision, strategy, devices, accelerator",
     [
diff --git a/tests/lite/test_wrappers.py b/tests/lite/test_wrappers.py
index 6741bf59b4dca..a732390e1d00a 100644
--- a/tests/lite/test_wrappers.py
+++ b/tests/lite/test_wrappers.py
@@ -17,6 +17,7 @@
 import torch
 from torch.utils.data.dataloader import DataLoader
 
+from pytorch_lightning.core.mixins import DeviceDtypeModuleMixin
 from pytorch_lightning.lite import LightningLite
 from pytorch_lightning.lite.wrappers import _LiteDataLoader, _LiteModule, _LiteOptimizer
 from tests.helpers.runif import RunIf
@@ -40,8 +41,13 @@ def test_lite_module_wraps():
         (32, torch.float16, torch.float32),
         (32, torch.float32, torch.float32),
         (32, torch.float64, torch.float32),
+        (32, torch.int, torch.int),
         (16, torch.float32, torch.float16),
         (16, torch.float64, torch.float16),
+        (16, torch.long, torch.long),
+        pytest.param("bf16", torch.float32, torch.bfloat16, marks=RunIf(min_torch="1.10")),
+        pytest.param("bf16", torch.float64, torch.bfloat16, marks=RunIf(min_torch="1.10")),
+        pytest.param("bf16", torch.bool, torch.bool, marks=RunIf(min_torch="1.10")),
     ],
 )
 def test_lite_module_forward_conversion(precision, input_type, expected_type):
@@ -53,11 +59,32 @@ def check_autocast(forward_input):
         assert precision != 16 or torch.is_autocast_enabled()
         return forward_input
 
-    module = Mock(wraps=torch.nn.Linear(1, 1), side_effect=check_autocast)
+    module = Mock(wraps=torch.nn.Identity(), side_effect=check_autocast)
     lite_module = _LiteModule(module, lite._precision_plugin).to(device)
-    out = lite_module(torch.rand(1, dtype=input_type, device=device))
+    out = lite_module(torch.tensor([1, 2, 3], dtype=input_type, device=device))
     assert module.call_args[0][0].dtype == expected_type
-    assert out.dtype == torch.get_default_dtype()
+    assert out.dtype == input_type or out.dtype == torch.get_default_dtype()
+
+
+@pytest.mark.parametrize(
+    "device", [torch.device("cpu"), pytest.param(torch.device("cuda", 0), marks=RunIf(min_gpus=1))]
+)
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
+def test_lite_module_device_dtype_propagation(device, dtype):
+    """Test that the LiteModule propagates device and dtype properties to its submodules (e.g. torchmetrics)."""
+
+    class DeviceModule(DeviceDtypeModuleMixin):
+        pass
+
+    device_module = DeviceModule()
+    lite_module = _LiteModule(device_module, Mock())
+    lite_module.to(device)
+    assert device_module.device == device
+    assert lite_module.device == device
+
+    lite_module.to(dtype)
+    assert device_module.dtype == dtype
+    assert lite_module.dtype == dtype
 
 
 def test_lite_dataloader_iterator():
@@ -115,6 +142,15 @@ def test_lite_optimizer_wraps():
     assert isinstance(lite_optimizer, optimizer_cls)
 
 
+def test_lite_optimizer_state_dict():
+    """Test that the LiteOptimizer calls into the accelerator/strategy to collect the state."""
+    optimizer = Mock()
+    accelerator = Mock()
+    lite_optimizer = _LiteOptimizer(optimizer=optimizer, accelerator=accelerator)
+    lite_optimizer.state_dict()
+    accelerator.optimizer_state.assert_called_with(optimizer)
+
+
 def test_lite_optimizer_steps():
     """Test that the LiteOptimizer forwards the step() and zero_grad() calls to the wrapped optimizer."""
     optimizer = Mock()
diff --git a/tests/loggers/test_all.py b/tests/loggers/test_all.py
index 67838e219fcfb..b12f7c8286c62 100644
--- a/tests/loggers/test_all.py
+++ b/tests/loggers/test_all.py
@@ -47,6 +47,8 @@ def _get_logger_args(logger_class, save_dir):
         logger_args.update(offline_mode=True)
     if "offline" in inspect.getfullargspec(logger_class).args:
         logger_args.update(offline=True)
+    if issubclass(logger_class, NeptuneLogger):
+        logger_args.update(mode="offline")
     return logger_args
 
 
@@ -144,10 +146,8 @@ def log_metrics(self, metrics, step):
     log_metric_names = [(s, sorted(m.keys())) for s, m in logger.history]
     if logger_class == TensorBoardLogger:
         expected = [
-            (0, ["hp_metric"]),
             (0, ["epoch", "train_some_val"]),
             (0, ["early_stop_on", "epoch", "val_loss"]),
-            (0, ["hp_metric"]),
             (1, ["epoch", "test_loss"]),
         ]
         assert log_metric_names == expected
@@ -263,6 +263,10 @@ def _test_loggers_pickle(tmpdir, monkeypatch, logger_class):
     # the logger needs to remove it from the state before pickle
     _ = logger.experiment
 
+    # logger also has to avoid adding un-picklable attributes to self in .save
+    logger.log_metrics({"a": 1})
+    logger.save()
+
     # test pickling loggers
     pickle.dumps(logger)
 
@@ -317,8 +321,10 @@ def on_train_batch_start(self, trainer, pl_module, batch, batch_idx):
             assert pl_module.logger.experiment.something(foo="bar") is None
 
 
-@pytest.mark.parametrize("logger_class", [CometLogger, CSVLogger, MLFlowLogger, TensorBoardLogger, TestTubeLogger])
-@RunIf(skip_windows=True)
+@RunIf(skip_windows=True, skip_49370=True, skip_hanging_spawn=True)
+@pytest.mark.parametrize(
+    "logger_class", [CometLogger, CSVLogger, MLFlowLogger, NeptuneLogger, TensorBoardLogger, TestTubeLogger]
+)
 def test_logger_created_on_rank_zero_only(tmpdir, monkeypatch, logger_class):
     """Test that loggers get replaced by dummy loggers on global rank > 0."""
     _patch_comet_atexit(monkeypatch)
diff --git a/tests/loggers/test_base.py b/tests/loggers/test_base.py
index d6b753c0439ee..224271709f5f7 100644
--- a/tests/loggers/test_base.py
+++ b/tests/loggers/test_base.py
@@ -111,7 +111,6 @@ def training_step(self, batch, batch_idx):
     trainer = Trainer(max_steps=2, log_every_n_steps=1, logger=logger, default_root_dir=tmpdir)
     trainer.fit(model)
     assert trainer.state.finished, f"Training failed with {trainer.state}"
-    assert logger.hparams_logged == model.hparams
     assert logger.metrics_logged != {}
     assert logger.after_save_checkpoint_called
     assert logger.finalized_status == "success"
@@ -133,11 +132,11 @@ def training_step(self, batch, batch_idx):
     trainer.fit(model)
     assert trainer.state.finished, f"Training failed with {trainer.state}"
 
-    assert logger1.hparams_logged == model.hparams
+    assert logger1.hparams_logged is None
     assert logger1.metrics_logged != {}
     assert logger1.finalized_status == "success"
 
-    assert logger2.hparams_logged == model.hparams
+    assert logger2.hparams_logged is None
     assert logger2.metrics_logged != {}
     assert logger2.finalized_status == "success"
 
@@ -241,6 +240,13 @@ def test_dummylogger_noop_method_calls():
     logger.log_metrics("1", 2, three="three")
 
 
+def test_dummyexperiment_support_item_assignment():
+    """Test that the DummyExperiment supports item assignment."""
+    experiment = DummyExperiment()
+    experiment["variable"] = "value"
+    assert experiment["variable"] != "value"  # this is only a stateless mock experiment
+
+
 def test_np_sanitization():
     class CustomParamsLogger(CustomLogger):
         def __init__(self):
diff --git a/tests/loggers/test_neptune.py b/tests/loggers/test_neptune.py
index 6eec5fffcf5b5..ddea7b2419608 100644
--- a/tests/loggers/test_neptune.py
+++ b/tests/loggers/test_neptune.py
@@ -77,7 +77,7 @@ def tmpdir_unittest_fixture(request, tmpdir):
 class TestNeptuneLogger(unittest.TestCase):
     def test_neptune_online(self, neptune):
         logger = NeptuneLogger(api_key="test", project="project")
-        created_run_mock = logger._run_instance
+        created_run_mock = logger.run
 
         self.assertEqual(logger._run_instance, created_run_mock)
         self.assertEqual(logger.name, "Run test name")
@@ -109,7 +109,7 @@ def test_neptune_pickling(self, neptune):
         pickled_logger = pickle.dumps(logger)
         unpickled = pickle.loads(pickled_logger)
 
-        neptune.init.assert_called_once_with(project="test-project", api_token=None, run="TEST-42")
+        neptune.init.assert_called_once_with(name="Test name", run=unpickleable_run._short_id)
         self.assertIsNotNone(unpickled.experiment)
 
     @patch("pytorch_lightning.loggers.neptune.Run", Run)
@@ -276,14 +276,15 @@ def test_after_save_checkpoint(self, neptune):
             logger, run_instance_mock, run_attr_mock = self._get_logger_with_mocks(
                 api_key="test", project="project", **prefix
             )
+            models_root_dir = os.path.join("path", "to", "models")
             cb_mock = MagicMock(
-                dirpath="path/to/models",
-                last_model_path="path/to/models/last",
+                dirpath=models_root_dir,
+                last_model_path=os.path.join(models_root_dir, "last"),
                 best_k_models={
-                    "path/to/models/model1": None,
-                    "path/to/models/model2/with/slashes": None,
+                    f"{os.path.join(models_root_dir, 'model1')}": None,
+                    f"{os.path.join(models_root_dir, 'model2/with/slashes')}": None,
                 },
-                best_model_path="path/to/models/best_model",
+                best_model_path=os.path.join(models_root_dir, "best_model"),
                 best_model_score=None,
             )
 
@@ -292,19 +293,21 @@ def test_after_save_checkpoint(self, neptune):
 
             # then:
             self.assertEqual(run_instance_mock.__setitem__.call_count, 1)
-            self.assertEqual(run_instance_mock.__getitem__.call_count, 3)
-            self.assertEqual(run_attr_mock.upload.call_count, 3)
+            self.assertEqual(run_instance_mock.__getitem__.call_count, 4)
+            self.assertEqual(run_attr_mock.upload.call_count, 4)
             run_instance_mock.__setitem__.assert_called_once_with(
-                f"{model_key_prefix}/best_model_path", "path/to/models/best_model"
+                f"{model_key_prefix}/best_model_path", os.path.join(models_root_dir, "best_model")
             )
             run_instance_mock.__getitem__.assert_any_call(f"{model_key_prefix}/checkpoints/last")
             run_instance_mock.__getitem__.assert_any_call(f"{model_key_prefix}/checkpoints/model1")
             run_instance_mock.__getitem__.assert_any_call(f"{model_key_prefix}/checkpoints/model2/with/slashes")
+            run_instance_mock.__getitem__.assert_any_call(f"{model_key_prefix}/checkpoints/best_model")
             run_attr_mock.upload.assert_has_calls(
                 [
-                    call("path/to/models/last"),
-                    call("path/to/models/model1"),
-                    call("path/to/models/model2/with/slashes"),
+                    call(os.path.join(models_root_dir, "last")),
+                    call(os.path.join(models_root_dir, "model1")),
+                    call(os.path.join(models_root_dir, "model2/with/slashes")),
+                    call(os.path.join(models_root_dir, "best_model")),
                 ]
             )
 
@@ -357,7 +360,7 @@ def test_legacy_functions(self, neptune, neptune_file_mock, warnings_mock):
         logger = NeptuneLogger(api_key="test", project="project")
 
         # test deprecated functions which will be shut down in pytorch-lightning 1.7.0
-        attr_mock = logger._run_instance.__getitem__
+        attr_mock = logger.run.__getitem__
         attr_mock.reset_mock()
         fake_image = {}
 
@@ -394,8 +397,12 @@ def test__get_full_model_name(self):
         # given:
         SimpleCheckpoint = namedtuple("SimpleCheckpoint", ["dirpath"])
         test_input_data = [
-            ("key.ext", "foo/bar/key.ext", SimpleCheckpoint(dirpath="foo/bar")),
-            ("key/in/parts.ext", "foo/bar/key/in/parts.ext", SimpleCheckpoint(dirpath="foo/bar")),
+            ("key", os.path.join("foo", "bar", "key.ext"), SimpleCheckpoint(dirpath=os.path.join("foo", "bar"))),
+            (
+                "key/in/parts",
+                os.path.join("foo", "bar", "key/in/parts.ext"),
+                SimpleCheckpoint(dirpath=os.path.join("foo", "bar")),
+            ),
         ]
 
         # expect:
diff --git a/tests/loops/test_loop_state_dict.py b/tests/loops/test_loop_state_dict.py
index 717d625f6c44e..72eeb197e9e57 100644
--- a/tests/loops/test_loop_state_dict.py
+++ b/tests/loops/test_loop_state_dict.py
@@ -14,7 +14,6 @@
 from unittest.mock import Mock
 
 import pytest
-import torch
 
 from pytorch_lightning.loops import FitLoop
 from pytorch_lightning.trainer.trainer import Trainer
@@ -80,14 +79,16 @@ def test_loops_state_dict_structure():
                 "is_last_batch": False,
             },
             "epoch_loop.val_loop._results": {
+                "batch": None,
+                "batch_size": None,
                 "training": False,
-                "_batch_size": torch.tensor(1),
                 "device": None,
                 "items": {},
             },
             "epoch_loop._results": {
+                "batch": None,
+                "batch_size": None,
                 "training": True,
-                "_batch_size": torch.tensor(1),
                 "device": None,
                 "items": {},
             },
@@ -106,8 +107,9 @@ def test_loops_state_dict_structure():
                 "is_last_batch": False,
             },
             "_results": {
+                "batch": None,
+                "batch_size": None,
                 "training": False,
-                "_batch_size": torch.tensor(1),
                 "device": None,
                 "items": {},
             },
@@ -122,8 +124,9 @@ def test_loops_state_dict_structure():
                 "is_last_batch": False,
             },
             "_results": {
+                "batch": None,
+                "batch_size": None,
                 "training": False,
-                "_batch_size": torch.tensor(1),
                 "device": None,
                 "items": {},
             },
diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py
index dd390ab4939d5..0d8b80c44af36 100644
--- a/tests/loops/test_loops.py
+++ b/tests/loops/test_loops.py
@@ -22,12 +22,11 @@
 import torch
 from torch.utils.data.dataloader import _MultiProcessingDataLoaderIter, DataLoader
 
-from pl_examples.bug_report_model import RandomDataset
 from pytorch_lightning import LightningModule, Trainer
-from pytorch_lightning.callbacks import ModelCheckpoint
+from pytorch_lightning.callbacks import Callback, ModelCheckpoint
 from pytorch_lightning.loops import Loop, TrainingBatchLoop
 from pytorch_lightning.trainer.progress import BaseProgress
-from tests.helpers import BoringModel
+from tests.helpers import BoringModel, RandomDataset
 from tests.helpers.runif import RunIf
 
 
@@ -912,21 +911,27 @@ def val_dataloader(self):
 
 
 @RunIf(min_torch="1.8.0")
-@pytest.mark.parametrize("persistent_workers", (True, False))
-def test_workers_are_shutdown(tmpdir, persistent_workers):
+@pytest.mark.parametrize("should_fail", [False, True])
+# False is de-activated due to slowness
+@pytest.mark.parametrize("persistent_workers", [True])
+def test_workers_are_shutdown(tmpdir, should_fail, persistent_workers):
     # `num_workers == 1` uses `_MultiProcessingDataLoaderIter`
     # `persistent_workers` makes sure `self._iterator` gets set on the `DataLoader` instance
 
     class _TestMultiProcessingDataLoaderIter(_MultiProcessingDataLoaderIter):
-        def __init__(self, *args, dataloader: DataLoader, **kwargs):
+        def __init__(self, *args, dataloader, **kwargs):
             super().__init__(*args, **kwargs)
             self.dataloader = dataloader
 
         def _shutdown_workers(self):
-            setattr(self.dataloader, "has_shutdown_workers", True)
+            self.dataloader.count_shutdown_workers += 1
             super()._shutdown_workers()
 
     class TestDataLoader(DataLoader):
+        def __init__(self, *args, **kwargs):
+            super().__init__(*args, **kwargs)
+            self.count_shutdown_workers = 0
+
         def _get_iterator(self):
             if self.num_workers == 0:
                 return super()._get_iterator()
@@ -937,10 +942,30 @@ def _get_iterator(self):
     train_dataloader = TestDataLoader(RandomDataset(32, 64), num_workers=1, persistent_workers=persistent_workers)
     val_dataloader = TestDataLoader(RandomDataset(32, 64), num_workers=1, persistent_workers=persistent_workers)
 
+    class TestCallback(Callback):
+        def on_train_epoch_end(self, trainer, *_):
+            if trainer.current_epoch == 1:
+                raise CustomException
+
+    max_epochs = 3
+
     model = BoringModel()
-    trainer = Trainer(default_root_dir=tmpdir, limit_train_batches=2, limit_val_batches=2, max_epochs=2)
-    trainer.fit(model, train_dataloader, val_dataloader)
-    assert train_dataloader.has_shutdown_workers
-    assert val_dataloader.has_shutdown_workers
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        limit_train_batches=2,
+        limit_val_batches=2,
+        max_epochs=max_epochs,
+        callbacks=TestCallback() if should_fail else None,
+    )
+
+    if should_fail:
+        with pytest.raises(CustomException):
+            trainer.fit(model, train_dataloader, val_dataloader)
+    else:
+        trainer.fit(model, train_dataloader, val_dataloader)
+
+    assert train_dataloader.count_shutdown_workers == 2 if should_fail else (2 if persistent_workers else max_epochs)
+    # on sanity checking end, the workers are being deleted too.
+    assert val_dataloader.count_shutdown_workers == 2 if persistent_workers else (3 if should_fail else max_epochs + 1)
     assert train_dataloader._iterator is None
     assert val_dataloader._iterator is None
diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py
index 2fb537b1d2861..c110f3a83d815 100644
--- a/tests/models/test_cpu.py
+++ b/tests/models/test_cpu.py
@@ -122,7 +122,7 @@ def validation_step(self, *args, **kwargs):
     model.unfreeze()
 
 
-@RunIf(skip_windows=True)
+@RunIf(skip_windows=True, skip_49370=True)
 def test_multi_cpu_model_ddp(tmpdir):
     """Make sure DDP works."""
     tutils.set_random_main_port()
diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py
index 6b34553ff313b..452051e58ff65 100644
--- a/tests/models/test_hooks.py
+++ b/tests/models/test_hooks.py
@@ -167,7 +167,7 @@ def transfer_batch_to_device(self, batch, device, dataloader_idx):
     assert torch.allclose(batch_gpu.targets.cpu(), torch.ones(5, 1, dtype=torch.long) * 2)
 
 
-@RunIf(min_gpus=2, special=True)
+@RunIf(min_gpus=2, standalone=True)
 def test_transfer_batch_hook_ddp(tmpdir):
     """Test custom data are properly moved to the right device using ddp."""
 
@@ -422,17 +422,11 @@ def _predict_batch(trainer, model, batches):
         return out
 
 
-@RunIf(deepspeed=True, min_gpus=1, special=True)
-def test_trainer_model_hook_system_fit_deepspeed_automatic_optimization(tmpdir):
-    _run_trainer_model_hook_system_fit(
-        dict(gpus=1, precision=16, strategy="deepspeed"), tmpdir, automatic_optimization=True
-    )
-
-
-@RunIf(deepspeed=True, min_gpus=1, special=True)
-def test_trainer_model_hook_system_fit_deepspeed_manual_optimization(tmpdir):
+@RunIf(deepspeed=True, min_gpus=1, standalone=True)
+@pytest.mark.parametrize("automatic_optimization", (True, False))
+def test_trainer_model_hook_system_fit_deepspeed(tmpdir, automatic_optimization):
     _run_trainer_model_hook_system_fit(
-        dict(gpus=1, precision=16, strategy="deepspeed"), tmpdir, automatic_optimization=False
+        dict(gpus=1, precision=16, strategy="deepspeed"), tmpdir, automatic_optimization=automatic_optimization
     )
 
 
@@ -505,8 +499,8 @@ def training_step(self, batch, batch_idx):
     expected = [
         dict(name="Callback.on_init_start", args=(trainer,)),
         dict(name="Callback.on_init_end", args=(trainer,)),
-        dict(name="prepare_data"),
         dict(name="configure_callbacks"),
+        dict(name="prepare_data"),
         dict(name="Callback.on_before_accelerator_backend_setup", args=(trainer, model)),
         # DeepSpeed needs the batch size to figure out throughput logging
         *([dict(name="train_dataloader")] if kwargs.get("strategy") == "deepspeed" else []),
@@ -624,8 +618,8 @@ def test_trainer_model_hook_system_fit_no_val_and_resume(tmpdir):
     expected = [
         dict(name="Callback.on_init_start", args=(trainer,)),
         dict(name="Callback.on_init_end", args=(trainer,)),
-        dict(name="prepare_data"),
         dict(name="configure_callbacks"),
+        dict(name="prepare_data"),
         dict(name="Callback.on_before_accelerator_backend_setup", args=(trainer, model)),
         dict(name="Callback.setup", args=(trainer, model), kwargs=dict(stage="fit")),
         dict(name="setup", kwargs=dict(stage="fit")),
@@ -722,8 +716,8 @@ def test_trainer_model_hook_system_eval(tmpdir, batches, verb, noun, dataloader,
     expected = [
         dict(name="Callback.on_init_start", args=(trainer,)),
         dict(name="Callback.on_init_end", args=(trainer,)),
-        dict(name="prepare_data"),
         dict(name="configure_callbacks"),
+        dict(name="prepare_data"),
         dict(name="Callback.on_before_accelerator_backend_setup", args=(trainer, model)),
         dict(name="Callback.setup", args=(trainer, model), kwargs=dict(stage=verb)),
         dict(name="setup", kwargs=dict(stage=verb)),
@@ -754,8 +748,8 @@ def test_trainer_model_hook_system_predict(tmpdir):
     expected = [
         dict(name="Callback.on_init_start", args=(trainer,)),
         dict(name="Callback.on_init_end", args=(trainer,)),
-        dict(name="prepare_data"),
         dict(name="configure_callbacks"),
+        dict(name="prepare_data"),
         dict(name="Callback.on_before_accelerator_backend_setup", args=(trainer, model)),
         dict(name="Callback.setup", args=(trainer, model), kwargs=dict(stage="predict")),
         dict(name="setup", kwargs=dict(stage="predict")),
@@ -884,7 +878,6 @@ def call(hook, fn, *args, **kwargs):
         *batch_transfer * batches,
         dict(name="train_dataloader"),
         *batch_transfer * batches,
-        dict(name="val_dataloader"),
         *batch_transfer * batches,
         dict(
             name="on_save_checkpoint",
diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py
index abf5a34757424..59a22cf1656d1 100644
--- a/tests/models/test_horovod.py
+++ b/tests/models/test_horovod.py
@@ -66,7 +66,7 @@ def _run_horovod(trainer_options, on_gpu=False):
     assert exit_code == 0
 
 
-@RunIf(skip_windows=True, horovod=True)
+@RunIf(skip_windows=True, horovod=True, skip_49370=True)
 def test_horovod_cpu(tmpdir):
     """Test Horovod running multi-process on CPU."""
     trainer_options = dict(
@@ -82,7 +82,7 @@ def test_horovod_cpu(tmpdir):
     _run_horovod(trainer_options)
 
 
-@RunIf(skip_windows=True, horovod=True)
+@RunIf(skip_windows=True, horovod=True, skip_49370=True)
 def test_horovod_cpu_clip_grad_by_value(tmpdir):
     """Test Horovod running multi-process on CPU."""
     trainer_options = dict(
@@ -99,7 +99,7 @@ def test_horovod_cpu_clip_grad_by_value(tmpdir):
     _run_horovod(trainer_options)
 
 
-@RunIf(skip_windows=True, horovod=True)
+@RunIf(skip_windows=True, horovod=True, skip_49370=True)
 def test_horovod_cpu_implicit(tmpdir):
     """Test Horovod without specifying a backend, inferring from env set by `horovodrun`."""
     trainer_options = dict(
diff --git a/tests/models/test_hparams.py b/tests/models/test_hparams.py
index dbd51d33bf0ed..d2ea07a12ea49 100644
--- a/tests/models/test_hparams.py
+++ b/tests/models/test_hparams.py
@@ -776,7 +776,10 @@ def test_adding_datamodule_hparams(tmpdir, model, data):
     # Merged hparams were logged
     merged_hparams = copy.deepcopy(org_model_hparams)
     merged_hparams.update(org_data_hparams)
-    mock_logger.log_hyperparams.assert_called_with(merged_hparams)
+    if merged_hparams:
+        mock_logger.log_hyperparams.assert_called_with(merged_hparams)
+    else:
+        mock_logger.log_hyperparams.assert_not_called()
 
 
 def test_no_datamodule_for_hparams(tmpdir):
diff --git a/tests/models/test_onnx.py b/tests/models/test_onnx.py
index 7ab425dd12ea6..d111b266fb115 100644
--- a/tests/models/test_onnx.py
+++ b/tests/models/test_onnx.py
@@ -53,6 +53,7 @@ def test_model_saves_on_gpu(tmpdir):
     assert os.path.getsize(file_path) > 4e2
 
 
+@RunIf(max_torch="1.10")
 def test_model_saves_with_example_output(tmpdir):
     """Test that ONNX model saves when provided with example output."""
     model = BoringModel()
diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py
index 6d241222526ab..adbfa769e1eac 100644
--- a/tests/models/test_restore.py
+++ b/tests/models/test_restore.py
@@ -266,8 +266,15 @@ def get_trainer_args():
 
     for before, after in zip(callbacks_before_resume, callback_capture.callbacks):
         if isinstance(before, ModelCheckpoint):
-            assert before.best_model_path == after.best_model_path
-            assert before.best_model_score == after.best_model_score
+            for attribute in (
+                "best_model_path",
+                "best_model_score",
+                "best_k_models",
+                "kth_best_model_path",
+                "kth_value",
+                "last_model_path",
+            ):
+                assert getattr(before, attribute) == getattr(after, attribute)
 
 
 def test_callbacks_references_fit_ckpt_path(tmpdir):
diff --git a/tests/models/test_sync_batchnorm.py b/tests/models/test_sync_batchnorm.py
index 67880bec4e474..5035e71f928fc 100644
--- a/tests/models/test_sync_batchnorm.py
+++ b/tests/models/test_sync_batchnorm.py
@@ -37,6 +37,9 @@ def __init__(self, gpu_count=1, **kwargs):
         self.linear = nn.Linear(28 * 28, 10)
         self.bn_layer = nn.BatchNorm1d(28 * 28)
 
+    def on_train_start(self) -> None:
+        assert isinstance(self.bn_layer, torch.nn.modules.batchnorm.SyncBatchNorm)
+
     def forward(self, x, batch_idx):
         with torch.no_grad():
             out_bn = self.bn_layer(x.view(x.size(0), -1))
@@ -67,7 +70,7 @@ def configure_optimizers(self):
 
 # TODO: Fatal Python error: Bus error
 @pytest.mark.skip(reason="Fatal Python error: Bus error")
-@RunIf(min_gpus=2, special=True)
+@RunIf(min_gpus=2, standalone=True)
 def test_sync_batchnorm_ddp(tmpdir):
     seed_everything(234)
     set_random_main_port()
@@ -123,4 +126,6 @@ def test_sync_batchnorm_ddp(tmpdir):
     )
 
     trainer.fit(model, dm)
-    assert trainer.state.finished, "Sync batchnorm failing with DDP"
+    # the strategy is responsible for tearing down the batchnorm wrappers
+    assert not isinstance(model.bn_layer, torch.nn.modules.batchnorm.SyncBatchNorm)
+    assert isinstance(model.bn_layer, torch.nn.modules.batchnorm._BatchNorm)
diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index d8ceb4106fd07..31ebd3968ff3e 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -407,7 +407,7 @@ def test_tpu_sync_dist():
     """Test tpu spawn sync dist operation."""
 
     def test_sync_dist(_):
-        sync = _Sync(TPUSpawnPlugin().reduce, should=True, op=torch.distributed.ReduceOp.SUM)
+        sync = _Sync(TPUSpawnPlugin().reduce, should=True, _op=torch.distributed.ReduceOp.SUM)
         value = torch.tensor([1.0])
         value = (sync(value),)
         assert value.item() == 8
diff --git a/tests/overrides/test_distributed.py b/tests/overrides/test_distributed.py
index c8d982bd733fe..e425859fe34df 100644
--- a/tests/overrides/test_distributed.py
+++ b/tests/overrides/test_distributed.py
@@ -54,9 +54,7 @@ def test_index_batch_sampler(tmpdir):
     assert batch_sampler.batch_size == index_batch_sampler.batch_size
     assert batch_sampler.drop_last == index_batch_sampler.drop_last
     assert batch_sampler.sampler is sampler
-
-    for batch in index_batch_sampler:
-        assert index_batch_sampler.batch_indices == batch
+    assert list(index_batch_sampler) == index_batch_sampler.seen_batch_indices
 
 
 def test_index_batch_sampler_methods():
diff --git a/tests/plugins/environments/test_lsf_environment.py b/tests/plugins/environments/test_lsf_environment.py
index e3a5a67ba4be2..35cdcb4580e8d 100644
--- a/tests/plugins/environments/test_lsf_environment.py
+++ b/tests/plugins/environments/test_lsf_environment.py
@@ -19,57 +19,98 @@
 from pytorch_lightning.plugins.environments import LSFEnvironment
 
 
-@mock.patch.dict(os.environ, {"LSB_HOSTS": "batch 10.10.10.0 10.10.10.1", "LSB_JOBID": "1234"})
-def test_missing_lsb_hosts():
-    """Test an error when the lsb hosts list cannot be found."""
-    del os.environ["LSB_HOSTS"]
-    with pytest.raises(ValueError, match="Could not find hosts in environment variable LSB_HOSTS"):
+def _make_rankfile(tmp_path):
+    hosts = "batch\n10.10.10.0\n10.10.10.1\n10.10.10.2\n10.10.10.3"
+    p = tmp_path / "lsb_djob_rankfile"
+    p.write_text(hosts)
+    return str(p)
+
+
+@mock.patch.dict(os.environ, {"LSB_JOBID": "1234"})
+def test_missing_lsb_djob_rankfile():
+    """Test an error when the LSB_DJOB_RANKFILE cannot be found."""
+    with pytest.raises(ValueError, match="Did not find the environment variable `LSB_DJOB_RANKFILE`"):
+        LSFEnvironment()
+
+
+@mock.patch.dict(os.environ, {"LSB_DJOB_RANKFILE": "", "LSB_JOBID": "1234"})
+def test_empty_lsb_djob_rankfile():
+    """Test an error when the LSB_DJOB_RANKFILE is not populated."""
+    with pytest.raises(ValueError, match="The environment variable `LSB_DJOB_RANKFILE` is empty"):
         LSFEnvironment()
 
 
-@mock.patch.dict(os.environ, {"LSB_HOSTS": "batch 10.10.10.0 10.10.10.1", "LSB_JOBID": "1234"})
-def test_missing_lsb_job_id():
+def test_missing_lsb_job_id(tmp_path):
     """Test an error when the job id cannot be found."""
-    del os.environ["LSB_JOBID"]
-    with pytest.raises(ValueError, match="Could not find job id in environment variable LSB_JOBID"):
+    with mock.patch.dict(os.environ, {"LSB_DJOB_RANKFILE": _make_rankfile(tmp_path)}), pytest.raises(
+        ValueError, match="Could not find job id in environment variable LSB_JOBID"
+    ):
         LSFEnvironment()
 
 
-@mock.patch.dict(os.environ, {"MASTER_PORT": "4321", "LSB_JOBID": "1234", "LSB_HOSTS": "batch 10.10.10.0 10.10.10.1"})
-def test_manual_master_port_and_address():
+def test_manual_main_port_and_address(tmp_path):
     """Test a user can set the port manually through the MASTER_PORT env variable."""
-    env = LSFEnvironment()
-    assert env.master_port() == 4321
+    environ = {
+        "LSB_DJOB_RANKFILE": _make_rankfile(tmp_path),
+        "LSB_JOBID": "1234",
+        "JSM_NAMESPACE_SIZE": "4",
+        "JSM_NAMESPACE_RANK": "3",
+        "JSM_NAMESPACE_LOCAL_RANK": "1",
+    }
+    with mock.patch.dict(os.environ, environ), mock.patch("socket.gethostname", return_value="10.10.10.2"):
+        env = LSFEnvironment()
+        assert env.master_port() == 10234
+
 
+def test_attributes_from_environment_variables(tmp_path):
+    """Test that the LSF environment takes the attributes from the environment variables."""
+    environ = {
+        "LSB_DJOB_RANKFILE": _make_rankfile(tmp_path),
+        "LSB_JOBID": "1234",
+        "JSM_NAMESPACE_SIZE": "4",
+        "JSM_NAMESPACE_RANK": "3",
+        "JSM_NAMESPACE_LOCAL_RANK": "1",
+    }
+    with mock.patch.dict(os.environ, environ), mock.patch("socket.gethostname", return_value="10.10.10.2"):
+        env = LSFEnvironment()
+        assert env.creates_processes_externally
+        assert env.master_address() == "10.10.10.0"
+        assert env.master_port() == 10234
+        assert env.world_size() == 4
+        assert env.global_rank() == 3
+        assert env.local_rank() == 1
+        env.set_global_rank(100)
+        assert env.global_rank() == 3
+        env.set_world_size(100)
+        assert env.world_size() == 4
+        assert LSFEnvironment.is_using_lsf()
 
-@mock.patch.dict(
-    os.environ,
-    {
-        "LSB_HOSTS": "batch 10.10.10.0 10.10.10.1 10.10.10.2 10.10.10.3",
+
+def test_node_rank(tmp_path):
+    environ = {
+        "LSB_DJOB_RANKFILE": _make_rankfile(tmp_path),
         "LSB_JOBID": "1234",
         "JSM_NAMESPACE_SIZE": "4",
         "JSM_NAMESPACE_RANK": "3",
         "JSM_NAMESPACE_LOCAL_RANK": "1",
-    },
-)
-def test_attributes_from_environment_variables():
-    """Test that the LSF environment takes the attributes from the environment variables."""
-    env = LSFEnvironment()
-    assert env.creates_processes_externally
-    assert env.master_address() == "10.10.10.0"
-    assert env.master_port() == 10234
-    assert env.world_size() == 4
-    assert env.global_rank() == 3
-    assert env.local_rank() == 1
-    env.set_global_rank(100)
-    assert env.global_rank() == 3
-    env.set_world_size(100)
-    assert env.world_size() == 4
-    assert LSFEnvironment.is_using_lsf()
-
-
-@mock.patch("socket.gethostname", return_value="host2")
-@mock.patch.dict(os.environ, {"LSB_HOSTS": "batch host0 host1 host2 host3", "LSB_JOBID": "1234"})
-def test_node_rank(_):
-    env = LSFEnvironment()
-    assert env.node_rank() == 2
+    }
+    with mock.patch.dict(os.environ, environ), mock.patch("socket.gethostname", return_value="10.10.10.2"):
+        env = LSFEnvironment()
+        assert env.node_rank() == 2
+
+
+def test_detect():
+    """Test the detection of a LSF environment configuration."""
+    with mock.patch.dict(os.environ, {}):
+        assert not LSFEnvironment.is_using_lsf()
+
+    with mock.patch.dict(
+        os.environ,
+        {
+            "LSB_DJOB_RANKFILE": "",
+            "LSB_JOBID": "",
+            "JSM_NAMESPACE_SIZE": "",
+            "JSM_NAMESPACE_LOCAL_RANK": "",
+        },
+    ):
+        assert LSFEnvironment.is_using_lsf()
diff --git a/tests/plugins/environments/torch_elastic_deadlock.py b/tests/plugins/environments/torch_elastic_deadlock.py
index ead433200c304..f8a64ba632991 100644
--- a/tests/plugins/environments/torch_elastic_deadlock.py
+++ b/tests/plugins/environments/torch_elastic_deadlock.py
@@ -7,7 +7,7 @@
 from pytorch_lightning.utilities.exceptions import DeadlockDetectedException
 from tests.helpers.boring_model import BoringModel
 
-if os.getenv("PL_RUNNING_SPECIAL_TESTS", "0") == "1" and os.getenv("PL_RECONCILE_PROCESS", "0") == "1":
+if os.getenv("PL_RUN_STANDALONE_TESTS", "0") == "1" and os.getenv("PL_RECONCILE_PROCESS", "0") == "1":
 
     class CustomException(Exception):
         pass
diff --git a/tests/plugins/test_amp_plugins.py b/tests/plugins/test_amp_plugins.py
index c482e8a83d7b6..3c10bf8495aeb 100644
--- a/tests/plugins/test_amp_plugins.py
+++ b/tests/plugins/test_amp_plugins.py
@@ -190,7 +190,7 @@ def configure_optimizers(self):
     trainer.fit(model)
 
 
-@RunIf(min_gpus=2, amp_apex=True, special=True)
+@RunIf(min_gpus=2, amp_apex=True, standalone=True)
 @pytest.mark.parametrize("amp_level", ["O2"])
 def test_amp_apex_ddp_fit(amp_level, tmpdir):
     class CustomBoringModel(BoringModel):
diff --git a/tests/plugins/test_ddp_fully_sharded_with_full_state_dict.py b/tests/plugins/test_ddp_fully_sharded_with_full_state_dict.py
index 1468c7f4a4137..4b68667bbed6a 100644
--- a/tests/plugins/test_ddp_fully_sharded_with_full_state_dict.py
+++ b/tests/plugins/test_ddp_fully_sharded_with_full_state_dict.py
@@ -89,7 +89,7 @@ def _assert_layer_fsdp_instance(self) -> None:
         assert self.layer.module[2].reshard_after_forward is True
 
 
-@RunIf(min_gpus=1, skip_windows=True, fairscale_fully_sharded=True, special=True)
+@RunIf(min_gpus=1, skip_windows=True, fairscale_fully_sharded=True, standalone=True)
 def test_fully_sharded_plugin_checkpoint(tmpdir):
     """Test to ensure that checkpoint is saved correctly when using a single GPU, and all stages can be run."""
 
@@ -98,7 +98,7 @@ def test_fully_sharded_plugin_checkpoint(tmpdir):
     _run_multiple_stages(trainer, model, os.path.join(tmpdir, "last.ckpt"))
 
 
-@RunIf(min_gpus=2, skip_windows=True, fairscale_fully_sharded=True, special=True)
+@RunIf(min_gpus=2, skip_windows=True, fairscale_fully_sharded=True, standalone=True)
 def test_fully_sharded_plugin_checkpoint_multi_gpus(tmpdir):
     """Test to ensure that checkpoint is saved correctly when using multiple GPUs, and all stages can be run."""
 
@@ -136,7 +136,7 @@ def _run_multiple_stages(trainer, model, model_path: Optional[str] = None):
     trainer.test(ckpt_path=model_path)
 
 
-@RunIf(min_gpus=1, skip_windows=True, fairscale_fully_sharded=True, special=True)
+@RunIf(min_gpus=1, skip_windows=True, fairscale_fully_sharded=True, standalone=True)
 def test_fsdp_gradient_clipping_raises(tmpdir):
     """Test to ensure that an exception is raised when clipping gradients by value with FSDP."""
     model = BoringModel()
diff --git a/tests/plugins/test_ddp_plugin.py b/tests/plugins/test_ddp_plugin.py
index 78ae931330307..1aaf89d052686 100644
--- a/tests/plugins/test_ddp_plugin.py
+++ b/tests/plugins/test_ddp_plugin.py
@@ -33,7 +33,7 @@ def on_train_start(self) -> None:
         self.start_cuda_memory = torch.cuda.memory_allocated()
 
 
-@RunIf(skip_windows=True, min_gpus=2, special=True)
+@RunIf(skip_windows=True, min_gpus=2, standalone=True)
 def test_ddp_with_2_gpus():
     """Tests if device is set correctely when training and after teardown for DDPPlugin."""
     trainer = Trainer(gpus=2, strategy="ddp", fast_dev_run=True)
@@ -64,7 +64,7 @@ def on_train_start(self):
         self.trainer.training_type_plugin.barrier("barrier after model is wrapped")
 
 
-@RunIf(min_gpus=4, special=True)
+@RunIf(min_gpus=4, standalone=True)
 @mock.patch("torch.distributed.barrier")
 def test_ddp_barrier_non_consecutive_device_ids(barrier_mock, tmpdir):
     """Test correct usage of barriers when device ids do not start at 0 or are not consecutive."""
diff --git a/tests/plugins/test_ddp_plugin_with_comm_hook.py b/tests/plugins/test_ddp_plugin_with_comm_hook.py
index 6497b39ffa516..49a637098f279 100644
--- a/tests/plugins/test_ddp_plugin_with_comm_hook.py
+++ b/tests/plugins/test_ddp_plugin_with_comm_hook.py
@@ -26,7 +26,7 @@
     import torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook as post_localSGD
 
 
-@RunIf(skip_windows=True, min_torch="1.9.0", min_gpus=2, special=True)
+@RunIf(skip_windows=True, min_torch="1.9.0", min_gpus=2, standalone=True)
 def test_ddp_fp16_compress_comm_hook(tmpdir):
     """Test for DDP FP16 compress hook."""
     model = BoringModel()
@@ -46,7 +46,7 @@ def test_ddp_fp16_compress_comm_hook(tmpdir):
     assert trainer.state.finished, f"Training failed with {trainer.state}"
 
 
-@RunIf(skip_windows=True, min_torch="1.9.0", min_gpus=2, special=True)
+@RunIf(skip_windows=True, min_torch="1.9.0", min_gpus=2, standalone=True)
 def test_ddp_sgd_comm_hook(tmpdir):
     """Test for DDP FP16 compress hook."""
     model = BoringModel()
@@ -70,7 +70,7 @@ def test_ddp_sgd_comm_hook(tmpdir):
     assert trainer.state.finished, f"Training failed with {trainer.state}"
 
 
-@RunIf(skip_windows=True, min_torch="1.9.0", min_gpus=2, special=True)
+@RunIf(skip_windows=True, min_torch="1.9.0", min_gpus=2, standalone=True)
 def test_ddp_fp16_compress_wrap_sgd_comm_hook(tmpdir):
     """Test for DDP FP16 compress wrapper for SGD hook."""
     model = BoringModel()
@@ -95,7 +95,7 @@ def test_ddp_fp16_compress_wrap_sgd_comm_hook(tmpdir):
     assert trainer.state.finished, f"Training failed with {trainer.state}"
 
 
-@RunIf(skip_windows=True, min_torch="1.9.0", min_gpus=2, special=True)
+@RunIf(skip_windows=True, min_torch="1.9.0", min_gpus=2, standalone=True)
 def test_ddp_spawn_fp16_compress_comm_hook(tmpdir):
     """Test for DDP Spawn FP16 compress hook."""
     model = BoringModel()
@@ -112,7 +112,7 @@ def test_ddp_spawn_fp16_compress_comm_hook(tmpdir):
     assert trainer.state.finished, f"Training failed with {trainer.state}"
 
 
-@RunIf(skip_windows=True, min_torch="1.10.0", min_gpus=2, special=True)
+@RunIf(skip_windows=True, min_torch="1.10.0", min_gpus=2, standalone=True)
 def test_ddp_post_local_sgd_comm_hook(tmpdir):
     """Test for DDP post-localSGD hook."""
     model = BoringModel()
diff --git a/tests/plugins/test_ddp_spawn_plugin.py b/tests/plugins/test_ddp_spawn_plugin.py
index c389cf9290c78..f8a96f5ef496e 100644
--- a/tests/plugins/test_ddp_spawn_plugin.py
+++ b/tests/plugins/test_ddp_spawn_plugin.py
@@ -46,7 +46,7 @@ def get_from_queue(self, queue: torch.multiprocessing.SimpleQueue) -> None:
         return super().get_from_queue(queue)
 
 
-@RunIf(skip_windows=True)
+@RunIf(skip_windows=True, skip_49370=True)
 def test_ddp_cpu():
     """Tests if device is set correctly when training for DDPSpawnPlugin."""
     trainer = Trainer(num_processes=2, fast_dev_run=True)
@@ -91,7 +91,7 @@ def get_from_queue(self, trainer: Trainer, queue: torch.multiprocessing.SimpleQu
         return super().get_from_queue(trainer, queue)
 
 
-@RunIf(skip_windows=True)
+@RunIf(skip_windows=True, skip_49370=True)
 def test_ddp_spawn_add_get_queue(tmpdir):
     """Tests add_to_queue/get_from_queue with DDPSpawnPlugin."""
 
@@ -128,7 +128,7 @@ def on_predict_start(self) -> None:
         assert isinstance(self.trainer.model, LightningModule)
 
 
-@RunIf(skip_windows=True)
+@RunIf(skip_windows=True, skip_49370=True, skip_hanging_spawn=True)
 def test_ddp_spawn_configure_ddp(tmpdir):
     """Tests with ddp spawn plugin."""
     trainer = Trainer(default_root_dir=tmpdir, num_processes=2, strategy="ddp_spawn", fast_dev_run=True)
diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index 981d0d5db8cf6..d2205e59773d4 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -1,5 +1,6 @@
 import contextlib
 import json
+import logging
 import os
 from typing import Any, Dict, Optional
 from unittest import mock
@@ -202,7 +203,7 @@ def test_deepspeed_defaults(tmpdir):
     assert isinstance(plugin.config["zero_optimization"], dict)
 
 
-@RunIf(min_gpus=1, deepspeed=True, special=True)
+@RunIf(min_gpus=1, deepspeed=True, standalone=True)
 def test_warn_deepspeed_ignored(tmpdir):
     class TestModel(BoringModel):
         def backward(self, loss: Tensor, optimizer: Optimizer, optimizer_idx: int, *args, **kwargs) -> None:
@@ -258,7 +259,7 @@ def setup(self, trainer, pl_module, stage: Optional[str] = None) -> None:
         trainer.fit(model)
 
 
-@RunIf(min_gpus=1, deepspeed=True, special=True)
+@RunIf(min_gpus=1, deepspeed=True, standalone=True)
 def test_deepspeed_run_configure_optimizers(tmpdir):
     """Test end to end that deepspeed works with defaults (without ZeRO as that requires compilation), whilst using
     configure_optimizers for optimizers and schedulers."""
@@ -295,7 +296,7 @@ def configure_optimizers(self):
     _assert_save_model_is_equal(model, tmpdir, trainer)
 
 
-@RunIf(min_gpus=1, deepspeed=True, special=True)
+@RunIf(min_gpus=1, deepspeed=True, standalone=True)
 def test_deepspeed_config(tmpdir, deepspeed_zero_config):
     """Test to ensure deepspeed works correctly when passed a DeepSpeed config object including
     optimizers/schedulers and saves the model weights to load correctly."""
@@ -323,7 +324,7 @@ def on_train_start(self, trainer, pl_module) -> None:
     trainer.test(model)
 
 
-@RunIf(min_gpus=1, deepspeed=True, special=True)
+@RunIf(min_gpus=1, deepspeed=True, standalone=True)
 def test_deepspeed_custom_precision_params(tmpdir):
     """Ensure if we modify the FP16 parameters via the DeepSpeedPlugin, the deepspeed config contains these
     changes."""
@@ -360,6 +361,36 @@ def test_deepspeed_custom_activation_checkpointing_params(tmpdir):
     assert checkpoint_config["synchronize_checkpoint_boundary"]
 
 
+@RunIf(min_gpus=1, deepspeed=True, standalone=True)
+def test_deepspeed_custom_activation_checkpointing_params_forwarded(tmpdir):
+    """Ensure if we modify the activation checkpointing parameters, we pass these to
+    deepspeed.checkpointing.configure correctly."""
+    ds = DeepSpeedPlugin(
+        partition_activations=True,
+        cpu_checkpointing=True,
+        contiguous_memory_optimization=True,
+        synchronize_checkpoint_boundary=True,
+    )
+
+    model = BoringModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        enable_progress_bar=False,
+        fast_dev_run=1,
+        strategy=ds,
+        precision=16,
+        gpus=1,
+    )
+    with mock.patch(
+        "deepspeed.checkpointing.configure", wraps=deepspeed.checkpointing.configure
+    ) as deepspeed_checkpointing_configure:
+        trainer.fit(model)
+
+    deepspeed_checkpointing_configure.assert_called_with(
+        mpu_=None, partition_activations=True, contiguous_checkpointing=True, checkpoint_in_cpu=True, profile=None
+    )
+
+
 @RunIf(min_gpus=1, deepspeed=True)
 def test_deepspeed_assert_config_zero_offload_disabled(tmpdir, deepspeed_zero_config):
     """Ensure if we use a config and turn off offload_optimizer, that this is set to False within the config."""
@@ -385,7 +416,7 @@ def on_before_accelerator_backend_setup(self, trainer, pl_module) -> None:
         trainer.fit(model)
 
 
-@RunIf(min_gpus=2, deepspeed=True, special=True)
+@RunIf(min_gpus=2, deepspeed=True, standalone=True)
 def test_deepspeed_multigpu(tmpdir):
     """Test to ensure that DeepSpeed with multiple GPUs works and deepspeed distributed is initialized
     correctly."""
@@ -401,14 +432,14 @@ def test_deepspeed_multigpu(tmpdir):
     _assert_save_model_is_equal(model, tmpdir, trainer)
 
 
-@RunIf(min_gpus=1, deepspeed=True, special=True)
+@RunIf(min_gpus=1, deepspeed=True, standalone=True)
 def test_deepspeed_fp32_works(tmpdir):
     model = BoringModel()
     trainer = Trainer(default_root_dir=tmpdir, gpus=1, strategy="deepspeed_stage_3", fast_dev_run=True)
     trainer.fit(model)
 
 
-@RunIf(min_gpus=2, deepspeed=True, special=True)
+@RunIf(min_gpus=2, deepspeed=True, standalone=True)
 def test_deepspeed_stage_3_save_warning(tmpdir):
     """Test to ensure that DeepSpeed Stage 3 gives a warning when saving on rank zero."""
     model = BoringModel()
@@ -428,7 +459,7 @@ def test_deepspeed_stage_3_save_warning(tmpdir):
         trainer.save_checkpoint(checkpoint_path)
 
 
-@RunIf(min_gpus=1, deepspeed=True, special=True)
+@RunIf(min_gpus=1, deepspeed=True, standalone=True)
 def test_deepspeed_multigpu_single_file(tmpdir):
     """Test to ensure that DeepSpeed loads from a single file checkpoint."""
     model = BoringModel()
@@ -537,7 +568,7 @@ def training_step(self, batch, batch_idx):
         opt.step()
 
 
-@RunIf(min_gpus=2, deepspeed=True, special=True)
+@RunIf(min_gpus=2, deepspeed=True, standalone=True)
 def test_deepspeed_multigpu_stage_3(tmpdir, deepspeed_config):
     """Test to ensure ZeRO Stage 3 works with a parallel model."""
     model = ModelParallelBoringModel()
@@ -550,7 +581,7 @@ def test_deepspeed_multigpu_stage_3(tmpdir, deepspeed_config):
     _assert_save_model_is_equal(model, tmpdir, trainer)
 
 
-@RunIf(min_gpus=2, deepspeed=True, special=True)
+@RunIf(min_gpus=2, deepspeed=True, standalone=True)
 def test_deepspeed_multigpu_stage_3_manual_optimization(tmpdir, deepspeed_config):
     """Test to ensure ZeRO Stage 3 works with a parallel model."""
     model = ModelParallelBoringModelManualOptim()
@@ -564,7 +595,9 @@ def test_deepspeed_multigpu_stage_3_manual_optimization(tmpdir, deepspeed_config
     _assert_save_model_is_equal(model, tmpdir, trainer)
 
 
-def run_checkpoint_test(tmpdir: str, automatic_optimization: bool = True, accumulate_grad_batches: int = 2):
+@pytest.mark.parametrize(("accumulate_grad_batches", "automatic_optimization"), [(1, False), (2, True)])
+@RunIf(min_gpus=2, deepspeed=True, standalone=True)
+def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, automatic_optimization, accumulate_grad_batches):
     seed_everything(1)
     if automatic_optimization:
         model = ModelParallelClassificationModel()
@@ -599,14 +632,7 @@ def run_checkpoint_test(tmpdir: str, automatic_optimization: bool = True, accumu
     assert results[0]["test_acc"] > 0.7
 
 
-@RunIf(min_gpus=2, deepspeed=True, special=True)
-def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir):
-    """Test to ensure with Stage 3 and multiple GPUs that we can save/load a model resuming from a checkpoint, and
-    see convergence."""
-    run_checkpoint_test(tmpdir)
-
-
-@RunIf(min_gpus=1, deepspeed=True, special=True)
+@RunIf(min_gpus=1, deepspeed=True, standalone=True)
 def test_deepspeed_multigpu_stage_3_warns_resume_training(tmpdir):
     """Test to ensure with Stage 3 and multiple GPUs that we can resume from training, throwing a warning that the
     optimizer state and scheduler states cannot be restored."""
@@ -633,7 +659,7 @@ def test_deepspeed_multigpu_stage_3_warns_resume_training(tmpdir):
         trainer.fit(model, datamodule=dm, ckpt_path=checkpoint_path)
 
 
-@RunIf(min_gpus=1, deepspeed=True, special=True)
+@RunIf(min_gpus=1, deepspeed=True, standalone=True)
 def test_deepspeed_multigpu_stage_3_resume_training(tmpdir):
     """Test to ensure with Stage 3 and multiple GPUs that we can resume training."""
     initial_model = ModelParallelClassificationModel()
@@ -687,24 +713,9 @@ def on_train_batch_start(
     trainer.fit(model, datamodule=dm, ckpt_path=ck.best_model_path)
 
 
-@RunIf(min_gpus=2, deepspeed=True, special=True)
-def test_deepspeed_multigpu_stage_3_checkpointing_full_weights_manual(tmpdir):
-    """Test to ensure with Stage 3 and multiple GPUs that we can save/load a model resuming from a checkpoint,
-    where we save the full weights to one file."""
-    run_checkpoint_test(tmpdir, automatic_optimization=False, accumulate_grad_batches=1)
-
-
-@RunIf(min_gpus=2, deepspeed=True, special=True)
-def test_deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir):
-    _deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer=False)
-
-
-@RunIf(min_gpus=2, deepspeed=True, special=True)
-def test_deepspeed_multigpu_stage_2_accumulated_grad_batches_offload_optimizer(tmpdir):
-    _deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer=True)
-
-
-def _deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer):
+@pytest.mark.parametrize("offload_optimizer", [False, True])
+@RunIf(min_gpus=2, deepspeed=True, standalone=True)
+def test_deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer):
     """Test to ensure with Stage 2 and multiple GPUs, accumulated grad batches works."""
     seed_everything(42)
 
@@ -740,7 +751,7 @@ def on_train_batch_start(self, trainer, pl_module: LightningModule, batch: Any,
     assert verification_callback.on_train_batch_start_called
 
 
-@RunIf(min_gpus=2, deepspeed=True, special=True)
+@RunIf(min_gpus=2, deepspeed=True, standalone=True)
 def test_deepspeed_multigpu_test(tmpdir):
     """Test to ensure we can use DeepSpeed with just test using ZeRO Stage 3."""
     model = ModelParallelBoringModel()
@@ -750,7 +761,9 @@ def test_deepspeed_multigpu_test(tmpdir):
     trainer.test(model)
 
 
-@RunIf(min_gpus=1, deepspeed=True, special=True)
+# TODO(Sean): Once partial parameter partitioning is supported this test should be re-enabled
+@pytest.mark.skip("Partial parameter partitioning for DeepSpeed is currently broken.")
+@RunIf(min_gpus=1, deepspeed=True, standalone=True)
 def test_deepspeed_multigpu_partial_partition_parameters(tmpdir):
     """Test to ensure that a module that defines a layer inside the ``__init__`` and ``configure_sharded_model``
     correctly converts all parameters to float16 when ``precision=16`` and runs successfully."""
@@ -777,7 +790,7 @@ def on_train_epoch_start(self) -> None:
     trainer.fit(model)
 
 
-@RunIf(min_gpus=1, deepspeed=True, special=True)
+@RunIf(min_gpus=1, deepspeed=True, standalone=True)
 def test_deepspeed_multigpu_test_rnn(tmpdir):
     """Test to ensure that turning off explicit partitioning of the entire module for ZeRO Stage 3 works when
     training with certain layers which will crash with explicit partitioning."""
@@ -793,7 +806,7 @@ def on_train_epoch_start(self) -> None:
     model = TestModel()
     trainer = Trainer(
         default_root_dir=tmpdir,
-        strategy=DeepSpeedPlugin(stage=3, partition_module=False),
+        strategy=DeepSpeedPlugin(stage=3),
         gpus=1,
         fast_dev_run=True,
         precision=16,
@@ -848,7 +861,7 @@ def _assert_save_model_is_equal(model, tmpdir, trainer):
             assert torch.equal(orig_param, saved_model_param)
 
 
-@RunIf(min_gpus=2, deepspeed=True, special=True)
+@RunIf(min_gpus=2, deepspeed=True, standalone=True)
 def test_deepspeed_multigpu_no_schedulers(tmpdir):
     """Test to ensure ZeRO Stage 3 works with a parallel model and no schedulers."""
     model = ModelParallelBoringModelNoSchedulers()
@@ -860,7 +873,7 @@ def test_deepspeed_multigpu_no_schedulers(tmpdir):
     _assert_save_model_is_equal(model, tmpdir, trainer)
 
 
-@RunIf(min_gpus=1, deepspeed=True, special=True)
+@RunIf(min_gpus=1, deepspeed=True, standalone=True)
 def test_deepspeed_skip_backward_raises(tmpdir):
     class TestModel(BoringModel):
         def training_step(self, batch, batch_idx):
@@ -872,24 +885,9 @@ def training_step(self, batch, batch_idx):
         trainer.fit(model)
 
 
-@RunIf(min_gpus=1, deepspeed=True, special=True)
-def test_deepspeed_warn_train_dataloader_called(tmpdir):
-    """Test DeepSpeed warns when it calls ``lightning_module.train_dataloader`` internally for logging batch
-    size."""
-    model = BoringModel()
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        strategy=DeepSpeedPlugin(),
-        gpus=1,
-        fast_dev_run=True,
-    )
-    with pytest.warns(UserWarning, match="Inferring the batch size for internal deepspeed logging"):
-        trainer.fit(model)
-
-
-@RunIf(min_gpus=1, deepspeed=True, special=True)
+@RunIf(min_gpus=1, deepspeed=True, standalone=True)
 def test_deepspeed_setup_train_dataloader(tmpdir):
-    """Test DeepSpeed works when setup is required to call, and the user passes the batch size manually."""
+    """Test DeepSpeed works when setup is required to call in the DataModule."""
 
     class TestSetupIsCalledDataModule(LightningDataModule):
         def __init__(self):
@@ -914,32 +912,25 @@ def test_dataloader(self):
     model = BoringModel()
     trainer = Trainer(
         default_root_dir=tmpdir,
-        strategy=DeepSpeedPlugin(logging_batch_size_per_gpu=32),
+        strategy=DeepSpeedPlugin(logging_level=logging.INFO),
         gpus=1,
         fast_dev_run=True,
     )
     dm = TestSetupIsCalledDataModule()
-    trainer.fit(model, datamodule=dm)
-    trainer.test(model, datamodule=dm)
+    with mock.patch("deepspeed.utils.logging.logger.warning", autospec=True) as mock_object:
+        trainer.fit(model, datamodule=dm)
+    assert any("Tried to infer the batch size" in str(arg) for arg in mock_object.call_args_list)
 
 
 @mock.patch("torch.optim.lr_scheduler.StepLR.step", autospec=True)
-@RunIf(min_gpus=1, deepspeed=True, special=True)
-def test_deepspeed_scheduler_step_count(mock_step):
+@pytest.mark.parametrize("interval", ["step", "epoch"])
+@pytest.mark.parametrize("max_epoch", [2])
+@pytest.mark.parametrize("limit_train_batches", [2])
+@RunIf(min_gpus=1, deepspeed=True, standalone=True)
+def test_scheduler_step_count(mock_step, max_epoch, limit_train_batches, interval):
     """Test to ensure that the scheduler is called the correct amount of times during training when scheduler is
-    set to step."""
-    _run_scheduler_test(mock_step, max_epoch=2, limit_train_batches=2, interval="step")
-
-
-@mock.patch("torch.optim.lr_scheduler.StepLR.step", autospec=True)
-@RunIf(min_gpus=1, deepspeed=True, special=True)
-def test_deepspeed_scheduler_step_count_epoch(mock_step):
-    """Test to ensure that the scheduler is called the correct amount of times during training when scheduler is
-    set to epoch."""
-    _run_scheduler_test(mock_step, max_epoch=2, limit_train_batches=2, interval="epoch")
-
+    set to step or epoch."""
 
-def _run_scheduler_test(mock_step, max_epoch, limit_train_batches, interval):
     class TestModel(BoringModel):
         def configure_optimizers(self):
             optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
@@ -967,7 +958,7 @@ def configure_optimizers(self):
         assert mock_step.call_count == 1 + (max_epoch * limit_train_batches)
 
 
-@RunIf(min_gpus=1, deepspeed=True, special=True)
+@RunIf(min_gpus=1, deepspeed=True, standalone=True)
 def test_deepspeed_configure_gradient_clipping(tmpdir):
     """Test to ensure that a warning is raised when `LightningModule.configure_gradient_clipping` is overridden in
     case of deepspeed."""
@@ -988,7 +979,7 @@ def configure_gradient_clipping(self, optimizer, optimizer_idx, gradient_clip_va
         trainer.fit(model)
 
 
-@RunIf(min_gpus=1, deepspeed=True, special=True)
+@RunIf(min_gpus=1, deepspeed=True, standalone=True)
 def test_deepspeed_gradient_clip_by_value(tmpdir):
     """Test to ensure that an exception is raised when using `gradient_clip_algorithm='value'`."""
     model = BoringModel()
@@ -1002,7 +993,7 @@ def test_deepspeed_gradient_clip_by_value(tmpdir):
         trainer.fit(model)
 
 
-@RunIf(min_gpus=1, deepspeed=True, special=True)
+@RunIf(min_gpus=1, deepspeed=True, standalone=True)
 def test_different_accumulate_grad_batches_fails(tmpdir):
     model = BoringModel()
     trainer = Trainer(default_root_dir=tmpdir, accumulate_grad_batches={1: 2}, gpus=1, strategy="deepspeed")
@@ -1012,7 +1003,7 @@ def test_different_accumulate_grad_batches_fails(tmpdir):
         trainer.fit(model)
 
 
-@RunIf(min_gpus=2, deepspeed=True, special=True)
+@RunIf(min_gpus=2, deepspeed=True, standalone=True)
 def test_specific_gpu_device_id(tmpdir):
     class TestCallback(Callback):
         def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None:
@@ -1049,7 +1040,7 @@ def on_test_batch_start(
     trainer.test(model)
 
 
-@RunIf(min_gpus=2, deepspeed=True, special=True, min_torch="1.10.0")
+@RunIf(min_gpus=2, deepspeed=True, standalone=True, min_torch="1.10.0")
 def test_deepspeed_with_meta_device(tmpdir):
     with init_meta_context():
         model = BoringModel()
diff --git a/tests/plugins/test_sharded_plugin.py b/tests/plugins/test_sharded_plugin.py
index e80b5d9f7621e..588ee8096250b 100644
--- a/tests/plugins/test_sharded_plugin.py
+++ b/tests/plugins/test_sharded_plugin.py
@@ -5,7 +5,6 @@
 import torch
 
 from pytorch_lightning import LightningModule, Trainer
-from pytorch_lightning.callbacks import Callback
 from pytorch_lightning.plugins import DDPShardedPlugin, DDPSpawnShardedPlugin
 from pytorch_lightning.trainer.states import TrainerFn
 from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE
@@ -31,43 +30,23 @@ def test_ddp_sharded_precision_16_clip_gradients(mock_oss_clip_grad_norm, clip_v
 
 
 @RunIf(fairscale=True)
-@pytest.mark.parametrize(["strategy"], [("ddp_sharded",), ("ddp_sharded_spawn",)])
-def test_sharded_ddp_choice(tmpdir, strategy):
+@pytest.mark.parametrize(
+    "strategy,expected", [("ddp_sharded", DDPShardedPlugin), ("ddp_sharded_spawn", DDPSpawnShardedPlugin)]
+)
+def test_sharded_ddp_choice(tmpdir, strategy, expected):
     """Test to ensure that plugin is correctly chosen."""
-
-    class CB(Callback):
-        def on_fit_start(self, trainer, pl_module):
-            if strategy == "ddp_sharded":
-                assert isinstance(trainer.accelerator.training_type_plugin, DDPShardedPlugin)
-            elif strategy == "ddp_sharded_spawn":
-                assert isinstance(trainer.accelerator.training_type_plugin, DDPSpawnShardedPlugin)
-            raise SystemExit()
-
-    model = BoringModel()
-    trainer = Trainer(fast_dev_run=True, strategy=strategy, callbacks=[CB()])
-
-    with pytest.raises(SystemExit):
-        trainer.fit(model)
+    trainer = Trainer(fast_dev_run=True, strategy=strategy)
+    assert isinstance(trainer.accelerator.training_type_plugin, expected)
 
 
 @RunIf(min_gpus=1, fairscale=True)
-@pytest.mark.parametrize(["strategy"], [("ddp_sharded",), ("ddp_sharded_spawn",)])
-def test_ddp_choice_sharded_amp(tmpdir, strategy):
+@pytest.mark.parametrize(
+    "strategy,expected", [("ddp_sharded", DDPShardedPlugin), ("ddp_sharded_spawn", DDPSpawnShardedPlugin)]
+)
+def test_ddp_choice_sharded_amp(tmpdir, strategy, expected):
     """Test to ensure that plugin native amp plugin is correctly chosen when using sharded."""
-
-    class CB(Callback):
-        def on_fit_start(self, trainer, pl_module):
-            if strategy == "ddp_sharded":
-                assert isinstance(trainer.accelerator.training_type_plugin, DDPShardedPlugin)
-            elif strategy == "ddp_sharded_spawn":
-                assert isinstance(trainer.accelerator.training_type_plugin, DDPSpawnShardedPlugin)
-            raise SystemExit()
-
-    model = BoringModel()
-    trainer = Trainer(fast_dev_run=True, gpus=1, precision=16, strategy=strategy, callbacks=[CB()])
-
-    with pytest.raises(SystemExit):
-        trainer.fit(model)
+    trainer = Trainer(fast_dev_run=True, gpus=1, precision=16, strategy=strategy)
+    assert isinstance(trainer.accelerator.training_type_plugin, expected)
 
 
 @RunIf(skip_windows=True, fairscale=True)
@@ -175,7 +154,7 @@ def test_ddp_sharded_plugin_fit_ckpt_path_gpu_to_cpu(tmpdir):
     trainer.fit(model, ckpt_path=checkpoint_path)
 
 
-@RunIf(skip_windows=True, special=True, fairscale=True)
+@RunIf(skip_windows=True, standalone=True, fairscale=True)
 @pytest.mark.parametrize("trainer_kwargs", (dict(num_processes=2), pytest.param(dict(gpus=2), marks=RunIf(min_gpus=2))))
 def test_ddp_sharded_plugin_test_multigpu(tmpdir, trainer_kwargs):
     """Test to ensure we can use validate and test without fit."""
@@ -201,7 +180,7 @@ def training_step(self, batch, batch_idx):
         return {"loss": loss}
 
 
-@RunIf(skip_windows=True, special=True, fairscale=True, min_gpus=2)
+@RunIf(skip_windows=True, standalone=True, fairscale=True, min_gpus=2)
 def test_ddp_sharded_plugin_manual_optimization_spawn(tmpdir):
     # todo (sean): this test has been split out as running both tests using parametrize causes "Address in use"
     model = ManualBoringModel()
@@ -209,7 +188,7 @@ def test_ddp_sharded_plugin_manual_optimization_spawn(tmpdir):
     trainer.fit(model)
 
 
-@RunIf(skip_windows=True, special=True, fairscale=True, min_gpus=2)
+@RunIf(skip_windows=True, standalone=True, fairscale=True, min_gpus=2)
 def test_ddp_sharded_plugin_manual_optimization(tmpdir):
     model = ManualBoringModel()
     trainer = Trainer(default_root_dir=tmpdir, strategy="ddp_sharded", fast_dev_run=2, gpus=2)
diff --git a/tests/plugins/test_tpu_spawn.py b/tests/plugins/test_tpu_spawn.py
index 3f4ff354e39bb..ba5dc0e9d5f0d 100644
--- a/tests/plugins/test_tpu_spawn.py
+++ b/tests/plugins/test_tpu_spawn.py
@@ -20,6 +20,7 @@
 from torch.utils.data import DataLoader
 
 from pytorch_lightning import Trainer
+from pytorch_lightning.loggers import LoggerCollection, TensorBoardLogger
 from pytorch_lightning.plugins.training_type import TPUSpawnPlugin
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers.boring_model import BoringModel, RandomDataset
@@ -85,7 +86,7 @@ def test_error_process_iterable_dataloader(_):
 
 class BoringModelTPU(BoringModel):
     def on_train_start(self) -> None:
-        assert self.device == torch.device("xla")
+        assert self.device == torch.device("xla", index=1)
         assert os.environ.get("PT_XLA_DEBUG") == "1"
 
 
@@ -93,12 +94,27 @@ def on_train_start(self) -> None:
 @pl_multi_process_test
 def test_model_tpu_one_core():
     """Tests if device/debug flag is set correctely when training and after teardown for TPUSpawnPlugin."""
-    trainer = Trainer(tpu_cores=1, fast_dev_run=True, plugin=TPUSpawnPlugin(debug=True))
+    trainer = Trainer(tpu_cores=1, fast_dev_run=True, strategy=TPUSpawnPlugin(debug=True))
     # assert training type plugin attributes for device setting
     assert isinstance(trainer.training_type_plugin, TPUSpawnPlugin)
     assert not trainer.training_type_plugin.on_gpu
     assert trainer.training_type_plugin.on_tpu
-    assert trainer.training_type_plugin.root_device == torch.device("xla")
+    assert trainer.training_type_plugin.root_device == torch.device("xla", index=1)
     model = BoringModelTPU()
     trainer.fit(model)
     assert "PT_XLA_DEBUG" not in os.environ
+
+
+@RunIf(tpu=True)
+@pytest.mark.parametrize("use_list", [False, True])
+def test_tensorboard_ddp_spawn_cleanup(use_list, tmpdir):
+    tensorboard_logger = TensorBoardLogger(save_dir=tmpdir)
+    assert tensorboard_logger._experiment is None
+    tensorboard_logger.experiment  # this property access will create the experiment
+    assert tensorboard_logger._experiment is not None
+    logger = [tensorboard_logger] if use_list else tensorboard_logger
+    trainer = Trainer(strategy="ddp_spawn", accelerator="tpu", devices="auto", logger=logger)
+    trainer.training_type_plugin._clean_logger(trainer)
+    if use_list:
+        assert isinstance(trainer.logger, LoggerCollection)
+    assert tensorboard_logger._experiment is None
diff --git a/tests/profiler/test_profiler.py b/tests/profiler/test_profiler.py
index 7369ab9a4a140..a8cff9368546e 100644
--- a/tests/profiler/test_profiler.py
+++ b/tests/profiler/test_profiler.py
@@ -25,7 +25,7 @@
 from pytorch_lightning.loggers.base import LoggerCollection
 from pytorch_lightning.loggers.tensorboard import TensorBoardLogger
 from pytorch_lightning.profiler import AdvancedProfiler, PassThroughProfiler, PyTorchProfiler, SimpleProfiler
-from pytorch_lightning.profiler.pytorch import RegisterRecordFunction
+from pytorch_lightning.profiler.pytorch import RegisterRecordFunction, warning_cache
 from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_7
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.imports import _KINETO_AVAILABLE
@@ -162,7 +162,7 @@ def test_simple_profiler_with_nonexisting_dirpath(tmpdir):
     assert nonexisting_tmpdir.join("fit-profiler.txt").exists()
 
 
-@RunIf(skip_windows=True)
+@RunIf(skip_windows=True, skip_49370=True)
 def test_simple_profiler_distributed_files(tmpdir):
     """Ensure the proper files are saved in distributed."""
     profiler = SimpleProfiler(dirpath=tmpdir, filename="profiler")
@@ -227,6 +227,7 @@ def test_advanced_profiler_iterable_durations(advanced_profiler, action: str, ex
     np.testing.assert_allclose(recored_total_duration, expected_total_duration, rtol=0.2)
 
 
+@pytest.mark.flaky(reruns=3)
 def test_advanced_profiler_overhead(advanced_profiler, n_iter=5):
     """ensure that the profiler doesn't introduce too much overhead during training."""
     for _ in range(n_iter):
@@ -292,7 +293,7 @@ def test_advanced_profiler_cprofile_deepcopy(tmpdir):
     trainer.fit(model)
 
 
-@RunIf(min_gpus=2, special=True)
+@RunIf(min_gpus=2, standalone=True)
 def test_pytorch_profiler_trainer_ddp(tmpdir, pytorch_profiler):
     """Ensure that the profiler can be given to the training and default step are properly recorded."""
     model = BoringModel()
@@ -330,7 +331,6 @@ def test_pytorch_profiler_trainer_ddp(tmpdir, pytorch_profiler):
         assert any(f"{local_rank}-validation_step" in f for f in files)
 
 
-@RunIf(special=True)
 @pytest.mark.parametrize("fast_dev_run", [1, 2, 3, 4, 5])
 @pytest.mark.parametrize("boring_model_cls", [ManualOptimBoringModel, BoringModel])
 def test_pytorch_profiler_trainer_fit(fast_dev_run, boring_model_cls, tmpdir):
@@ -426,7 +426,7 @@ def look_for_trace(trace_dir):
     assert look_for_trace(tmpdir)
 
 
-@RunIf(min_gpus=1, special=True)
+@RunIf(min_gpus=1, standalone=True)
 def test_pytorch_profiler_nested_emit_nvtx(tmpdir):
     """This test check emit_nvtx is correctly supported."""
     profiler = PyTorchProfiler(use_cuda=True, emit_nvtx=True)
@@ -522,3 +522,31 @@ def test_trainer_profiler_incorrect_str_arg():
         match=r"When passing string value for the `profiler` parameter of `Trainer`, it can only be one of.*",
     ):
         Trainer(profiler="unknown_profiler")
+
+
+@pytest.mark.skipif(not _KINETO_AVAILABLE, reason="Requires PyTorch Profiler Kineto")
+@pytest.mark.parametrize(
+    ["trainer_config", "trainer_fn"],
+    [
+        ({"limit_train_batches": 4, "limit_val_batches": 7}, "fit"),
+        ({"limit_train_batches": 7, "limit_val_batches": 4, "num_sanity_val_steps": 0}, "fit"),
+        (
+            {
+                "limit_train_batches": 7,
+                "limit_val_batches": 2,
+            },
+            "fit",
+        ),
+        ({"limit_val_batches": 4}, "validate"),
+        ({"limit_test_batches": 4}, "test"),
+        ({"limit_predict_batches": 4}, "predict"),
+    ],
+)
+def test_pytorch_profiler_raises_warning_for_limited_steps(tmpdir, trainer_config, trainer_fn):
+    model = BoringModel()
+    trainer = Trainer(default_root_dir=tmpdir, profiler="pytorch", max_epochs=1, **trainer_config)
+    warning_cache.clear()
+    with pytest.warns(UserWarning, match="not enough steps to properly record traces"):
+        getattr(trainer, trainer_fn)(model)
+        assert trainer.profiler._schedule is None
+        warning_cache.clear()
diff --git a/tests/profiler/test_xla_profiler.py b/tests/profiler/test_xla_profiler.py
index 2afbf69a6d0b0..7f460ea11d322 100644
--- a/tests/profiler/test_xla_profiler.py
+++ b/tests/profiler/test_xla_profiler.py
@@ -18,14 +18,16 @@
 
 from pytorch_lightning import Trainer
 from pytorch_lightning.profiler import XLAProfiler
-from pytorch_lightning.utilities import _TPU_AVAILABLE
+from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_8, _TPU_AVAILABLE
 from tests.helpers import BoringModel
 from tests.helpers.runif import RunIf
 
 if _TPU_AVAILABLE:
-    import torch_xla.debug.profiler as xp
     import torch_xla.utils.utils as xu
 
+    if _TORCH_GREATER_EQUAL_1_8:
+        import torch_xla.debug.profiler as xp
+
 
 @RunIf(tpu=True)
 def test_xla_profiler_instance(tmpdir):
diff --git a/tests/special_tests.sh b/tests/standalone_tests.sh
similarity index 53%
rename from tests/special_tests.sh
rename to tests/standalone_tests.sh
index f4b760dd75291..7b7dd361ab0b1 100755
--- a/tests/special_tests.sh
+++ b/tests/standalone_tests.sh
@@ -15,57 +15,43 @@
 set -e
 
 # this environment variable allows special tests to run
-export PL_RUNNING_SPECIAL_TESTS=1
+export PL_RUN_STANDALONE_TESTS=1
 # python arguments
-defaults='-m coverage run --source pytorch_lightning --append -m pytest --durations=0 --capture=no --disable-warnings'
+defaults='-m coverage run --source pytorch_lightning --append -m pytest --capture=no'
 
-# find tests marked as `@RunIf(special=True)`
-grep_output=$(grep --recursive --line-number --word-regexp 'tests' 'benchmarks' --regexp 'special=True')
-# file paths
-files=$(echo "$grep_output" | cut -f1 -d:)
-files_arr=($files)
-# line numbers
-linenos=$(echo "$grep_output" | cut -f2 -d:)
-linenos_arr=($linenos)
+# find tests marked as `@RunIf(standalone=True)`. done manually instead of with pytest because it is faster
+grep_output=$(grep --recursive --word-regexp 'tests' --regexp 'standalone=True' --include '*.py' --exclude 'tests/conftest.py')
 
-# tests to skip - space separated
-blocklist='test_pytorch_profiler_nested_emit_nvtx'
-report=''
+# file paths, remove duplicates
+files=$(echo "$grep_output" | cut -f1 -d: | sort | uniq)
 
-for i in "${!files_arr[@]}"; do
-  file=${files_arr[$i]}
-  lineno=${linenos_arr[$i]}
+# get the list of parametrizations. we need to call them separately. the last two lines are removed.
+# note: if there's a syntax error, this will fail with some garbled output
+if [[ "$OSTYPE" == "darwin"* ]]; then
+  parametrizations=$(pytest $files --collect-only --quiet "$@" | tail -r | sed -e '1,3d' | tail -r)
+else
+  parametrizations=$(pytest $files --collect-only --quiet "$@" | head -n -2)
+fi
+parametrizations_arr=($parametrizations)
 
-  # get code from `@RunIf(special=True)` line to EOF
-  test_code=$(tail -n +"$lineno" "$file")
+# tests to skip - space separated
+blocklist='tests/profiler/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx'
+report=''
 
-  # read line by line
-  while read -r line; do
-    # if it's a test
-    if [[ $line == def\ test_* ]]; then
-      # get the name
-      test_name=$(echo $line | cut -c 5- | cut -f1 -d\()
+for i in "${!parametrizations_arr[@]}"; do
+  parametrization=${parametrizations_arr[$i]}
 
-      # check blocklist
-      if echo $blocklist | grep --word-regexp "$test_name" > /dev/null; then
-        report+="Skipped\t$file:$lineno::$test_name\n"
-        break
-      fi
+  # check blocklist
+  if echo $blocklist | grep -F "${parametrization}"; then
+    report+="Skipped\t$parametrization\n"
+    continue
+  fi
 
-      # SPECIAL_PATTERN allows filtering the tests to run when debugging.
-      # use as `SPECIAL_PATTERN="foo_bar" ./special_tests.sh` to run only those
-      # test with `foo_bar` in their name
-      if [[ $line != *$SPECIAL_PATTERN* ]]; then
-        report+="Skipped\t$file:$lineno::$test_name\n"
-        break
-      fi
+  # run the test
+  echo "Running ${parametrization}"
+  python ${defaults} "${parametrization}"
 
-      # run the test
-      report+="Ran\t$file:$lineno::$test_name\n"
-      python ${defaults} "${file}::${test_name}"
-      break
-    fi
-  done < <(echo "$test_code")
+  report+="Ran\t$parametrization\n"
 done
 
 if nvcc --version; then
@@ -80,13 +66,14 @@ fi
 
 # TODO: enable when CI uses torch>=1.9
 # test deadlock is properly handled with TorchElastic.
-# LOGS=$(PL_RUNNING_SPECIAL_TESTS=1 PL_RECONCILE_PROCESS=1 python -m torch.distributed.run --nproc_per_node=2 --max_restarts 0 -m coverage run --source pytorch_lightning -a tests/plugins/environments/torch_elastic_deadlock.py | grep "SUCCEEDED")
+# LOGS=$(PL_RUN_STANDALONE_TESTS=1 PL_RECONCILE_PROCESS=1 python -m torch.distributed.run --nproc_per_node=2 --max_restarts 0 -m coverage run --source pytorch_lightning -a tests/plugins/environments/torch_elastic_deadlock.py | grep "SUCCEEDED")
 # if  [ -z "$LOGS" ]; then
 #    exit 1
 # fi
 # report+="Ran\ttests/plugins/environments/torch_elastic_deadlock.py\n"
 
 # test that a user can manually launch individual processes
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
 args="--trainer.gpus 2 --trainer.strategy ddp --trainer.max_epochs=1 --trainer.limit_train_batches=1 --trainer.limit_val_batches=1 --trainer.limit_test_batches=1"
 MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=1 python pl_examples/basic_examples/mnist_examples/image_classifier_5_lightning_datamodule.py ${args} &
 MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=0 python pl_examples/basic_examples/mnist_examples/image_classifier_5_lightning_datamodule.py ${args}
diff --git a/tests/trainer/connectors/test_signal_connector.py b/tests/trainer/connectors/test_signal_connector.py
index 3da8c100fe40c..c5990be94e2cb 100644
--- a/tests/trainer/connectors/test_signal_connector.py
+++ b/tests/trainer/connectors/test_signal_connector.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import concurrent.futures
 import os
 import signal
 from time import sleep
@@ -19,19 +20,33 @@
 import pytest
 
 from pytorch_lightning import Trainer
+from pytorch_lightning.plugins.environments import SLURMEnvironment
+from pytorch_lightning.trainer.connectors.signal_connector import SignalConnector
 from pytorch_lightning.utilities.exceptions import ExitGracefullyException
 from tests.helpers import BoringModel
 from tests.helpers.runif import RunIf
 
 
+@RunIf(skip_windows=True)
+@mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"})
+def test_signal_handlers_restored_in_teardown():
+    """Test that the SignalConnector restores the previously configured handler on teardown."""
+    assert signal.getsignal(signal.SIGTERM) is signal.SIG_DFL
+
+    trainer = Trainer(plugins=SLURMEnvironment())
+    connector = SignalConnector(trainer)
+    connector.register_signal_handlers()
+
+    assert signal.getsignal(signal.SIGUSR1) is not signal.SIG_DFL
+    connector.teardown()
+    assert signal.getsignal(signal.SIGUSR1) is signal.SIG_DFL
+
+
 @pytest.mark.parametrize("register_handler", [False, True])
 @pytest.mark.parametrize("terminate_gracefully", [False, True])
-@RunIf(min_torch="1.7.0", skip_windows=True)
+@RunIf(skip_windows=True)
 def test_fault_tolerant_sig_handler(register_handler, terminate_gracefully, tmpdir):
 
-    # hack to reset the signal
-    signal.signal(signal.SIGUSR1, 0)
-
     if register_handler:
 
         def handler(*_):
@@ -57,3 +72,44 @@ def training_step(self, batch, batch_idx):
         else:
             trainer.fit(model)
         assert trainer._terminate_gracefully == (False if register_handler else terminate_gracefully)
+
+    # reset the signal to system defaults
+    signal.signal(signal.SIGUSR1, signal.SIG_DFL)
+
+
+def _registering_signals():
+    trainer = Trainer()
+    trainer.signal_connector.register_signal_handlers()
+
+
+@RunIf(skip_windows=True)
+@mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"})
+def test_signal_connector_in_thread():
+    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
+        for future in concurrent.futures.as_completed([executor.submit(_registering_signals)]):
+            assert future.exception() is None
+
+
+def signal_handler():
+    pass
+
+
+class SignalHandlers:
+    def signal_handler(self):
+        pass
+
+
+@pytest.mark.parametrize(
+    ["handler", "expected_return"],
+    [
+        (None, False),
+        (signal.Handlers.SIG_IGN, True),
+        (signal.Handlers.SIG_DFL, False),
+        (signal_handler, True),
+        (SignalHandlers().signal_handler, True),
+    ],
+)
+def test_has_already_handler(handler, expected_return):
+    """Test that the SignalConnector detects whether a signal handler is already attached."""
+    with mock.patch("pytorch_lightning.trainer.connectors.signal_connector.signal.getsignal", return_value=handler):
+        assert SignalConnector._has_already_handler(signal.SIGTERM) is expected_return
diff --git a/tests/trainer/flags/test_overfit_batches.py b/tests/trainer/flags/test_overfit_batches.py
index 76c8b37405b47..3860d85ec9836 100644
--- a/tests/trainer/flags/test_overfit_batches.py
+++ b/tests/trainer/flags/test_overfit_batches.py
@@ -13,13 +13,16 @@
 # limitations under the License.
 import pytest
 import torch
+from torch.utils.data.sampler import Sampler, SequentialSampler
 
 from pytorch_lightning import Trainer
 from tests.helpers.boring_model import BoringModel, RandomDataset
 
 
 def test_overfit_multiple_val_loaders(tmpdir):
-    """Tests that only training_step can be used."""
+    """Tests that overfit batches works with multiple val dataloaders."""
+    val_dl_count = 2
+    overfit_batches = 3
 
     class TestModel(BoringModel):
         def validation_step(self, batch, batch_idx, dataloader_idx):
@@ -31,25 +34,65 @@ def validation_epoch_end(self, outputs) -> None:
             pass
 
         def val_dataloader(self):
-            dl1 = torch.utils.data.DataLoader(RandomDataset(32, 64))
-            dl2 = torch.utils.data.DataLoader(RandomDataset(32, 64))
-            return [dl1, dl2]
+            dls = [torch.utils.data.DataLoader(RandomDataset(32, 64)) for _ in range(val_dl_count)]
+            return dls
 
     model = TestModel()
 
     trainer = Trainer(
-        default_root_dir=tmpdir, max_epochs=2, overfit_batches=1, log_every_n_steps=1, enable_model_summary=False
+        default_root_dir=tmpdir,
+        max_epochs=2,
+        overfit_batches=overfit_batches,
+        log_every_n_steps=1,
+        enable_model_summary=False,
     )
 
     trainer.fit(model)
+    assert trainer.num_training_batches == overfit_batches
+    assert len(trainer.num_val_batches) == val_dl_count
+    assert all(nbatches == overfit_batches for nbatches in trainer.num_val_batches)
 
 
-@pytest.mark.parametrize("overfit", [1, 2, 0.1, 0.25, 1.0])
-def test_overfit_basic(tmpdir, overfit):
-    """Tests that only training_step can be used."""
+@pytest.mark.parametrize("overfit_batches", [1, 2, 0.1, 0.25, 1.0])
+def test_overfit_basic(tmpdir, overfit_batches):
+    """Tests that only training_step can be used when overfitting."""
 
     model = BoringModel()
+    model.validation_step = None
+    total_train_samples = len(BoringModel().train_dataloader())
 
-    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, overfit_batches=overfit, enable_model_summary=False)
-
+    trainer = Trainer(
+        default_root_dir=tmpdir, max_epochs=1, overfit_batches=overfit_batches, enable_model_summary=False
+    )
     trainer.fit(model)
+
+    assert trainer.num_val_batches == []
+    assert trainer.num_training_batches == int(
+        overfit_batches * (1 if isinstance(overfit_batches, int) else total_train_samples)
+    )
+
+
+def test_overfit_batches_raises_warning_in_case_of_sequential_sampler(tmpdir):
+    class NonSequentialSampler(Sampler):
+        def __init__(self, data_source):
+            self.data_source = data_source
+
+        def __iter__(self):
+            return iter(range(len(self.data_source)))
+
+        def __len__(self):
+            return len(self.data_source)
+
+    class TestModel(BoringModel):
+        def train_dataloader(self):
+            dataset = RandomDataset(32, 64)
+            sampler = NonSequentialSampler(dataset)
+            return torch.utils.data.DataLoader(dataset, sampler=sampler)
+
+    model = TestModel()
+    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, overfit_batches=2)
+
+    with pytest.warns(UserWarning, match="requested to overfit but enabled training dataloader shuffling"):
+        trainer.fit(model)
+
+    assert isinstance(trainer.train_dataloader.loaders.sampler, SequentialSampler)
diff --git a/tests/trainer/logging_/test_distributed_logging.py b/tests/trainer/logging_/test_distributed_logging.py
index 487b7f38e4e19..36c266343b849 100644
--- a/tests/trainer/logging_/test_distributed_logging.py
+++ b/tests/trainer/logging_/test_distributed_logging.py
@@ -59,7 +59,7 @@ def on_train_end(self):
         assert self.log_name.format(rank=self.local_rank) in self.logger.logs, "Expected rank to be logged"
 
 
-@RunIf(skip_windows=True)
+@RunIf(skip_windows=True, skip_49370=True)
 def test_all_rank_logging_ddp_cpu(tmpdir):
     """Check that all ranks can be logged from."""
     model = TestModel()
@@ -112,7 +112,6 @@ def on_fit_start(self, trainer, pl_module):
 
         def on_train_start(self, trainer, pl_module):
             assert trainer.logger.method_call
-            trainer.logger.log_hyperparams.assert_called_once()
             trainer.logger.log_graph.assert_called_once()
 
     logger = Mock()
diff --git a/tests/trainer/logging_/test_eval_loop_logging.py b/tests/trainer/logging_/test_eval_loop_logging.py
index b1b7217c892e5..d47cb1ef7d3bf 100644
--- a/tests/trainer/logging_/test_eval_loop_logging.py
+++ b/tests/trainer/logging_/test_eval_loop_logging.py
@@ -25,6 +25,7 @@
 from pytorch_lightning.loggers import TensorBoardLogger
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers import BoringModel, RandomDataset
+from tests.helpers.runif import RunIf
 
 
 def test__validation_step__log(tmpdir):
@@ -423,6 +424,12 @@ def make_logging(self, pl_module, func_name, on_steps, on_epochs, prob_bars):
         def on_test_start(self, _, pl_module):
             self.make_logging(pl_module, "on_test_start", on_steps=[False], on_epochs=[True], prob_bars=self.choices)
 
+        def on_epoch_start(self, trainer, pl_module):
+            if trainer.testing:
+                self.make_logging(
+                    pl_module, "on_epoch_start", on_steps=[False], on_epochs=[True], prob_bars=self.choices
+                )
+
         def on_test_epoch_start(self, _, pl_module):
             self.make_logging(
                 pl_module, "on_test_epoch_start", on_steps=[False], on_epochs=[True], prob_bars=self.choices
@@ -503,6 +510,10 @@ class ExtendedModel(BoringModel):
 
         val_losses = []
 
+        def __init__(self, some_val=7):
+            super().__init__()
+            self.save_hyperparameters()
+
         def training_step(self, batch, batch_idx):
             output = self.layer(batch)
             loss = self.loss(batch, output)
@@ -666,3 +677,68 @@ def val_dataloader(self):
         enable_model_summary=False,
     )
     trainer.fit(model)
+
+
+@RunIf(min_gpus=1)
+def test_evaluation_move_metrics_to_cpu_and_outputs(tmpdir):
+    class TestModel(BoringModel):
+        def validation_step(self, *args):
+            x = torch.tensor(2.0, requires_grad=True, device=self.device)
+            y = x * 2
+            assert x.requires_grad is True
+            assert y.grad_fn is None  # disabled by validation
+
+            self.log("foo", y)
+            return y
+
+        def validation_epoch_end(self, outputs):
+            # the step outputs were not moved
+            assert all(o.device == self.device for o in outputs), outputs
+            # but the logging results were
+            assert self.trainer.callback_metrics["foo"].device.type == "cpu"
+
+    model = TestModel()
+    trainer = Trainer(default_root_dir=tmpdir, limit_val_batches=2, move_metrics_to_cpu=True, gpus=1)
+    trainer.validate(model, verbose=False)
+
+
+def test_logging_results_with_no_dataloader_idx(tmpdir):
+    num_dataloaders = 2
+    log_common_same_val = {"test_log_common": 789}
+    log_common_diff_val = "test_log_common_diff_value"
+    log_key_no_dl_idx = "test_log_no_dl_idx_{}"
+    log_key_dl0 = {"test_log_a_class": 123}
+    log_key_dl1 = {"test_log_b_class": 456}
+
+    class CustomBoringModel(BoringModel):
+        def test_step(self, batch, batch_idx, dataloader_idx):
+            self.log_dict(log_common_same_val)
+            self.log(log_common_diff_val, dataloader_idx + 1)
+            self.log(
+                log_key_no_dl_idx.format(dataloader_idx),
+                321 * (dataloader_idx + 1),
+                add_dataloader_idx=False,
+            )
+            self.log_dict(log_key_dl0 if dataloader_idx == 0 else log_key_dl1, add_dataloader_idx=False)
+
+        def test_dataloader(self):
+            return [torch.utils.data.DataLoader(RandomDataset(32, 64)) for _ in range(num_dataloaders)]
+
+    model = CustomBoringModel()
+    model.test_epoch_end = None
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=1)
+    results = trainer.test(model)
+
+    assert len(results) == num_dataloaders
+    assert results[0] == {
+        "test_log_common/dataloader_idx_0": 789.0,
+        "test_log_common_diff_value/dataloader_idx_0": 1.0,
+        "test_log_no_dl_idx_0": 321,
+        "test_log_a_class": 123.0,
+    }
+    assert results[1] == {
+        "test_log_common/dataloader_idx_1": 789.0,
+        "test_log_common_diff_value/dataloader_idx_1": 2.0,
+        "test_log_no_dl_idx_1": 321 * 2,
+        "test_log_b_class": 456.0,
+    }
diff --git a/tests/trainer/logging_/test_logger_connector.py b/tests/trainer/logging_/test_logger_connector.py
index d26245a377897..ef36211eb3b70 100644
--- a/tests/trainer/logging_/test_logger_connector.py
+++ b/tests/trainer/logging_/test_logger_connector.py
@@ -17,7 +17,7 @@
 import pytest
 import torch
 from torch.utils.data import DataLoader
-from torchmetrics import Accuracy, AveragePrecision
+from torchmetrics import Accuracy, AveragePrecision, MeanAbsoluteError, MeanSquaredError
 
 from pytorch_lightning import LightningModule
 from pytorch_lightning.callbacks.base import Callback
@@ -141,17 +141,17 @@ def test_fx_validator(tmpdir):
             and func_name not in ["on_train_end", "on_test_end", "on_validation_end"]
         )
         if allowed:
-            validator.check_logging(fx_name=func_name, on_step=on_step, on_epoch=on_epoch)
+            validator.check_logging_levels(fx_name=func_name, on_step=on_step, on_epoch=on_epoch)
             if not is_start and is_stage:
                 with pytest.raises(MisconfigurationException, match="must be one of"):
-                    validator.check_logging(fx_name=func_name, on_step=True, on_epoch=on_epoch)
+                    validator.check_logging_levels(fx_name=func_name, on_step=True, on_epoch=on_epoch)
         else:
             assert func_name in not_supported
             with pytest.raises(MisconfigurationException, match="You can't"):
-                validator.check_logging(fx_name=func_name, on_step=on_step, on_epoch=on_epoch)
+                validator.check_logging(fx_name=func_name)
 
     with pytest.raises(RuntimeError, match="Logging inside `foo` is not implemented"):
-        validator.check_logging("foo", False, False)
+        validator.check_logging("foo")
 
 
 class HookedCallback(Callback):
@@ -527,9 +527,9 @@ def _assert_called(model, fn, stage):
 
 
 def test_result_collection_on_tensor_with_mean_reduction():
-    result_collection = ResultCollection(True, torch.device("cpu"))
+    result_collection = ResultCollection(True)
     product = [(True, True), (False, True), (True, False), (False, False)]
-    values = torch.arange(1, 10).float()  # need to convert to float() due to precision issues using torch 1.4
+    values = torch.arange(1, 10)
     batches = values * values
 
     for i, v in enumerate(values):
@@ -637,3 +637,51 @@ def training_step(self, batch, batch_idx):
 
     # should not get overridden if logged manually
     assert trainer.logged_metrics == {"epoch": -1}
+
+
+def test_result_collection_batch_size_extraction():
+    fx_name = "training_step"
+    log_val = torch.tensor(7.0)
+
+    results = ResultCollection(training=True, device="cpu")
+    results.batch = torch.randn(1, 4)
+    train_mse = MeanSquaredError()
+    train_mse(torch.randn(4, 5), torch.randn(4, 5))
+    results.log(fx_name, "train_logs", {"mse": train_mse, "log_val": log_val}, on_step=False, on_epoch=True)
+    assert results.batch_size == 1
+    assert isinstance(results["training_step.train_logs"]["mse"].value, MeanSquaredError)
+    assert results["training_step.train_logs"]["log_val"].value == log_val
+
+    results = ResultCollection(training=True, device="cpu")
+    results.batch = torch.randn(1, 4)
+    results.log(fx_name, "train_log", log_val, on_step=False, on_epoch=True)
+    assert results.batch_size == 1
+    assert results["training_step.train_log"].value == log_val
+    assert results["training_step.train_log"].cumulated_batch_size == 1
+
+
+def test_result_collection_no_batch_size_extraction():
+    results = ResultCollection(training=True, device="cpu")
+    results.batch = torch.randn(1, 4)
+    fx_name = "training_step"
+    batch_size = 10
+    log_val = torch.tensor(7.0)
+
+    train_mae = MeanAbsoluteError()
+    train_mae(torch.randn(4, 5), torch.randn(4, 5))
+    train_mse = MeanSquaredError()
+    train_mse(torch.randn(4, 5), torch.randn(4, 5))
+    results.log(fx_name, "step_log_val", log_val, on_step=True, on_epoch=False)
+    results.log(fx_name, "epoch_log_val", log_val, on_step=False, on_epoch=True, batch_size=batch_size)
+    results.log(fx_name, "epoch_sum_log_val", log_val, on_step=True, on_epoch=True, reduce_fx="sum")
+    results.log(fx_name, "train_mae", train_mae, on_step=True, on_epoch=False)
+    results.log(fx_name, "train_mse", {"mse": train_mse}, on_step=True, on_epoch=False)
+
+    assert results.batch_size is None
+    assert isinstance(results["training_step.train_mse"]["mse"].value, MeanSquaredError)
+    assert isinstance(results["training_step.train_mae"].value, MeanAbsoluteError)
+    assert results["training_step.step_log_val"].value == log_val
+    assert results["training_step.step_log_val"].cumulated_batch_size == 0
+    assert results["training_step.epoch_log_val"].value == log_val * batch_size
+    assert results["training_step.epoch_log_val"].cumulated_batch_size == batch_size
+    assert results["training_step.epoch_sum_log_val"].value == log_val
diff --git a/tests/trainer/logging_/test_loop_logging.py b/tests/trainer/logging_/test_loop_logging.py
new file mode 100644
index 0000000000000..2c2f2253c42a3
--- /dev/null
+++ b/tests/trainer/logging_/test_loop_logging.py
@@ -0,0 +1,108 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Test logging in the training loop."""
+import inspect
+from unittest import mock
+from unittest.mock import ANY
+
+import torch
+
+from pytorch_lightning import Trainer
+from pytorch_lightning.trainer.connectors.logger_connector.fx_validator import _FxValidator
+from pytorch_lightning.trainer.connectors.logger_connector.result import ResultCollection
+from pytorch_lightning.trainer.states import RunningStage, TrainerFn
+from tests.helpers.boring_model import BoringModel
+
+
+def test_default_level_for_hooks_that_support_logging():
+    def _make_assertion(model, hooks, result_mock, on_step, on_epoch, extra_kwargs):
+        for hook in hooks:
+            model._current_fx_name = hook
+            model.log(hook, 1)
+            result_mock.assert_called_with(
+                hook, hook, torch.tensor(1), on_step=on_step, on_epoch=on_epoch, **extra_kwargs
+            )
+
+    trainer = Trainer()
+    model = BoringModel()
+    model.trainer = trainer
+    extra_kwargs = {
+        k: ANY
+        for k in inspect.signature(ResultCollection.log).parameters
+        if k not in ["self", "fx", "name", "value", "on_step", "on_epoch"]
+    }
+    all_logging_hooks = {k for k in _FxValidator.functions if _FxValidator.functions[k]}
+
+    with mock.patch(
+        "pytorch_lightning.trainer.connectors.logger_connector.result.ResultCollection.log", return_value=None
+    ) as result_mock:
+        trainer.state.stage = RunningStage.TRAINING
+        hooks = [
+            "on_before_backward",
+            "on_after_backward",
+            "on_before_optimizer_step",
+            "on_before_zero_grad",
+            "training_step",
+            "training_step_end",
+            "on_batch_start",
+            "on_batch_end",
+            "on_train_batch_start",
+            "on_train_batch_end",
+        ]
+        all_logging_hooks = all_logging_hooks - set(hooks)
+        _make_assertion(model, hooks, result_mock, on_step=True, on_epoch=False, extra_kwargs=extra_kwargs)
+
+        hooks = [
+            "on_train_start",
+            "on_train_epoch_start",
+            "on_train_epoch_end",
+            "on_epoch_start",
+            "on_epoch_end",
+            "training_epoch_end",
+        ]
+        all_logging_hooks = all_logging_hooks - set(hooks)
+        _make_assertion(model, hooks, result_mock, on_step=False, on_epoch=True, extra_kwargs=extra_kwargs)
+
+        trainer.state.stage = RunningStage.VALIDATING
+        trainer.state.fn = TrainerFn.VALIDATING
+        hooks = [
+            "on_validation_start",
+            "on_validation_epoch_start",
+            "on_validation_epoch_end",
+            "on_validation_batch_start",
+            "on_validation_batch_end",
+            "validation_step",
+            "validation_step_end",
+            "validation_epoch_end",
+        ]
+        all_logging_hooks = all_logging_hooks - set(hooks)
+        _make_assertion(model, hooks, result_mock, on_step=False, on_epoch=True, extra_kwargs=extra_kwargs)
+
+        trainer.state.stage = RunningStage.TESTING
+        trainer.state.fn = TrainerFn.TESTING
+        hooks = [
+            "on_test_start",
+            "on_test_epoch_start",
+            "on_test_epoch_end",
+            "on_test_batch_start",
+            "on_test_batch_end",
+            "test_step",
+            "test_step_end",
+            "test_epoch_end",
+        ]
+        all_logging_hooks = all_logging_hooks - set(hooks)
+        _make_assertion(model, hooks, result_mock, on_step=False, on_epoch=True, extra_kwargs=extra_kwargs)
+
+    # just to ensure we checked all possible logging hooks here
+    assert len(all_logging_hooks) == 0
diff --git a/tests/trainer/logging_/test_train_loop_logging.py b/tests/trainer/logging_/test_train_loop_logging.py
index 6cad94017177e..2ad2585f0fe02 100644
--- a/tests/trainer/logging_/test_train_loop_logging.py
+++ b/tests/trainer/logging_/test_train_loop_logging.py
@@ -272,13 +272,11 @@ def on_train_start(self, _, pl_module):
             self.make_logging(pl_module, "on_train_start", on_steps=[False], on_epochs=[True], prob_bars=self.choices)
 
         def on_epoch_start(self, _, pl_module):
-            self.make_logging(
-                pl_module, "on_epoch_start", on_steps=self.choices, on_epochs=[True], prob_bars=self.choices
-            )
+            self.make_logging(pl_module, "on_epoch_start", on_steps=[False], on_epochs=[True], prob_bars=self.choices)
 
         def on_train_epoch_start(self, _, pl_module):
             self.make_logging(
-                pl_module, "on_train_epoch_start", on_steps=self.choices, on_epochs=[True], prob_bars=self.choices
+                pl_module, "on_train_epoch_start", on_steps=[False], on_epochs=[True], prob_bars=self.choices
             )
 
         def on_batch_start(self, _, pl_module, *__):
@@ -397,7 +395,7 @@ def validation_step(self, batch, batch_idx):
         return super().validation_step(batch, batch_idx)
 
 
-@pytest.mark.parametrize("devices", [1, pytest.param(2, marks=RunIf(skip_windows=True))])
+@pytest.mark.parametrize("devices", [1, pytest.param(2, marks=RunIf(skip_windows=True, skip_49370=True))])
 def test_logging_sync_dist_true(tmpdir, devices):
     """Tests to ensure that the sync_dist flag works (should just return the original value)"""
     fake_result = 1
@@ -435,7 +433,7 @@ def test_logging_sync_dist_true(tmpdir, devices):
     assert metrics["bar_3"] == 2 + int(use_multiple_devices)
 
 
-@RunIf(min_gpus=2, special=True)
+@RunIf(min_gpus=2, standalone=True)
 def test_logging_sync_dist_true_ddp(tmpdir):
     """Tests to ensure that the sync_dist flag works with ddp."""
 
@@ -717,19 +715,15 @@ def on_validation_epoch_end(self):
             assert all(v == 3 for v in self.trainer.callback_metrics.values())
 
         def on_train_batch_start(self, batch, batch_idx):
-            assert self.trainer._results.batch_size == 2
-            self.log("on_train_batch_start", 1.0, reduce_fx="sum")
+            self.log("on_train_batch_start", 1.0, on_step=False, on_epoch=True, reduce_fx="sum")
 
         def on_train_batch_end(self, outputs, batch, batch_idx):
-            assert self.trainer._results.batch_size == 2
-            self.log("on_train_batch_end", 1.0, reduce_fx="sum")
+            self.log("on_train_batch_end", 1.0, on_step=False, on_epoch=True, reduce_fx="sum")
 
         def on_validation_batch_start(self, batch, batch_idx, dataloader_idx):
-            assert self.trainer._results.batch_size == 2
             self.log("on_validation_batch_start", 1.0, reduce_fx="sum")
 
         def on_validation_batch_end(self, outputs, batch, batch_idx, dataloader_idx):
-            assert self.trainer._results.batch_size == 2
             self.log("on_validation_batch_end", 1.0, reduce_fx="sum")
 
         def training_epoch_end(self, *_) -> None:
diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py
index 5c86fd6343002..82acfb8b08f8c 100644
--- a/tests/trainer/optimization/test_manual_optimization.py
+++ b/tests/trainer/optimization/test_manual_optimization.py
@@ -840,7 +840,7 @@ def train_manual_optimization(tmpdir, strategy, model_cls=TesManualOptimizationD
         assert not torch.equal(param.cpu().data, param_copy.data)
 
 
-@RunIf(min_gpus=2, special=True)
+@RunIf(min_gpus=2, standalone=True)
 def test_step_with_optimizer_closure_with_different_frequencies_ddp(tmpdir):
     """Tests that `step` works with optimizer_closure and different accumulated_gradient frequency."""
 
@@ -910,7 +910,7 @@ def dis_closure():
                 opt_dis.zero_grad()
 
 
-@RunIf(min_gpus=2, special=True)
+@RunIf(min_gpus=2, standalone=True)
 def test_step_with_optimizer_closure_with_different_frequencies_ddp_with_toggle_model(tmpdir):
     train_manual_optimization(tmpdir, "ddp", model_cls=TestManualOptimizationDDPModelToggleModel)
 
diff --git a/tests/trainer/optimization/test_optimizers.py b/tests/trainer/optimization/test_optimizers.py
index b2d88becb1ec7..4a99b3318f06f 100644
--- a/tests/trainer/optimization/test_optimizers.py
+++ b/tests/trainer/optimization/test_optimizers.py
@@ -537,7 +537,7 @@ def configure_optimizers(self):
         trainer.fit(model)
 
 
-@RunIf(min_gpus=2, special=True)
+@RunIf(min_gpus=2, standalone=True)
 def test_optimizer_state_on_device(tmpdir):
     """Test that optimizers that create state initially at instantiation still end up with the state on the GPU."""
 
diff --git a/tests/trainer/properties/test_get_model.py b/tests/trainer/properties/test_get_model.py
index 6e405739e83fe..ed81b90a2d142 100644
--- a/tests/trainer/properties/test_get_model.py
+++ b/tests/trainer/properties/test_get_model.py
@@ -37,7 +37,7 @@ def test_get_model(tmpdir):
     trainer.fit(model)
 
 
-@RunIf(skip_windows=True)
+@RunIf(skip_windows=True, skip_49370=True)
 def test_get_model_ddp_cpu(tmpdir):
     """Tests that `trainer.lightning_module` extracts the model correctly when using ddp on cpu."""
 
diff --git a/tests/trainer/test_data_loading.py b/tests/trainer/test_data_loading.py
index 723cff55c6860..8f745db4b8400 100644
--- a/tests/trainer/test_data_loading.py
+++ b/tests/trainer/test_data_loading.py
@@ -20,9 +20,12 @@
 from torch.utils.data.sampler import BatchSampler, Sampler, SequentialSampler
 
 from pytorch_lightning import Trainer
+from pytorch_lightning.trainer.data_loading import TrainerDataLoadingMixin
+from pytorch_lightning.trainer.states import RunningStage
+from pytorch_lightning.trainer.supporters import CombinedLoader
 from pytorch_lightning.utilities.enums import DistributedType
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from tests.helpers import BoringModel, RandomDataset
+from tests.helpers.boring_model import BoringModel, RandomDataset, RandomIterableDataset
 from tests.helpers.runif import RunIf
 
 
@@ -133,7 +136,7 @@ def _get_warning_msg():
             assert warn_str in msg
 
 
-@RunIf(skip_windows=True)
+@RunIf(skip_windows=True, skip_49370=True)
 @pytest.mark.parametrize("num_workers", [0, 1])
 def test_dataloader_warnings(tmpdir, num_workers):
     trainer = Trainer(default_root_dir=tmpdir, strategy="ddp_spawn", num_processes=2, fast_dev_run=4)
@@ -279,29 +282,30 @@ class CustomSampler(Sampler):
 
     # Should raise an error if existing sampler is being replaced
     dataloader = CustomDataLoader(dataset, sampler=CustomSampler(dataset))
-    with pytest.raises(MisconfigurationException, match="will be replaced  by `DistributedSampler`"):
+    with pytest.raises(MisconfigurationException, match="will be replaced by `DistributedSampler`"):
         trainer.prepare_dataloader(dataloader, shuffle=True)
 
 
-def test_loader_detaching():
-    """Checks that the loader has been resetted after the entrypoint."""
+class LoaderTestModel(BoringModel):
+    def training_step(self, batch, batch_idx):
+        assert len(self.trainer.train_dataloader.loaders) == 10
+        return super().training_step(batch, batch_idx)
 
-    class LoaderTestModel(BoringModel):
-        def training_step(self, batch, batch_idx):
-            assert len(self.trainer.train_dataloader.loaders) == 10
-            return super().training_step(batch, batch_idx)
+    def validation_step(self, batch, batch_idx):
+        assert len(self.trainer.val_dataloaders[0]) == 10
+        return super().validation_step(batch, batch_idx)
 
-        def validation_step(self, batch, batch_idx):
-            assert len(self.trainer.val_dataloaders[0]) == 10
-            return super().validation_step(batch, batch_idx)
+    def test_step(self, batch, batch_idx):
+        assert len(self.trainer.test_dataloaders[0]) == 10
+        return super().test_step(batch, batch_idx)
 
-        def test_step(self, batch, batch_idx):
-            assert len(self.trainer.test_dataloaders[0]) == 10
-            return super().test_step(batch, batch_idx)
+    def predict_step(self, batch, batch_idx, dataloader_idx=0):
+        assert len(self.trainer.predict_dataloaders[0]) == 10
+        return super().predict_step(batch, batch_idx, dataloader_idx=dataloader_idx)
 
-        def predict_step(self, batch, batch_idx, dataloader_idx=None):
-            assert len(self.trainer.predict_dataloaders[0]) == 10
-            return super().predict_step(batch, batch_idx, dataloader_idx=dataloader_idx)
+
+def test_loader_detaching():
+    """Checks that the loader has been resetted after the entrypoint."""
 
     loader = DataLoader(RandomDataset(32, 10), batch_size=1)
 
@@ -340,3 +344,62 @@ def predict_step(self, batch, batch_idx, dataloader_idx=None):
     assert len(model.val_dataloader()) == 64
     assert len(model.predict_dataloader()) == 64
     assert len(model.test_dataloader()) == 64
+
+
+def test_pre_made_batches():
+    """Check that loader works with pre-made batches."""
+    loader = DataLoader(RandomDataset(32, 10), batch_size=None)
+    trainer = Trainer(fast_dev_run=1)
+    trainer.predict(LoaderTestModel(), loader)
+
+
+def test_error_raised_with_float_limited_eval_batches():
+    """Test that an error is raised if there are not enough batches when passed with float value of
+    limit_eval_batches."""
+    model = BoringModel()
+    dl_size = len(model.val_dataloader())
+    limit_val_batches = 1 / (dl_size + 2)
+    trainer = Trainer(limit_val_batches=limit_val_batches)
+    trainer._data_connector.attach_data(model)
+    with pytest.raises(
+        MisconfigurationException,
+        match=fr"{limit_val_batches} \* {dl_size} < 1. Please increase the `limit_val_batches`",
+    ):
+        trainer._reset_eval_dataloader(RunningStage.VALIDATING, model)
+
+
+@pytest.mark.parametrize(
+    "val_dl",
+    [
+        DataLoader(dataset=RandomDataset(32, 64), shuffle=True),
+        CombinedLoader(DataLoader(dataset=RandomDataset(32, 64), shuffle=True)),
+        CombinedLoader(
+            [DataLoader(dataset=RandomDataset(32, 64)), DataLoader(dataset=RandomDataset(32, 64), shuffle=True)]
+        ),
+        CombinedLoader(
+            {
+                "dl1": DataLoader(dataset=RandomDataset(32, 64)),
+                "dl2": DataLoader(dataset=RandomDataset(32, 64), shuffle=True),
+            }
+        ),
+    ],
+)
+def test_non_sequential_sampler_warning_is_raised_for_eval_dataloader(val_dl):
+    trainer = Trainer()
+    model = BoringModel()
+    trainer._data_connector.attach_data(model, val_dataloaders=val_dl)
+    with pytest.warns(UserWarning, match="recommended .* turn this off for val/test/predict"):
+        trainer._reset_eval_dataloader(RunningStage.VALIDATING, model)
+
+
+@pytest.mark.parametrize("mode", [RunningStage.TRAINING, RunningStage.PREDICTING, RunningStage.TESTING])
+def test_dataloader_kwargs_replacement_with_iterable_dataset(mode):
+    """Test that DataLoader kwargs are not replaced when using Iterable Dataset."""
+    dataset = RandomIterableDataset(7, 100)
+    dataloader = DataLoader(dataset, batch_size=32)
+    dl_kwargs = TrainerDataLoadingMixin._get_dataloader_init_kwargs(dataloader, dataloader.sampler, mode=mode)
+    assert dl_kwargs["sampler"] is None
+    assert dl_kwargs["batch_sampler"] is None
+    assert dl_kwargs["batch_size"] is dataloader.batch_size
+    assert dl_kwargs["dataset"] is dataloader.dataset
+    assert dl_kwargs["collate_fn"] is dataloader.collate_fn
diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py
index 533eceb8018db..55956c954e497 100644
--- a/tests/trainer/test_dataloaders.py
+++ b/tests/trainer/test_dataloaders.py
@@ -1122,17 +1122,12 @@ def test_dataloaders_load_only_once(tmpdir):
     assert tracker.mock_calls == [call.val_dataloader(), call.train_dataloader()]
 
 
-def test_dataloaders_load_only_once_val_interval(tmpdir):
+def test_dataloaders_load_only_once_no_sanity_check(tmpdir):
     model = BoringModel()
 
     # logger file to get meta
     trainer = Trainer(
-        default_root_dir=tmpdir,
-        limit_train_batches=10,
-        limit_val_batches=10,
-        val_check_interval=0.3,
-        reload_dataloaders_every_n_epochs=True,
-        max_epochs=3,
+        default_root_dir=tmpdir, limit_train_batches=0.3, limit_val_batches=0.3, num_sanity_val_steps=0, max_epochs=3
     )
 
     tracker = Mock()
@@ -1145,34 +1140,33 @@ def test_dataloaders_load_only_once_val_interval(tmpdir):
     tracker.attach_mock(model.test_dataloader, "test_dataloader")
 
     trainer.fit(model)
-    trainer.test(model)
 
     # verify the sequence
-    expected_sequence = [
-        call.val_dataloader(),
-        call.train_dataloader(),
-        call.val_dataloader(),
-        call.val_dataloader(),
-        call.val_dataloader(),
-        call.train_dataloader(),
-        call.val_dataloader(),
-        call.val_dataloader(),
-        call.val_dataloader(),
-        call.train_dataloader(),
-        call.val_dataloader(),
-        call.val_dataloader(),
-        call.val_dataloader(),
-        call.test_dataloader(),
-    ]
+    expected_sequence = [call.train_dataloader(), call.val_dataloader()]
     assert tracker.mock_calls == expected_sequence
 
 
-def test_dataloaders_load_only_once_no_sanity_check(tmpdir):
-    model = BoringModel()
+@pytest.mark.parametrize("n", [1, 2])
+def test_dataloaders_load_every_n_epochs(tmpdir, n):
+    train_reload_epochs, val_reload_epochs = [], []
+
+    class TestModel(BoringModel):
+        def train_dataloader(self):
+            train_reload_epochs.append(self.current_epoch)
+            return super().train_dataloader()
+
+        def val_dataloader(self):
+            val_reload_epochs.append(self.current_epoch)
+            return super().val_dataloader()
+
+    model = TestModel()
 
-    # logger file to get meta
     trainer = Trainer(
-        default_root_dir=tmpdir, limit_train_batches=0.3, limit_val_batches=0.3, num_sanity_val_steps=0, max_epochs=3
+        default_root_dir=tmpdir,
+        limit_train_batches=0.3,
+        limit_val_batches=0.3,
+        reload_dataloaders_every_n_epochs=n,
+        max_epochs=5,
     )
 
     tracker = Mock()
@@ -1185,44 +1179,113 @@ def test_dataloaders_load_only_once_no_sanity_check(tmpdir):
     tracker.attach_mock(model.test_dataloader, "test_dataloader")
 
     trainer.fit(model)
+    trainer.test(model)
+
+    # Verify the sequence
+    expected_sequence = [call.val_dataloader(), call.train_dataloader()]  # Sanity check first
+    if n == 1:
+        expected_sequence += [call.train_dataloader(), call.val_dataloader()] * 4
+    elif n == 2:
+        expected_sequence += [call.train_dataloader(), call.val_dataloader()] * 2
+    expected_sequence += [call.test_dataloader()]
 
-    # verify the sequence
-    expected_sequence = [call.train_dataloader(), call.val_dataloader()]
     assert tracker.mock_calls == expected_sequence
 
+    # Verify epoch of reloads
+    if n == 1:
+        assert train_reload_epochs == [0, 1, 2, 3, 4]
+        assert val_reload_epochs == [0, 1, 2, 3, 4]
+    elif n == 2:
+        assert train_reload_epochs == [0, 2, 4]
+        assert val_reload_epochs == [0, 2, 4]
 
-@pytest.mark.parametrize("n", [1, 2])
-def test_dataloaders_load_every_n_epochs(tmpdir, n):
-    model = BoringModel()
+
+@pytest.mark.parametrize(
+    "n, train_reload_epochs_expect, val_reload_epochs_expect",
+    [
+        # Sanity check at epoch 0 creates a validation dataloader, but validation is
+        # checked (and in this case reloaded) every n epochs starting from epoch n-1
+        (3, [0, 2, 4, 6, 8], [0, 2, 5, 8]),
+        (5, [0, 2, 4, 6, 8], [0, 4, 9]),
+    ],
+)
+def test_dataloaders_load_every_n_epochs_infrequent_val(
+    tmpdir, n, train_reload_epochs_expect, val_reload_epochs_expect
+):
+    """Test dataloader reload behavior when infrequently checking validation set (via check_val_every_n_epoch)"""
+    train_reload_epochs, val_reload_epochs = [], []
+
+    class TestModel(BoringModel):
+        def train_dataloader(self):
+            train_reload_epochs.append(self.current_epoch)
+            return super().train_dataloader()
+
+        def val_dataloader(self):
+            val_reload_epochs.append(self.current_epoch)
+            return super().val_dataloader()
+
+    model = TestModel()
 
     trainer = Trainer(
         default_root_dir=tmpdir,
         limit_train_batches=0.3,
         limit_val_batches=0.3,
-        reload_dataloaders_every_n_epochs=n,
+        check_val_every_n_epoch=n,
+        reload_dataloaders_every_n_epochs=2,
+        max_epochs=10,
+    )
+    model.test_dataloader = Mock(wraps=model.test_dataloader)
+
+    trainer.fit(model)
+    trainer.test(model)
+
+    # Verify epoch of reloads
+    assert train_reload_epochs == train_reload_epochs_expect
+    assert val_reload_epochs == val_reload_epochs_expect
+
+    model.test_dataloader.assert_called_once()
+
+
+def test_dataloaders_load_every_n_epochs_frequent_val(tmpdir):
+    """Test dataloader reload behavior when frequently checking validation set (via val_check_interval)"""
+    train_reload_epochs, val_reload_epochs, val_check_epochs = [], [], []
+
+    class TestModel(BoringModel):
+        def train_dataloader(self):
+            train_reload_epochs.append(self.current_epoch)
+            return super().train_dataloader()
+
+        def val_dataloader(self):
+            val_reload_epochs.append(self.current_epoch)
+            return super().val_dataloader()
+
+        def validation_epoch_end(self, outputs):
+            val_check_epochs.append(self.current_epoch)
+            return super().validation_epoch_end(outputs)
+
+    model = TestModel()
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        limit_train_batches=0.3,
+        limit_val_batches=0.3,
+        val_check_interval=0.3,
+        reload_dataloaders_every_n_epochs=1,
         max_epochs=3,
     )
 
-    tracker = Mock()
-    model.train_dataloader = Mock(wraps=model.train_dataloader)
-    model.val_dataloader = Mock(wraps=model.val_dataloader)
     model.test_dataloader = Mock(wraps=model.test_dataloader)
 
-    tracker.attach_mock(model.train_dataloader, "train_dataloader")
-    tracker.attach_mock(model.val_dataloader, "val_dataloader")
-    tracker.attach_mock(model.test_dataloader, "test_dataloader")
-
     trainer.fit(model)
     trainer.test(model)
 
-    # verify the sequence
-    expected_sequence = [call.val_dataloader()]
-    if n == 1:
-        expected_sequence += [call.train_dataloader(), call.val_dataloader()] * 3
-    elif n == 2:
-        expected_sequence += [call.train_dataloader(), call.val_dataloader()] * 2
-    expected_sequence += [call.test_dataloader()]
-    assert tracker.mock_calls == expected_sequence
+    # Verify epoch of reloads
+    assert train_reload_epochs == [0, 1, 2]
+    assert val_reload_epochs == [0, 1, 2]
+    model.test_dataloader.assert_called_once()
+
+    # Verify validation happens 3 times per epoch + 1 for sanity check
+    assert val_check_epochs == [0, 0, 0, 0, 1, 1, 1, 2, 2, 2]
 
 
 @pytest.mark.parametrize("n", ["test", -1])
@@ -1269,15 +1332,6 @@ def validation_step(self, batch, batch_idx):
     expected_calls = [
         call.train_dataloader(),
         call.val_dataloader(),
-        # This has subsequent calls to val_dataloader
-        # because the training loop runs the evaluation loop,
-        # which reloads the val dataloader again.
-        # We cannot yet rely on trainer.current_epoch=0 to skip reloading
-        # the val dataloader on the first epoch because this only tracks the training epoch
-        # meaning multiple passes through the validation data within a single training epoch
-        # would not have the dataloader reloaded.
-        # This breaks the assumption behind reload_dataloaders_every_epoch=True
-        call.val_dataloader(),
         call.train_dataloader(),
         call.val_dataloader(),
         call.train_dataloader(),
diff --git a/tests/trainer/test_supporters.py b/tests/trainer/test_supporters.py
index 204f3079f544b..e4598550c24fb 100644
--- a/tests/trainer/test_supporters.py
+++ b/tests/trainer/test_supporters.py
@@ -33,8 +33,10 @@
 )
 from pytorch_lightning.utilities.apply_func import apply_to_collection
 from pytorch_lightning.utilities.auto_restart import CaptureMapDataset, FastForwardSampler
+from pytorch_lightning.utilities.data import get_len
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_7
+from tests.helpers.boring_model import RandomDataset
 
 
 def test_tensor_running_accum_reset():
@@ -379,3 +381,56 @@ def _assert_dataset(loader):
             assert isinstance(d, CustomDataset)
 
     apply_to_collection(dataloader.loaders, DataLoader, _assert_dataset)
+
+
+@pytest.mark.parametrize("replace_sampler_ddp", [False, True])
+def test_combined_data_loader_with_max_size_cycle_and_ddp(replace_sampler_ddp, tmpdir):
+    """This test makes sure distributed sampler has been properly injected in dataloaders when using CombinedLoader
+    with ddp and `max_size_cycle` mode."""
+    trainer = Trainer(strategy="ddp", accelerator="auto", devices=2, replace_sampler_ddp=replace_sampler_ddp)
+
+    dataloader = CombinedLoader(
+        {"a": DataLoader(RandomDataset(32, 8), batch_size=1), "b": DataLoader(RandomDataset(32, 8), batch_size=1)},
+    )
+    dataloader = trainer.prepare_dataloader(dataloader, shuffle=False)
+    assert len(dataloader) == 4 if replace_sampler_ddp else 8
+
+    for a_length in [6, 8, 10]:
+        dataloader = CombinedLoader(
+            {
+                "a": DataLoader(range(a_length), batch_size=1),
+                "b": DataLoader(range(8), batch_size=1),
+            },
+            mode="max_size_cycle",
+        )
+
+        length = max(a_length, 8)
+        assert len(dataloader) == length
+        dataloader = trainer.prepare_dataloader(dataloader, shuffle=False)
+        assert len(dataloader) == length // 2 if replace_sampler_ddp else length
+        if replace_sampler_ddp:
+            last_batch = list(dataloader)[-1]
+            if a_length == 6:
+                assert last_batch == {"a": torch.tensor([0]), "b": torch.tensor([6])}
+            elif a_length == 8:
+                assert last_batch == {"a": torch.tensor([6]), "b": torch.tensor([6])}
+            elif a_length == 10:
+                assert last_batch == {"a": torch.tensor([8]), "b": torch.tensor([0])}
+
+    class InfiniteDataset(IterableDataset):
+        def __iter__(self):
+            while True:
+                yield 1
+
+    dataloader = CombinedLoader(
+        {
+            "a": DataLoader(InfiniteDataset(), batch_size=1),
+            "b": DataLoader(range(8), batch_size=1),
+        },
+        mode="max_size_cycle",
+    )
+    assert get_len(dataloader) == float("inf")
+    assert len(dataloader.loaders["b"].loader) == 8
+    dataloader = trainer.prepare_dataloader(dataloader, shuffle=False)
+    assert len(dataloader.loaders["b"].loader) == 4 if replace_sampler_ddp else 8
+    assert get_len(dataloader) == float("inf")
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 0a3eacb23863f..65006a98b30e8 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -1452,30 +1452,27 @@ def test_trainer_predict_cpu(tmpdir, datamodule, enable_progress_bar):
     predict(tmpdir, datamodule=datamodule, enable_progress_bar=enable_progress_bar)
 
 
-@RunIf(min_gpus=2, special=True)
-@pytest.mark.parametrize("num_gpus", [1, 2])
-def test_trainer_predict_dp(tmpdir, num_gpus):
-    predict(tmpdir, strategy="dp", accelerator="gpu", devices=num_gpus)
-
-
-@RunIf(min_gpus=2, special=True, fairscale=True)
-def test_trainer_predict_ddp(tmpdir):
-    predict(tmpdir, strategy="ddp", accelerator="gpu", devices=2)
-
-
-@RunIf(min_gpus=2, skip_windows=True, special=True)
-def test_trainer_predict_ddp_spawn(tmpdir):
-    predict(tmpdir, strategy="dp", accelerator="gpu", devices=2)
+@RunIf(min_gpus=2, standalone=True)
+@pytest.mark.parametrize(
+    "kwargs",
+    [
+        {"strategy": "dp", "devices": 1},
+        {"strategy": "dp", "devices": 2},
+        {"strategy": "ddp", "devices": 2},
+    ],
+)
+def test_trainer_predict_standalone(tmpdir, kwargs):
+    predict(tmpdir, accelerator="gpu", **kwargs)
 
 
-@RunIf(min_gpus=1, special=True)
+@RunIf(min_gpus=1)
 def test_trainer_predict_1_gpu(tmpdir):
     predict(tmpdir, accelerator="gpu", devices=1)
 
 
 @RunIf(skip_windows=True)
-def test_trainer_predict_ddp_cpu(tmpdir):
-    predict(tmpdir, strategy="ddp_spawn", accelerator="cpu", devices=2)
+def test_trainer_predict_ddp_spawn(tmpdir):
+    predict(tmpdir, strategy="ddp_spawn", accelerator="auto", devices=2)
 
 
 @pytest.mark.parametrize("dataset_cls", [RandomDataset, RandomIterableDatasetWithLen, RandomIterableDataset])
@@ -1505,14 +1502,10 @@ def write_on_batch_end(self, trainer, pl_module, prediction, batch_indices, *arg
 def test_spawn_predict_return_predictions(_, __, accelerator):
     """Test that `return_predictions=True` raise a MisconfigurationException with spawn training type plugins."""
     model = BoringModel()
-
-    def run(expected_plugin, **trainer_kwargs):
-        trainer = Trainer(**trainer_kwargs, fast_dev_run=True)
-        assert isinstance(trainer.training_type_plugin, expected_plugin)
-        with pytest.raises(MisconfigurationException, match="`return_predictions` should be set to `False`"):
-            trainer.predict(model, dataloaders=model.train_dataloader(), return_predictions=True)
-
-    run(DDPSpawnPlugin, accelerator=accelerator, strategy="ddp_spawn", devices=2)
+    trainer = Trainer(accelerator=accelerator, strategy="ddp_spawn", devices=2, fast_dev_run=True)
+    assert isinstance(trainer.training_type_plugin, DDPSpawnPlugin)
+    with pytest.raises(MisconfigurationException, match="`return_predictions` should be set to `False`"):
+        trainer.predict(model, dataloaders=model.train_dataloader(), return_predictions=True)
 
 
 @pytest.mark.parametrize("return_predictions", [None, False, True])
@@ -1809,7 +1802,7 @@ def on_predict_start(self) -> None:
 
 
 @pytest.mark.parametrize(
-    "strategy,num_processes", [(None, 1), pytest.param("ddp_spawn", 2, marks=RunIf(skip_windows=True))]
+    "strategy,num_processes", [(None, 1), pytest.param("ddp_spawn", 2, marks=RunIf(skip_windows=True, skip_49370=True))]
 )
 def test_model_in_correct_mode_during_stages(tmpdir, strategy, num_processes):
     model = TrainerStagesModel()
@@ -1830,7 +1823,7 @@ def validation_epoch_end(self, outputs) -> None:
         pass
 
 
-@RunIf(skip_windows=True)
+@RunIf(skip_windows=True, skip_49370=True)
 def test_fit_test_synchronization(tmpdir):
     """Test that the trainer synchronizes processes before returning control back to the caller."""
     tutils.set_random_main_port()
@@ -1892,7 +1885,7 @@ class CustomException(Exception):
     pass
 
 
-@RunIf(min_gpus=2, special=True)
+@RunIf(min_gpus=2, standalone=True)
 def test_ddp_terminate_when_deadlock_is_detected(tmpdir):
     """Test that DDP kills the remaining processes when only one rank is throwing an exception."""
 
diff --git a/tests/utilities/test_all_gather_grad.py b/tests/utilities/test_all_gather_grad.py
index 073468fc4cb28..01ffd12a0ca62 100644
--- a/tests/utilities/test_all_gather_grad.py
+++ b/tests/utilities/test_all_gather_grad.py
@@ -41,13 +41,13 @@ def _test_all_gather_ddp(rank, world_size):
     assert torch.allclose(grad2, tensor2.grad)
 
 
-@RunIf(skip_windows=True)
-def test_all_gather_ddp():
+@RunIf(skip_windows=True, skip_49370=True, skip_hanging_spawn=True)
+def test_all_gather_ddp_spawn():
     world_size = 3
     torch.multiprocessing.spawn(_test_all_gather_ddp, args=(world_size,), nprocs=world_size)
 
 
-@RunIf(min_gpus=2, skip_windows=True, special=True)
+@RunIf(min_gpus=2, skip_windows=True, standalone=True)
 def test_all_gather_collection(tmpdir):
     class TestModel(BoringModel):
 
@@ -98,7 +98,7 @@ def training_epoch_end(self, outputs) -> None:
     assert model.training_epoch_end_called
 
 
-@RunIf(min_gpus=2, skip_windows=True, special=True)
+@RunIf(min_gpus=2, skip_windows=True, standalone=True)
 def test_all_gather_sync_grads(tmpdir):
     class TestModel(BoringModel):
 
diff --git a/tests/utilities/test_apply_func.py b/tests/utilities/test_apply_func.py
index 2c131f96ecc6f..9b0fcbd643744 100644
--- a/tests/utilities/test_apply_func.py
+++ b/tests/utilities/test_apply_func.py
@@ -13,8 +13,9 @@
 # limitations under the License.
 import dataclasses
 import numbers
-from collections import namedtuple, OrderedDict
-from typing import List
+from collections import defaultdict, namedtuple, OrderedDict
+from dataclasses import InitVar
+from typing import Any, ClassVar, List, Optional
 
 import numpy as np
 import pytest
@@ -31,6 +32,12 @@ class Feature:
         input_ids: torch.Tensor
         segment_ids: np.ndarray
 
+        def __eq__(self, o: object) -> bool:
+            if not isinstance(o, Feature):
+                return NotImplemented
+            else:
+                return torch.equal(self.input_ids, o.input_ids) and np.equal(self.segment_ids, o.segment_ids).all()
+
     @dataclasses.dataclass
     class ModelExample:
         example_ids: List[str]
@@ -41,6 +48,71 @@ class ModelExample:
         def __post_init__(self):
             self.some_constant = 7
 
+        def __eq__(self, o: object) -> bool:
+            if not isinstance(o, ModelExample):
+                return NotImplemented
+            else:
+                return (
+                    self.example_ids == o.example_ids
+                    and self.feature == o.feature
+                    and torch.equal(self.label, o.label)
+                    and self.some_constant == o.some_constant
+                )
+
+    @dataclasses.dataclass
+    class WithClassVar:
+        class_var: ClassVar[int] = 0
+        dummy: Any
+
+        def __eq__(self, o: object) -> bool:
+            if not isinstance(o, WithClassVar):
+                return NotImplemented
+            elif isinstance(self.dummy, torch.Tensor):
+                return torch.equal(self.dummy, o.dummy)
+            else:
+                return self.dummy == o.dummy
+
+    @dataclasses.dataclass
+    class WithInitVar:
+        dummy: Any
+        override: InitVar[Optional[Any]] = None
+
+        def __post_init__(self, override: Optional[Any]):
+            if override is not None:
+                self.dummy = override
+
+        def __eq__(self, o: object) -> bool:
+            if not isinstance(o, WithInitVar):
+                return NotImplemented
+            elif isinstance(self.dummy, torch.Tensor):
+                return torch.equal(self.dummy, o.dummy)
+            else:
+                return self.dummy == o.dummy
+
+    @dataclasses.dataclass
+    class WithClassAndInitVar:
+        class_var: ClassVar[torch.Tensor] = torch.tensor(0)
+        dummy: Any
+        override: InitVar[Optional[Any]] = torch.tensor(1)
+
+        def __post_init__(self, override: Optional[Any]):
+            if override is not None:
+                self.dummy = override
+
+        def __eq__(self, o: object) -> bool:
+            if not isinstance(o, WithClassAndInitVar):
+                return NotImplemented
+            elif isinstance(self.dummy, torch.Tensor):
+                return torch.equal(self.dummy, o.dummy)
+            else:
+                return self.dummy == o.dummy
+
+    model_example = ModelExample(
+        example_ids=["i-1", "i-2", "i-3"],
+        feature=Feature(input_ids=torch.tensor([1.0, 2.0, 3.0]), segment_ids=np.array([4.0, 5.0, 6.0])),
+        label=torch.tensor([7.0, 8.0, 9.0]),
+    )
+
     to_reduce = {
         "a": torch.tensor([1.0]),  # Tensor
         "b": [torch.tensor([2.0])],  # list
@@ -50,13 +122,18 @@ def __post_init__(self):
         "f": "this_is_a_dummy_str",  # string
         "g": 12.0,  # number
         "h": Feature(input_ids=torch.tensor([1.0, 2.0, 3.0]), segment_ids=np.array([4.0, 5.0, 6.0])),  # dataclass
-        "i": ModelExample(
-            example_ids=["i-1", "i-2", "i-3"],
-            feature=Feature(input_ids=torch.tensor([1.0, 2.0, 3.0]), segment_ids=np.array([4.0, 5.0, 6.0])),
-            label=torch.tensor([7.0, 8.0, 9.0]),
-        ),  # nested dataclass
+        "i": model_example,  # nested dataclass
+        "j": WithClassVar(torch.arange(3)),  # dataclass with class variable
+        "k": WithInitVar("this_gets_overridden", torch.tensor([2.0])),  # dataclass with init-only variable
+        "l": WithClassAndInitVar(model_example, None),  # nested dataclass with class and init-only variables
     }
 
+    model_example_result = ModelExample(
+        example_ids=["i-1", "i-2", "i-3"],
+        feature=Feature(input_ids=torch.tensor([2.0, 4.0, 6.0]), segment_ids=np.array([8.0, 10.0, 12.0])),
+        label=torch.tensor([14.0, 16.0, 18.0]),
+    )
+
     expected_result = {
         "a": torch.tensor([2.0]),
         "b": [torch.tensor([4.0])],
@@ -66,32 +143,31 @@ def __post_init__(self):
         "f": "this_is_a_dummy_str",
         "g": 24.0,
         "h": Feature(input_ids=torch.tensor([2.0, 4.0, 6.0]), segment_ids=np.array([8.0, 10.0, 12.0])),
-        "i": ModelExample(
-            example_ids=["i-1", "i-2", "i-3"],
-            feature=Feature(input_ids=torch.tensor([2.0, 4.0, 6.0]), segment_ids=np.array([8.0, 10.0, 12.0])),
-            label=torch.tensor([14.0, 16.0, 18.0]),
-        ),
+        "i": model_example_result,
+        "j": WithClassVar(torch.arange(0, 6, 2)),
+        "k": WithInitVar(torch.tensor([4.0])),
+        "l": WithClassAndInitVar(model_example_result, None),
     }
 
     reduced = apply_to_collection(to_reduce, (torch.Tensor, numbers.Number, np.ndarray), lambda x: x * 2)
 
-    assert isinstance(reduced, dict), " Type Consistency of dict not preserved"
+    assert isinstance(reduced, dict), "Type Consistency of dict not preserved"
     assert all(x in reduced for x in to_reduce), "Not all entries of the dict were preserved"
     assert all(
         isinstance(reduced[k], type(expected_result[k])) for k in to_reduce
     ), "At least one type was not correctly preserved"
 
     assert isinstance(reduced["a"], torch.Tensor), "Reduction Result of a Tensor should be a Tensor"
-    assert torch.allclose(expected_result["a"], reduced["a"]), "Reduction of a tensor does not yield the expected value"
+    assert torch.equal(expected_result["a"], reduced["a"]), "Reduction of a tensor does not yield the expected value"
 
     assert isinstance(reduced["b"], list), "Reduction Result of a list should be a list"
     assert all(
-        torch.allclose(x, y) for x, y in zip(reduced["b"], expected_result["b"])
+        torch.equal(x, y) for x, y in zip(reduced["b"], expected_result["b"])
     ), "At least one value of list reduction did not come out as expected"
 
     assert isinstance(reduced["c"], tuple), "Reduction Result of a tuple should be a tuple"
     assert all(
-        torch.allclose(x, y) for x, y in zip(reduced["c"], expected_result["c"])
+        torch.equal(x, y) for x, y in zip(reduced["c"], expected_result["c"])
     ), "At least one value of tuple reduction did not come out as expected"
 
     assert isinstance(reduced["d"], ntc), "Type Consistency for named tuple not given"
@@ -109,34 +185,30 @@ def __post_init__(self):
     assert isinstance(reduced["g"], numbers.Number), "Reduction of a number should result in a number"
     assert reduced["g"] == expected_result["g"], "Reduction of a number did not yield the desired result"
 
-    assert dataclasses.is_dataclass(reduced["h"]) and not isinstance(
-        reduced["h"], type
-    ), "Reduction of a dataclass should result in a dataclass"
-    assert torch.allclose(
-        reduced["h"].input_ids, expected_result["h"].input_ids
-    ), "Reduction of a dataclass did not yield the desired result"
-    assert np.allclose(
-        reduced["h"].segment_ids, expected_result["h"].segment_ids
-    ), "Reduction of a dataclass did not yield the desired result"
-
-    assert dataclasses.is_dataclass(reduced["i"]) and not isinstance(
-        reduced["i"], type
-    ), "Reduction of a dataclass should result in a dataclass"
-    assert dataclasses.is_dataclass(reduced["i"].feature) and not isinstance(
-        reduced["i"].feature, type
-    ), "Reduction of a nested dataclass should result in a nested dataclass"
-    assert (
-        reduced["i"].example_ids == expected_result["i"].example_ids
-    ), "Reduction of a nested dataclass did not yield the desired result"
-    assert torch.allclose(
-        reduced["i"].label, expected_result["i"].label
-    ), "Reduction of a nested dataclass did not yield the desired result"
-    assert torch.allclose(
-        reduced["i"].feature.input_ids, expected_result["i"].feature.input_ids
-    ), "Reduction of a nested dataclass did not yield the desired result"
-    assert np.allclose(
-        reduced["i"].feature.segment_ids, expected_result["i"].feature.segment_ids
-    ), "Reduction of a nested dataclass did not yield the desired result"
+    def _assert_dataclass_reduction(actual, expected, dataclass_type: str = ""):
+        assert dataclasses.is_dataclass(actual) and not isinstance(
+            actual, type
+        ), f"Reduction of a {dataclass_type} dataclass should result in a dataclass"
+        for field in dataclasses.fields(actual):
+            if dataclasses.is_dataclass(field.type):
+                _assert_dataclass_reduction(getattr(actual, field.name), getattr(expected, field.name), "nested")
+        assert actual == expected, f"Reduction of a {dataclass_type} dataclass did not yield the desired result"
+
+    _assert_dataclass_reduction(reduced["h"], expected_result["h"])
+
+    _assert_dataclass_reduction(reduced["i"], expected_result["i"])
+
+    dataclass_type = "ClassVar-containing"
+    _assert_dataclass_reduction(reduced["j"], expected_result["j"], dataclass_type)
+    assert WithClassVar.class_var == 0, f"Reduction of a {dataclass_type} dataclass should not change the class var"
+
+    _assert_dataclass_reduction(reduced["k"], expected_result["k"], "InitVar-containing")
+
+    dataclass_type = "Class-and-InitVar-containing"
+    _assert_dataclass_reduction(reduced["l"], expected_result["l"], dataclass_type)
+    assert torch.equal(
+        WithClassAndInitVar.class_var, torch.tensor(0)
+    ), f"Reduction of a {dataclass_type} dataclass should not change the class var"
 
     # mapping support
     reduced = apply_to_collection({"a": 1, "b": 2}, int, lambda x: str(x))
@@ -153,6 +225,11 @@ def __init__(self, initial_dict):
     reduced = apply_to_collection(to_reduce, int, lambda x: str(x))
     assert reduced == _CustomCollection({"a": "1", "b": "2", "c": "3"})
 
+    # defaultdict
+    to_reduce = defaultdict(int, {"a": 1, "b": 2, "c": 3})
+    reduced = apply_to_collection(to_reduce, int, lambda x: str(x))
+    assert reduced == defaultdict(int, {"a": "1", "b": "2", "c": "3"})
+
 
 def test_apply_to_collection_include_none():
     to_reduce = [1, 2, 3.4, 5.6, 7, (8, 9.1, {10: 10})]
diff --git a/tests/utilities/test_cli.py b/tests/utilities/test_cli.py
index 7a86150454777..dd2132d40c63d 100644
--- a/tests/utilities/test_cli.py
+++ b/tests/utilities/test_cli.py
@@ -57,7 +57,7 @@
 
 
 @mock.patch("argparse.ArgumentParser.parse_args")
-def test_default_args(mock_argparse, tmpdir):
+def test_default_args(mock_argparse):
     """Tests default argument parser for Trainer."""
     mock_argparse.return_value = Namespace(**Trainer.default_attributes())
 
@@ -347,9 +347,7 @@ def test_lightning_cli_args(tmpdir):
     with open(config_path) as f:
         loaded_config = yaml.safe_load(f.read())
 
-    loaded_config = loaded_config["fit"]
     cli_config = cli.config["fit"]
-
     assert cli_config["seed_everything"] == 1234
     assert "model" not in loaded_config and "model" not in cli_config  # no arguments to include
     assert loaded_config["data"] == cli_config["data"]
@@ -403,9 +401,7 @@ def test_lightning_cli_config_and_subclass_mode(tmpdir):
     with open(config_path) as f:
         loaded_config = yaml.safe_load(f.read())
 
-    loaded_config = loaded_config["fit"]
     cli_config = cli.config["fit"]
-
     assert loaded_config["model"] == cli_config["model"]
     assert loaded_config["data"] == cli_config["data"]
     assert loaded_config["trainer"] == cli_config["trainer"]
@@ -868,7 +864,7 @@ class CustomCallback(Callback):
     pass
 
 
-def test_registries(tmpdir):
+def test_registries():
     assert "SGD" in OPTIMIZER_REGISTRY.names
     assert "RMSprop" in OPTIMIZER_REGISTRY.names
     assert "CustomAdam" in OPTIMIZER_REGISTRY.names
@@ -1251,6 +1247,10 @@ def test_lightning_cli_config_before_subcommand():
     test_mock.assert_called_once_with(cli.trainer, model=cli.model, verbose=True, ckpt_path="foobar")
     assert cli.trainer.limit_test_batches == 1
 
+    save_config_callback = cli.trainer.callbacks[0]
+    assert save_config_callback.config["trainer"]["limit_test_batches"] == 1
+    assert save_config_callback.parser.subcommand == "test"
+
     with mock.patch("sys.argv", ["any.py", f"--config={config}", "validate"]), mock.patch(
         "pytorch_lightning.Trainer.validate", autospec=True
     ) as validate_mock:
@@ -1358,9 +1358,27 @@ class TestCallback(Callback):
     assert cli.config_init["trainer"]["max_epochs"] is None
 
 
-def test_cli_configure_optimizers_warning(tmpdir):
+def test_cli_configure_optimizers_warning():
     match = "configure_optimizers` will be overridden by `LightningCLI"
     with mock.patch("sys.argv", ["any.py"]), no_warning_call(UserWarning, match=match):
         LightningCLI(BoringModel, run=False)
     with mock.patch("sys.argv", ["any.py", "--optimizer=Adam"]), pytest.warns(UserWarning, match=match):
         LightningCLI(BoringModel, run=False)
+
+
+def test_cli_help_message():
+    # full class path
+    cli_args = ["any.py", "--optimizer.help=torch.optim.Adam"]
+    classpath_help = StringIO()
+    with mock.patch("sys.argv", cli_args), redirect_stdout(classpath_help), pytest.raises(SystemExit):
+        LightningCLI(BoringModel, run=False)
+
+    cli_args = ["any.py", "--optimizer.help=Adam"]
+    shorthand_help = StringIO()
+    with mock.patch("sys.argv", cli_args), redirect_stdout(shorthand_help), pytest.raises(SystemExit):
+        LightningCLI(BoringModel, run=False)
+
+    # the help messages should match
+    assert shorthand_help.getvalue() == classpath_help.getvalue()
+    # make sure it's not empty
+    assert "Implements Adam" in shorthand_help.getvalue()
diff --git a/tests/utilities/test_data.py b/tests/utilities/test_data.py
index acbe645515f55..f4c61cda64f5d 100644
--- a/tests/utilities/test_data.py
+++ b/tests/utilities/test_data.py
@@ -12,6 +12,7 @@
     warning_cache,
 )
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from tests.deprecated_api import no_warning_call
 from tests.helpers.boring_model import BoringModel, RandomDataset, RandomIterableDataset
 
 
@@ -19,9 +20,8 @@ def test_extract_batch_size():
     """Tests the behavior of extracting the batch size."""
 
     def _check_warning_not_raised(data, expected):
-        with pytest.warns(None) as record:
+        with no_warning_call(match="Trying to infer the `batch_size`"):
             assert extract_batch_size(data) == expected
-        assert len(record) == 0
 
     def _check_warning_raised(data, expected):
         with pytest.warns(UserWarning, match=f"Trying to infer the `batch_size` .* we found is {expected}."):
@@ -43,6 +43,9 @@ def _check_warning_raised(data, expected):
     batch = {"test": [{"test": [torch.zeros(11, 10)]}]}
     _check_warning_not_raised(batch, 11)
 
+    batch = {"a": [torch.tensor(1), torch.tensor(2)], "b": torch.tensor([1, 2, 3, 4])}
+    _check_warning_raised(batch, 1)
+
     batch = {"test": [{"test": [torch.zeros(11, 10), torch.zeros(10, 10)]}]}
     _check_warning_raised(batch, 11)
 
diff --git a/tests/utilities/test_deepspeed_collate_checkpoint.py b/tests/utilities/test_deepspeed_collate_checkpoint.py
index e85557b4e6056..0f36ada39227d 100644
--- a/tests/utilities/test_deepspeed_collate_checkpoint.py
+++ b/tests/utilities/test_deepspeed_collate_checkpoint.py
@@ -22,7 +22,7 @@
 from tests.helpers.runif import RunIf
 
 
-@RunIf(min_gpus=2, deepspeed=True, special=True)
+@RunIf(min_gpus=2, deepspeed=True, standalone=True)
 def test_deepspeed_collate_checkpoint(tmpdir):
     """Test to ensure that with DeepSpeed Stage 3 we can collate the sharded checkpoints into a single file."""
     model = BoringModel()
diff --git a/tests/utilities/test_meta.py b/tests/utilities/test_meta.py
index 8e36a86c3beef..1f386ac1ce0fe 100644
--- a/tests/utilities/test_meta.py
+++ b/tests/utilities/test_meta.py
@@ -14,7 +14,7 @@
 from torch import nn
 
 from pytorch_lightning.core.lightning import LightningModule
-from pytorch_lightning.utilities.meta import init_meta_context, materialize_module
+from pytorch_lightning.utilities.meta import init_meta_context, is_on_meta_device, materialize_module
 from tests.helpers.runif import RunIf
 
 
@@ -31,18 +31,23 @@ def __init__(self, num_layers: int):
         self.layer = nn.Sequential(*[nn.Linear(1, 1) for _ in range(self.hparams.num_layers)])
 
 
-@RunIf(min_torch="1.10.0")
+@RunIf(standalone=True, min_torch="1.10.0")
 def test_init_meta_context():
 
     with init_meta_context():
         m = nn.Linear(in_features=1, out_features=1)
+        assert isinstance(m, nn.Linear)
         assert m.weight.device.type == "meta"
+        assert is_on_meta_device(m)
         mlp = MLP(4)
         assert mlp.layer[0].weight.device.type == "meta"
 
         mlp = materialize_module(mlp)
         assert mlp.layer[0].weight.device.type == "cpu"
 
+        assert not is_on_meta_device(mlp)
+        assert not is_on_meta_device(nn.Module())
+
         model = BoringModel(4)
         assert model.layer[0].weight.device.type == "meta"
         materialize_module(model)
diff --git a/tests/utilities/test_warnings.py b/tests/utilities/test_warnings.py
index d1222672b7595..6189562d9e190 100644
--- a/tests/utilities/test_warnings.py
+++ b/tests/utilities/test_warnings.py
@@ -21,8 +21,8 @@
 
 from pytorch_lightning.utilities.warnings import _warn, rank_zero_deprecation, rank_zero_warn, WarningCache
 
-running_special = os.getenv("PL_RUNNING_SPECIAL_TESTS", "0") == "1"
-if running_special:
+standalone = os.getenv("PL_RUN_STANDALONE_TESTS", "0") == "1"
+if standalone:
 
     stderr = StringIO()
     # recording