Lightning-AI · lexierule · Mar 30, 2021 · Mar 9, 2021 · Mar 25, 2021 · Mar 23, 2021
diff --git a/.github/workflows/ci_test-base.yml b/.github/workflows/ci_test-base.yml
@@ -68,7 +68,7 @@ jobs:
     - name: Test Package [only]
       run: |
         # NOTE: run coverage on tests does not propagare faler status for Win, https://github.com/nedbat/coveragepy/issues/1003
-        python -m pytest pytorch_lightning -v --cov=pytorch_lightning --junitxml=junit/test-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}.xml
+        coverage run --source pytorch_lightning -m pytest pytorch_lightning -v --junitxml=junit/test-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}.xml
 
     - name: Upload pytest test results
       uses: actions/upload-artifact@v2

diff --git a/.github/workflows/ci_test-conda.yml b/.github/workflows/ci_test-conda.yml
@@ -44,7 +44,7 @@ jobs:
     - name: Tests
       run: |
         # NOTE: run coverage on tests does not propagare faler status for Win, https://github.com/nedbat/coveragepy/issues/1003
-        python -m pytest pytorch_lightning tests -v --durations=50 --junitxml=junit/test-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}.xml
+        coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --durations=50 --junitxml=junit/test-results-${{ runner.os }}-torch${{ matrix.pytorch-version }}.xml
       shell: bash -l {0}
 
     - name: Upload pytest test results

diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml
@@ -134,7 +134,7 @@ jobs:
     - name: Tests
       run: |
         # NOTE: do not include coverage report here, see: https://github.com/nedbat/coveragepy/issues/1003
-        coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --durations=50 --junitxml=junit/test-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}.xml
+        coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --durations=50 --junitxml=junit/test-results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}.xml
 
     - name: Examples
       run: |

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,23 @@ All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
+## [1.2.6] - 2021-03-30
+
+### Changed
+
+- Changed the behavior of `on_epoch_start` to run at the beginning of validation & test epoch ([#6498](https://github.com/PyTorchLightning/pytorch-lightning/pull/6498))
+
+### Removed
+
+- Removed legacy code to include `step` dictionary returns in `callback_metrics`. Use `self.log_dict` instead. ([#6682](https://github.com/PyTorchLightning/pytorch-lightning/pull/6682))
+
+### Fixed
+
+- Fixed `DummyLogger.log_hyperparams` raising a `TypeError` when running with `fast_dev_run=True` ([#6398](https://github.com/PyTorchLightning/pytorch-lightning/pull/6398))
+- Fixed error on TPUs when there was no `ModelCheckpoint` ([#6654](https://github.com/PyTorchLightning/pytorch-lightning/pull/6654))
+- Fixed `trainer.test` freeze on TPUs ([#6654](https://github.com/PyTorchLightning/pytorch-lightning/pull/6654))
+- Fixed a bug where gradients were disabled after calling `Trainer.predict` ([#6657](https://github.com/PyTorchLightning/pytorch-lightning/pull/6657))
+- Fixed bug where no TPUs were detected in a TPU pod env ([#6719](https://github.com/PyTorchLightning/pytorch-lightning/pull/6719))
 
 
 ## [1.2.5] - 2021-03-23
@@ -13,7 +30,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Update Gradient Clipping for the TPU Accelerator ([#6576](https://github.com/PyTorchLightning/pytorch-lightning/pull/6576))
 - Refactored setup for typing friendly ([#6590](https://github.com/PyTorchLightning/pytorch-lightning/pull/6590))
 
-
 ### Fixed
 
 - Fixed a bug where `all_gather` would not work correctly with `tpu_cores=8` ([#6587](https://github.com/PyTorchLightning/pytorch-lightning/pull/6587))
@@ -36,7 +52,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed broadcast to use PyTorch `broadcast_object_list` and add `reduce_decision` ([#6410](https://github.com/PyTorchLightning/pytorch-lightning/pull/6410))
 - Fixed logger creating directory structure too early in DDP ([#6380](https://github.com/PyTorchLightning/pytorch-lightning/pull/6380))
 - Fixed DeepSpeed additional memory use on rank 0 when default device not set early enough ([#6460](https://github.com/PyTorchLightning/pytorch-lightning/pull/6460))
-- Fixed `DummyLogger.log_hyperparams` raising a `TypeError` when running with `fast_dev_run=True` ([#6398](https://github.com/PyTorchLightning/pytorch-lightning/pull/6398))
 - Fixed an issue with `Tuner.scale_batch_size` not finding the batch size attribute in the datamodule ([#5968](https://github.com/PyTorchLightning/pytorch-lightning/pull/5968))
 - Fixed an exception in the layer summary when the model contains torch.jit scripted submodules ([#6511](https://github.com/PyTorchLightning/pytorch-lightning/pull/6511))
 - Fixed when Train loop config was run during `Trainer.predict` ([#6541](https://github.com/PyTorchLightning/pytorch-lightning/pull/6541))

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -78,7 +78,7 @@ jobs:
       displayName: 'Get legacy checkpoints'
 
     - bash: |
-        python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --durations=50
+        python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50
       displayName: 'Testing: standard'
 
     - bash: |
@@ -98,9 +98,12 @@ jobs:
     - script: |
         set -e
         python -m pytest pl_examples -v --maxfail=2 --durations=0
-        python setup.py install --user --quiet
-        bash pl_examples/run_ddp-example.sh
+        pip install . --user --quiet
+        bash pl_examples/run_examples-args.sh --gpus 1 --max_epochs 1 --batch_size 64 --limit_train_batches 5 --limit_val_batches 3
+        bash pl_examples/run_ddp-examples.sh --max_epochs 1 --batch_size 32 --limit_train_batches 2 --limit_val_batches 2
         # cd pl_examples/basic_examples
         # bash submit_ddp_job.sh
         # bash submit_ddp2_job.sh
+      env:
+        PL_USE_MOCKED_MNIST: "1"
       displayName: 'Examples'
diff --git a/dockers/nvidia/Dockerfile b/dockers/nvidia/Dockerfile
@@ -12,26 +12,69 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-FROM nvcr.io/nvidia/pytorch:20.12-py3
+FROM nvcr.io/nvidia/cuda:11.1.1-runtime-ubuntu20.04
 
 MAINTAINER PyTorchLightning <https://github.com/PyTorchLightning>
 
 ARG LIGHTNING_VERSION=""
 
-COPY ./ ./pytorch-lightning/
+SHELL ["/bin/bash", "-c"]
+# https://techoverflow.net/2019/05/18/how-to-fix-configuring-tzdata-interactive-input-when-building-docker-images/
+ENV \
+    DEBIAN_FRONTEND=noninteractive \
+    TZ=Europe/Prague \
+    PATH="$PATH:/root/.local/bin" \
+    CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda" \
+    MKL_THREADING_LAYER=GNU
+
+RUN apt-get update -qq && \
+    apt-get install -y --no-install-recommends \
+        build-essential \
+        python3 \
+        python3-distutils \
+        python3-dev \
+        pkg-config \
+        cmake \
+        git \
+        wget \
+        unzip \
+        ca-certificates \
+    && \
+
+# Cleaning
+    apt-get autoremove -y && \
+    apt-get clean && \
+    rm -rf /root/.cache && \
+    rm -rf /var/lib/apt/lists/* && \
+
+# Setup PIP
+    update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \
+    wget https://bootstrap.pypa.io/get-pip.py --progress=bar:force:noscroll --no-check-certificate && \
+    python get-pip.py && \
+    rm get-pip.py && \
+    pip --version
+
+COPY ./ /home/pytorch-lightning/
 
-# install dependencies
 RUN \
-    # Disable cache
-    #conda install "pip>20.1" && \
-    #pip config set global.cache-dir false && \
-    if [ -z $LIGHTNING_VERSION ] ; then \
-        pip install ./pytorch-lightning --no-cache-dir ; \
-        rm -rf pytorch-lightning ; \
-    else \
+    cd /home  && \
+    mv pytorch-lightning/notebooks . && \
+    mv pytorch-lightning/pl_examples . && \
+    # replace by specific version if asked
+    if [ ! -z "$LIGHTNING_VERSION" ] ; then \
         rm -rf pytorch-lightning ; \
-        pip install https://github.com/PyTorchLightning/pytorch-lightning/archive/${LIGHTNING_VERSION}.zip --no-cache-dir ; \
-    fi
+        wget https://github.com/PyTorchLightning/pytorch-lightning/archive/${LIGHTNING_VERSION}.zip --progress=bar:force:noscroll ; \
+        unzip ${LIGHTNING_VERSION}.zip ; \
+        mv pytorch-lightning-*/ pytorch-lightning ; \
+        rm *.zip ; \
+    fi && \
+
+# Installations
+    python -c "fname = './pytorch-lightning/requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('horovod')] ; open(fname, 'w').writelines(lines)" && \
+    pip install -r ./pytorch-lightning/requirements/extra.txt -U --no-cache-dir && \
+    pip install -r ./pytorch-lightning/requirements/examples.txt -U --no-cache-dir && \
+    pip install ./pytorch-lightning --no-cache-dir && \
+    rm -rf pytorch-lightning
 
 RUN python --version && \
     pip --version && \

diff --git a/dockers/release/Dockerfile b/dockers/release/Dockerfile
@@ -21,12 +21,14 @@ MAINTAINER PyTorchLightning <https://github.com/PyTorchLightning>
 
 ARG LIGHTNING_VERSION=""
 
-COPY ./ ./pytorch-lightning/
+COPY ./ /home/pytorch-lightning/
 
 # install dependencies
 RUN \
-    # Disable cache
-    #conda install "pip>20.1" && \
+    cd /home && \
+    mv pytorch-lightning/notebooks . && \
+    mv pytorch-lightning/pl_examples . && \
+    # replace by specific version if asked
     if [ ! -z "$LIGHTNING_VERSION" ] ; then \
         rm -rf pytorch-lightning ; \
         wget https://github.com/PyTorchLightning/pytorch-lightning/archive/${LIGHTNING_VERSION}.zip --progress=bar:force:noscroll ; \

diff --git a/docs/source/common/lightning_module.rst b/docs/source/common/lightning_module.rst
@@ -1039,6 +1039,7 @@ This is the pseudocode to describe how all the hooks are called during a call to
         teardown()
 
     def train_loop():
+        on_epoch_start()
         on_train_epoch_start()
         train_outs = []
         for train_batch in train_dataloader():
@@ -1062,12 +1063,15 @@ This is the pseudocode to describe how all the hooks are called during a call to
                 val_loop()
 
         # end training epoch
-        logs = training_epoch_end(outs)
+        outs = training_epoch_end(outs)
+        on_train_epoch_end(outs)
+        on_epoch_end()
 
     def val_loop():
         model.eval()
         torch.set_grad_enabled(False)
 
+        on_epoch_start()
         on_validation_epoch_start()
         val_outs = []
         for val_batch in val_dataloader():
@@ -1081,6 +1085,7 @@ This is the pseudocode to describe how all the hooks are called during a call to
 
         validation_epoch_end(val_outs)
         on_validation_epoch_end()
+        on_epoch_end()
 
         # set up for train
         model.train()
@@ -1108,12 +1113,12 @@ manual_backward
 on_after_backward
 ~~~~~~~~~~~~~~~~~
 
-.. automethod:: pytorch_lightning.core.lightning.LightningModule.on_after_backward
+.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_after_backward
     :noindex:
 
 on_before_zero_grad
 ~~~~~~~~~~~~~~~~~~~
-.. automethod:: pytorch_lightning.core.lightning.LightningModule.on_before_zero_grad
+.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_before_zero_grad
     :noindex:
 
 on_fit_start
@@ -1132,15 +1137,38 @@ on_fit_end
 on_load_checkpoint
 ~~~~~~~~~~~~~~~~~~
 
-.. automethod:: pytorch_lightning.core.lightning.LightningModule.on_load_checkpoint
+.. automethod:: pytorch_lightning.core.hooks.CheckpointHooks.on_load_checkpoint
     :noindex:
 
 on_save_checkpoint
 ~~~~~~~~~~~~~~~~~~
 
-.. automethod:: pytorch_lightning.core.lightning.LightningModule.on_save_checkpoint
+.. automethod:: pytorch_lightning.core.hooks.CheckpointHooks.on_save_checkpoint
     :noindex:
 
+on_train_start
+~~~~~~~~~~~~~~
+
+.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_train_start
+    :noindex:
+
+on_train_end
+~~~~~~~~~~~~
+
+.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_train_end
+    :noindex:
+
+on_validation_start
+~~~~~~~~~~~~~~~~~~~
+
+.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_validation_start
+    :noindex:
+
+on_validation_end
+~~~~~~~~~~~~~~~~~
+
+.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_validation_end
+    :noindex:
 
 on_pretrain_routine_start
 ~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -1178,6 +1206,11 @@ on_test_epoch_end
 .. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_test_epoch_end
     :noindex:
 
+on_test_end
+~~~~~~~~~~~
+
+.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_test_end
+    :noindex:
 
 on_train_batch_start
 ~~~~~~~~~~~~~~~~~~~~
@@ -1191,6 +1224,18 @@ on_train_batch_end
 .. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_train_batch_end
     :noindex:
 
+on_epoch_start
+~~~~~~~~~~~~~~
+
+.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_epoch_start
+    :noindex:
+
+on_epoch_end
+~~~~~~~~~~~~
+
+.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_epoch_end
+    :noindex:
+
 on_train_epoch_start
 ~~~~~~~~~~~~~~~~~~~~
 
@@ -1227,6 +1272,36 @@ on_validation_epoch_end
 .. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_validation_epoch_end
     :noindex:
 
+on_post_move_to_device
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_post_move_to_device
+    :noindex:
+
+on_validation_model_eval
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_validation_model_eval
+    :noindex:
+
+on_validation_model_train
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_validation_model_train
+    :noindex:
+
+on_test_model_eval
+~~~~~~~~~~~~~~~~~~
+
+.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_test_model_eval
+    :noindex:
+
+on_test_model_train
+~~~~~~~~~~~~~~~~~~~
+
+.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_test_model_train
+    :noindex:
+
 optimizer_step
 ~~~~~~~~~~~~~~
 
@@ -1266,19 +1341,19 @@ teardown
 train_dataloader
 ~~~~~~~~~~~~~~~~
 
-.. automethod:: pytorch_lightning.core.lightning.LightningModule.train_dataloader
+.. automethod:: pytorch_lightning.core.hooks.DataHooks.train_dataloader
     :noindex:
 
 val_dataloader
 ~~~~~~~~~~~~~~
 
-.. automethod:: pytorch_lightning.core.lightning.LightningModule.val_dataloader
+.. automethod:: pytorch_lightning.core.hooks.DataHooks.val_dataloader
     :noindex:
 
 test_dataloader
 ~~~~~~~~~~~~~~~
 
-.. automethod:: pytorch_lightning.core.lightning.LightningModule.test_dataloader
+.. automethod:: pytorch_lightning.core.hooks.DataHooks.test_dataloader
     :noindex:
 
 transfer_batch_to_device

diff --git a/docs/source/ecosystem/asr_nlp_tts.rst b/docs/source/ecosystem/asr_nlp_tts.rst
@@ -270,12 +270,12 @@ with PyTorch Lightning since every NeMo model is a Lightning Module.
                 log_probs=log_probs, targets=transcript, input_lengths=encoded_len, target_lengths=transcript_len
             )
             wer_num, wer_denom = self._wer(predictions, transcript, transcript_len)
-            tensorboard_logs = {
+            self.log_dict({
                 'train_loss': loss_value,
                 'training_batch_wer': wer_num / wer_denom,
                 'learning_rate': self._optimizer.param_groups[0]['lr'],
-            }
-            return {'loss': loss_value, 'log': tensorboard_logs}
+            })
+            return loss_value
 
 Neural Types in NeMo ASR
 ------------------------
@@ -539,8 +539,8 @@ since every NeMo model is a Lightning Module.
             logits = self(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask)
 
             loss = self.loss(logits=logits, labels=labels, loss_mask=loss_mask)
-            tensorboard_logs = {'train_loss': loss, 'lr': self._optimizer.param_groups[0]['lr']}
-            return {'loss': loss, 'log': tensorboard_logs}
+            self.log_dict({'train_loss': loss, 'lr': self._optimizer.param_groups[0]['lr']})
+            return loss
         ...
 
 Neural Types in NeMo NLP