Merge branch 'master' into multi_opt

jain-anshul · web-flow · commit b046f18b1943 · 2021-01-05T18:45:19.000+05:30
diff --git a/.drone.yml b/.drone.yml
@@ -30,6 +30,7 @@ steps:
     MKL_THREADING_LAYER: GNU
 
   commands:
+    - set -e
     - python --version
     - pip --version
     - nvidia-smi
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,7 +5,7 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 
-## [1.1.3rc] - 2020-12-29
+## [1.1.3] - 2021-01-05
 
 ### Added
 
@@ -25,12 +25,13 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
+- Skip restore from `resume_from_checkpoint` in while `testing` ([#5161](https://github.com/PyTorchLightning/pytorch-lightning/pull/5161))
+
 - Allowed `log_momentum` for adaptive optimizers in `LearningRateMonitor` ([#5333](https://github.com/PyTorchLightning/pytorch-lightning/pull/5333))
 
 - Disabled checkpointing, earlystopping and logger with `fast_dev_run` ([#5277](https://github.com/PyTorchLightning/pytorch-lightning/pull/5277))
 
 
-
 ## [1.1.2] - 2020-12-23
 
 ### Added
diff --git a/docs/source/transfer_learning.rst b/docs/source/transfer_learning.rst
@@ -52,16 +52,22 @@ Example: Imagenet (computer Vision)
 
     class ImagenetTransferLearning(LightningModule):
         def __init__(self):
+            super().__init__()
+
             # init a pretrained resnet
-            num_target_classes = 10
-            self.feature_extractor = models.resnet50(pretrained=True)
-            self.feature_extractor.eval()
+            backbone = models.resnet50(pretrained=True)
+            num_filters = backbone.fc.in_features
+            layers = list(backbone.children())[:-1]
+            self.feature_extractor = torch.nn.Sequential(*layers)
 
             # use the pretrained model to classify cifar-10 (10 image classes)
-            self.classifier = nn.Linear(2048, num_target_classes)
+            num_target_classes = 10
+            self.classifier = nn.Linear(num_filters, num_target_classes)
 
         def forward(self, x):
-            representations = self.feature_extractor(x)
+            self.feature_extractor.eval()
+            with torch.no_grad():
+                representations = self.feature_extractor(x).flatten(1)
             x = self.classifier(representations)
             ...
 
diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py
@@ -208,6 +208,7 @@ def on_save_checkpoint(self, trainer, pl_module) -> Dict[str, Any]:
             "best_model_score": self.best_model_score,
             "best_model_path": self.best_model_path,
             "current_score": self.current_score,
+            "dirpath": self.dirpath
         }
 
     def on_load_checkpoint(self, checkpointed_state: Dict[str, Any]):
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
@@ -14,15 +14,15 @@
 
 """nn.Module with additional great features."""
 
-from abc import ABC
-from argparse import Namespace
 import collections
 import copy
 import inspect
 import os
-from pathlib import Path
 import re
 import tempfile
+from abc import ABC
+from argparse import Namespace
+from pathlib import Path
 from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Union
 
 import torch
@@ -1331,9 +1331,17 @@ def tbptt_split_batch(self, batch, split_size):
 
         return splits
 
-    def summarize(self, mode: str = ModelSummary.MODE_DEFAULT) -> ModelSummary:
-        model_summary = ModelSummary(self, mode=mode)
-        log.info("\n" + str(model_summary))
+    def summarize(self, mode: Optional[str] = ModelSummary.MODE_DEFAULT) -> Optional[ModelSummary]:
+        model_summary = None
+
+        if mode in ModelSummary.MODES:
+            model_summary = ModelSummary(self, mode=mode)
+            log.info("\n" + str(model_summary))
+        elif mode is not None:
+            raise MisconfigurationException(
+                f"`mode` can be None, {', '.join(ModelSummary.MODES)}, got {mode}"
+            )
+
         return model_summary
 
     def freeze(self) -> None:
diff --git a/pytorch_lightning/metrics/classification/precision_recall.py b/pytorch_lightning/metrics/classification/precision_recall.py
@@ -207,7 +207,7 @@ def update(self, preds: torch.Tensor, target: torch.Tensor):
 
     def compute(self):
         """
-        Computes accuracy over state.
+        Computes recall over state.
         """
         if self.average == 'micro':
             return self.true_positives.sum().float() / (self.actual_positives.sum() + METRIC_EPS)
diff --git a/pytorch_lightning/plugins/rpc_plugin.py b/pytorch_lightning/plugins/rpc_plugin.py
@@ -12,16 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
-from typing import Any, Optional
+from typing import Optional
 
 import torch
 
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.plugins.ddp_plugin import DDPPlugin
-from pytorch_lightning.utilities import RPC_AVAILABLE
+from pytorch_lightning.utilities import _module_available, RPC_AVAILABLE
 
+DEFAULT_RPC_TIMEOUT_SEC = 60.
 if RPC_AVAILABLE:
     from torch.distributed import rpc
+    if _module_available("torch.distributed.rpc.constants") and hasattr(torch.distributed.rpc.constants, "DEFAULT_RPC_TIMEOUT_SEC"):
+        from torch.distributed.rpc.constants import DEFAULT_RPC_TIMEOUT_SEC
 
 
 class RPCPlugin(DDPPlugin):
@@ -33,7 +36,8 @@ class RPCPlugin(DDPPlugin):
     that need to be addressed when using RPC communication when building custom RPC Plugins.
     """
 
-    def __init__(self, **kwargs):
+    def __init__(self, rpc_timeout_sec: float = DEFAULT_RPC_TIMEOUT_SEC, **kwargs):
+        self.rpc_timeout_sec = rpc_timeout_sec
         self.rpc_initialized = False
         super().__init__(**kwargs)
 
@@ -42,6 +46,7 @@ def init_rpc_connection(self,
                             world_size: int) -> None:
         os.environ['MASTER_PORT'] = os.getenv('RPC_MASTER_PORT', '15000')
         rpc.init_rpc(f"worker{global_rank}", rank=global_rank, world_size=world_size)
+        rpc._set_rpc_timeout(self.rpc_timeout_sec)
         self.rpc_initialized = True
 
     def rpc_save_model(self,
diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
@@ -21,6 +21,7 @@
 
 import pytorch_lightning
 from pytorch_lightning import _logger as log
+from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.utilities import AMPType, APEX_AVAILABLE, OMEGACONF_AVAILABLE, rank_zero_info, rank_zero_warn
 from pytorch_lightning.utilities.cloud_io import atomic_save, get_filesystem
@@ -63,7 +64,7 @@ def restore_weights(self, model: LightningModule) -> None:
             rank_zero_info(f'restored hpc model from: {checkpoint_path}')
 
         # 2. Attempt to restore states from `resume_from_checkpoint` file
-        elif self.trainer.resume_from_checkpoint is not None:
+        elif self.trainer.resume_from_checkpoint is not None and not self.trainer.testing:
             self.restore(self.trainer.resume_from_checkpoint, on_gpu=self.trainer.on_gpu)
 
         # wait for all to catch up
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
@@ -311,7 +311,6 @@ def __init__(
         self.plugin_connector = PluginConnector(self)
 
         # training state
-        self.weights_summary = weights_summary
         self.model = None
         self.shown_warnings = set()
 
@@ -374,7 +373,8 @@ def __init__(
             max_steps,
             min_steps,
             num_sanity_val_steps,
-            automatic_optimization
+            automatic_optimization,
+            weights_summary,
         )
         self.evaluation_loop.on_trainer_init()
 
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
@@ -49,7 +49,14 @@ def __init__(self, trainer):
         self._cur_grad_norm_dict = None
 
     def on_trainer_init(
-        self, max_epochs, min_epochs, max_steps, min_steps, num_sanity_val_steps, automatic_optimization
+        self,
+        max_epochs,
+        min_epochs,
+        max_steps,
+        min_steps,
+        num_sanity_val_steps,
+        automatic_optimization,
+        weights_summary,
     ):
         self.trainer.global_step = 0
         self.trainer.current_epoch = 0
@@ -73,6 +80,12 @@ def on_trainer_init(
         else:
             self.trainer.num_sanity_val_steps = num_sanity_val_steps
 
+        self.trainer.weights_summary = weights_summary
+        if weights_summary is not None and weights_summary not in ModelSummary.MODES:
+            raise MisconfigurationException(
+                f"`weights_summary` can be None, {', '.join(ModelSummary.MODES)}, got {weights_summary}"
+            )
+
     @property
     def num_optimizers(self):
         num_optimizers = len(self.get_optimizers_iterable())
@@ -161,17 +174,14 @@ def setup_training(self, model: LightningModule):
             ref_model.on_pretrain_routine_start()
 
         # print model summary
-        if self.trainer.is_global_zero and self.trainer.weights_summary is not None and not self.trainer.testing:
-            if self.trainer.weights_summary in ModelSummary.MODES:
-                ref_model.summarize(mode=self.trainer.weights_summary)
-            else:
-                raise MisconfigurationException("weights_summary can be None, " + ", ".join(ModelSummary.MODES))
+        if self.trainer.is_global_zero and not self.trainer.testing:
+            ref_model.summarize(mode=self.trainer.weights_summary)
 
         # track model now.
         # if cluster resets state, the model will update with the saved weights
         self.trainer.model = model
 
-        # restore training and model before hpc is called
+        # restore training state and model weights before hpc is called
         self.trainer.checkpoint_connector.restore_weights(model)
 
         # on pretrain routine end
diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py
@@ -11,20 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from argparse import Namespace
 import os
+from pathlib import Path
 import pickle
 import platform
 import re
-from argparse import Namespace
-from pathlib import Path
 from unittest import mock
 from unittest.mock import Mock
 
 import cloudpickle
+from omegaconf import Container, OmegaConf
 import pytest
 import torch
 import yaml
-from omegaconf import Container, OmegaConf
 
 import pytorch_lightning as pl
 import tests.base.develop_utils as tutils
@@ -34,6 +34,7 @@
 from pytorch_lightning.utilities.cloud_io import load as pl_load
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.base import BoringModel
+import tests.base.develop_utils as tutils
 
 
 class LogInTwoMethods(BoringModel):
@@ -760,9 +761,9 @@ def assert_checkpoint_log_dir(idx):
         model = ExtendedBoringModel()
         trainer.test(model)
         assert not trainer.checkpoint_connector.has_trained
-        assert trainer.global_step == epochs * limit_train_batches
-        assert trainer.current_epoch == epochs
-
+        # resume_from_checkpoint is resumed when calling `.fit`
+        assert trainer.global_step == 0
+        assert trainer.current_epoch == 0
         trainer.fit(model)
         assert not trainer.checkpoint_connector.has_trained
         assert trainer.global_step == epochs * limit_train_batches
diff --git a/tests/checkpointing/test_trainer_checkpoint.py b/tests/checkpointing/test_trainer_checkpoint.py
@@ -0,0 +1,87 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from copy import deepcopy
+import os
+
+import torch
+
+import pytorch_lightning as pl
+from pytorch_lightning import seed_everything, Trainer
+from pytorch_lightning.callbacks import ModelCheckpoint
+from pytorch_lightning.utilities.cloud_io import load as pl_load
+from tests.base import BoringModel
+
+
+def test_finetuning_with_resume_from_checkpoint(tmpdir):
+    """
+    This test validates that generated ModelCheckpoint is pointing to the right best_model_path during test
+    """
+
+    seed_everything(3)
+
+    checkpoint_callback = ModelCheckpoint(monitor='val_loss', dirpath=tmpdir, filename="{epoch:02d}", save_top_k=-1)
+
+    class ExtendedBoringModel(BoringModel):
+
+        def configure_optimizers(self):
+            optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.001)
+            lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
+            return [optimizer], [lr_scheduler]
+
+        def validation_step(self, batch, batch_idx):
+            output = self.layer(batch)
+            loss = self.loss(batch, output)
+            self.log("val_loss", loss, on_epoch=True, prog_bar=True)
+
+    model = ExtendedBoringModel()
+    model.validation_epoch_end = None
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        limit_train_batches=12,
+        limit_val_batches=6,
+        limit_test_batches=12,
+        callbacks=[checkpoint_callback],
+        logger=False,
+    )
+    trainer.fit(model)
+    assert os.listdir(tmpdir) == ['epoch=00.ckpt']
+
+    best_model_paths = [checkpoint_callback.best_model_path]
+    results = []
+
+    for idx in range(3, 6):
+        # load from checkpoint
+        trainer = pl.Trainer(
+            default_root_dir=tmpdir,
+            max_epochs=idx,
+            limit_train_batches=12,
+            limit_val_batches=12,
+            limit_test_batches=12,
+            resume_from_checkpoint=best_model_paths[-1],
+            progress_bar_refresh_rate=0,
+        )
+        trainer.fit(model)
+        trainer.test()
+        results.append(deepcopy(trainer.callback_metrics))
+        best_model_paths.append(trainer.checkpoint_callback.best_model_path)
+
+    for idx in range(len(results) - 1):
+        assert results[idx]["val_loss"] > results[idx + 1]["val_loss"]
+
+    for idx, best_model_path in enumerate(best_model_paths):
+        if idx == 0:
+            assert best_model_path.endswith(f"epoch=0{idx}.ckpt")
+        else:
+            assert f"epoch={idx + 1}" in best_model_path
diff --git a/tests/core/test_memory.py b/tests/core/test_memory.py
diff --git a/tests/plugins/test_ddp_sequential_plugin.py b/tests/plugins/test_ddp_sequential_plugin.py
diff --git a/tests/special_tests.sh b/tests/special_tests.sh

Original file line number	Diff line number	Diff line change
`@@ -208,6 +208,7 @@ def on_save_checkpoint(self, trainer, pl_module) -> Dict[str, Any]:`
`208`	`208`	`"best_model_score": self.best_model_score,`
`209`	`209`	`"best_model_path": self.best_model_path,`
`210`	`210`	`"current_score": self.current_score,`
	`211`	`+ "dirpath": self.dirpath`
`211`	`212`	`}`
`212`	`213`
`213`	`214`	`def on_load_checkpoint(self, checkpointed_state: Dict[str, Any]):`