Lightning-AI
diff --git a/‎CHANGELOG.md‎
Lines changed: 12 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎docs/source/extensions/logging.rst‎
Lines changed: 19 additions & 0 deletions b/‎docs/source/extensions/logging.rst‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎notebooks/06-mnist-tpu-training.ipynb‎
Lines changed: 1 addition & 1 deletion b/‎notebooks/06-mnist-tpu-training.ipynb‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pytorch_lightning/callbacks/model_checkpoint.py‎
Lines changed: 81 additions & 55 deletions b/‎pytorch_lightning/callbacks/model_checkpoint.py‎
Lines changed: 81 additions & 55 deletions
diff --git a/‎pytorch_lightning/loggers/tensorboard.py‎
Lines changed: 7 additions & 4 deletions b/‎pytorch_lightning/loggers/tensorboard.py‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎pytorch_lightning/plugins/training_type/rpc.py‎
Lines changed: 4 additions & 4 deletions b/‎pytorch_lightning/plugins/training_type/rpc.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎pytorch_lightning/plugins/training_type/rpc_sequential.py‎
Lines changed: 3 additions & 3 deletions b/‎pytorch_lightning/plugins/training_type/rpc_sequential.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎pytorch_lightning/plugins/training_type/tpu_spawn.py‎
Lines changed: 1 addition & 1 deletion b/‎pytorch_lightning/plugins/training_type/tpu_spawn.py‎
Lines changed: 1 addition & 1 deletion
@@ -89,9 +89,18 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Do not print top-k verbose log with `ModelCheckpoint(monitor=None)` ([#6109](https://github.com/PyTorchLightning/pytorch-lightning/pull/6109))
 
 
+- Fixed `ModelCheckpoint(monitor=None, save_last=True)` not saving checkpoints ([#6136](https://github.com/PyTorchLightning/pytorch-lightning/pull/6136))
+
+
+- Fixed `ModelCheckpoint(save_top_k=0, save_last=True)` not saving the `last` checkpoint ([#6136](https://github.com/PyTorchLightning/pytorch-lightning/pull/6136))
+
+
 - Expose DeepSpeed loss parameters to allow users to fix loss instability ([#6115](https://github.com/PyTorchLightning/pytorch-lightning/pull/6115))
 
 
+- Fixed `AttributeError` when `logger=None` on TPU ([#6221](https://github.com/PyTorchLightning/pytorch-lightning/pull/6221))
+
+
 - Fixed `ModelPruning(make_pruning_permanent=True)` pruning buffers getting removed when saved during training ([#6073](https://github.com/PyTorchLightning/pytorch-lightning/pull/6073))
 
 
@@ -110,6 +119,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed PyTorch Profiler with `emit_nvtx` ([#6260](https://github.com/PyTorchLightning/pytorch-lightning/pull/6260))
 
 
+- Fixed `Trainer` not resetting `lightning_optimizers` when calling `Trainer.fit()` multiple times ([#6372](https://github.com/PyTorchLightning/pytorch-lightning/pull/6372))
+
+
 - Fixed `DummyLogger.log_hyperparams` raising a `TypeError` when running with `fast_dev_run=True` ([#6398](https://github.com/PyTorchLightning/pytorch-lightning/pull/6398))
 
 
 
@@ -294,6 +294,25 @@ Some loggers also allow logging the hyperparams used in the experiment. For inst
 when using the TestTubeLogger or the TensorBoardLogger, all hyperparams will show
 in the `hparams tab <https://pytorch.org/docs/stable/tensorboard.html#torch.utils.tensorboard.writer.SummaryWriter.add_hparams>`_.
 
+.. note::
+    If you want to track a metric in the tensorboard hparams tab, log scalars to the key ``hp_metric``. If tracking multiple metrics, initialize ``TensorBoardLogger`` with ``default_hp_metric=False`` and call ``log_hyperparams`` only once with your metric keys and initial values. Subsequent updates can simply be logged to the metric keys. Refer to the following for examples on how to setup proper hyperparams metrics tracking within :doc:`LightningModule <../common/lightning_module>`.
+
+    .. code-block:: python
+
+        # Using default_hp_metric
+        def validation_step(self, batch, batch_idx):
+            self.log("hp_metric", some_scalar)
+
+        # Using custom or multiple metrics (default_hp_metric=False)
+        def on_train_start(self):
+            self.logger.log_hyperparams(self.hparams, {"hp/metric_1": 0, "hp/metric_2": 0})
+
+        def validation_step(self, batch, batch_idx):
+            self.log("hp/metric_1", some_scalar_1)
+            self.log("hp/metric_2", some_scalar_2)
+
+    In the example, using `hp/` as a prefix allows for the metrics to be grouped under "hp" in the tensorboard scalar tab where you can collapse them.
+
 ----------
 
 *************
 
@@ -80,7 +80,7 @@
         "id": "AYGWh10lRaF1"
       },
       "source": [
-        "! pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.7-cp36-cp36m-linux_x86_64.whl"
+        "! pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.7-cp37-cp37m-linux_x86_64.whl"
       ],
       "execution_count": null,
       "outputs": []
 
@@ -189,7 +189,7 @@ def on_validation_end(self, trainer, pl_module):
         """
         checkpoints can be saved at the end of the val loop
         """
-        self.save_checkpoint(trainer, pl_module)
+        self.save_checkpoint(trainer)
 
     def on_save_checkpoint(self, trainer, pl_module, checkpoint: Dict[str, Any]) -> Dict[str, Any]:
         return {
@@ -204,12 +204,18 @@ def on_load_checkpoint(self, callback_state: Dict[str, Any]):
         self.best_model_score = callback_state["best_model_score"]
         self.best_model_path = callback_state["best_model_path"]
 
-    def save_checkpoint(self, trainer, pl_module):
+    def save_checkpoint(self, trainer, unused: Optional = None):
         """
         Performs the main logic around saving a checkpoint.
         This method runs on all ranks, it is the responsibility of `self.save_function`
         to handle correct behaviour in distributed training, i.e., saving only on rank 0.
         """
+        if unused is not None:
+            rank_zero_warn(
+                "`ModelCheckpoint.save_checkpoint` signature has changed in v1.3. The `pl_module` parameter"
+                " has been removed. Support for the old signature will be removed in v1.5", DeprecationWarning
+            )
+
         epoch = trainer.current_epoch
         global_step = trainer.global_step
 
@@ -218,7 +224,6 @@ def save_checkpoint(self, trainer, pl_module):
             trainer.fast_dev_run  # disable checkpointing with fast_dev_run
             or trainer.state != TrainerState.FITTING  # don't save anything during non-fit
             or trainer.sanity_checking  # don't save anything during sanity check
-            or self.save_top_k == 0  # no models are saved
             or self.period < 1  # no models are saved
             or (epoch + 1) % self.period  # skip epoch
             or self._last_global_step_saved == global_step  # already saved at the last step
@@ -236,28 +241,33 @@ def save_checkpoint(self, trainer, pl_module):
 
         # callback supports multiple simultaneous modes
         # here we call each mode sequentially
-        # Mode 1: save all checkpoints OR only the top k
-        if self.save_top_k:
-            self._save_top_k_checkpoints(trainer, pl_module, monitor_candidates)
-
-        # Mode 2: save the last checkpoint
+        # Mode 1: save the top k checkpoints
+        self._save_top_k_checkpoint(trainer, monitor_candidates)
+        # Mode 2: save monitor=None checkpoints
+        self._save_none_monitor_checkpoint(trainer, monitor_candidates)
+        # Mode 3: save last checkpoints
         self._save_last_checkpoint(trainer, monitor_candidates)
 
     def __validate_init_configuration(self):
         if self.save_top_k is not None and self.save_top_k < -1:
             raise MisconfigurationException(f'Invalid value for save_top_k={self.save_top_k}. Must be None or >= -1')
         if self.monitor is None:
             # None: save last epoch, -1: save all epochs, 0: nothing is saved
-            if self.save_top_k not in [None, -1, 0]:
+            if self.save_top_k not in (None, -1, 0):
                 raise MisconfigurationException(
                     f'ModelCheckpoint(save_top_k={self.save_top_k}, monitor=None) is not a valid'
                     ' configuration. No quantity for top_k to track.'
                 )
             if self.save_last:
                 rank_zero_warn(
-                    'ModelCheckpoint(save_last=True, monitor=None) is a redundant configuration.'
+                    'ModelCheckpoint(save_last=True, save_top_k=None, monitor=None) is a redundant configuration.'
                     ' You can save the last checkpoint with ModelCheckpoint(save_top_k=None, monitor=None).'
                 )
+            if self.save_top_k == -1 and self.save_last:
+                rank_zero_info(
+                    'ModelCheckpoint(save_last=True, save_top_k=-1, monitor=None)'
+                    ' will duplicate the last checkpoint saved.'
+                )
 
     def __init_ckpt_dir(self, dirpath, filename, save_top_k):
 
@@ -293,7 +303,16 @@ def _del_model(self, filepath: str):
             self._fs.rm(filepath)
             log.debug(f"Removed checkpoint: {filepath}")
 
-    def _save_model(self, filepath: str, trainer):
+    def _save_model(self, trainer, filepath: str):
+        if trainer.training_type_plugin.rpc_enabled:
+            # RPCPlugin manages saving all model states
+            # TODO: the rpc plugin should wrap trainer.save_checkpoint
+            # instead of us having to do it here manually
+            trainer.training_type_plugin.rpc_save_model(trainer, self._do_save, filepath)
+        else:
+            self._do_save(trainer, filepath)
+
+    def _do_save(self, trainer, filepath: str):
         # in debugging, track when we save checkpoints
         trainer.dev_debugger.track_checkpointing_history(filepath)
 
@@ -307,7 +326,7 @@ def _save_model(self, filepath: str, trainer):
         else:
             raise ValueError(".save_function() not set")
 
-    def check_monitor_top_k(self, current) -> bool:
+    def check_monitor_top_k(self, current: torch.Tensor) -> bool:
         if current is None:
             return False
 
@@ -462,17 +481,17 @@ def _validate_monitor_key(self, trainer):
 
     def _get_metric_interpolated_filepath_name(
         self,
-        ckpt_name_metrics: Dict[str, Any],
+        monitor_candidates: Dict[str, Any],
         epoch: int,
         step: int,
         trainer,
         del_filepath: Optional[str] = None,
     ) -> str:
-        filepath = self.format_checkpoint_name(epoch, step, ckpt_name_metrics)
+        filepath = self.format_checkpoint_name(epoch, step, monitor_candidates)
 
         version_cnt = self.STARTING_VERSION
         while self.file_exists(filepath, trainer) and filepath != del_filepath:
-            filepath = self.format_checkpoint_name(epoch, step, ckpt_name_metrics, ver=version_cnt)
+            filepath = self.format_checkpoint_name(epoch, step, monitor_candidates, ver=version_cnt)
             version_cnt += 1
 
         return filepath
@@ -482,47 +501,32 @@ def _monitor_candidates(self, trainer):
         monitor_candidates.update(step=trainer.global_step, epoch=trainer.current_epoch)
         return monitor_candidates
 
-    def _save_last_checkpoint(self, trainer, ckpt_name_metrics):
-        should_save_last = self.monitor is None or self.save_last
-        if not should_save_last:
+    def _save_last_checkpoint(self, trainer, monitor_candidates: Dict[str, Any]):
+        if not self.save_last:
             return
 
-        # when user ALSO asked for the 'last.ckpt' change the name
-        if self.save_last:
-            last_filepath = self._format_checkpoint_name(
-                self.CHECKPOINT_NAME_LAST,
-                trainer.current_epoch,
-                trainer.global_step,
-                ckpt_name_metrics,
-            )
-            last_filepath = os.path.join(self.dirpath, f"{last_filepath}{self.FILE_EXTENSION}")
-        else:
-            last_filepath = self._get_metric_interpolated_filepath_name(
-                ckpt_name_metrics,
-                trainer.current_epoch,
-                trainer.global_step,
-                trainer,
-            )
+        filepath = self._format_checkpoint_name(
+            self.CHECKPOINT_NAME_LAST,
+            trainer.current_epoch,
+            trainer.global_step,
+            monitor_candidates,
+        )
+        filepath = os.path.join(self.dirpath, f"{filepath}{self.FILE_EXTENSION}")
 
-        if trainer.training_type_plugin.rpc_enabled:
-            # RPCPlugin manages saving all model states
-            trainer.training_type_plugin.rpc_save_model(self._save_model, last_filepath, trainer)
-        else:
-            self._save_model(last_filepath, trainer)
-        if (
-            self.last_model_path and self.last_model_path != last_filepath
-            and (self.save_top_k != -1 or self.save_last) and trainer.is_global_zero
-        ):
+        self._save_model(trainer, filepath)
+
+        if self.last_model_path and self.last_model_path != filepath and trainer.is_global_zero:
             self._del_model(self.last_model_path)
-        self.last_model_path = last_filepath
 
-        if self.monitor is None:
-            self.best_model_path = self.last_model_path
+        self.last_model_path = filepath
+
+    def _save_top_k_checkpoint(self, trainer, monitor_candidates: Dict[str, Any]):
+        if self.monitor is None or self.save_top_k == 0:
+            return
 
-    def _save_top_k_checkpoints(self, trainer, pl_module, metrics):
-        current = metrics.get(self.monitor)
-        epoch = metrics.get("epoch")
-        step = metrics.get("step")
+        current = monitor_candidates.get(self.monitor)
+        epoch = monitor_candidates.get("epoch")
+        step = monitor_candidates.get("step")
 
         # when `val_loss` is being logged and no ModelCheckpoint is being provided
         # `val_loss` will be selected for monitor and need to be reduced to
@@ -533,15 +537,37 @@ def _save_top_k_checkpoints(self, trainer, pl_module, metrics):
             current = trainer.training_type_plugin.reduce(current, reduce_op="mean")
 
         if self.check_monitor_top_k(current):
-            self._update_best_and_save(current, epoch, step, trainer, pl_module, metrics)
-        elif self.monitor is not None and self.verbose:
+            self._update_best_and_save(current, epoch, step, trainer, monitor_candidates)
+        elif self.verbose:
             rank_zero_info(f"Epoch {epoch:d}, step {step:d}: {self.monitor} was not in top {self.save_top_k}")
 
+    def _save_none_monitor_checkpoint(self, trainer, monitor_candidates: Dict[str, Any]):
+        if self.monitor is not None or self.save_top_k == 0:
+            return
+
+        filepath = self._get_metric_interpolated_filepath_name(
+            monitor_candidates,
+            trainer.current_epoch,
+            trainer.global_step,
+            trainer,
+        )
+        self._save_model(trainer, filepath)
+
+        if (
+            self.save_top_k is None
+            and self.best_model_path
+            and self.best_model_path != filepath
+            and trainer.is_global_zero
+        ):
+            self._del_model(self.best_model_path)
+
+        self.best_model_path = filepath
+
     def _is_valid_monitor_key(self, metrics):
         return self.monitor in metrics or len(metrics) == 0
 
     def _update_best_and_save(
-        self, current: torch.Tensor, epoch: int, step: int, trainer, pl_module, ckpt_name_metrics
+        self, current: torch.Tensor, epoch: int, step: int, trainer, monitor_candidates: Dict[str, Any]
     ):
         k = len(self.best_k_models) + 1 if self.save_top_k == -1 else self.save_top_k
 
@@ -554,7 +580,7 @@ def _update_best_and_save(
         if isinstance(current, torch.Tensor) and torch.isnan(current):
             current = torch.tensor(float('inf' if self.mode == "min" else '-inf'))
 
-        filepath = self._get_metric_interpolated_filepath_name(ckpt_name_metrics, epoch, step, trainer, del_filepath)
+        filepath = self._get_metric_interpolated_filepath_name(monitor_candidates, epoch, step, trainer, del_filepath)
 
         # save the current score
         self.current_score = current
@@ -575,7 +601,7 @@ def _update_best_and_save(
                 f"Epoch {epoch:d}, global step {step:d}: {self.monitor} reached {current:0.5f}"
                 f' (best {self.best_model_score:0.5f}), saving model to "{filepath}" as top {k}'
             )
-        self._save_model(filepath, trainer)
+        self._save_model(trainer, filepath)
 
         if del_filepath is not None and filepath != del_filepath:
             self._del_model(del_filepath)
 
@@ -46,10 +46,13 @@ class TensorBoardLogger(LightningLoggerBase):
     preinstalled.
 
     Example:
-        >>> from pytorch_lightning import Trainer
-        >>> from pytorch_lightning.loggers import TensorBoardLogger
-        >>> logger = TensorBoardLogger("tb_logs", name="my_model")
-        >>> trainer = Trainer(logger=logger)
+
+    .. testcode::
+
+        from pytorch_lightning import Trainer
+        from pytorch_lightning.loggers import TensorBoardLogger
+        logger = TensorBoardLogger("tb_logs", name="my_model")
+        trainer = Trainer(logger=logger)
 
     Args:
         save_dir: Save directory
 
@@ -13,7 +13,7 @@
 # limitations under the License.
 import os
 from contextlib import suppress
-from typing import List, Optional
+from typing import List, Optional, Callable
 
 import torch
 
@@ -63,15 +63,15 @@ def init_rpc_connection(self, global_rank: int, world_size: int) -> None:
         rpc._set_rpc_timeout(self.rpc_timeout_sec)
         self._is_rpc_initialized = True
 
-    def rpc_save_model(self, save_model_fn, last_filepath, trainer) -> None:
+    def rpc_save_model(self, trainer, save_model_fn: Callable, filepath: str) -> None:
         """
         Override to save model to disk.
         This is required as the main process will be required to handle aggregating model states from RPC processes.
 
         Args:
-            save_model_fn: The saving function to save final model.
-            last_filepath: The filepath to save the model to.
             trainer: The trainer object.
+            save_model_fn: The saving function to save final model.
+            filepath: The filepath to save the model to.
         """
         raise NotImplementedError
 
 
@@ -13,7 +13,7 @@
 # limitations under the License
 import logging
 import os
-from typing import List, Optional
+from typing import List, Optional, Callable
 
 import torch
 import torch.distributed as torch_distrib
@@ -266,7 +266,7 @@ def configure_ddp(self):
             self._model.require_backward_grad_sync = False
 
     @rank_zero_only
-    def rpc_save_model(self, save_model_fn, last_filepath, trainer) -> None:
+    def rpc_save_model(self, trainer, save_model_fn: Callable, filepath: str) -> None:
         model = self.lightning_module
         if not hasattr(model.sequential_module, "foreach_worker"):
             return
@@ -275,7 +275,7 @@ def rpc_save_model(self, save_model_fn, last_filepath, trainer) -> None:
             save_layers_on_all_rank_zero_workers, {"gpus_per_model": self.gpus_per_model}, include_self=True
         )
         model.sequential_module = load_sequential_from_saved_layers(self.gpus_per_model)
-        save_model_fn(last_filepath, trainer)
+        save_model_fn(trainer, filepath)
         model.sequential_module = current_layers
 
     def worker_optimizer_step(self, model: LightningModule, opt_idx: int, *args, **kwargs) -> None:
 
@@ -262,7 +262,7 @@ def __load_weights_on_main_process(self) -> None:
         self._model = model
 
     def _close_logger(self, trainer) -> None:
-        if hasattr(trainer, "logger"):
+        if trainer.logger is not None:
             trainer.logger.finalize("success")
 
     @property