Add support for logging the model checkpoints to MLFlowLogger (#15246)

AlessioQuercia · awaelchli · Borda · web-flow · commit 46548337def1 · 2022-11-21T13:40:37.000Z
Co-authored-by: Adrian Wälchli &lt;aedu.waelchli@gmail.com&gt;
Co-authored-by: Jirka &lt;jirka.borovec@seznam.cz&gt;
Co-authored-by: Carlos Mocholí &lt;carlossmocholi@gmail.com&gt;
diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
@@ -21,6 +21,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Add an axes argument `ax` to the `.lr_find().plot()` to enable writing to a user-defined axes in a matplotlib figure ([#15652](https://github.com/Lightning-AI/lightning/pull/15652))
 
 
+- Added `log_model` parameter to `MLFlowLogger` ([#9187](https://github.com/PyTorchLightning/pytorch-lightning/pull/9187))
+
+
 - Added a check to validate that wrapped FSDP models are used while initializing optimizers ([#15301](https://github.com/Lightning-AI/lightning/pull/15301))
 
 
@@ -56,6 +59,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Enhanced `reduce_boolean_decision` to accommodate `any`-analogous semantics expected by the `EarlyStopping` callback ([#15253](https://github.com/Lightning-AI/lightning/pull/15253))
 
 
+- Fixed the automatic fallback from `Trainer(strategy="ddp_spawn", ...)` to `Trainer(strategy="ddp", ...)` when on an LSF cluster ([#15103](https://github.com/PyTorchLightning/pytorch-lightning/issues/15103))
+
+
 -
 
 ## [1.8.1] - 2022-11-10
@@ -80,8 +86,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed manual optimization raising `AttributeError` with Bagua Strategy ([#12534](https://github.com/PyTorchLightning/pytorch-lightning/issues/12534))
 - Fixed the import of `pytorch_lightning` causing a warning 'Redirects are currently not supported in Windows or MacOs' ([#15610](https://github.com/PyTorchLightning/pytorch-lightning/issues/15610))
 
-- Fixed the automatic fallback from `Trainer(strategy="ddp_spawn", ...)` to `Trainer(strategy="ddp", ...)` when on an LSF cluster ([#15103](https://github.com/PyTorchLightning/pytorch-lightning/issues/15103))
-
 
 ## [1.8.0] - 2022-11-01
 
diff --git a/src/pytorch_lightning/loggers/logger.py b/src/pytorch_lightning/loggers/logger.py
@@ -26,7 +26,7 @@
 from torch import Tensor
 
 import pytorch_lightning as pl
-from pytorch_lightning.callbacks import Checkpoint
+from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
 from pytorch_lightning.utilities.rank_zero import rank_zero_only
 
 
@@ -58,7 +58,7 @@ def get_experiment() -> Callable:
 class Logger(ABC):
     """Base class for experiment loggers."""
 
-    def after_save_checkpoint(self, checkpoint_callback: Checkpoint) -> None:
+    def after_save_checkpoint(self, checkpoint_callback: ModelCheckpoint) -> None:
         """Called after model checkpoint callback saves a new checkpoint.
 
         Args:
diff --git a/src/pytorch_lightning/loggers/mlflow.py b/src/pytorch_lightning/loggers/mlflow.py
@@ -18,14 +18,20 @@
 import logging
 import os
 import re
+import tempfile
 from argparse import Namespace
+from pathlib import Path
 from time import time
 from typing import Any, Dict, Mapping, Optional, Union
 
+import torch
+import yaml
 from lightning_utilities.core.imports import module_available
+from typing_extensions import Literal
 
+from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
 from pytorch_lightning.loggers.logger import Logger, rank_zero_experiment
-from pytorch_lightning.utilities.logger import _add_prefix, _convert_params, _flatten_dict
+from pytorch_lightning.utilities.logger import _add_prefix, _convert_params, _flatten_dict, _scan_checkpoints
 from pytorch_lightning.utilities.rank_zero import rank_zero_only, rank_zero_warn
 
 log = logging.getLogger(__name__)
@@ -108,6 +114,15 @@ def any_lightning_module_function_or_hook(self):
         save_dir: A path to a local directory where the MLflow runs get saved.
             Defaults to `./mlflow` if `tracking_uri` is not provided.
             Has no effect if `tracking_uri` is provided.
+        log_model: Log checkpoints created by :class:`~pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint`
+            as MLFlow artifacts.
+
+            * if ``log_model == 'all'``, checkpoints are logged during training.
+            * if ``log_model == True``, checkpoints are logged at the end of training, except when
+              :paramref:`~pytorch_lightning.callbacks.Checkpoint.save_top_k` ``== -1``
+              which also logs every checkpoint during training.
+            * if ``log_model == False`` (default), no checkpoint is logged.
+
         prefix: A string to put at the beginning of metric keys.
         artifact_location: The location to store run artifacts. If not provided, the server picks an appropriate
             default.
@@ -127,6 +142,7 @@ def __init__(
         tracking_uri: Optional[str] = os.getenv("MLFLOW_TRACKING_URI"),
         tags: Optional[Dict[str, Any]] = None,
         save_dir: Optional[str] = "./mlruns",
+        log_model: Literal[True, False, "all"] = False,
         prefix: str = "",
         artifact_location: Optional[str] = None,
         run_id: Optional[str] = None,
@@ -145,6 +161,9 @@ def __init__(
         self._run_name = run_name
         self._run_id = run_id
         self.tags = tags
+        self._log_model = log_model
+        self._logged_model_time: Dict[str, float] = {}
+        self._checkpoint_callback: Optional[ModelCheckpoint] = None
         self._prefix = prefix
         self._artifact_location = artifact_location
 
@@ -261,6 +280,11 @@ def finalize(self, status: str = "success") -> None:
             status = "FINISHED"
         elif status == "failed":
             status = "FAILED"
+
+        # log checkpoints as artifacts
+        if self._checkpoint_callback:
+            self._scan_and_log_checkpoints(self._checkpoint_callback)
+
         if self.experiment.get_run(self.run_id):
             self.experiment.set_terminated(self.run_id, status)
 
@@ -292,3 +316,59 @@ def version(self) -> Optional[str]:
             The run id.
         """
         return self.run_id
+
+    def after_save_checkpoint(self, checkpoint_callback: ModelCheckpoint) -> None:
+        # log checkpoints as artifacts
+        if self._log_model == "all" or self._log_model is True and checkpoint_callback.save_top_k == -1:
+            self._scan_and_log_checkpoints(checkpoint_callback)
+        elif self._log_model is True:
+            self._checkpoint_callback = checkpoint_callback
+
+    def _scan_and_log_checkpoints(self, checkpoint_callback: ModelCheckpoint) -> None:
+        # get checkpoints to be saved with associated score
+        checkpoints = _scan_checkpoints(checkpoint_callback, self._logged_model_time)
+
+        # log iteratively all new checkpoints
+        for t, p, s, tag in checkpoints:
+            metadata = {
+                # Ensure .item() is called to store Tensor contents
+                "score": s.item() if isinstance(s, torch.Tensor) else s,
+                "original_filename": Path(p).name,
+                "Checkpoint": {
+                    k: getattr(checkpoint_callback, k)
+                    for k in [
+                        "monitor",
+                        "mode",
+                        "save_last",
+                        "save_top_k",
+                        "save_weights_only",
+                        "_every_n_train_steps",
+                        "_every_n_val_epochs",
+                    ]
+                    # ensure it does not break if `Checkpoint` args change
+                    if hasattr(checkpoint_callback, k)
+                },
+            }
+            aliases = ["latest", "best"] if p == checkpoint_callback.best_model_path else ["latest"]
+
+            # Artifact path on mlflow
+            artifact_path = f"model/checkpoints/{Path(p).stem}"
+
+            # Log the checkpoint
+            self.experiment.log_artifact(self._run_id, p, artifact_path)
+
+            # Create a temporary directory to log on mlflow
+            with tempfile.TemporaryDirectory(prefix="test", suffix="test", dir=os.getcwd()) as tmp_dir:
+                # Log the metadata
+                with open(f"{tmp_dir}/metadata.yaml", "w") as tmp_file_metadata:
+                    yaml.dump(metadata, tmp_file_metadata, default_flow_style=False)
+
+                # Log the aliases
+                with open(f"{tmp_dir}/aliases.txt", "w") as tmp_file_aliases:
+                    tmp_file_aliases.write(str(aliases))
+
+                # Log the metadata and aliases
+                self.experiment.log_artifacts(self._run_id, tmp_dir, artifact_path)
+
+            # remember logged models - timestamp needed in case filename didn't change (lastkckpt or custom name)
+            self._logged_model_time[p] = t
diff --git a/src/pytorch_lightning/loggers/wandb.py b/src/pytorch_lightning/loggers/wandb.py
@@ -25,7 +25,7 @@
 from torch import Tensor
 
 from lightning_lite.utilities.types import _PATH
-from pytorch_lightning.callbacks import Checkpoint
+from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
 from pytorch_lightning.loggers.logger import Logger, rank_zero_experiment
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.logger import (
@@ -331,7 +331,7 @@ def __init__(
         self._prefix = prefix
         self._experiment = experiment
         self._logged_model_time: Dict[str, float] = {}
-        self._checkpoint_callback: Optional[Checkpoint] = None
+        self._checkpoint_callback: Optional[ModelCheckpoint] = None
 
         # paths are processed as strings
         if save_dir is not None:
@@ -513,14 +513,9 @@ def version(self) -> Optional[str]:
         # don't create an experiment if we don't have one
         return self._experiment.id if self._experiment else self._id
 
-    def after_save_checkpoint(self, checkpoint_callback: Checkpoint) -> None:
+    def after_save_checkpoint(self, checkpoint_callback: ModelCheckpoint) -> None:
         # log checkpoints as artifacts
-        if (
-            self._log_model == "all"
-            or self._log_model is True
-            and hasattr(checkpoint_callback, "save_top_k")
-            and checkpoint_callback.save_top_k == -1
-        ):
+        if self._log_model == "all" or self._log_model is True and checkpoint_callback.save_top_k == -1:
             self._scan_and_log_checkpoints(checkpoint_callback)
         elif self._log_model is True:
             self._checkpoint_callback = checkpoint_callback
@@ -574,7 +569,7 @@ def finalize(self, status: str) -> None:
         if self._checkpoint_callback and self._experiment is not None:
             self._scan_and_log_checkpoints(self._checkpoint_callback)
 
-    def _scan_and_log_checkpoints(self, checkpoint_callback: Checkpoint) -> None:
+    def _scan_and_log_checkpoints(self, checkpoint_callback: ModelCheckpoint) -> None:
         # get checkpoints to be saved with associated score
         checkpoints = _scan_checkpoints(checkpoint_callback, self._logged_model_time)
 
diff --git a/tests/tests_pytorch/loggers/test_mlflow.py b/tests/tests_pytorch/loggers/test_mlflow.py
@@ -277,3 +277,41 @@ def test_mlflow_logger_finalize_when_exception(*_):
     assert logger._initialized
     logger.finalize("failed")
     logger.experiment.set_terminated.assert_called_once_with(logger.run_id, "FAILED")
+
+
+@mock.patch("pytorch_lightning.loggers.mlflow.mlflow")
+@mock.patch("pytorch_lightning.loggers.mlflow.MlflowClient")
+@pytest.mark.parametrize("log_model", ["all", True, False])
+def test_mlflow_log_model(client, _, tmpdir, log_model):
+    """Test that the logger creates the folders and files in the right place."""
+    # Get model, logger, trainer and train
+    model = BoringModel()
+    logger = MLFlowLogger("test", save_dir=tmpdir, log_model=log_model)
+    logger = mock_mlflow_run_creation(logger, experiment_id="test-id")
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        logger=logger,
+        max_epochs=2,
+        limit_train_batches=3,
+        limit_val_batches=3,
+    )
+    trainer.fit(model)
+
+    if log_model == "all":
+        # Checkpoint log
+        assert client.return_value.log_artifact.call_count == 2
+        # Metadata and aliases log
+        assert client.return_value.log_artifacts.call_count == 2
+
+    elif log_model is True:
+        # Checkpoint log
+        client.return_value.log_artifact.assert_called_once()
+        # Metadata and aliases log
+        client.return_value.log_artifacts.assert_called_once()
+
+    elif log_model is False:
+        # Checkpoint log
+        assert not client.return_value.log_artifact.called
+        # Metadata and aliases log
+        assert not client.return_value.log_artifacts.called