Better errors for logging corner cases (#13164)

carmocca · web-flow · commit b1e38bfd798d · 2022-06-28T16:59:31.000+01:00
diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
@@ -49,6 +49,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Include a version suffix for new "last" checkpoints of later runs in the same directory ([#12902](https://github.com/PyTorchLightning/pytorch-lightning/pull/12902))
 
 
+- Show a better error message when a Metric that does not return a Tensor is logged ([#13164](https://github.com/PyTorchLightning/pytorch-lightning/pull/13164))
+
+
 - Added missing `predict_dataset` argument in `LightningDataModule.from_datasets` to create predict dataloaders ([#12942](https://github.com/PyTorchLightning/pytorch-lightning/pull/12942))
 
 
@@ -123,6 +126,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - `DataLoader` instantiated inside a `*_dataloader` hook will not set the passed arguments as attributes anymore ([#12981](https://github.com/PyTorchLightning/pytorch-lightning/pull/12981))
 
 
+- When a multi-element tensor is logged, an error is now raised instead of silently taking the mean of all elements ([#13164](https://github.com/PyTorchLightning/pytorch-lightning/pull/13164))
+
+
 - The `WandbLogger` will now use the run name in the logs folder if it is provided, and otherwise the project name  ([#12604](https://github.com/PyTorchLightning/pytorch-lightning/pull/12604))
 
 
diff --git a/src/pytorch_lightning/core/module.py b/src/pytorch_lightning/core/module.py
@@ -396,6 +396,7 @@ def log(
             )
 
         value = apply_to_collection(value, numbers.Number, self.__to_tensor)
+        apply_to_collection(value, torch.Tensor, self.__check_numel_1, name)
 
         if self.trainer._logger_connector.should_reset_tensors(self._current_fx_name):
             # if we started a new epoch (running its first batch) the hook name has changed
@@ -518,11 +519,10 @@ def log_dict(
             )
 
     @staticmethod
-    def __check_not_nested(value: dict, name: str) -> dict:
+    def __check_not_nested(value: dict, name: str) -> None:
         # self-imposed restriction. for simplicity
         if any(isinstance(v, dict) for v in value.values()):
             raise ValueError(f"`self.log({name}, {value})` was called, but nested dictionaries cannot be logged")
-        return value
 
     @staticmethod
     def __check_allowed(v: Any, name: str, value: Any) -> None:
@@ -531,6 +531,14 @@ def __check_allowed(v: Any, name: str, value: Any) -> None:
     def __to_tensor(self, value: numbers.Number) -> Tensor:
         return torch.tensor(value, device=self.device)
 
+    @staticmethod
+    def __check_numel_1(value: torch.Tensor, name: str) -> None:
+        if not torch.numel(value) == 1:
+            raise ValueError(
+                f"`self.log({name}, {value})` was called, but the tensor must have a single element."
+                f" You can try doing `self.log({name}, {value}.mean())`"
+            )
+
     def log_grad_norm(self, grad_norm_dict: Dict[str, float]) -> None:
         """Override this method to change the default behaviour of ``log_grad_norm``.
 
diff --git a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py
@@ -244,12 +244,12 @@ def update(self, value: _IN_METRIC, batch_size: int) -> None:  # type: ignore[ov
             # perform accumulation with reduction
             if self.meta.is_mean_reduction:
                 # do not use `+=` as it doesn't do type promotion
-                self.value = self.value + value.mean() * batch_size
+                self.value = self.value + value * batch_size
                 self.cumulated_batch_size = self.cumulated_batch_size + batch_size
             elif self.meta.is_max_reduction or self.meta.is_min_reduction:
-                self.value = self.meta.reduce_fx(self.value, value.mean())
+                self.value = self.meta.reduce_fx(self.value, value)
             elif self.meta.is_sum_reduction:
-                self.value = self.value + value.mean()
+                self.value = self.value + value
         else:
             value = cast(Metric, value)
             self.value = value
@@ -528,8 +528,14 @@ def _get_cache(result_metric: _ResultMetric, on_step: bool) -> Optional[Tensor]:
                 result_metric.compute()
                 result_metric.meta.sync.should = should
             cache = result_metric._computed
-        if cache is not None and not result_metric.meta.enable_graph:
-            return cache.detach()
+        if cache is not None:
+            if not isinstance(cache, torch.Tensor):
+                raise ValueError(
+                    f"The `.compute()` return of the metric logged as {result_metric.meta.name!r} must be a tensor."
+                    f" Found {cache}"
+                )
+            if not result_metric.meta.enable_graph:
+                return cache.detach()
         return cache
 
     def valid_items(self) -> Generator:
diff --git a/tests/tests_pytorch/core/test_metric_result_integration.py b/tests/tests_pytorch/core/test_metric_result_integration.py
@@ -629,3 +629,33 @@ def test_result_metric_max_min(reduce_fx, expected):
     rm = _ResultMetric(metadata, is_tensor=True)
     rm.update(torch.tensor(expected), 1)
     assert rm.compute() == expected
+
+
+def test_compute_not_a_tensor_raises():
+    class RandomMetric(Metric):
+        def update(self):
+            pass
+
+        def compute(self):
+            return torch.tensor(1.0), torch.tensor(2.0)
+
+    class MyModel(BoringModel):
+        def __init__(self):
+            super().__init__()
+            self.metric = RandomMetric()
+
+        def on_train_start(self):
+            self.log("foo", self.metric)
+
+    model = MyModel()
+    trainer = Trainer(
+        limit_train_batches=1,
+        limit_val_batches=0,
+        max_epochs=1,
+        enable_progress_bar=False,
+        enable_checkpointing=False,
+        logger=False,
+        enable_model_summary=False,
+    )
+    with pytest.raises(ValueError, match=r"compute\(\)` return of.*foo' must be a tensor"):
+        trainer.fit(model)
diff --git a/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py b/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py
@@ -631,7 +631,16 @@ class TestModel(BoringModel):
         def training_step(self, batch, batch_idx):
             self.log("foo/dataloader_idx_0", -1)
 
-    trainer = Trainer(default_root_dir=tmpdir)
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        limit_train_batches=1,
+        limit_val_batches=0,
+        max_epochs=1,
+        enable_progress_bar=False,
+        enable_checkpointing=False,
+        logger=False,
+        enable_model_summary=False,
+    )
     model = TestModel()
     with pytest.raises(MisconfigurationException, match="`self.log` with the key `foo/dataloader_idx_0`"):
         trainer.fit(model)
@@ -640,7 +649,6 @@ class TestModel(BoringModel):
         def training_step(self, batch, batch_idx):
             self.log("foo", Accuracy())
 
-    trainer = Trainer(default_root_dir=tmpdir)
     model = TestModel()
     with pytest.raises(MisconfigurationException, match="fix this by setting an attribute for the metric in your"):
         trainer.fit(model)
@@ -653,7 +661,6 @@ def __init__(self):
         def training_step(self, batch, batch_idx):
             self.log("foo", Accuracy())
 
-    trainer = Trainer(default_root_dir=tmpdir)
     model = TestModel()
     with pytest.raises(
         MisconfigurationException,
@@ -667,7 +674,6 @@ def training_step(self, *args):
             self.log("foo", -1, prog_bar=True)
             return super().training_step(*args)
 
-    trainer = Trainer(default_root_dir=tmpdir)
     model = TestModel()
     with pytest.raises(MisconfigurationException, match=r"self.log\(foo, ...\)` twice in `training_step`"):
         trainer.fit(model)
@@ -677,11 +683,18 @@ def training_step(self, *args):
             self.log("foo", -1, reduce_fx=torch.argmax)
             return super().training_step(*args)
 
-    trainer = Trainer(default_root_dir=tmpdir)
     model = TestModel()
     with pytest.raises(MisconfigurationException, match=r"reduce_fx={min,max,mean,sum}\)` are supported"):
         trainer.fit(model)
 
+    class TestModel(BoringModel):
+        def on_train_start(self):
+            self.log("foo", torch.tensor([1.0, 2.0]))
+
+    model = TestModel()
+    with pytest.raises(ValueError, match="tensor must have a single element"):
+        trainer.fit(model)
+
 
 def test_sanity_metrics_are_reset(tmpdir):
     class TestModel(BoringModel):