From 0d2997fdad829879f31facbaf0f2e9d5ead0bd91 Mon Sep 17 00:00:00 2001 From: krishna Date: Wed, 22 Jun 2022 12:24:03 +0200 Subject: [PATCH 01/18] add warning --- .../trainer/connectors/logger_connector/result.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py index b1d8a064e122e..8250d593da67c 100644 --- a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py +++ b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py @@ -516,17 +516,16 @@ def fn(result_metric: _ResultMetric, v: Tensor) -> None: apply_to_collections(self[key], value, _ResultMetric, fn) @staticmethod - def _get_cache(result_metric: _ResultMetric, on_step: bool) -> Optional[Tensor]: + def _get_cache(result_metric: _ResultMetric, on_step: bool, result_key: str) -> Optional[Tensor]: cache = None if on_step and result_metric.meta.on_step: cache = result_metric._forward_cache elif not on_step and result_metric.meta.on_epoch: if result_metric._computed is None: # always reduce on epoch end - should = result_metric.meta.sync.should - result_metric.meta.sync.should = True + warning_cache.warn(f"Please set sync_dist to True {result_key}") result_metric.compute() - result_metric.meta.sync.should = should + cache = result_metric._computed if cache is not None and not result_metric.meta.enable_graph: return cache.detach() @@ -554,10 +553,10 @@ def _forked_name(self, result_metric: _ResultMetric, on_step: bool) -> Tuple[str def metrics(self, on_step: bool) -> _METRICS: metrics = _METRICS(callback={}, log={}, pbar={}) - for _, result_metric in self.valid_items(): - + for result_key, result_metric in self.valid_items(): + # extract forward_cache or computed from the _ResultMetric. ignore when the output is None - value = apply_to_collection(result_metric, _ResultMetric, self._get_cache, on_step, include_none=False) + value = apply_to_collection(result_metric, _ResultMetric, self._get_cache, on_step, result_key, include_none=False) # convert metric collection to dict container. if isinstance(value, _ResultMetricCollection): From f52490d81b120c41a4705e2ff797d6db8aad7ec9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 22 Jun 2022 10:27:47 +0000 Subject: [PATCH 02/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../trainer/connectors/logger_connector/result.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py index 8250d593da67c..e9b0d4d6ce1df 100644 --- a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py +++ b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py @@ -554,9 +554,11 @@ def metrics(self, on_step: bool) -> _METRICS: metrics = _METRICS(callback={}, log={}, pbar={}) for result_key, result_metric in self.valid_items(): - + # extract forward_cache or computed from the _ResultMetric. ignore when the output is None - value = apply_to_collection(result_metric, _ResultMetric, self._get_cache, on_step, result_key, include_none=False) + value = apply_to_collection( + result_metric, _ResultMetric, self._get_cache, on_step, result_key, include_none=False + ) # convert metric collection to dict container. if isinstance(value, _ResultMetricCollection): From ff7d9248d172ecad2c10b24877888706062ad48b Mon Sep 17 00:00:00 2001 From: Krishna Kalyan Date: Wed, 6 Jul 2022 07:02:00 -0400 Subject: [PATCH 03/18] add test --- .../connectors/logger_connector/result.py | 8 +++++--- .../logging_/test_distributed_logging.py | 17 +++++++++++++++++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py index e9b0d4d6ce1df..1b73b00adb408 100644 --- a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py +++ b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py @@ -516,14 +516,16 @@ def fn(result_metric: _ResultMetric, v: Tensor) -> None: apply_to_collections(self[key], value, _ResultMetric, fn) @staticmethod - def _get_cache(result_metric: _ResultMetric, on_step: bool, result_key: str) -> Optional[Tensor]: + def _get_cache(result_metric: _ResultMetric, result_key: str, on_step: bool) -> Optional[Tensor]: cache = None if on_step and result_metric.meta.on_step: cache = result_metric._forward_cache elif not on_step and result_metric.meta.on_epoch: if result_metric._computed is None: # always reduce on epoch end - warning_cache.warn(f"Please set sync_dist to True {result_key}") + print(f"ABBABAB {result_metric.meta.sync.should}") + if not result_metric.meta.sync.should: + warning_cache.warn(f"Please set sync_dist to True {result_key}") result_metric.compute() cache = result_metric._computed @@ -557,7 +559,7 @@ def metrics(self, on_step: bool) -> _METRICS: # extract forward_cache or computed from the _ResultMetric. ignore when the output is None value = apply_to_collection( - result_metric, _ResultMetric, self._get_cache, on_step, result_key, include_none=False + result_metric, _ResultMetric, self._get_cache, result_key, on_step, include_none=False ) # convert metric collection to dict container. diff --git a/tests/tests_pytorch/trainer/logging_/test_distributed_logging.py b/tests/tests_pytorch/trainer/logging_/test_distributed_logging.py index ff950f7a8f679..3fbdd10ee4a3d 100644 --- a/tests/tests_pytorch/trainer/logging_/test_distributed_logging.py +++ b/tests/tests_pytorch/trainer/logging_/test_distributed_logging.py @@ -20,6 +20,7 @@ from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.loggers.logger import Logger from tests_pytorch.helpers.runif import RunIf +import pytest class AllRankLogger(Logger): @@ -194,3 +195,19 @@ def on_test_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> assert trainer.logger.logs == {"fit": 1, "validate": 1, "test": 1} trainer.predict(model) assert trainer.logger.logs == {"fit": 1, "validate": 1, "test": 1, "predict": 1} + +def test_logger_sync_dist(): + # Suggestions + # Imporve warning + class CustomBoringModel(BoringModel): + def training_epoch_end(self, *args, **kwargs): + super().training_epoch_end(*args, **kwargs) + self.log("global_rank", self.global_rank, sync_dist=False) + + model = CustomBoringModel() + trainer = Trainer(fast_dev_run = 1, accelerator="cpu", strategy="ddp", devices=2) + + with pytest.warns(UserWarning, match="Please set sync_dist to True global_rank"): + trainer.fit(model) + + assert trainer.callback_metrics["global_rank"] == 0 \ No newline at end of file From 19ee6b6b88cd495a77da6d1a6da2023889c55bf5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 6 Jul 2022 11:04:26 +0000 Subject: [PATCH 04/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../trainer/connectors/logger_connector/result.py | 2 +- .../trainer/logging_/test_distributed_logging.py | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py index 1b73b00adb408..ff404938c317e 100644 --- a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py +++ b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py @@ -516,7 +516,7 @@ def fn(result_metric: _ResultMetric, v: Tensor) -> None: apply_to_collections(self[key], value, _ResultMetric, fn) @staticmethod - def _get_cache(result_metric: _ResultMetric, result_key: str, on_step: bool) -> Optional[Tensor]: + def _get_cache(result_metric: _ResultMetric, result_key: str, on_step: bool) -> Optional[Tensor]: cache = None if on_step and result_metric.meta.on_step: cache = result_metric._forward_cache diff --git a/tests/tests_pytorch/trainer/logging_/test_distributed_logging.py b/tests/tests_pytorch/trainer/logging_/test_distributed_logging.py index 3fbdd10ee4a3d..cfed4a9e4948d 100644 --- a/tests/tests_pytorch/trainer/logging_/test_distributed_logging.py +++ b/tests/tests_pytorch/trainer/logging_/test_distributed_logging.py @@ -15,12 +15,13 @@ from typing import Any, Dict, Optional, Union from unittest.mock import Mock +import pytest + import pytorch_lightning as pl from pytorch_lightning import Callback, Trainer from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.loggers.logger import Logger from tests_pytorch.helpers.runif import RunIf -import pytest class AllRankLogger(Logger): @@ -196,6 +197,7 @@ def on_test_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> trainer.predict(model) assert trainer.logger.logs == {"fit": 1, "validate": 1, "test": 1, "predict": 1} + def test_logger_sync_dist(): # Suggestions # Imporve warning @@ -205,9 +207,9 @@ def training_epoch_end(self, *args, **kwargs): self.log("global_rank", self.global_rank, sync_dist=False) model = CustomBoringModel() - trainer = Trainer(fast_dev_run = 1, accelerator="cpu", strategy="ddp", devices=2) + trainer = Trainer(fast_dev_run=1, accelerator="cpu", strategy="ddp", devices=2) with pytest.warns(UserWarning, match="Please set sync_dist to True global_rank"): trainer.fit(model) - assert trainer.callback_metrics["global_rank"] == 0 \ No newline at end of file + assert trainer.callback_metrics["global_rank"] == 0 From 07814f625685050f85e4ec3d97315e6993bcfa88 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Tue, 12 Jul 2022 17:41:08 +0530 Subject: [PATCH 05/18] fix test --- .../connectors/logger_connector/result.py | 16 ++++++++-------- .../trainer/logging_/test_distributed_logging.py | 4 +--- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py index ff404938c317e..4a831655a76e1 100644 --- a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py +++ b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py @@ -516,19 +516,21 @@ def fn(result_metric: _ResultMetric, v: Tensor) -> None: apply_to_collections(self[key], value, _ResultMetric, fn) @staticmethod - def _get_cache(result_metric: _ResultMetric, result_key: str, on_step: bool) -> Optional[Tensor]: + def _get_cache(result_metric: _ResultMetric, on_step: bool) -> Optional[Tensor]: cache = None if on_step and result_metric.meta.on_step: cache = result_metric._forward_cache elif not on_step and result_metric.meta.on_epoch: if result_metric._computed is None: - # always reduce on epoch end - print(f"ABBABAB {result_metric.meta.sync.should}") if not result_metric.meta.sync.should: - warning_cache.warn(f"Please set sync_dist to True {result_key}") + warning_cache.warn( + f"It is recommended to use `self.log({result_metric.meta.name}, sync_dist=True)` when logging" + " on epoch level in distributed setting to accumulate the metric across devices." + ) result_metric.compute() cache = result_metric._computed + if cache is not None and not result_metric.meta.enable_graph: return cache.detach() return cache @@ -555,12 +557,10 @@ def _forked_name(self, result_metric: _ResultMetric, on_step: bool) -> Tuple[str def metrics(self, on_step: bool) -> _METRICS: metrics = _METRICS(callback={}, log={}, pbar={}) - for result_key, result_metric in self.valid_items(): + for _, result_metric in self.valid_items(): # extract forward_cache or computed from the _ResultMetric. ignore when the output is None - value = apply_to_collection( - result_metric, _ResultMetric, self._get_cache, result_key, on_step, include_none=False - ) + value = apply_to_collection(result_metric, _ResultMetric, self._get_cache, on_step, include_none=False) # convert metric collection to dict container. if isinstance(value, _ResultMetricCollection): diff --git a/tests/tests_pytorch/trainer/logging_/test_distributed_logging.py b/tests/tests_pytorch/trainer/logging_/test_distributed_logging.py index cfed4a9e4948d..6ad828eb0e29b 100644 --- a/tests/tests_pytorch/trainer/logging_/test_distributed_logging.py +++ b/tests/tests_pytorch/trainer/logging_/test_distributed_logging.py @@ -199,8 +199,6 @@ def on_test_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> def test_logger_sync_dist(): - # Suggestions - # Imporve warning class CustomBoringModel(BoringModel): def training_epoch_end(self, *args, **kwargs): super().training_epoch_end(*args, **kwargs) @@ -209,7 +207,7 @@ def training_epoch_end(self, *args, **kwargs): model = CustomBoringModel() trainer = Trainer(fast_dev_run=1, accelerator="cpu", strategy="ddp", devices=2) - with pytest.warns(UserWarning, match="Please set sync_dist to True global_rank"): + with pytest.warns(UserWarning, match="It is recommended to use .* sync_dist=True"): trainer.fit(model) assert trainer.callback_metrics["global_rank"] == 0 From 2ac71cb388ff59928b154914cfabafedbdb2711b Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Tue, 12 Jul 2022 17:44:36 +0530 Subject: [PATCH 06/18] fix --- .../trainer/connectors/logger_connector/result.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py index 4a831655a76e1..e8dc50a6b5ffd 100644 --- a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py +++ b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py @@ -524,8 +524,8 @@ def _get_cache(result_metric: _ResultMetric, on_step: bool) -> Optional[Tensor]: if result_metric._computed is None: if not result_metric.meta.sync.should: warning_cache.warn( - f"It is recommended to use `self.log({result_metric.meta.name}, sync_dist=True)` when logging" - " on epoch level in distributed setting to accumulate the metric across devices." + f"It is recommended to use `self.log({result_metric.meta.name!r}, sync_dist=True)` when" + " logging on epoch level in distributed setting to accumulate the metric across devices." ) result_metric.compute() From 0e2acc7e513e7dbd6a24bc8bd9d42d18aeb9a064 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Tue, 12 Jul 2022 17:49:30 +0530 Subject: [PATCH 07/18] fix rebase --- .../trainer/connectors/logger_connector/result.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py index 8f3dca7c387e1..260c5b7d6c63f 100644 --- a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py +++ b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py @@ -531,8 +531,14 @@ def _get_cache(result_metric: _ResultMetric, on_step: bool) -> Optional[Tensor]: cache = result_metric._computed - if cache is not None and not result_metric.meta.enable_graph: - return cache.detach() + if cache is not None: + if not isinstance(cache, torch.Tensor): + raise ValueError( + f"The `.compute()` return of the metric logged as {result_metric.meta.name!r} must be a tensor." + f" Found {cache}" + ) + if not result_metric.meta.enable_graph: + return cache.detach() return cache From fabae4779afcdc475f8ac7daf0ee3446152c3f45 Mon Sep 17 00:00:00 2001 From: Krishna Kalyan Date: Tue, 12 Jul 2022 08:29:59 -0400 Subject: [PATCH 08/18] update change log --- src/pytorch_lightning/CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 0d42f819d9d1d..b8f5ba860c3af 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -139,7 +139,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - The `WandbLogger` will now use the run name in the logs folder if it is provided, and otherwise the project name ([#12604](https://github.com/PyTorchLightning/pytorch-lightning/pull/12604)) -- +- Raised warning instead of forcing `sync_dist=True` on `epoch_end` ([13210](https://github.com/Lightning-AI/lightning/issues/13210)) ### Deprecated From bf87e61466597ce47d08decff8f60aa48025a02d Mon Sep 17 00:00:00 2001 From: Krishna Kalyan Date: Tue, 12 Jul 2022 08:31:38 -0400 Subject: [PATCH 09/18] fix_pr --- src/pytorch_lightning/CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index b8f5ba860c3af..592888bc73cbc 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -139,7 +139,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - The `WandbLogger` will now use the run name in the logs folder if it is provided, and otherwise the project name ([#12604](https://github.com/PyTorchLightning/pytorch-lightning/pull/12604)) -- Raised warning instead of forcing `sync_dist=True` on `epoch_end` ([13210](https://github.com/Lightning-AI/lightning/issues/13210)) +- Raised warning instead of forcing `sync_dist=True` on `epoch_end` ([13364](https://github.com/Lightning-AI/lightning/pull/13364)) ### Deprecated From f4e059bf8a39ee709b9b4d063ba7102c12e9cc97 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Tue, 12 Jul 2022 18:03:03 +0530 Subject: [PATCH 10/18] nit --- src/pytorch_lightning/CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 592888bc73cbc..deb17cca86f78 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -139,7 +139,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - The `WandbLogger` will now use the run name in the logs folder if it is provided, and otherwise the project name ([#12604](https://github.com/PyTorchLightning/pytorch-lightning/pull/12604)) -- Raised warning instead of forcing `sync_dist=True` on `epoch_end` ([13364](https://github.com/Lightning-AI/lightning/pull/13364)) +- Raised a warning instead of forcing `sync_dist=True` on epoch end ([13364](https://github.com/Lightning-AI/lightning/pull/13364)) ### Deprecated From 352a34112074abd9ab6c064bb55fdd4eb51d941a Mon Sep 17 00:00:00 2001 From: Krishna Kalyan Date: Wed, 13 Jul 2022 13:47:25 +0200 Subject: [PATCH 11/18] Update src/pytorch_lightning/trainer/connectors/logger_connector/result.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos MocholĂ­ --- .../trainer/connectors/logger_connector/result.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py index 260c5b7d6c63f..2e047bbc08a20 100644 --- a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py +++ b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py @@ -524,7 +524,7 @@ def _get_cache(result_metric: _ResultMetric, on_step: bool) -> Optional[Tensor]: if result_metric._computed is None: if not result_metric.meta.sync.should: warning_cache.warn( - f"It is recommended to use `self.log({result_metric.meta.name!r}, sync_dist=True)` when" + f"It is recommended to use `self.log({result_metric.meta.name!r}, ..., sync_dist=True)` when" " logging on epoch level in distributed setting to accumulate the metric across devices." ) result_metric.compute() From 5140c2bc58cb64a58de764a6f1f7c570dc966fe9 Mon Sep 17 00:00:00 2001 From: Krishna Kalyan Date: Sat, 16 Jul 2022 16:42:12 -0400 Subject: [PATCH 12/18] commit suggestions --- .../trainer/connectors/logger_connector/result.py | 9 +++++---- .../trainer/logging_/test_distributed_logging.py | 6 +++--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py index 2e047bbc08a20..b4a00b72f3038 100644 --- a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py +++ b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py @@ -24,11 +24,12 @@ from pytorch_lightning.core.mixins import DeviceDtypeModuleMixin from pytorch_lightning.utilities.apply_func import apply_to_collection, apply_to_collections, move_data_to_device from pytorch_lightning.utilities.data import extract_batch_size +from pytorch_lightning.utilities.distributed import distributed_available from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.memory import recursive_detach from pytorch_lightning.utilities.metrics import metrics_to_scalars from pytorch_lightning.utilities.rank_zero import rank_zero_warn -from pytorch_lightning.utilities.warnings import WarningCache +from pytorch_lightning.utilities.warnings import WarningCache, PossibleUserWarning _IN_METRIC = Union[Metric, Tensor] # Do not include scalars as they were converted to tensors _OUT_METRIC = Union[Tensor, Dict[str, Tensor]] @@ -522,11 +523,11 @@ def _get_cache(result_metric: _ResultMetric, on_step: bool) -> Optional[Tensor]: cache = result_metric._forward_cache elif not on_step and result_metric.meta.on_epoch: if result_metric._computed is None: - if not result_metric.meta.sync.should: + if not result_metric.meta.sync.should and distributed_available(): warning_cache.warn( f"It is recommended to use `self.log({result_metric.meta.name!r}, ..., sync_dist=True)` when" - " logging on epoch level in distributed setting to accumulate the metric across devices." - ) + " logging on epoch level in distributed setting to accumulate the metric across devices.", + category=PossibleUserWarning) result_metric.compute() cache = result_metric._computed diff --git a/tests/tests_pytorch/trainer/logging_/test_distributed_logging.py b/tests/tests_pytorch/trainer/logging_/test_distributed_logging.py index 6ad828eb0e29b..9abfd42f03669 100644 --- a/tests/tests_pytorch/trainer/logging_/test_distributed_logging.py +++ b/tests/tests_pytorch/trainer/logging_/test_distributed_logging.py @@ -21,6 +21,7 @@ from pytorch_lightning import Callback, Trainer from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.loggers.logger import Logger +from pytorch_lightning.utilities.warnings import PossibleUserWarning from tests_pytorch.helpers.runif import RunIf @@ -200,14 +201,13 @@ def on_test_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> def test_logger_sync_dist(): class CustomBoringModel(BoringModel): - def training_epoch_end(self, *args, **kwargs): - super().training_epoch_end(*args, **kwargs) + def on_train_epoch_end(self): self.log("global_rank", self.global_rank, sync_dist=False) model = CustomBoringModel() trainer = Trainer(fast_dev_run=1, accelerator="cpu", strategy="ddp", devices=2) - with pytest.warns(UserWarning, match="It is recommended to use .* sync_dist=True"): + with pytest.warns(PossibleUserWarning, match="It is recommended to use .* sync_dist=True"): trainer.fit(model) assert trainer.callback_metrics["global_rank"] == 0 From db2d06c4b250f285f45378f5d86664d9535e7004 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 16 Jul 2022 20:46:40 +0000 Subject: [PATCH 13/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../trainer/connectors/logger_connector/result.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py index b4a00b72f3038..c9c3c897ffdd7 100644 --- a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py +++ b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py @@ -29,7 +29,7 @@ from pytorch_lightning.utilities.memory import recursive_detach from pytorch_lightning.utilities.metrics import metrics_to_scalars from pytorch_lightning.utilities.rank_zero import rank_zero_warn -from pytorch_lightning.utilities.warnings import WarningCache, PossibleUserWarning +from pytorch_lightning.utilities.warnings import PossibleUserWarning, WarningCache _IN_METRIC = Union[Metric, Tensor] # Do not include scalars as they were converted to tensors _OUT_METRIC = Union[Tensor, Dict[str, Tensor]] @@ -527,7 +527,8 @@ def _get_cache(result_metric: _ResultMetric, on_step: bool) -> Optional[Tensor]: warning_cache.warn( f"It is recommended to use `self.log({result_metric.meta.name!r}, ..., sync_dist=True)` when" " logging on epoch level in distributed setting to accumulate the metric across devices.", - category=PossibleUserWarning) + category=PossibleUserWarning, + ) result_metric.compute() cache = result_metric._computed From 26acd4590d3afa5749098a2a3a2c7fa5c405942f Mon Sep 17 00:00:00 2001 From: Krishna Kalyan Date: Sat, 16 Jul 2022 18:01:16 -0400 Subject: [PATCH 14/18] improve error message --- .../tests_pytorch/trainer/logging_/test_distributed_logging.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/tests_pytorch/trainer/logging_/test_distributed_logging.py b/tests/tests_pytorch/trainer/logging_/test_distributed_logging.py index 9abfd42f03669..c12669be87c4d 100644 --- a/tests/tests_pytorch/trainer/logging_/test_distributed_logging.py +++ b/tests/tests_pytorch/trainer/logging_/test_distributed_logging.py @@ -207,7 +207,8 @@ def on_train_epoch_end(self): model = CustomBoringModel() trainer = Trainer(fast_dev_run=1, accelerator="cpu", strategy="ddp", devices=2) - with pytest.warns(PossibleUserWarning, match="It is recommended to use .* sync_dist=True"): + with pytest.warns(PossibleUserWarning, match="It is recommended to use `self.log('global_rank', ..., sync_dist=True)` when " + "logging on epoch level in distributed setting to accumulate the metric across devices."): trainer.fit(model) assert trainer.callback_metrics["global_rank"] == 0 From 68735d7b403c052b7726a3fc23a045a637af77be Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 16 Jul 2022 22:02:57 +0000 Subject: [PATCH 15/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../trainer/logging_/test_distributed_logging.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/tests_pytorch/trainer/logging_/test_distributed_logging.py b/tests/tests_pytorch/trainer/logging_/test_distributed_logging.py index c12669be87c4d..c074af5038fda 100644 --- a/tests/tests_pytorch/trainer/logging_/test_distributed_logging.py +++ b/tests/tests_pytorch/trainer/logging_/test_distributed_logging.py @@ -207,8 +207,11 @@ def on_train_epoch_end(self): model = CustomBoringModel() trainer = Trainer(fast_dev_run=1, accelerator="cpu", strategy="ddp", devices=2) - with pytest.warns(PossibleUserWarning, match="It is recommended to use `self.log('global_rank', ..., sync_dist=True)` when " - "logging on epoch level in distributed setting to accumulate the metric across devices."): + with pytest.warns( + PossibleUserWarning, + match="It is recommended to use `self.log('global_rank', ..., sync_dist=True)` when " + "logging on epoch level in distributed setting to accumulate the metric across devices.", + ): trainer.fit(model) assert trainer.callback_metrics["global_rank"] == 0 From 677a3a63d055337718d45a93c2f3311111c3e10c Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Mon, 18 Jul 2022 14:16:06 +0530 Subject: [PATCH 16/18] fix test --- .../trainer/logging_/test_distributed_logging.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/tests_pytorch/trainer/logging_/test_distributed_logging.py b/tests/tests_pytorch/trainer/logging_/test_distributed_logging.py index c074af5038fda..bfa79dcfc6a24 100644 --- a/tests/tests_pytorch/trainer/logging_/test_distributed_logging.py +++ b/tests/tests_pytorch/trainer/logging_/test_distributed_logging.py @@ -23,6 +23,7 @@ from pytorch_lightning.loggers.logger import Logger from pytorch_lightning.utilities.warnings import PossibleUserWarning from tests_pytorch.helpers.runif import RunIf +from tests_pytorch.helpers.utils import no_warning_call class AllRankLogger(Logger): @@ -206,11 +207,16 @@ def on_train_epoch_end(self): model = CustomBoringModel() trainer = Trainer(fast_dev_run=1, accelerator="cpu", strategy="ddp", devices=2) - with pytest.warns( - PossibleUserWarning, - match="It is recommended to use `self.log('global_rank', ..., sync_dist=True)` when " - "logging on epoch level in distributed setting to accumulate the metric across devices.", + PossibleUserWarning, match=r"recommended to use `self.log\('global_rank', ..., sync_dist=True\)`" + ): + trainer.fit(model) + + assert trainer.callback_metrics["global_rank"] == 0 + + trainer = Trainer(fast_dev_run=1, accelerator="cpu", devices=1) + with no_warning_call( + PossibleUserWarning, match=r"recommended to use `self.log\('global_rank', ..., sync_dist=True\)`" ): trainer.fit(model) From e497b2ed6385a3add00dbc37aad2239fe9a13e0a Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Mon, 18 Jul 2022 18:54:28 +0530 Subject: [PATCH 17/18] block for ft and use unit test --- .../connectors/logger_connector/result.py | 18 ++++++--- .../core/test_metric_result_integration.py | 4 +- .../logging_/test_distributed_logging.py | 40 ++++++++++--------- 3 files changed, 38 insertions(+), 24 deletions(-) diff --git a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py index c9c3c897ffdd7..05bc62c98f92d 100644 --- a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py +++ b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py @@ -26,6 +26,7 @@ from pytorch_lightning.utilities.data import extract_batch_size from pytorch_lightning.utilities.distributed import distributed_available from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.imports import _fault_tolerant_training from pytorch_lightning.utilities.memory import recursive_detach from pytorch_lightning.utilities.metrics import metrics_to_scalars from pytorch_lightning.utilities.rank_zero import rank_zero_warn @@ -523,13 +524,20 @@ def _get_cache(result_metric: _ResultMetric, on_step: bool) -> Optional[Tensor]: cache = result_metric._forward_cache elif not on_step and result_metric.meta.on_epoch: if result_metric._computed is None: + should = result_metric.meta.sync.should if not result_metric.meta.sync.should and distributed_available(): - warning_cache.warn( - f"It is recommended to use `self.log({result_metric.meta.name!r}, ..., sync_dist=True)` when" - " logging on epoch level in distributed setting to accumulate the metric across devices.", - category=PossibleUserWarning, - ) + if _fault_tolerant_training(): + # make sure to always sync across devices when fault-tolerant is used + result_metric.meta.sync.should = True + else: + warning_cache.warn( + f"It is recommended to use `self.log({result_metric.meta.name!r}, ..., sync_dist=True)`" + " when logging on epoch level in distributed setting to accumulate the metric across" + " devices.", + category=PossibleUserWarning, + ) result_metric.compute() + result_metric.meta.sync.should = should cache = result_metric._computed diff --git a/tests/tests_pytorch/core/test_metric_result_integration.py b/tests/tests_pytorch/core/test_metric_result_integration.py index 12247e27e8c9d..41672d2ff7577 100644 --- a/tests/tests_pytorch/core/test_metric_result_integration.py +++ b/tests/tests_pytorch/core/test_metric_result_integration.py @@ -456,6 +456,8 @@ def on_train_epoch_end(self) -> None: "limit_val_batches": 0, "accelerator": accelerator, "devices": devices, + "enable_progress_bar": False, + "enable_model_summary": False, } trainer_kwargs.update(kwargs) trainer = Trainer(**trainer_kwargs) @@ -471,7 +473,7 @@ def on_train_epoch_end(self) -> None: ) ckpt_path = os.path.join(tmpdir, ".pl_auto_save.ckpt") - trainer = Trainer(**trainer_kwargs, enable_progress_bar=False, enable_model_summary=False) + trainer = Trainer(**trainer_kwargs) trainer.fit(model, ckpt_path=ckpt_path) assert model.has_validated_sum diff --git a/tests/tests_pytorch/trainer/logging_/test_distributed_logging.py b/tests/tests_pytorch/trainer/logging_/test_distributed_logging.py index bfa79dcfc6a24..006248c95219c 100644 --- a/tests/tests_pytorch/trainer/logging_/test_distributed_logging.py +++ b/tests/tests_pytorch/trainer/logging_/test_distributed_logging.py @@ -13,14 +13,22 @@ # limitations under the License. import os from typing import Any, Dict, Optional, Union +from unittest import mock from unittest.mock import Mock import pytest +import torch import pytorch_lightning as pl from pytorch_lightning import Callback, Trainer from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.loggers.logger import Logger +from pytorch_lightning.trainer.connectors.logger_connector.result import ( + _Metadata, + _ResultCollection, + _ResultMetric, + _Sync, +) from pytorch_lightning.utilities.warnings import PossibleUserWarning from tests_pytorch.helpers.runif import RunIf from tests_pytorch.helpers.utils import no_warning_call @@ -200,24 +208,20 @@ def on_test_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> assert trainer.logger.logs == {"fit": 1, "validate": 1, "test": 1, "predict": 1} -def test_logger_sync_dist(): - class CustomBoringModel(BoringModel): - def on_train_epoch_end(self): - self.log("global_rank", self.global_rank, sync_dist=False) +@pytest.mark.parametrize("distributed_env", [True, False]) +def test_logger_sync_dist(distributed_env): + # self.log('bar', 7, ..., sync_dist=False) + meta = _Metadata("foo", "bar") + meta.sync = _Sync(_should=False) + result_metric = _ResultMetric(metadata=meta, is_tensor=True) + result_metric.update(torch.tensor(7.0), 10) - model = CustomBoringModel() - trainer = Trainer(fast_dev_run=1, accelerator="cpu", strategy="ddp", devices=2) - with pytest.warns( - PossibleUserWarning, match=r"recommended to use `self.log\('global_rank', ..., sync_dist=True\)`" - ): - trainer.fit(model) - - assert trainer.callback_metrics["global_rank"] == 0 + warning_ctx = pytest.warns if distributed_env else no_warning_call - trainer = Trainer(fast_dev_run=1, accelerator="cpu", devices=1) - with no_warning_call( - PossibleUserWarning, match=r"recommended to use `self.log\('global_rank', ..., sync_dist=True\)`" + with mock.patch( + "pytorch_lightning.trainer.connectors.logger_connector.result.distributed_available", + return_value=distributed_env, ): - trainer.fit(model) - - assert trainer.callback_metrics["global_rank"] == 0 + with warning_ctx(PossibleUserWarning, match=r"recommended to use `self.log\('bar', ..., sync_dist=True\)`"): + value = _ResultCollection._get_cache(result_metric, on_step=False) + assert value == 7.0 From 0f39c807ea28b1ae7c703c9def361e23fd586281 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Tue, 19 Jul 2022 13:31:48 +0530 Subject: [PATCH 18/18] move test --- .../connectors/logger_connector/result.py | 5 ++- .../core/test_metric_result_integration.py | 21 +++++++++++++ .../logging_/test_distributed_logging.py | 31 ------------------- 3 files changed, 25 insertions(+), 32 deletions(-) diff --git a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py index 05bc62c98f92d..1a8020f91f4f6 100644 --- a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py +++ b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py @@ -526,8 +526,11 @@ def _get_cache(result_metric: _ResultMetric, on_step: bool) -> Optional[Tensor]: if result_metric._computed is None: should = result_metric.meta.sync.should if not result_metric.meta.sync.should and distributed_available(): + # ensure sync happens for FT since during a failure, the metrics are synced and saved to the + # checkpoint, so during restart, metrics on rank 0 are from the accumulated ones from the previous + # run, and on other ranks, they are 0. So we need to make sure they are synced in further training + # to ensure correct calculation. if _fault_tolerant_training(): - # make sure to always sync across devices when fault-tolerant is used result_metric.meta.sync.should = True else: warning_cache.warn( diff --git a/tests/tests_pytorch/core/test_metric_result_integration.py b/tests/tests_pytorch/core/test_metric_result_integration.py index 41672d2ff7577..cb8a51c5bf9ba 100644 --- a/tests/tests_pytorch/core/test_metric_result_integration.py +++ b/tests/tests_pytorch/core/test_metric_result_integration.py @@ -34,7 +34,9 @@ _ResultMetric, _Sync, ) +from pytorch_lightning.utilities.warnings import PossibleUserWarning from tests_pytorch.helpers.runif import RunIf +from tests_pytorch.helpers.utils import no_warning_call class DummyMetric(Metric): @@ -661,3 +663,22 @@ def on_train_start(self): ) with pytest.raises(ValueError, match=r"compute\(\)` return of.*foo' must be a tensor"): trainer.fit(model) + + +@pytest.mark.parametrize("distributed_env", [True, False]) +def test_logger_sync_dist(distributed_env): + # self.log('bar', 7, ..., sync_dist=False) + meta = _Metadata("foo", "bar") + meta.sync = _Sync(_should=False) + result_metric = _ResultMetric(metadata=meta, is_tensor=True) + result_metric.update(torch.tensor(7.0), 10) + + warning_ctx = pytest.warns if distributed_env else no_warning_call + + with mock.patch( + "pytorch_lightning.trainer.connectors.logger_connector.result.distributed_available", + return_value=distributed_env, + ): + with warning_ctx(PossibleUserWarning, match=r"recommended to use `self.log\('bar', ..., sync_dist=True\)`"): + value = _ResultCollection._get_cache(result_metric, on_step=False) + assert value == 7.0 diff --git a/tests/tests_pytorch/trainer/logging_/test_distributed_logging.py b/tests/tests_pytorch/trainer/logging_/test_distributed_logging.py index 006248c95219c..ff950f7a8f679 100644 --- a/tests/tests_pytorch/trainer/logging_/test_distributed_logging.py +++ b/tests/tests_pytorch/trainer/logging_/test_distributed_logging.py @@ -13,25 +13,13 @@ # limitations under the License. import os from typing import Any, Dict, Optional, Union -from unittest import mock from unittest.mock import Mock -import pytest -import torch - import pytorch_lightning as pl from pytorch_lightning import Callback, Trainer from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.loggers.logger import Logger -from pytorch_lightning.trainer.connectors.logger_connector.result import ( - _Metadata, - _ResultCollection, - _ResultMetric, - _Sync, -) -from pytorch_lightning.utilities.warnings import PossibleUserWarning from tests_pytorch.helpers.runif import RunIf -from tests_pytorch.helpers.utils import no_warning_call class AllRankLogger(Logger): @@ -206,22 +194,3 @@ def on_test_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> assert trainer.logger.logs == {"fit": 1, "validate": 1, "test": 1} trainer.predict(model) assert trainer.logger.logs == {"fit": 1, "validate": 1, "test": 1, "predict": 1} - - -@pytest.mark.parametrize("distributed_env", [True, False]) -def test_logger_sync_dist(distributed_env): - # self.log('bar', 7, ..., sync_dist=False) - meta = _Metadata("foo", "bar") - meta.sync = _Sync(_should=False) - result_metric = _ResultMetric(metadata=meta, is_tensor=True) - result_metric.update(torch.tensor(7.0), 10) - - warning_ctx = pytest.warns if distributed_env else no_warning_call - - with mock.patch( - "pytorch_lightning.trainer.connectors.logger_connector.result.distributed_available", - return_value=distributed_env, - ): - with warning_ctx(PossibleUserWarning, match=r"recommended to use `self.log\('bar', ..., sync_dist=True\)`"): - value = _ResultCollection._get_cache(result_metric, on_step=False) - assert value == 7.0