From b8f9301e5eabe8c46f51a1a32eb2736b9f77cd43 Mon Sep 17 00:00:00 2001 From: ananthsub Date: Wed, 25 Aug 2021 21:29:12 -0700 Subject: [PATCH 1/3] Move logging GPU metrics out of the core trainer --- .../trainer/connectors/callback_connector.py | 10 ++++++++ .../logger_connector/logger_connector.py | 25 +------------------ pytorch_lightning/trainer/trainer.py | 3 ++- 3 files changed, 13 insertions(+), 25 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/callback_connector.py b/pytorch_lightning/trainer/connectors/callback_connector.py index 4bdafbf97690b..cde392a78698e 100644 --- a/pytorch_lightning/trainer/connectors/callback_connector.py +++ b/pytorch_lightning/trainer/connectors/callback_connector.py @@ -16,6 +16,7 @@ from typing import Dict, List, Optional, Union from pytorch_lightning.callbacks import Callback, ModelCheckpoint, ProgressBar, ProgressBarBase +from pytorch_lightning.callbacks.gpu_stats_monitor import GPUStatsMonitor from pytorch_lightning.callbacks.timer import Timer from pytorch_lightning.utilities import rank_zero_info from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -34,6 +35,7 @@ def on_trainer_init( default_root_dir: Optional[str], weights_save_path: Optional[str], stochastic_weight_avg: bool, + log_gpu_memory: Optional[str], max_time: Optional[Union[str, timedelta, Dict[str, int]]] = None, ): # init folder paths for checkpoint + weights save callbacks @@ -60,6 +62,9 @@ def on_trainer_init( # init progress bar self.trainer._progress_bar_callback = self.configure_progress_bar(progress_bar_refresh_rate, process_position) + # init gpu memory stats callback + self._configure_gpu_memory_callback(log_gpu_memory) + # push all checkpoint callbacks to the end # it is important that these are the last callbacks to run self.trainer.callbacks = self._reorder_callbacks(self.trainer.callbacks) @@ -114,6 +119,11 @@ def configure_progress_bar(self, refresh_rate=None, process_position=0): return progress_bar_callback + def _configure_gpu_memory_callback(self, log_gpu_memory: Optional[str]) -> None: + if log_gpu_memory and not any([c for c in self.trainer.callbacks if isinstance(c, GPUStatsMonitor)]): + gpu_stats_monitor = GPUStatsMonitor() + self.trainer.callbacks.append(gpu_stats_monitor) + def _configure_timer_callback(self, max_time: Optional[Union[str, timedelta, Dict[str, int]]] = None) -> None: if max_time is None: return diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index a965699510689..104eae90f9664 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -20,23 +20,20 @@ from pytorch_lightning.loggers import LightningLoggerBase, LoggerCollection, TensorBoardLogger from pytorch_lightning.trainer.connectors.logger_connector.result import _METRIC, MetricSource from pytorch_lightning.trainer.states import RunningStage, TrainerFn -from pytorch_lightning.utilities import DeviceType, memory from pytorch_lightning.utilities.apply_func import apply_to_collection, move_data_to_device from pytorch_lightning.utilities.metrics import metrics_to_scalars from pytorch_lightning.utilities.types import _EVALUATE_OUTPUT class LoggerConnector: - def __init__(self, trainer: "pl.Trainer", log_gpu_memory: Optional[str] = None) -> None: + def __init__(self, trainer: "pl.Trainer") -> None: self.trainer = trainer - self.log_gpu_memory = log_gpu_memory self.eval_loop_results = [] self._val_log_step: int = 0 self._test_log_step: int = 0 self._progress_bar_metrics: Dict[str, float] = {} self._logged_metrics: Dict[str, _METRIC] = {} self._callback_metrics: Dict[str, _METRIC] = {} - self._gpus_metrics: Dict[str, str] = {} self._epoch_end_reached = False self._current_fx: Optional[str] = None self._batch_idx: Optional[int] = None @@ -211,8 +208,6 @@ def update_train_step_metrics(self) -> None: if self.trainer.fit_loop.should_accumulate() and self.trainer.lightning_module.automatic_optimization: return - self._log_gpus_metrics() - # when metrics should be logged assert not self._epoch_end_reached if self.should_update_logs or self.trainer.fast_dev_run: @@ -226,17 +221,6 @@ def update_train_epoch_metrics(self) -> None: # reset result collection for next epoch self.trainer._results.reset(metrics=True) - def _log_gpus_metrics(self): - for key, mem in self.gpus_metrics.items(): - if self.log_gpu_memory == "min_max": - self.trainer.lightning_module.log(key, mem, prog_bar=False, logger=True) - else: - gpu_id = int(key.split("/")[0].split(":")[1]) - if gpu_id in self.trainer.accelerator_connector.parallel_device_ids: - self.trainer.lightning_module.log( - key, mem, prog_bar=False, logger=True, on_step=True, on_epoch=False - ) - """ Utilities and properties """ @@ -292,13 +276,6 @@ def metrics(self) -> Dict[MetricSource, Dict[str, _METRIC]]: on_step = not self._epoch_end_reached return self.trainer._results.metrics(on_step) - @property - def gpus_metrics(self) -> Dict[str, str]: - if self.trainer._device_type == DeviceType.GPU and self.log_gpu_memory: - mem_map = memory.get_memory_profile(self.log_gpu_memory) - self._gpus_metrics.update(mem_map) - return self._gpus_metrics - @property def callback_metrics(self) -> Dict[str, _METRIC]: if self.trainer._results: diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 7ebbc55ae7ac9..d07e61360f574 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -365,7 +365,7 @@ def __init__( amp_level, plugins, ) - self.logger_connector = LoggerConnector(self, log_gpu_memory) + self.logger_connector = LoggerConnector(self) self.model_connector = ModelConnector(self) self.callback_connector = CallbackConnector(self) self.debugging_connector = DebuggingConnector(self) @@ -414,6 +414,7 @@ def __init__( default_root_dir, weights_save_path, stochastic_weight_avg, + log_gpu_memory, max_time, ) From 2b7858c5f55f9117d8313c06adba0e41218a6e56 Mon Sep 17 00:00:00 2001 From: ananthsub Date: Wed, 25 Aug 2021 21:49:52 -0700 Subject: [PATCH 2/3] go for direct deprecation instead --- CHANGELOG.md | 5 ++- .../trainer/connectors/callback_connector.py | 10 ------ .../logger_connector/logger_connector.py | 31 ++++++++++++++++++- pytorch_lightning/trainer/trainer.py | 7 +++-- tests/deprecated_api/test_remove_1-7.py | 7 +++++ 5 files changed, 46 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5232cc793163f..92d9e899fa0df 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -147,7 +147,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Deprecated `DataModule` properties: `train_transforms`, `val_transforms`, `test_transforms`, `size`, `dims` ([#8851](https://github.com/PyTorchLightning/pytorch-lightning/pull/8851)) -- Deprecated `prepare_data_per_node` flag on Trainer and set it as a property of `DataHooks`, accessible in the `LightningModule` and `LightningDataModule` [#8958](https://github.com/PyTorchLightning/pytorch-lightning/pull/8958) +- Deprecated `prepare_data_per_node` flag on Trainer and set it as a property of `DataHooks`, accessible in the `LightningModule` and `LightningDataModule` ([#8958](https://github.com/PyTorchLightning/pytorch-lightning/pull/8958)) + + +- Deprecated `log_gpu_memory` flag on the Trainer in favor of passing the `GPUStatsMonitor` callback to the Trainer ([#9124](https://github.com/PyTorchLightning/pytorch-lightning/pull/9124/)) ### Removed diff --git a/pytorch_lightning/trainer/connectors/callback_connector.py b/pytorch_lightning/trainer/connectors/callback_connector.py index cde392a78698e..4bdafbf97690b 100644 --- a/pytorch_lightning/trainer/connectors/callback_connector.py +++ b/pytorch_lightning/trainer/connectors/callback_connector.py @@ -16,7 +16,6 @@ from typing import Dict, List, Optional, Union from pytorch_lightning.callbacks import Callback, ModelCheckpoint, ProgressBar, ProgressBarBase -from pytorch_lightning.callbacks.gpu_stats_monitor import GPUStatsMonitor from pytorch_lightning.callbacks.timer import Timer from pytorch_lightning.utilities import rank_zero_info from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -35,7 +34,6 @@ def on_trainer_init( default_root_dir: Optional[str], weights_save_path: Optional[str], stochastic_weight_avg: bool, - log_gpu_memory: Optional[str], max_time: Optional[Union[str, timedelta, Dict[str, int]]] = None, ): # init folder paths for checkpoint + weights save callbacks @@ -62,9 +60,6 @@ def on_trainer_init( # init progress bar self.trainer._progress_bar_callback = self.configure_progress_bar(progress_bar_refresh_rate, process_position) - # init gpu memory stats callback - self._configure_gpu_memory_callback(log_gpu_memory) - # push all checkpoint callbacks to the end # it is important that these are the last callbacks to run self.trainer.callbacks = self._reorder_callbacks(self.trainer.callbacks) @@ -119,11 +114,6 @@ def configure_progress_bar(self, refresh_rate=None, process_position=0): return progress_bar_callback - def _configure_gpu_memory_callback(self, log_gpu_memory: Optional[str]) -> None: - if log_gpu_memory and not any([c for c in self.trainer.callbacks if isinstance(c, GPUStatsMonitor)]): - gpu_stats_monitor = GPUStatsMonitor() - self.trainer.callbacks.append(gpu_stats_monitor) - def _configure_timer_callback(self, max_time: Optional[Union[str, timedelta, Dict[str, int]]] = None) -> None: if max_time is None: return diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index 104eae90f9664..9dbe405094d79 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -20,20 +20,29 @@ from pytorch_lightning.loggers import LightningLoggerBase, LoggerCollection, TensorBoardLogger from pytorch_lightning.trainer.connectors.logger_connector.result import _METRIC, MetricSource from pytorch_lightning.trainer.states import RunningStage, TrainerFn +from pytorch_lightning.utilities import DeviceType, memory from pytorch_lightning.utilities.apply_func import apply_to_collection, move_data_to_device from pytorch_lightning.utilities.metrics import metrics_to_scalars from pytorch_lightning.utilities.types import _EVALUATE_OUTPUT +from pytorch_lightning.utilities.warning import rank_zero_deprecation class LoggerConnector: - def __init__(self, trainer: "pl.Trainer") -> None: + def __init__(self, trainer: "pl.Trainer", log_gpu_memory: Optional[str] = None) -> None: self.trainer = trainer + if log_gpu_memory is not None: + rank_zero_deprecation( + "Setting `log_gpu_memory` with the trainer flag is deprecated and will be removed in v1.7.0! " + "Please monitor GPU stats with the `GPUStatsMonitor` callback directly instead." + ) + self.log_gpu_memory = log_gpu_memory self.eval_loop_results = [] self._val_log_step: int = 0 self._test_log_step: int = 0 self._progress_bar_metrics: Dict[str, float] = {} self._logged_metrics: Dict[str, _METRIC] = {} self._callback_metrics: Dict[str, _METRIC] = {} + self._gpus_metrics: Dict[str, str] = {} self._epoch_end_reached = False self._current_fx: Optional[str] = None self._batch_idx: Optional[int] = None @@ -208,6 +217,8 @@ def update_train_step_metrics(self) -> None: if self.trainer.fit_loop.should_accumulate() and self.trainer.lightning_module.automatic_optimization: return + self._log_gpus_metrics() + # when metrics should be logged assert not self._epoch_end_reached if self.should_update_logs or self.trainer.fast_dev_run: @@ -221,6 +232,17 @@ def update_train_epoch_metrics(self) -> None: # reset result collection for next epoch self.trainer._results.reset(metrics=True) + def _log_gpus_metrics(self): + for key, mem in self.gpus_metrics.items(): + if self.log_gpu_memory == "min_max": + self.trainer.lightning_module.log(key, mem, prog_bar=False, logger=True) + else: + gpu_id = int(key.split("/")[0].split(":")[1]) + if gpu_id in self.trainer.accelerator_connector.parallel_device_ids: + self.trainer.lightning_module.log( + key, mem, prog_bar=False, logger=True, on_step=True, on_epoch=False + ) + """ Utilities and properties """ @@ -276,6 +298,13 @@ def metrics(self) -> Dict[MetricSource, Dict[str, _METRIC]]: on_step = not self._epoch_end_reached return self.trainer._results.metrics(on_step) + @property + def gpus_metrics(self) -> Dict[str, str]: + if self.trainer._device_type == DeviceType.GPU and self.log_gpu_memory: + mem_map = memory.get_memory_profile(self.log_gpu_memory) + self._gpus_metrics.update(mem_map) + return self._gpus_metrics + @property def callback_metrics(self) -> Dict[str, _METRIC]: if self.trainer._results: diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index d07e61360f574..c1a3f0bcd0a32 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -236,6 +236,10 @@ def __init__( log_gpu_memory: None, 'min_max', 'all'. Might slow performance + .. deprecated:: v1.5 + Deprecated in v1.5.0 and will be removed in v1.7.0 + Please use the ``GPUStatsMonitor`` callback directly instead. + log_every_n_steps: How often to log within steps (defaults to every 50 steps). prepare_data_per_node: If True, each LOCAL_RANK=0 will call prepare data. @@ -365,7 +369,7 @@ def __init__( amp_level, plugins, ) - self.logger_connector = LoggerConnector(self) + self.logger_connector = LoggerConnector(self, log_gpu_memory) self.model_connector = ModelConnector(self) self.callback_connector = CallbackConnector(self) self.debugging_connector = DebuggingConnector(self) @@ -414,7 +418,6 @@ def __init__( default_root_dir, weights_save_path, stochastic_weight_avg, - log_gpu_memory, max_time, ) diff --git a/tests/deprecated_api/test_remove_1-7.py b/tests/deprecated_api/test_remove_1-7.py index 7581bf2b0c142..5fd26602f4b25 100644 --- a/tests/deprecated_api/test_remove_1-7.py +++ b/tests/deprecated_api/test_remove_1-7.py @@ -87,3 +87,10 @@ def test_v1_7_0_trainer_prepare_data_per_node(tmpdir): match="Setting `prepare_data_per_node` with the trainer flag is deprecated and will be removed in v1.7.0!" ): _ = Trainer(prepare_data_per_node=False) + + +def test_v1_7_0_trainer_log_gpu_memory(tmpdir): + with pytest.deprecated_call( + match="Setting `log_gpu_memory` with the trainer flag is deprecated and will be removed in v1.7.0!" + ): + _ = Trainer(log_gpu_memory="min_max") From 0bba67bb4e49e48035e5c4392ce482f26e63daad Mon Sep 17 00:00:00 2001 From: ananthsub Date: Wed, 25 Aug 2021 21:53:14 -0700 Subject: [PATCH 3/3] Update logger_connector.py --- .../trainer/connectors/logger_connector/logger_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index 9dbe405094d79..1392c07282942 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -24,7 +24,7 @@ from pytorch_lightning.utilities.apply_func import apply_to_collection, move_data_to_device from pytorch_lightning.utilities.metrics import metrics_to_scalars from pytorch_lightning.utilities.types import _EVALUATE_OUTPUT -from pytorch_lightning.utilities.warning import rank_zero_deprecation +from pytorch_lightning.utilities.warnings import rank_zero_deprecation class LoggerConnector: