From 819f2135c91b626202f973f3268cc0326e662281 Mon Sep 17 00:00:00 2001 From: tchaton Date: Mon, 9 Nov 2020 18:02:26 +0000 Subject: [PATCH 01/13] move value to cpu to save memory --- pytorch_lightning/core/step_result.py | 4 ++++ pytorch_lightning/utilities/memory.py | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index 0eca72095e0e0..4e9f0b6d53291 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -136,6 +136,10 @@ def log( if sync_dist and isinstance(value, (torch.Tensor, numbers.Number)): value = sync_fn(value, group=sync_dist_group, reduce_op=sync_dist_op) + # no need to keep on gpu + if isinstance(value, torch.Tensor) and value.is_cuda: + value = value.cpu() + if 'meta' not in self: self.__setitem__('meta', {}) diff --git a/pytorch_lightning/utilities/memory.py b/pytorch_lightning/utilities/memory.py index 1d3b8d27807f0..744c35b2bfc08 100644 --- a/pytorch_lightning/utilities/memory.py +++ b/pytorch_lightning/utilities/memory.py @@ -35,7 +35,8 @@ def recursive_detach(in_dict: dict) -> dict: if isinstance(v, dict): out_dict.update({k: recursive_detach(v)}) elif callable(getattr(v, 'detach', None)): - out_dict.update({k: v.detach()}) + # detach and move to cpu + out_dict.update({k: v..detach().cpu()}) else: out_dict.update({k: v}) return out_dict From 2386c0ff71de3238a16e0fa36a9483d32f916ca2 Mon Sep 17 00:00:00 2001 From: tchaton Date: Mon, 9 Nov 2020 18:38:46 +0000 Subject: [PATCH 02/13] update --- pytorch_lightning/utilities/memory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/utilities/memory.py b/pytorch_lightning/utilities/memory.py index 744c35b2bfc08..143625f595a2c 100644 --- a/pytorch_lightning/utilities/memory.py +++ b/pytorch_lightning/utilities/memory.py @@ -36,7 +36,7 @@ def recursive_detach(in_dict: dict) -> dict: out_dict.update({k: recursive_detach(v)}) elif callable(getattr(v, 'detach', None)): # detach and move to cpu - out_dict.update({k: v..detach().cpu()}) + out_dict.update({k: v.detach().cpu()}) else: out_dict.update({k: v}) return out_dict From 477cd36a167b8946becbe304f25931f7b692ca00 Mon Sep 17 00:00:00 2001 From: tchaton Date: Mon, 9 Nov 2020 18:42:15 +0000 Subject: [PATCH 03/13] move to cpu --- pytorch_lightning/core/step_result.py | 5 +++++ .../connectors/logger_connector/epoch_result_store.py | 3 +++ pytorch_lightning/trainer/training_loop.py | 1 + 3 files changed, 9 insertions(+) diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index 4e9f0b6d53291..ae47d35035a58 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -399,6 +399,11 @@ def detach(self): if isinstance(v, torch.Tensor): self.__setitem__(k, v.detach()) + def cpu(self): + for k, v in self.items(): + if isinstance(v, torch.Tensor): + self.__setitem__(k, v.cpu()) + def __repr__(self): self_copy = self.copy() diff --git a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py index 2980b037c95f7..0798260189633 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py @@ -392,6 +392,9 @@ def cache_result(self) -> None: # attach capture batch_size Result.attach_batch_size(self._batch_size, hook_result) + hook_result.detach() + hook_result.cpu() + self._internals[fx_name].append( hook_result, dataloader_idx=dataloader_idx, diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 2f66f5b1a600e..d50169f8826af 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -426,6 +426,7 @@ def _process_training_step_output_1_0(self, training_step_output, split_batch): # track metrics without grads for epoch reduction training_step_output_for_epoch_end = copy(result) training_step_output_for_epoch_end.detach() + training_step_output_for_epoch_end.cpu() # what flows back into the system training_step_output = result From abbd672e21b28691057a8f79f35d5572757a2bbf Mon Sep 17 00:00:00 2001 From: tchaton Date: Mon, 9 Nov 2020 19:13:40 +0000 Subject: [PATCH 04/13] try something --- pytorch_lightning/utilities/memory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/utilities/memory.py b/pytorch_lightning/utilities/memory.py index 143625f595a2c..40a8e9b8c5c5e 100644 --- a/pytorch_lightning/utilities/memory.py +++ b/pytorch_lightning/utilities/memory.py @@ -36,7 +36,7 @@ def recursive_detach(in_dict: dict) -> dict: out_dict.update({k: recursive_detach(v)}) elif callable(getattr(v, 'detach', None)): # detach and move to cpu - out_dict.update({k: v.detach().cpu()}) + out_dict.update({k: v.detach()}) else: out_dict.update({k: v}) return out_dict From fc44142bfe34b3c0fa3cfb1764efb29a4d0f0a72 Mon Sep 17 00:00:00 2001 From: tchaton Date: Mon, 9 Nov 2020 19:48:56 +0000 Subject: [PATCH 05/13] update --- pytorch_lightning/utilities/memory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/utilities/memory.py b/pytorch_lightning/utilities/memory.py index 40a8e9b8c5c5e..9b9408212c652 100644 --- a/pytorch_lightning/utilities/memory.py +++ b/pytorch_lightning/utilities/memory.py @@ -35,7 +35,7 @@ def recursive_detach(in_dict: dict) -> dict: if isinstance(v, dict): out_dict.update({k: recursive_detach(v)}) elif callable(getattr(v, 'detach', None)): - # detach and move to cpu + # detach out_dict.update({k: v.detach()}) else: out_dict.update({k: v}) From 858ea607ae37bb3c4d98ada7e7f672155079cc73 Mon Sep 17 00:00:00 2001 From: tchaton Date: Tue, 10 Nov 2020 09:33:02 +0000 Subject: [PATCH 06/13] update --- pytorch_lightning/trainer/trainer.py | 19 +++++++++++++++---- pytorch_lightning/utilities/memory.py | 8 +++++--- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 2d4e2c0d9e4bd..46e8f2f13002d 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -60,6 +60,7 @@ from pytorch_lightning.plugins.plugin_connector import PluginConnector from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.accelerators.cpu_accelerator import CPUAccelerator +from pytorch_lightning.utilities.memory import recursive_detach # warnings to ignore in trainer warnings.filterwarnings( @@ -603,12 +604,11 @@ def run_evaluation(self, test_mode: bool = False, max_batches=None): # log step metrics step_metrics = self.evaluation_loop.log_evaluation_step_metrics(batch, batch_idx) - if step_metrics is not None: - dl_step_metrics.append(step_metrics) + # track epoch level outputs + self.track_output_for_epoch_end(dl_step_metrics, step_metrics) # track epoch level outputs - if output is not None: - dl_outputs.append(output) + self.track_output_for_epoch_end(dl_outputs, output) self.evaluation_loop.outputs.append(dl_outputs) self.evaluation_loop.step_metrics.append(dl_step_metrics) @@ -634,6 +634,17 @@ def run_evaluation(self, test_mode: bool = False, max_batches=None): return eval_loop_results, deprecated_eval_results + def track_output_for_epoch_end(self, outputs, output): + if output is not None: + if isinstance(output, Result): + output.detach() + output.cpu() + elif isinstance(output, dict): + output = recursive_detach(output, to_cpu=True) + elif isinstance(output, torch.Tensor) and output.is_cuda: + output = output.cpu() + outputs.append(output) + def run_test(self): # only load test dataloader for testing # self.reset_test_dataloader(ref_model) diff --git a/pytorch_lightning/utilities/memory.py b/pytorch_lightning/utilities/memory.py index 9b9408212c652..a88f9ca5538ab 100644 --- a/pytorch_lightning/utilities/memory.py +++ b/pytorch_lightning/utilities/memory.py @@ -17,7 +17,7 @@ import torch -def recursive_detach(in_dict: dict) -> dict: +def recursive_detach(in_dict: dict, to_cpu=False) -> dict: """Detach all tensors in `in_dict`. May operate recursively if some of the values in `in_dict` are dictionaries @@ -26,6 +26,7 @@ def recursive_detach(in_dict: dict) -> dict: Args: in_dict: + to_cpu: Wheter to move tensor to cpu Return: out_dict: @@ -36,8 +37,9 @@ def recursive_detach(in_dict: dict) -> dict: out_dict.update({k: recursive_detach(v)}) elif callable(getattr(v, 'detach', None)): # detach - out_dict.update({k: v.detach()}) - else: + v = v.detach() + if to_cpu: + v = v.cpu() out_dict.update({k: v}) return out_dict From 30afb9383858d73acf6f92018701e7f534bb1357 Mon Sep 17 00:00:00 2001 From: tchaton Date: Tue, 10 Nov 2020 09:48:54 +0000 Subject: [PATCH 07/13] add back out_dict.update({k: v}) --- pytorch_lightning/utilities/memory.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pytorch_lightning/utilities/memory.py b/pytorch_lightning/utilities/memory.py index a88f9ca5538ab..7331dc565bc31 100644 --- a/pytorch_lightning/utilities/memory.py +++ b/pytorch_lightning/utilities/memory.py @@ -41,6 +41,8 @@ def recursive_detach(in_dict: dict, to_cpu=False) -> dict: if to_cpu: v = v.cpu() out_dict.update({k: v}) + else: + out_dict.update({k: v}) return out_dict From d9caa5d8cf48dc7db3be0b69cadf7c7e82e058e4 Mon Sep 17 00:00:00 2001 From: tchaton Date: Tue, 10 Nov 2020 10:18:05 +0000 Subject: [PATCH 08/13] add move_metrics_to_cpu --- pytorch_lightning/core/step_result.py | 4 ---- .../logger_connector/epoch_result_store.py | 3 ++- .../logger_connector/logger_connector.py | 4 +++- pytorch_lightning/trainer/trainer.py | 19 +++++++++++++++---- pytorch_lightning/trainer/training_loop.py | 3 ++- 5 files changed, 22 insertions(+), 11 deletions(-) diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index ae47d35035a58..da6653bea06a2 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -136,10 +136,6 @@ def log( if sync_dist and isinstance(value, (torch.Tensor, numbers.Number)): value = sync_fn(value, group=sync_dist_group, reduce_op=sync_dist_op) - # no need to keep on gpu - if isinstance(value, torch.Tensor) and value.is_cuda: - value = value.cpu() - if 'meta' not in self: self.__setitem__('meta', {}) diff --git a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py index 0798260189633..9f8d029d9bef4 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py @@ -393,7 +393,8 @@ def cache_result(self) -> None: Result.attach_batch_size(self._batch_size, hook_result) hook_result.detach() - hook_result.cpu() + if self.trainer.move_metrics_to_cpu: + hook_result.cpu() self._internals[fx_name].append( hook_result, diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index 946064660f818..79b8cc69e641a 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -93,7 +93,7 @@ def cache_logged_metrics(self) -> Union[EpochResultStore, None]: if self._current_stage is not None: self._cached_results[self._current_stage].cache_result() - def on_trainer_init(self, logger, flush_logs_every_n_steps, log_every_n_steps): + def on_trainer_init(self, logger, flush_logs_every_n_steps, log_every_n_steps, move_metrics_to_cpu): # logging self.configure_logger(logger) # todo: IDE is complaining, these shall be initialized in the Trainer init at leas as placeholders @@ -101,6 +101,8 @@ def on_trainer_init(self, logger, flush_logs_every_n_steps, log_every_n_steps): self.trainer.flush_logs_every_n_steps = flush_logs_every_n_steps self.trainer.log_every_n_steps = log_every_n_steps + self.trainer.move_metrics_to_cpu = move_metrics_to_cpu + @property def should_flush_logs(self): should_flush = (self.trainer.global_step + 1) % self.trainer.flush_logs_every_n_steps == 0 diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 46e8f2f13002d..f2db41d29e3e3 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -136,6 +136,7 @@ def __init__( amp_level: str = 'O2', distributed_backend: Optional[str] = None, automatic_optimization: bool = True, + move_metrics_to_cpu: bool = False, ): r""" Customize every aspect of training via flags @@ -273,6 +274,9 @@ def __init__( stored in a different place than the logs written in `default_root_dir`. Can be remote file paths such as `s3://mybucket/path` or 'hdfs://path/' Defaults to `default_root_dir`. + + move_metrics_to_cpu: Whether to force internal logged metrics to be moved to cpu. + This can save some gpu memory, but can make training slower. Use with attention. """ super().__init__() @@ -364,7 +368,12 @@ def __init__( self.profile_connector.on_trainer_init(profiler) # init logger flags - self.logger_connector.on_trainer_init(logger, flush_logs_every_n_steps, log_every_n_steps) + self.logger_connector.on_trainer_init( + logger, + flush_logs_every_n_steps, + log_every_n_steps, + move_metrics_to_cpu + ) # init debugging flags self.debugging_connector.on_init_start( @@ -636,12 +645,14 @@ def run_evaluation(self, test_mode: bool = False, max_batches=None): def track_output_for_epoch_end(self, outputs, output): if output is not None: + move_metrics_to_cpu = self.trainer.move_metrics_to_cpu if isinstance(output, Result): output.detach() - output.cpu() + if move_metrics_to_cpu: + output.cpu() elif isinstance(output, dict): - output = recursive_detach(output, to_cpu=True) - elif isinstance(output, torch.Tensor) and output.is_cuda: + output = recursive_detach(output, to_cpu=move_metrics_to_cpu) + elif isinstance(output, torch.Tensor) and output.is_cuda and move_metrics_to_cpu: output = output.cpu() outputs.append(output) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index d50169f8826af..e3bc81f4531a2 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -426,7 +426,8 @@ def _process_training_step_output_1_0(self, training_step_output, split_batch): # track metrics without grads for epoch reduction training_step_output_for_epoch_end = copy(result) training_step_output_for_epoch_end.detach() - training_step_output_for_epoch_end.cpu() + if self.trainer.move_metrics_to_cpu: + training_step_output_for_epoch_end.cpu() # what flows back into the system training_step_output = result From a57dcfe2ebdfd437bca91fb34252f2f39327f446 Mon Sep 17 00:00:00 2001 From: tchaton Date: Tue, 10 Nov 2020 10:35:43 +0000 Subject: [PATCH 09/13] update --- pytorch_lightning/trainer/trainer.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index f2db41d29e3e3..1436be388286c 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -645,14 +645,13 @@ def run_evaluation(self, test_mode: bool = False, max_batches=None): def track_output_for_epoch_end(self, outputs, output): if output is not None: - move_metrics_to_cpu = self.trainer.move_metrics_to_cpu if isinstance(output, Result): output.detach() - if move_metrics_to_cpu: + if self.move_metrics_to_cpu: output.cpu() elif isinstance(output, dict): - output = recursive_detach(output, to_cpu=move_metrics_to_cpu) - elif isinstance(output, torch.Tensor) and output.is_cuda and move_metrics_to_cpu: + output = recursive_detach(output, to_cpu=self.move_metrics_to_cpu) + elif isinstance(output, torch.Tensor) and output.is_cuda and self.move_metrics_to_cpu: output = output.cpu() outputs.append(output) From be4e0f333380aeca6f266d6a39e101779dceba69 Mon Sep 17 00:00:00 2001 From: chaton Date: Tue, 10 Nov 2020 18:34:40 +0000 Subject: [PATCH 10/13] Update pytorch_lightning/utilities/memory.py Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> --- pytorch_lightning/utilities/memory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/utilities/memory.py b/pytorch_lightning/utilities/memory.py index 7331dc565bc31..16c0ede1e5413 100644 --- a/pytorch_lightning/utilities/memory.py +++ b/pytorch_lightning/utilities/memory.py @@ -17,7 +17,7 @@ import torch -def recursive_detach(in_dict: dict, to_cpu=False) -> dict: +def recursive_detach(in_dict: dict, to_cpu: bool = False) -> dict: """Detach all tensors in `in_dict`. May operate recursively if some of the values in `in_dict` are dictionaries From d41273bc989a86de49e6ce8d7ad9c3eb3bd43a6b Mon Sep 17 00:00:00 2001 From: tchaton Date: Tue, 10 Nov 2020 18:35:47 +0000 Subject: [PATCH 11/13] resolve comments --- pytorch_lightning/trainer/trainer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 1436be388286c..4ef83dc7de544 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -614,10 +614,10 @@ def run_evaluation(self, test_mode: bool = False, max_batches=None): step_metrics = self.evaluation_loop.log_evaluation_step_metrics(batch, batch_idx) # track epoch level outputs - self.track_output_for_epoch_end(dl_step_metrics, step_metrics) + dl_step_metrics = self.track_output_for_epoch_end(dl_step_metrics, step_metrics) # track epoch level outputs - self.track_output_for_epoch_end(dl_outputs, output) + dl_outputs = self.track_output_for_epoch_end(dl_outputs, output) self.evaluation_loop.outputs.append(dl_outputs) self.evaluation_loop.step_metrics.append(dl_step_metrics) @@ -654,6 +654,7 @@ def track_output_for_epoch_end(self, outputs, output): elif isinstance(output, torch.Tensor) and output.is_cuda and self.move_metrics_to_cpu: output = output.cpu() outputs.append(output) + return outputs def run_test(self): # only load test dataloader for testing From 873e9c088d12b5103a5c4841b45f9e3da7945e43 Mon Sep 17 00:00:00 2001 From: chaton Date: Tue, 10 Nov 2020 19:38:57 +0000 Subject: [PATCH 12/13] Update pytorch_lightning/core/step_result.py Co-authored-by: Jirka Borovec --- pytorch_lightning/core/step_result.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index da6653bea06a2..8f8a517d544f0 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -396,6 +396,7 @@ def detach(self): self.__setitem__(k, v.detach()) def cpu(self): + """Move all self attributes to CPU.""" for k, v in self.items(): if isinstance(v, torch.Tensor): self.__setitem__(k, v.cpu()) From 5e80eb9ddfb2677cd550dc2899ef94d842334c1b Mon Sep 17 00:00:00 2001 From: chaton Date: Tue, 10 Nov 2020 19:39:09 +0000 Subject: [PATCH 13/13] Update pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py Co-authored-by: Jirka Borovec --- .../trainer/connectors/logger_connector/logger_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index 79b8cc69e641a..6a6a3229b8061 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -93,7 +93,7 @@ def cache_logged_metrics(self) -> Union[EpochResultStore, None]: if self._current_stage is not None: self._cached_results[self._current_stage].cache_result() - def on_trainer_init(self, logger, flush_logs_every_n_steps, log_every_n_steps, move_metrics_to_cpu): + def on_trainer_init(self, logger, flush_logs_every_n_steps: int, log_every_n_steps: int, move_metrics_to_cpu: bool): # logging self.configure_logger(logger) # todo: IDE is complaining, these shall be initialized in the Trainer init at leas as placeholders