From 8bf41f1814af1a0f6c23185e4dd6f356a139b0da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 10 Mar 2021 00:18:38 +0100 Subject: [PATCH 01/13] Improve DummyLogger (#6398) * fix dummy logger * docs * update docs * add changelog * add none return annotation * return empty string for name, version --- CHANGELOG.md | 11 ++++++-- pytorch_lightning/loggers/base.py | 34 +++++++++++++----------- tests/loggers/test_base.py | 9 +++++++ tests/trainer/flags/test_fast_dev_run.py | 1 + 4 files changed, 37 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d192f814c4081..4d2f403739b47 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,15 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). +## [1.2.6] - 2021-03-30 + +### Changed + +- + +### Fixed + +- Fixed `DummyLogger.log_hyperparams` raising a `TypeError` when running with `fast_dev_run=True` ([#6398](https://github.com/PyTorchLightning/pytorch-lightning/pull/6398)) ## [1.2.5] - 2021-03-23 @@ -13,7 +22,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Update Gradient Clipping for the TPU Accelerator ([#6576](https://github.com/PyTorchLightning/pytorch-lightning/pull/6576)) - Refactored setup for typing friendly ([#6590](https://github.com/PyTorchLightning/pytorch-lightning/pull/6590)) - ### Fixed - Fixed a bug where `all_gather` would not work correctly with `tpu_cores=8` ([#6587](https://github.com/PyTorchLightning/pytorch-lightning/pull/6587)) @@ -36,7 +44,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed broadcast to use PyTorch `broadcast_object_list` and add `reduce_decision` ([#6410](https://github.com/PyTorchLightning/pytorch-lightning/pull/6410)) - Fixed logger creating directory structure too early in DDP ([#6380](https://github.com/PyTorchLightning/pytorch-lightning/pull/6380)) - Fixed DeepSpeed additional memory use on rank 0 when default device not set early enough ([#6460](https://github.com/PyTorchLightning/pytorch-lightning/pull/6460)) -- Fixed `DummyLogger.log_hyperparams` raising a `TypeError` when running with `fast_dev_run=True` ([#6398](https://github.com/PyTorchLightning/pytorch-lightning/pull/6398)) - Fixed an issue with `Tuner.scale_batch_size` not finding the batch size attribute in the datamodule ([#5968](https://github.com/PyTorchLightning/pytorch-lightning/pull/5968)) - Fixed an exception in the layer summary when the model contains torch.jit scripted submodules ([#6511](https://github.com/PyTorchLightning/pytorch-lightning/pull/6511)) - Fixed when Train loop config was run during `Trainer.predict` ([#6541](https://github.com/PyTorchLightning/pytorch-lightning/pull/6541)) diff --git a/pytorch_lightning/loggers/base.py b/pytorch_lightning/loggers/base.py index 4fdb5e8c437bf..035a42338fe68 100644 --- a/pytorch_lightning/loggers/base.py +++ b/pytorch_lightning/loggers/base.py @@ -279,12 +279,14 @@ def _sanitize_params(params: Dict[str, Any]) -> Dict[str, Any]: return params @abstractmethod - def log_hyperparams(self, params: argparse.Namespace): + def log_hyperparams(self, params: argparse.Namespace, *args, **kwargs): """ Record hyperparameters. Args: params: :class:`~argparse.Namespace` containing the hyperparameters + args: Optional positional arguments, depends on the specific logger being used + kwargs: Optional keywoard arguments, depends on the specific logger being used """ def log_graph(self, model: LightningModule, input_array=None) -> None: @@ -418,41 +420,41 @@ def nop(*args, **kw): def __getattr__(self, _): return self.nop - def __getitem__(self, idx): - # enables self.logger[0].experiment.add_image - # and self.logger.experiment[0].add_image(...) + def __getitem__(self, idx) -> "DummyExperiment": + # enables self.logger.experiment[0].add_image(...) return self class DummyLogger(LightningLoggerBase): - """ Dummy logger for internal use. Is usefull if we want to disable users - logger for a feature, but still secure that users code can run """ + """ + Dummy logger for internal use. It is useful if we want to disable user's + logger for a feature, but still ensure that user code can run + """ def __init__(self): super().__init__() self._experiment = DummyExperiment() @property - def experiment(self): + def experiment(self) -> DummyExperiment: return self._experiment - @rank_zero_only - def log_metrics(self, metrics, step): + def log_metrics(self, *args, **kwargs) -> None: pass - @rank_zero_only - def log_hyperparams(self, params): + def log_hyperparams(self, *args, **kwargs) -> None: pass @property - def name(self): - pass + def name(self) -> str: + return "" @property - def version(self): - pass + def version(self) -> str: + return "" - def __getitem__(self, idx): + def __getitem__(self, idx) -> "DummyLogger": + # enables self.logger[0].experiment.add_image(...) return self diff --git a/tests/loggers/test_base.py b/tests/loggers/test_base.py index c48fef5e04b49..cf3a0cb74b3f4 100644 --- a/tests/loggers/test_base.py +++ b/tests/loggers/test_base.py @@ -229,15 +229,24 @@ def log_metrics(self, metrics, step): def test_dummyexperiment_support_indexing(): + """ Test that the DummyExperiment can imitate indexing the experiment in a LoggerCollection. """ experiment = DummyExperiment() assert experiment[0] == experiment def test_dummylogger_support_indexing(): + """ Test that the DummyLogger can imitate indexing of a LoggerCollection. """ logger = DummyLogger() assert logger[0] == logger +def test_dummylogger_noop_method_calls(): + """ Test that the DummyLogger methods can be called with arbitrary arguments. """ + logger = DummyLogger() + logger.log_hyperparams("1", 2, three="three") + logger.log_metrics("1", 2, three="three") + + def test_np_sanitization(): class CustomParamsLogger(CustomLogger): diff --git a/tests/trainer/flags/test_fast_dev_run.py b/tests/trainer/flags/test_fast_dev_run.py index 221951e788284..bcfdd6247d550 100644 --- a/tests/trainer/flags/test_fast_dev_run.py +++ b/tests/trainer/flags/test_fast_dev_run.py @@ -71,6 +71,7 @@ def test_step(self, batch, batch_idx): checkpoint_callback = ModelCheckpoint() early_stopping_callback = EarlyStopping() trainer_config = dict( + default_root_dir=tmpdir, fast_dev_run=fast_dev_run, val_check_interval=2, logger=True, From 4e19a5b4f1307a83410fc72dd4dc3721ccd7a9a8 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Thu, 25 Mar 2021 18:50:49 +0530 Subject: [PATCH 02/13] Add on_epoch_start to run at the beginning of every loop irrespective of train/val/test (#6498) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * update docs * add hook and update docs * update tests * chlog * Update CHANGELOG.md Co-authored-by: Adrian Wälchli * chlog Co-authored-by: Adrian Wälchli --- CHANGELOG.md | 2 +- docs/source/common/lightning_module.rst | 91 +++++++++++++++++-- docs/source/extensions/callbacks.rst | 12 +++ docs/source/extensions/logging.rst | 2 +- pytorch_lightning/callbacks/base.py | 4 +- .../gradient_accumulation_scheduler.py | 2 +- pytorch_lightning/callbacks/progress.py | 6 +- pytorch_lightning/core/hooks.py | 4 +- pytorch_lightning/core/lightning.py | 11 ++- pytorch_lightning/trainer/callback_hook.py | 4 +- pytorch_lightning/trainer/evaluation_loop.py | 2 + pytorch_lightning/trainer/training_loop.py | 6 +- tests/callbacks/test_callbacks.py | 3 + tests/models/test_hooks.py | 3 + .../logging_/test_eval_loop_logging_1_0.py | 15 ++- 15 files changed, 135 insertions(+), 32 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4d2f403739b47..524e57ac48e03 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Changed -- +- Changed the behavior of `on_epoch_start` to run at the beginning of validation & test epoch ([#6498](https://github.com/PyTorchLightning/pytorch-lightning/pull/6498)) ### Fixed diff --git a/docs/source/common/lightning_module.rst b/docs/source/common/lightning_module.rst index c02f23ac60d09..7f0df33a351e4 100644 --- a/docs/source/common/lightning_module.rst +++ b/docs/source/common/lightning_module.rst @@ -1039,6 +1039,7 @@ This is the pseudocode to describe how all the hooks are called during a call to teardown() def train_loop(): + on_epoch_start() on_train_epoch_start() train_outs = [] for train_batch in train_dataloader(): @@ -1062,12 +1063,15 @@ This is the pseudocode to describe how all the hooks are called during a call to val_loop() # end training epoch - logs = training_epoch_end(outs) + outs = training_epoch_end(outs) + on_train_epoch_end(outs) + on_epoch_end() def val_loop(): model.eval() torch.set_grad_enabled(False) + on_epoch_start() on_validation_epoch_start() val_outs = [] for val_batch in val_dataloader(): @@ -1081,6 +1085,7 @@ This is the pseudocode to describe how all the hooks are called during a call to validation_epoch_end(val_outs) on_validation_epoch_end() + on_epoch_end() # set up for train model.train() @@ -1108,12 +1113,12 @@ manual_backward on_after_backward ~~~~~~~~~~~~~~~~~ -.. automethod:: pytorch_lightning.core.lightning.LightningModule.on_after_backward +.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_after_backward :noindex: on_before_zero_grad ~~~~~~~~~~~~~~~~~~~ -.. automethod:: pytorch_lightning.core.lightning.LightningModule.on_before_zero_grad +.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_before_zero_grad :noindex: on_fit_start @@ -1132,15 +1137,38 @@ on_fit_end on_load_checkpoint ~~~~~~~~~~~~~~~~~~ -.. automethod:: pytorch_lightning.core.lightning.LightningModule.on_load_checkpoint +.. automethod:: pytorch_lightning.core.hooks.CheckpointHooks.on_load_checkpoint :noindex: on_save_checkpoint ~~~~~~~~~~~~~~~~~~ -.. automethod:: pytorch_lightning.core.lightning.LightningModule.on_save_checkpoint +.. automethod:: pytorch_lightning.core.hooks.CheckpointHooks.on_save_checkpoint :noindex: +on_train_start +~~~~~~~~~~~~~~ + +.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_train_start + :noindex: + +on_train_end +~~~~~~~~~~~~ + +.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_train_end + :noindex: + +on_validation_start +~~~~~~~~~~~~~~~~~~~ + +.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_validation_start + :noindex: + +on_validation_end +~~~~~~~~~~~~~~~~~ + +.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_validation_end + :noindex: on_pretrain_routine_start ~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1178,6 +1206,11 @@ on_test_epoch_end .. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_test_epoch_end :noindex: +on_test_end +~~~~~~~~~~~ + +.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_test_end + :noindex: on_train_batch_start ~~~~~~~~~~~~~~~~~~~~ @@ -1191,6 +1224,18 @@ on_train_batch_end .. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_train_batch_end :noindex: +on_epoch_start +~~~~~~~~~~~~~~ + +.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_epoch_start + :noindex: + +on_epoch_end +~~~~~~~~~~~~ + +.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_epoch_end + :noindex: + on_train_epoch_start ~~~~~~~~~~~~~~~~~~~~ @@ -1227,6 +1272,36 @@ on_validation_epoch_end .. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_validation_epoch_end :noindex: +on_post_move_to_device +~~~~~~~~~~~~~~~~~~~~~~ + +.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_post_move_to_device + :noindex: + +on_validation_model_eval +~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_validation_model_eval + :noindex: + +on_validation_model_train +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_validation_model_train + :noindex: + +on_test_model_eval +~~~~~~~~~~~~~~~~~~ + +.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_test_model_eval + :noindex: + +on_test_model_train +~~~~~~~~~~~~~~~~~~~ + +.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_test_model_train + :noindex: + optimizer_step ~~~~~~~~~~~~~~ @@ -1266,19 +1341,19 @@ teardown train_dataloader ~~~~~~~~~~~~~~~~ -.. automethod:: pytorch_lightning.core.lightning.LightningModule.train_dataloader +.. automethod:: pytorch_lightning.core.hooks.DataHooks.train_dataloader :noindex: val_dataloader ~~~~~~~~~~~~~~ -.. automethod:: pytorch_lightning.core.lightning.LightningModule.val_dataloader +.. automethod:: pytorch_lightning.core.hooks.DataHooks.val_dataloader :noindex: test_dataloader ~~~~~~~~~~~~~~~ -.. automethod:: pytorch_lightning.core.lightning.LightningModule.test_dataloader +.. automethod:: pytorch_lightning.core.hooks.DataHooks.test_dataloader :noindex: transfer_batch_to_device diff --git a/docs/source/extensions/callbacks.rst b/docs/source/extensions/callbacks.rst index 63a221a06119f..73691c6dd76f5 100644 --- a/docs/source/extensions/callbacks.rst +++ b/docs/source/extensions/callbacks.rst @@ -349,3 +349,15 @@ on_load_checkpoint .. automethod:: pytorch_lightning.callbacks.Callback.on_load_checkpoint :noindex: + +on_after_backward +^^^^^^^^^^^^^^^^^ + +.. automethod:: pytorch_lightning.callbacks.Callback.on_after_backward + :noindex: + +on_before_zero_grad +^^^^^^^^^^^^^^^^^^^ + +.. automethod:: pytorch_lightning.callbacks.Callback.on_before_zero_grad + :noindex: diff --git a/docs/source/extensions/logging.rst b/docs/source/extensions/logging.rst index bfeed22fd4e66..1ac6e698ccbd3 100644 --- a/docs/source/extensions/logging.rst +++ b/docs/source/extensions/logging.rst @@ -90,7 +90,7 @@ The :func:`~~pytorch_lightning.core.lightning.LightningModule.log` method has a .. note:: - Setting ``on_epoch=True`` will cache all your logged values during the full training epoch and perform a - reduction `on_epoch_end`. We recommend using the :doc:`metrics <../extensions/metrics>` API when working with custom reduction. + reduction in ``on_train_epoch_end``. We recommend using the :doc:`metrics <../extensions/metrics>` API when working with custom reduction. - Setting both ``on_step=True`` and ``on_epoch=True`` will create two keys per metric you log with suffix ``_step`` and ``_epoch``, respectively. You can refer to these keys e.g. in the `monitor` diff --git a/pytorch_lightning/callbacks/base.py b/pytorch_lightning/callbacks/base.py index d53acf0f7030d..76e23a3118dcb 100644 --- a/pytorch_lightning/callbacks/base.py +++ b/pytorch_lightning/callbacks/base.py @@ -102,11 +102,11 @@ def on_test_epoch_end(self, trainer, pl_module: LightningModule) -> None: pass def on_epoch_start(self, trainer, pl_module: LightningModule) -> None: - """Called when the epoch begins.""" + """Called when either of train/val/test epoch begins.""" pass def on_epoch_end(self, trainer, pl_module: LightningModule) -> None: - """Called when the epoch ends.""" + """Called when either of train/val/test epoch ends.""" pass def on_batch_start(self, trainer, pl_module: LightningModule) -> None: diff --git a/pytorch_lightning/callbacks/gradient_accumulation_scheduler.py b/pytorch_lightning/callbacks/gradient_accumulation_scheduler.py index 0af7d61bf5dec..b1885087f4da0 100644 --- a/pytorch_lightning/callbacks/gradient_accumulation_scheduler.py +++ b/pytorch_lightning/callbacks/gradient_accumulation_scheduler.py @@ -74,7 +74,7 @@ def __init__(self, scheduling: Dict[int, int]): def going_to_accumulate_grad_batches(self): return any([v > 1 for v in self.scheduling.values()]) - def on_epoch_start(self, trainer, pl_module): + def on_train_epoch_start(self, trainer, pl_module): epoch = trainer.current_epoch for i in reversed(range(len(self.epochs))): if epoch >= self.epochs[i]: diff --git a/pytorch_lightning/callbacks/progress.py b/pytorch_lightning/callbacks/progress.py index 587fee95e9cd0..46331e004c1c7 100644 --- a/pytorch_lightning/callbacks/progress.py +++ b/pytorch_lightning/callbacks/progress.py @@ -192,7 +192,7 @@ def on_init_end(self, trainer): def on_train_start(self, trainer, pl_module): self._train_batch_idx = trainer.batch_idx - def on_epoch_start(self, trainer, pl_module): + def on_train_epoch_start(self, trainer, pl_module): self._train_batch_idx = 0 def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx): @@ -383,8 +383,8 @@ def on_train_start(self, trainer, pl_module): super().on_train_start(trainer, pl_module) self.main_progress_bar = self.init_train_tqdm() - def on_epoch_start(self, trainer, pl_module): - super().on_epoch_start(trainer, pl_module) + def on_train_epoch_start(self, trainer, pl_module): + super().on_train_epoch_start(trainer, pl_module) total_train_batches = self.total_train_batches total_val_batches = self.total_val_batches if total_train_batches != float('inf'): diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py index 2e1ea31871e03..79295c7c81dc1 100644 --- a/pytorch_lightning/core/hooks.py +++ b/pytorch_lightning/core/hooks.py @@ -224,13 +224,13 @@ def on_predict_model_eval(self) -> None: def on_epoch_start(self) -> None: """ - Called in the training loop at the very beginning of the epoch. + Called when either of train/val/test epoch begins. """ # do something when the epoch starts def on_epoch_end(self) -> None: """ - Called in the training loop at the very end of the epoch. + Called when either of train/val/test epoch ends. """ # do something when the epoch ends diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index d1a0a87c37f33..137f65baf71cb 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -706,10 +706,13 @@ def validation_step(self, *args, **kwargs): .. code-block:: python # pseudocode of order - out = validation_step() - if defined('validation_step_end'): - out = validation_step_end(out) - out = validation_epoch_end(out) + val_outs = [] + for val_batch in val_data: + out = validation_step(val_batch) + if defined('validation_step_end'): + out = validation_step_end(out) + val_outs.append(out) + val_outs = validation_epoch_end(val_outs) .. code-block:: python diff --git a/pytorch_lightning/trainer/callback_hook.py b/pytorch_lightning/trainer/callback_hook.py index d33338055a5b1..bbd968fba061e 100644 --- a/pytorch_lightning/trainer/callback_hook.py +++ b/pytorch_lightning/trainer/callback_hook.py @@ -105,12 +105,12 @@ def on_test_epoch_end(self): callback.on_test_epoch_end(self, self.lightning_module) def on_epoch_start(self): - """Called when the epoch begins.""" + """Called when either of train/val/test epoch begins.""" for callback in self.callbacks: callback.on_epoch_start(self, self.lightning_module) def on_epoch_end(self): - """Called when the epoch ends.""" + """Called when either of train/val/test epoch ends.""" for callback in self.callbacks: callback.on_epoch_end(self, self.lightning_module) diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py index e1b3688ef36e6..c7eb7e0c90ad0 100644 --- a/pytorch_lightning/trainer/evaluation_loop.py +++ b/pytorch_lightning/trainer/evaluation_loop.py @@ -120,6 +120,8 @@ def setup(self, model, max_batches, dataloaders): self._predictions = [[] for _ in range(self.num_dataloaders)] def on_evaluation_epoch_start(self, *args, **kwargs): + self.trainer.call_hook('on_epoch_start', *args, **kwargs) + if self.trainer.testing: self.trainer.call_hook('on_test_epoch_start', *args, **kwargs) else: diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index c3afe14285d9f..36e1f6799437e 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -189,7 +189,7 @@ def on_train_epoch_start(self, epoch): self.trainer.train_dataloader.sampler.set_epoch(epoch) # changing gradient according accumulation_scheduler - self.trainer.accumulation_scheduler.on_epoch_start(self.trainer, self.trainer.lightning_module) + self.trainer.accumulation_scheduler.on_train_epoch_start(self.trainer, self.trainer.lightning_module) # stores accumulated grad fractions per batch self.accumulated_loss = TensorRunningAccum(window_length=self.trainer.accumulate_grad_batches) @@ -555,7 +555,7 @@ def run_training_epoch(self): self.increment_accumulated_grad_global_step() # epoch end hook - self.run_on_epoch_end_hook(epoch_output) + self.on_train_epoch_end(epoch_output) # log epoch metrics self.trainer.logger_connector.log_train_epoch_end_metrics( @@ -798,7 +798,7 @@ def update_train_loop_lr_schedulers(self, monitor_metrics=None): # update lr self.trainer.optimizer_connector.update_learning_rates(interval="step", monitor_metrics=monitor_metrics) - def run_on_epoch_end_hook(self, epoch_output): + def on_train_epoch_end(self, epoch_output): # inform logger the batch loop has finished self.trainer.logger_connector.on_train_epoch_end() diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py index 8d01841f3636c..4b3aab7638e3d 100644 --- a/tests/callbacks/test_callbacks.py +++ b/tests/callbacks/test_callbacks.py @@ -61,6 +61,7 @@ def test_trainer_callback_system(torch_save, tmpdir): call.on_pretrain_routine_end(trainer, model), call.on_sanity_check_start(trainer, model), call.on_validation_start(trainer, model), + call.on_epoch_start(trainer, model), call.on_validation_epoch_start(trainer, model), call.on_validation_batch_start(trainer, model, ANY, 0, 0), call.on_validation_batch_end(trainer, model, ANY, ANY, 0, 0), @@ -92,6 +93,7 @@ def test_trainer_callback_system(torch_save, tmpdir): call.on_train_epoch_end(trainer, model, ANY), call.on_epoch_end(trainer, model), call.on_validation_start(trainer, model), + call.on_epoch_start(trainer, model), call.on_validation_epoch_start(trainer, model), call.on_validation_batch_start(trainer, model, ANY, 0, 0), call.on_validation_batch_end(trainer, model, ANY, ANY, 0, 0), @@ -115,6 +117,7 @@ def test_trainer_callback_system(torch_save, tmpdir): call.on_before_accelerator_backend_setup(trainer, model), call.on_fit_start(trainer, model), call.on_test_start(trainer, model), + call.on_epoch_start(trainer, model), call.on_test_epoch_start(trainer, model), call.on_test_batch_start(trainer, model, ANY, 0, 0), call.on_test_batch_end(trainer, model, ANY, ANY, 0, 0), diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 62a252eaa3128..0da13ecbd8867 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -435,6 +435,7 @@ def teardown(self, stage: str): 'on_pretrain_routine_end', 'on_validation_model_eval', 'on_validation_start', + 'on_epoch_start', 'on_validation_epoch_start', 'on_validation_batch_start', 'on_validation_batch_end', @@ -457,6 +458,7 @@ def teardown(self, stage: str): 'on_epoch_end', 'on_validation_model_eval', 'on_validation_start', + 'on_epoch_start', 'on_validation_epoch_start', 'on_validation_batch_start', 'on_validation_batch_end', @@ -479,6 +481,7 @@ def teardown(self, stage: str): 'on_fit_start', 'on_test_model_eval', 'on_test_start', + 'on_epoch_start', 'on_test_epoch_start', 'on_test_batch_start', 'on_test_batch_end', diff --git a/tests/trainer/logging_/test_eval_loop_logging_1_0.py b/tests/trainer/logging_/test_eval_loop_logging_1_0.py index 765fab229f6cf..79bdecae46424 100644 --- a/tests/trainer/logging_/test_eval_loop_logging_1_0.py +++ b/tests/trainer/logging_/test_eval_loop_logging_1_0.py @@ -496,9 +496,15 @@ def on_validation_start(self, trainer, pl_module): ) def on_epoch_start(self, trainer, pl_module): - self.make_logging( - pl_module, 'on_epoch_start', 2, on_steps=self.choices, on_epochs=self.choices, prob_bars=self.choices - ) + if trainer.validating: + self.make_logging( + pl_module, + 'on_epoch_start', + 2, + on_steps=self.choices, + on_epochs=self.choices, + prob_bars=self.choices + ) def on_validation_epoch_start(self, trainer, pl_module): self.make_logging( @@ -540,7 +546,7 @@ def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, self.count += 1 def on_epoch_end(self, trainer, pl_module): - if not trainer.training: + if trainer.validating: self.make_logging( pl_module, 'on_epoch_end', 8, on_steps=[False], on_epochs=self.choices, prob_bars=self.choices ) @@ -578,7 +584,6 @@ def validation_step(self, batch, batch_idx): callbacks=[test_callback], ) trainer.fit(model) - trainer.test() assert test_callback.funcs_called_count["on_epoch_start"] == 1 # assert test_callback.funcs_called_count["on_batch_start"] == 1 From c02db78e591054ae40306a503661aa0b4305a9c1 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 23 Mar 2021 23:05:04 +0100 Subject: [PATCH 03/13] update coverage config (#6524) * update coverage config * parallel * parallel * Apply suggestions from code review * Apply suggestions from code review * paralel * paralel * paralel * combine * combine * . * .. * .. * .. * rev * cb * cb * drop * drop * . * .. * ... * ... * ... * . --- .github/workflows/ci_test-base.yml | 2 +- .github/workflows/ci_test-conda.yml | 2 +- .github/workflows/ci_test-full.yml | 2 +- azure-pipelines.yml | 2 +- requirements/test.txt | 4 ++-- setup.cfg | 5 ----- tests/special_tests.sh | 2 +- 7 files changed, 7 insertions(+), 12 deletions(-) diff --git a/.github/workflows/ci_test-base.yml b/.github/workflows/ci_test-base.yml index 0e84642e2f810..77363992718af 100644 --- a/.github/workflows/ci_test-base.yml +++ b/.github/workflows/ci_test-base.yml @@ -68,7 +68,7 @@ jobs: - name: Test Package [only] run: | # NOTE: run coverage on tests does not propagare faler status for Win, https://github.com/nedbat/coveragepy/issues/1003 - python -m pytest pytorch_lightning -v --cov=pytorch_lightning --junitxml=junit/test-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}.xml + coverage run --source pytorch_lightning -m pytest pytorch_lightning -v --junitxml=junit/test-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}.xml - name: Upload pytest test results uses: actions/upload-artifact@v2 diff --git a/.github/workflows/ci_test-conda.yml b/.github/workflows/ci_test-conda.yml index 419580b71cd10..94cc73f0dc184 100644 --- a/.github/workflows/ci_test-conda.yml +++ b/.github/workflows/ci_test-conda.yml @@ -44,7 +44,7 @@ jobs: - name: Tests run: | # NOTE: run coverage on tests does not propagare faler status for Win, https://github.com/nedbat/coveragepy/issues/1003 - python -m pytest pytorch_lightning tests -v --durations=50 --junitxml=junit/test-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}.xml + coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --durations=50 --junitxml=junit/test-results-${{ runner.os }}-torch${{ matrix.pytorch-version }}.xml shell: bash -l {0} - name: Upload pytest test results diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml index ff8fba06adee6..5fe142577b2c0 100644 --- a/.github/workflows/ci_test-full.yml +++ b/.github/workflows/ci_test-full.yml @@ -134,7 +134,7 @@ jobs: - name: Tests run: | # NOTE: do not include coverage report here, see: https://github.com/nedbat/coveragepy/issues/1003 - coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --durations=50 --junitxml=junit/test-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}.xml + coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --durations=50 --junitxml=junit/test-results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}.xml - name: Examples run: | diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 1447176c7ea70..fecf8f1a776ce 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -78,7 +78,7 @@ jobs: displayName: 'Get legacy checkpoints' - bash: | - python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --durations=50 + python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50 displayName: 'Testing: standard' - bash: | diff --git a/requirements/test.txt b/requirements/test.txt index 84ddb2f981b54..48ed4727ecd51 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -1,8 +1,8 @@ coverage>=5.2 codecov>=2.1 pytest>=6.0 -pytest-cov>2.10 -# pytest-xdist +#pytest-cov>2.10 +#pytest-xdist flake8>=3.6 check-manifest twine==3.2 diff --git a/setup.cfg b/setup.cfg index 4c478dccb709e..528c4ffa9e214 100644 --- a/setup.cfg +++ b/setup.cfg @@ -47,11 +47,6 @@ omit = pytorch_lightning/utilities/xla_device_utils.py pytorch_lightning/utilities/distributed.py pytorch_lightning/tuner/auto_gpu_select.py - # TODO: temporary, until accelerator refactor is finished - pytorch_lightning/accelerators/accelerator.py - pytorch_lightning/plugins/training_type/*.py - pytorch_lightning/plugins/precision/*.py - pytorch_lightning/plugins/base_plugin.py [flake8] diff --git a/tests/special_tests.sh b/tests/special_tests.sh index 43658721e9226..b00936398489d 100644 --- a/tests/special_tests.sh +++ b/tests/special_tests.sh @@ -14,7 +14,7 @@ # Running special tests set -e export PL_RUNNING_SPECIAL_TESTS=1 -DEFAULTS="-m coverage run --source pytorch_lightning -a -m pytest --verbose --capture=no" +DEFAULTS="-m coverage run --source pytorch_lightning --append -m pytest --verbose --capture=no" python ${DEFAULTS} tests/trainer/optimization/test_manual_optimization.py::test_step_with_optimizer_closure_with_different_frequencies_ddp python ${DEFAULTS} tests/models/test_sync_batchnorm.py::test_sync_batchnorm_ddp python ${DEFAULTS} tests/plugins/test_deepspeed_plugin.py::test_deepspeed_multigpu From 4aa9be21f832a9959739d04580744bf19aea0d60 Mon Sep 17 00:00:00 2001 From: Amog Kamsetty Date: Fri, 19 Mar 2021 14:38:49 -0700 Subject: [PATCH 04/13] Automatically set sync_batchnorm for training_type_plugin (#6536) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí Co-authored-by: Roger Shieh Co-authored-by: Kaushik Bokka --- .../connectors/accelerator_connector.py | 5 +++ tests/plugins/test_custom_plugin.py | 41 +++++++++++++++++++ 2 files changed, 46 insertions(+) create mode 100644 tests/plugins/test_custom_plugin.py diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 7d5e5fb9c358c..83eddfed6c4dc 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -425,6 +425,11 @@ def resolve_training_type_plugin(self, training_type: TrainingTypePlugin) -> Tra if hasattr(training_type, 'num_nodes') and getattr(training_type, 'num_nodes') is None: training_type.num_nodes = self.num_nodes + # Automatically set sync_batchnorm if None. + # Useful for custom plugins. + if hasattr(training_type, 'sync_batchnorm') and getattr(training_type, 'sync_batchnorm') is None: + training_type.sync_batchnorm = self.sync_batchnorm + return training_type def select_accelerator(self) -> Accelerator: diff --git a/tests/plugins/test_custom_plugin.py b/tests/plugins/test_custom_plugin.py new file mode 100644 index 0000000000000..872b49ef48635 --- /dev/null +++ b/tests/plugins/test_custom_plugin.py @@ -0,0 +1,41 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from pytorch_lightning import Trainer +from pytorch_lightning.plugins import DDPPlugin +from tests.helpers import BoringModel +from tests.helpers.runif import RunIf + + +class CustomParallelPlugin(DDPPlugin): + + def __init__(self, **kwargs): + super().__init__(**kwargs) + # Set to None so it will be overwritten by the accelerator connector. + self.sync_batchnorm = None + + +@RunIf(skip_windows=True) +def test_sync_batchnorm_set(tmpdir): + """Tests if sync_batchnorm is automatically set for custom plugin.""" + model = BoringModel() + plugin = CustomParallelPlugin() + assert plugin.sync_batchnorm is None + trainer = Trainer( + max_epochs=1, + plugins=[plugin], + default_root_dir=tmpdir, + sync_batchnorm=True, + ) + trainer.fit(model) + assert plugin.sync_batchnorm is True From 836d02aa3506e45a329f320ff41ab19b418965e7 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Mon, 29 Mar 2021 23:01:04 +0200 Subject: [PATCH 05/13] Add RunIf --- tests/helpers/runif.py | 184 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 184 insertions(+) create mode 100644 tests/helpers/runif.py diff --git a/tests/helpers/runif.py b/tests/helpers/runif.py new file mode 100644 index 0000000000000..5483e33d9cddb --- /dev/null +++ b/tests/helpers/runif.py @@ -0,0 +1,184 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import sys +from distutils.version import LooseVersion +from typing import Optional + +import pytest +import torch +from pkg_resources import get_distribution + +from pytorch_lightning.utilities import ( + _APEX_AVAILABLE, + _DEEPSPEED_AVAILABLE, + _FAIRSCALE_AVAILABLE, + _FAIRSCALE_PIPE_AVAILABLE, + _HOROVOD_AVAILABLE, + _NATIVE_AMP_AVAILABLE, + _RPC_AVAILABLE, + _TORCH_QUANTIZE_AVAILABLE, + _TPU_AVAILABLE, +) + +try: + from horovod.common.util import nccl_built + nccl_built() +except (ImportError, ModuleNotFoundError, AttributeError): + _HOROVOD_NCCL_AVAILABLE = False +finally: + _HOROVOD_NCCL_AVAILABLE = True + + +class RunIf: + """ + RunIf wrapper for simple marking specific cases, fully compatible with pytest.mark:: + + @RunIf(min_torch="0.0") + @pytest.mark.parametrize("arg1", [1, 2.0]) + def test_wrapper(arg1): + assert arg1 > 0.0 + """ + + def __new__( + self, + *args, + min_gpus: int = 0, + min_torch: Optional[str] = None, + max_torch: Optional[str] = None, + min_python: Optional[str] = None, + quantization: bool = False, + amp_apex: bool = False, + amp_native: bool = False, + tpu: bool = False, + horovod: bool = False, + horovod_nccl: bool = False, + skip_windows: bool = False, + special: bool = False, + rpc: bool = False, + fairscale: bool = False, + fairscale_pipe: bool = False, + deepspeed: bool = False, + **kwargs + ): + """ + Args: + args: native pytest.mark.skipif arguments + min_gpus: min number of gpus required to run test + min_torch: minimum pytorch version to run test + max_torch: maximum pytorch version to run test + min_python: minimum python version required to run test + quantization: if `torch.quantization` package is required to run test + amp_apex: NVIDIA Apex is installed + amp_native: if native PyTorch native AMP is supported + tpu: if TPU is available + horovod: if Horovod is installed + horovod_nccl: if Horovod is installed with NCCL support + skip_windows: skip test for Windows platform (typically fo some limited torch functionality) + special: running in special mode, outside pytest suit + rpc: requires Remote Procedure Call (RPC) + fairscale: if `fairscale` module is required to run the test + deepspeed: if `deepspeed` module is required to run the test + kwargs: native pytest.mark.skipif keyword arguments + """ + conditions = [] + reasons = [] + + if min_gpus: + conditions.append(torch.cuda.device_count() < min_gpus) + reasons.append(f"GPUs>={min_gpus}") + + if min_torch: + torch_version = LooseVersion(get_distribution("torch").version) + conditions.append(torch_version < LooseVersion(min_torch)) + reasons.append(f"torch>={min_torch}") + + if max_torch: + torch_version = LooseVersion(get_distribution("torch").version) + conditions.append(torch_version >= LooseVersion(max_torch)) + reasons.append(f"torch<{max_torch}") + + if min_python: + py_version = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}" + conditions.append(py_version < LooseVersion(min_python)) + reasons.append(f"python>={min_python}") + + if quantization: + _miss_default = 'fbgemm' not in torch.backends.quantized.supported_engines + conditions.append(not _TORCH_QUANTIZE_AVAILABLE or _miss_default) + reasons.append("PyTorch quantization") + + if amp_native: + conditions.append(not _NATIVE_AMP_AVAILABLE) + reasons.append("native AMP") + + if amp_apex: + conditions.append(not _APEX_AVAILABLE) + reasons.append("NVIDIA Apex") + + if skip_windows: + conditions.append(sys.platform == "win32") + reasons.append("unimplemented on Windows") + + if tpu: + conditions.append(not _TPU_AVAILABLE) + reasons.append("TPU") + + if horovod: + conditions.append(not _HOROVOD_AVAILABLE) + reasons.append("Horovod") + + if horovod_nccl: + conditions.append(not _HOROVOD_NCCL_AVAILABLE) + reasons.append("Horovod with NCCL") + + if special: + env_flag = os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') + conditions.append(env_flag != '1') + reasons.append("Special execution") + + if rpc: + conditions.append(not _RPC_AVAILABLE) + reasons.append("RPC") + + if fairscale: + conditions.append(not _FAIRSCALE_AVAILABLE) + reasons.append("Fairscale") + + if fairscale_pipe: + conditions.append(not _FAIRSCALE_PIPE_AVAILABLE) + reasons.append("Fairscale Pipe") + + if deepspeed: + conditions.append(not _DEEPSPEED_AVAILABLE) + reasons.append("Deepspeed") + + reasons = [rs for cond, rs in zip(conditions, reasons) if cond] + return pytest.mark.skipif( + *args, + condition=any(conditions), + reason=f"Requires: [{' + '.join(reasons)}]", + **kwargs, + ) + + +@RunIf(min_torch="99") +def test_always_skip(): + exit(1) + + +@pytest.mark.parametrize("arg1", [0.5, 1.0, 2.0]) +@RunIf(min_torch="0.0") +def test_wrapper(arg1: float): + assert arg1 > 0.0 From 4df060bc8d6dba95e0853daf858a1301faeb1f83 Mon Sep 17 00:00:00 2001 From: Shengyao Zhuang <46237844+ArvinZhuang@users.noreply.github.com> Date: Thu, 25 Mar 2021 19:37:58 +1000 Subject: [PATCH 06/13] Match the number of outputs of backward with forward for AllGatherGrad (#6625) --- pytorch_lightning/utilities/distributed.py | 2 +- tests/utilities/test_all_gather_grad.py | 23 ++++++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py index 9e47af26f53d5..3877f774b7cd8 100644 --- a/pytorch_lightning/utilities/distributed.py +++ b/pytorch_lightning/utilities/distributed.py @@ -187,7 +187,7 @@ def backward(ctx, *grad_output): torch.distributed.all_reduce(grad_output, op=torch.distributed.ReduceOp.SUM, async_op=False, group=ctx.group) - return grad_output[torch.distributed.get_rank()] + return grad_output[torch.distributed.get_rank()], None def all_gather_ddp_if_available( diff --git a/tests/utilities/test_all_gather_grad.py b/tests/utilities/test_all_gather_grad.py index f82cfc94bcce2..86b977cfff029 100644 --- a/tests/utilities/test_all_gather_grad.py +++ b/tests/utilities/test_all_gather_grad.py @@ -96,3 +96,26 @@ def training_epoch_end(self, outputs) -> None: trainer.fit(model) assert model.training_epoch_end_called + + +@RunIf(min_gpus=2, skip_windows=True, special=True) +def test_all_gather_sync_grads(tmpdir): + + class TestModel(BoringModel): + + training_step_called = False + + def training_step(self, batch, batch_idx): + self.training_step_called = True + tensor = torch.rand(2, 2, requires_grad=True, device=self.device) + gathered_tensor = self.all_gather(tensor, sync_grads=True) + assert gathered_tensor.shape == torch.Size([2, 2, 2]) + + loss = gathered_tensor.sum() + + return loss + + model = TestModel() + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, gpus=2) + trainer.fit(model) + assert model.training_step_called From 014a6b7d026c8e9a73db0428643cd3e63ca744b8 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 22 Mar 2021 17:49:01 +0100 Subject: [PATCH 07/13] hotfix: mock examples (#6632) * mock examples * drop from GA --- azure-pipelines.yml | 2 ++ pl_examples/__init__.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index fecf8f1a776ce..f944cad2ae09d 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -103,4 +103,6 @@ jobs: # cd pl_examples/basic_examples # bash submit_ddp_job.sh # bash submit_ddp2_job.sh + env: + PL_USE_MOCKED_MNIST: "1" displayName: 'Examples' diff --git a/pl_examples/__init__.py b/pl_examples/__init__.py index ffd60f9ed71af..150ac309ddceb 100644 --- a/pl_examples/__init__.py +++ b/pl_examples/__init__.py @@ -15,10 +15,10 @@ _DATASETS_PATH = os.path.join(_PACKAGE_ROOT, 'Datasets') _TORCHVISION_AVAILABLE = _module_available("torchvision") -_TORCHVISION_MNIST_AVAILABLE = True +_TORCHVISION_MNIST_AVAILABLE = not bool(os.environ.get("PL_USE_MOCKED_MNIST", False)) _DALI_AVAILABLE = _module_available("nvidia.dali") -if _TORCHVISION_AVAILABLE: +if _TORCHVISION_MNIST_AVAILABLE: try: from torchvision.datasets.mnist import MNIST MNIST(_DATASETS_PATH, download=True) From a1829bc4058b448577338ef80a2144e955b7d6b4 Mon Sep 17 00:00:00 2001 From: Kaushik B <45285388+kaushikb11@users.noreply.github.com> Date: Thu, 25 Mar 2021 16:07:37 +0530 Subject: [PATCH 08/13] Fix checkpoint callback & Trainer.test(_) issue for TPUs (#6654) * Fix checkpoint callback issue for TPUs * update changelog * add barrier * apply code suggestions * update trainer test * remove spaces * fix tpu tests * Apply suggestions from code review * add comment Co-authored-by: Jirka Borovec --- CHANGELOG.md | 2 ++ .../plugins/training_type/tpu_spawn.py | 12 ++++++------ pytorch_lightning/trainer/trainer.py | 6 ++++-- tests/models/test_tpu.py | 17 +++++++++++++++-- 4 files changed, 27 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 524e57ac48e03..6669050a56298 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed - Fixed `DummyLogger.log_hyperparams` raising a `TypeError` when running with `fast_dev_run=True` ([#6398](https://github.com/PyTorchLightning/pytorch-lightning/pull/6398)) +- Fixed error on TPUs when there was no `ModelCheckpoint` ([#6654](https://github.com/PyTorchLightning/pytorch-lightning/pull/6654)) +- Fixed `trainer.test` freeze on TPUs ([#6654](https://github.com/PyTorchLightning/pytorch-lightning/pull/6654)) ## [1.2.5] - 2021-03-23 diff --git a/pytorch_lightning/plugins/training_type/tpu_spawn.py b/pytorch_lightning/plugins/training_type/tpu_spawn.py index 1e951329b22cc..09603f9a22bc2 100644 --- a/pytorch_lightning/plugins/training_type/tpu_spawn.py +++ b/pytorch_lightning/plugins/training_type/tpu_spawn.py @@ -4,7 +4,6 @@ from typing import Any, Dict, Iterable, List, Optional, Union import torch -import torch.distributed as torch_distrib import torch.multiprocessing as mp from pytorch_lightning.core.lightning import LightningModule @@ -96,13 +95,15 @@ def new_process(self, process_idx: int, trainer, mp_queue) -> None: # replace trainer save_checkpoint to use `xm.save` trainer.save_checkpoint = self.save_checkpoint - self.barrier() + self.barrier("pre-run-stage") results = trainer.train_or_test_or_predict() self.__save_end_of_training_weights(self.lightning_module) self.transfer_distrib_spawn_state_on_fit_end(results) + self.barrier("end-process") + def __save_end_of_training_weights(self, model: LightningModule) -> None: # when training ends on these platforms dump weights to get out of the main process if on_colab_kaggle(): @@ -113,12 +114,11 @@ def model_to_device(self) -> None: self._model.to(xm.xla_device()) def barrier(self, name: Optional[str] = None) -> None: - if torch_distrib.is_initialized(): - rendezvous(f"pl.Trainer.{name}") + rendezvous(name) def transfer_distrib_spawn_state_on_fit_end(self, results): - # TODO: is there a better way than accessing callback through model -> trainer -> callback? - best_model_path = self.lightning_module.trainer.checkpoint_callback.best_model_path + checkpoint_callback = self.lightning_module.trainer.checkpoint_callback + best_model_path = checkpoint_callback.best_model_path if checkpoint_callback else None if self.mp_queue is not None: rank_zero_warn("cleaning up ddp environment...") diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index f378ee830d261..2d5e2504a319f 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -56,7 +56,7 @@ from pytorch_lightning.trainer.training_loop import TrainLoop from pytorch_lightning.trainer.training_tricks import TrainerTrainingTricksMixin from pytorch_lightning.tuner.tuning import Tuner -from pytorch_lightning.utilities import rank_zero_warn +from pytorch_lightning.utilities import DeviceType, rank_zero_warn from pytorch_lightning.utilities.cloud_io import load as pl_load from pytorch_lightning.utilities.debugging import InternalDebugger from pytorch_lightning.utilities.enums import LightningEnum @@ -942,7 +942,9 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders): ) return {} - self.training_type_plugin.barrier() + # only one process running at this point for TPUs, as spawn isn't triggered yet + if not self._device_type == DeviceType.TPU: + self.training_type_plugin.barrier() ckpt = pl_load(ckpt_path, map_location=lambda storage, loc: storage) model.load_state_dict(ckpt['state_dict']) diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index 0554d924e6e9f..fbda891f0065f 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -349,13 +349,14 @@ def test_reduce(rank): xmp.spawn(test_reduce, nprocs=8, start_method='fork') -@pytest.mark.parametrize("clip_val", [0, 10]) -@pytest.mark.skipif(not _TPU_AVAILABLE, reason="test requires TPU machine") +@RunIf(tpu=True) @pl_multi_process_test +@pytest.mark.parametrize("clip_val", [10]) @mock.patch("pytorch_lightning.accelerators.tpu.xla_clip_grad_norm_") def test_tpu_precision_16_clip_gradients(mock_clip_grad_norm, clip_val, tmpdir): """ Ensure that clip gradients is only called if the value is greater than 0. + TODO: Fix (test fails with parametrize) """ tutils.reset_seed() trainer_options = dict( @@ -375,3 +376,15 @@ def test_tpu_precision_16_clip_gradients(mock_clip_grad_norm, clip_val, tmpdir): mock_clip_grad_norm.assert_called() else: mock_clip_grad_norm.assert_not_called() + + +@RunIf(tpu=True) +@pl_multi_process_test +def test_if_test_works_with_checkpoint_false(tmpdir): + """Ensure that model trains properly when `checkpoint_callback` is set to False.""" + + # Train a model on TPU + model = BoringModel() + trainer = Trainer(max_epochs=1, tpu_cores=8, default_root_dir=tmpdir, fast_dev_run=True, checkpoint_callback=False) + trainer.fit(model) + assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}" From d229429a9ea0fe4c763487cf08015163966387a7 Mon Sep 17 00:00:00 2001 From: Ethan Harris Date: Tue, 23 Mar 2021 22:07:48 +0000 Subject: [PATCH 09/13] Fix disabled grads after call to predict (#6657) --- CHANGELOG.md | 1 + pytorch_lightning/trainer/trainer.py | 4 ++++ tests/trainer/test_trainer.py | 21 +++++++++++++++++---- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6669050a56298..003c321203a9c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed `DummyLogger.log_hyperparams` raising a `TypeError` when running with `fast_dev_run=True` ([#6398](https://github.com/PyTorchLightning/pytorch-lightning/pull/6398)) - Fixed error on TPUs when there was no `ModelCheckpoint` ([#6654](https://github.com/PyTorchLightning/pytorch-lightning/pull/6654)) - Fixed `trainer.test` freeze on TPUs ([#6654](https://github.com/PyTorchLightning/pytorch-lightning/pull/6654)) +- Fixed a bug where gradients were disabled after calling `Trainer.predict` ([#6657](https://github.com/PyTorchLightning/pytorch-lightning/pull/6657)) ## [1.2.5] - 2021-03-23 diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 2d5e2504a319f..6af34b71f69c2 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -834,6 +834,10 @@ def run_predict(self): self.predict_loop.predict(batch, batch_idx, dataloader_idx) results = self.predict_loop.on_predict_epoch_end() + + # re-enable grads + torch.set_grad_enabled(True) + return results def run_sanity_check(self, ref_model): diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 6966edc3cbf70..fd2b48a3fa140 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -1410,12 +1410,12 @@ def predict_dataloader(self): return self._dataloaders -def predict(tmpdir, accelerator, gpus, num_processes, plugins=None, datamodule=True): +def predict(tmpdir, accelerator, gpus, num_processes, model=None, plugins=None, datamodule=True): dataloaders = [torch.utils.data.DataLoader(RandomDataset(32, 2)), torch.utils.data.DataLoader(RandomDataset(32, 2))] - model = BoringModel() - datamodule = TestLightningDataModule(dataloaders) + model = model or BoringModel() + dm = TestLightningDataModule(dataloaders) trainer = Trainer( default_root_dir=tmpdir, @@ -1428,7 +1428,7 @@ def predict(tmpdir, accelerator, gpus, num_processes, plugins=None, datamodule=T plugins=plugins, ) if datamodule: - results = trainer.predict(model, datamodule=datamodule) + results = trainer.predict(model, datamodule=dm) else: results = trainer.predict(model, dataloaders=dataloaders) @@ -1439,6 +1439,19 @@ def predict(tmpdir, accelerator, gpus, num_processes, plugins=None, datamodule=T assert results[0][0].shape == torch.Size([1, 2]) +def test_trainer_predict_grad(tmpdir): + class CustomBoringModel(BoringModel): + + def predict_step(self, batch, batch_idx, dataloader_idx=None): + assert batch.expand_as(batch).grad_fn is None + return super().predict_step(batch, batch_idx, dataloader_idx) + + predict(tmpdir, None, None, 1, model=CustomBoringModel()) + + x = torch.zeros(1, requires_grad=True) + assert x.expand_as(x).grad_fn is not None + + @pytest.mark.parametrize('datamodule', [False, True]) def test_trainer_predict_cpu(tmpdir, datamodule): predict(tmpdir, None, None, 1, datamodule=datamodule) From 2b59a49eb0c35acfd5a8c9a05b5f7471efc1aad7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Fri, 26 Mar 2021 14:05:20 +0100 Subject: [PATCH 10/13] Do not add return dict items to callback_metrics (#6682) --- CHANGELOG.md | 4 + docs/source/ecosystem/asr_nlp_tts.rst | 10 +- docs/source/ecosystem/bolts.rst | 4 +- .../callbacks/model_checkpoint.py | 2 +- .../logger_connector/epoch_result_store.py | 1 - .../logger_connector/logger_connector.py | 67 ++----- pytorch_lightning/trainer/logging.py | 31 +-- pytorch_lightning/trainer/trainer.py | 9 - pytorch_lightning/trainer/training_loop.py | 3 +- tests/base/model_valid_epoch_ends.py | 5 +- tests/callbacks/test_early_stopping.py | 30 +-- tests/checkpointing/test_model_checkpoint.py | 10 +- tests/models/test_tpu.py | 1 + .../test_eval_loop_dict_return.py | 176 ------------------ .../test_trainer_steps_dict_return.py | 22 --- .../logging_/test_eval_loop_logging_1_0.py | 25 +-- .../trainer/logging_/test_logger_connector.py | 42 +++++ tests/utilities/test_all_gather_grad.py | 1 + 18 files changed, 101 insertions(+), 342 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 003c321203a9c..a2ea14b23d166 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Changed the behavior of `on_epoch_start` to run at the beginning of validation & test epoch ([#6498](https://github.com/PyTorchLightning/pytorch-lightning/pull/6498)) +### Removed + +- Removed legacy code to include `step` dictionary returns in `callback_metrics`. Use `self.log_dict` instead. ([#6682](https://github.com/PyTorchLightning/pytorch-lightning/pull/6682)) + ### Fixed - Fixed `DummyLogger.log_hyperparams` raising a `TypeError` when running with `fast_dev_run=True` ([#6398](https://github.com/PyTorchLightning/pytorch-lightning/pull/6398)) diff --git a/docs/source/ecosystem/asr_nlp_tts.rst b/docs/source/ecosystem/asr_nlp_tts.rst index 49bed0a981a6e..af9a7084583f2 100644 --- a/docs/source/ecosystem/asr_nlp_tts.rst +++ b/docs/source/ecosystem/asr_nlp_tts.rst @@ -270,12 +270,12 @@ with PyTorch Lightning since every NeMo model is a Lightning Module. log_probs=log_probs, targets=transcript, input_lengths=encoded_len, target_lengths=transcript_len ) wer_num, wer_denom = self._wer(predictions, transcript, transcript_len) - tensorboard_logs = { + self.log_dict({ 'train_loss': loss_value, 'training_batch_wer': wer_num / wer_denom, 'learning_rate': self._optimizer.param_groups[0]['lr'], - } - return {'loss': loss_value, 'log': tensorboard_logs} + }) + return loss_value Neural Types in NeMo ASR ------------------------ @@ -539,8 +539,8 @@ since every NeMo model is a Lightning Module. logits = self(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask) loss = self.loss(logits=logits, labels=labels, loss_mask=loss_mask) - tensorboard_logs = {'train_loss': loss, 'lr': self._optimizer.param_groups[0]['lr']} - return {'loss': loss, 'log': tensorboard_logs} + self.log_dict({'train_loss': loss, 'lr': self._optimizer.param_groups[0]['lr']}) + return loss ... Neural Types in NeMo NLP diff --git a/docs/source/ecosystem/bolts.rst b/docs/source/ecosystem/bolts.rst index 9133176cab912..f3a4ab9c858be 100644 --- a/docs/source/ecosystem/bolts.rst +++ b/docs/source/ecosystem/bolts.rst @@ -68,8 +68,8 @@ you can trust the implementations and use them to bootstrap your research much f loss = self.criterion(logits.view(-1, logits.size(-1)), x.view(-1).long()) - logs = {"loss": loss} - return {"loss": loss, "log": logs} + self.log("loss", loss) + return loss ---------- diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 43f7a66dca313..5dc891795af93 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -490,7 +490,7 @@ def _validate_monitor_key(self, trainer): m = ( f"ModelCheckpoint(monitor='{self.monitor}') not found in the returned metrics:" f" {list(metrics.keys())}. " - f"HINT: Did you call self.log('{self.monitor}', tensor) in the LightningModule?" + f"HINT: Did you call self.log('{self.monitor}', value) in the LightningModule?" ) raise MisconfigurationException(m) diff --git a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py index b40d87c2d9664..8979ce7798eaa 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py @@ -346,7 +346,6 @@ def update_logger_connector(self) -> Tuple[Dict, Dict]: # update callback_metrics logger_connector._callback_metrics.update(callback_metrics) - logger_connector._callback_metrics.pop("epoch", None) batch_pbar_metrics.pop("debug_epoch", None) return batch_pbar_metrics, batch_log_metrics diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index 8ebec3238e276..b106244f06307 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -78,7 +78,7 @@ def progress_bar_metrics(self, progress_bar_metrics: Dict) -> None: @property def cached_results(self) -> Union[EpochResultStore, None]: - return self._cached_results.get(self.trainer._running_stage) # type: ignore + return self._cached_results.get(self.trainer._running_stage) def get_metrics(self, key: str) -> Dict: metrics_holder = getattr(self, f"_{key}", None) @@ -125,8 +125,6 @@ def cache_logged_metrics(self): def on_trainer_init(self, logger, flush_logs_every_n_steps: int, log_every_n_steps: int, move_metrics_to_cpu: bool): # logging self.configure_logger(logger) - # todo: IDE is complaining, these shall be initialized in the Trainer init at leas as placeholders - # and assign here the desired value self.trainer.flush_logs_every_n_steps = flush_logs_every_n_steps self.trainer.log_every_n_steps = log_every_n_steps self.trainer.move_metrics_to_cpu = move_metrics_to_cpu @@ -189,9 +187,6 @@ def cache_training_step_metrics(self, opt_closure_result): batch_log_metrics = opt_closure_result.training_step_output.log_metrics logged_metrics_tmp.update(batch_log_metrics) - callback_metrics = opt_closure_result.training_step_output.callback_metrics - callback_metrics_tmp.update(callback_metrics) - batch_pbar_metrics = opt_closure_result.training_step_output.pbar_on_batch_end pbar_metrics_tmp.update(batch_pbar_metrics) @@ -214,9 +209,6 @@ def log_metrics(self, metrics, grad_norm_dic, step=None): metrics (dict): Metric values grad_norm_dic (dict): Gradient norms step (int): Step for which metrics should be logged. Default value corresponds to `self.global_step` - log_train_step_metrics (bool): Used to track if `log_metrics` function is being called in during training - steps. In training steps, we will log metrics on step: `total_nb_idx` (for accumulated gradients) - and global_step for the rest. """ # add gpu memory if self.trainer._device_type == DeviceType.GPU and self.log_gpu_memory: @@ -350,27 +342,6 @@ def _track_callback_metrics(self, eval_results): if self.trainer.testing: self.trainer.logger_connector.evaluation_callback_metrics.update(flat) - def __process_eval_epoch_end_results_and_log_legacy_update(self, prog_bar_metrics, log_metrics, callback_metrics): - # eval loop returns all metrics - dataloader_result_metrics = {**prog_bar_metrics, **log_metrics, **callback_metrics} - - # add metrics to prog bar - self.trainer.logger_connector.add_progress_bar_metrics(prog_bar_metrics) - - # log metrics - if len(log_metrics) > 0: - self.trainer.logger_connector.log_metrics(log_metrics, {}) - - # track metrics for callbacks (all prog bar, logged and callback metrics) - callback_metrics.update(log_metrics) - callback_metrics.update(prog_bar_metrics) - self.trainer.logger_connector.callback_metrics.update(callback_metrics) - if self.trainer.testing: - self.trainer.logger_connector.evaluation_callback_metrics.update(callback_metrics) - - if len(dataloader_result_metrics) > 0: - self.eval_loop_results.append(dataloader_result_metrics) - def __process_eval_epoch_end_results_and_log_legacy(self, eval_results): if self.trainer.running_sanity_check: return @@ -381,21 +352,21 @@ def __process_eval_epoch_end_results_and_log_legacy(self, eval_results): if not isinstance(eval_results, list): eval_results = [eval_results] - num_loaders: int = self.trainer.evaluation_loop.num_dataloaders - prog_bar_metrics, log_metrics, callback_metrics = {}, {}, {} - for result_idx, result in enumerate(eval_results): - _, prog_bar_metrics, log_metrics, callback_metrics, _ = self.trainer.process_dict_result(result) + _, prog_bar_metrics, log_metrics, _ = self.trainer.process_dict_result(result) + + # eval loop returns all metrics + dataloader_result_metrics = {**prog_bar_metrics, **log_metrics} + + # add metrics to prog bar + self.trainer.logger_connector.add_progress_bar_metrics(prog_bar_metrics) - if num_loaders > 1: - self.__process_eval_epoch_end_results_and_log_legacy_update( - prog_bar_metrics, log_metrics, callback_metrics - ) + # log metrics + if len(log_metrics) > 0: + self.trainer.logger_connector.log_metrics(log_metrics, {}) - if num_loaders == 1: - self.__process_eval_epoch_end_results_and_log_legacy_update( - prog_bar_metrics, log_metrics, callback_metrics - ) + if len(dataloader_result_metrics) > 0: + self.eval_loop_results.append(dataloader_result_metrics) def on_train_epoch_end(self): # inform cached logger connector epoch finished @@ -448,10 +419,9 @@ def log_train_epoch_end_metrics( # TODO: deprecate 1.0 else: - out = self.__run_legacy_training_epoch_end( - num_optimizers, epoch_output, model, is_result_obj, epoch_callback_metrics + epoch_log_metrics, epoch_progress_bar_metrics = self.__run_legacy_training_epoch_end( + num_optimizers, epoch_output, model, is_result_obj ) - epoch_log_metrics, epoch_progress_bar_metrics, epoch_callback_metrics = out # it will perform reduction over epoch and return log metrics cached_epoch_log_metrics = self.cached_results.get_epoch_log_metrics() @@ -503,9 +473,7 @@ def training_epoch_end(self, model, epoch_output, num_optimizers): # capture logging self.trainer.logger_connector.cache_logged_metrics() - def __run_legacy_training_epoch_end( - self, num_optimizers, epoch_output, model, is_result_obj, epoch_callback_metrics - ): + def __run_legacy_training_epoch_end(self, num_optimizers, epoch_output, model, is_result_obj): epoch_log_metrics = {} epoch_progress_bar_metrics = {} @@ -536,7 +504,6 @@ def __run_legacy_training_epoch_end( _processed_outputs = self.trainer.process_dict_result(epoch_output) epoch_progress_bar_metrics = _processed_outputs[1] epoch_log_metrics = _processed_outputs[2] - epoch_callback_metrics = _processed_outputs[3] # -------------------------- # Structured Result (auto epoch end) @@ -544,7 +511,7 @@ def __run_legacy_training_epoch_end( elif is_result_obj: epoch_log_metrics, epoch_progress_bar_metrics = self.__auto_reduce_results_on_epoch_end(epoch_output) - return epoch_log_metrics, epoch_progress_bar_metrics, epoch_callback_metrics + return epoch_log_metrics, epoch_progress_bar_metrics def __auto_reduce_results_on_epoch_end(self, epoch_output): epoch_log_metrics = {} diff --git a/pytorch_lightning/trainer/logging.py b/pytorch_lightning/trainer/logging.py index 16060f863884c..3f97f4adcf1b0 100644 --- a/pytorch_lightning/trainer/logging.py +++ b/pytorch_lightning/trainer/logging.py @@ -21,6 +21,7 @@ from pytorch_lightning.loggers import LightningLoggerBase from pytorch_lightning.utilities import DeviceType, DistributedType from pytorch_lightning.utilities.distributed import rank_zero_warn +from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.memory import recursive_detach @@ -42,8 +43,14 @@ class TrainerLoggingMixin(ABC): def metrics_to_scalars(self, metrics): new_metrics = {} + # TODO: this is duplicated in MetricsHolder. should be unified for k, v in metrics.items(): if isinstance(v, torch.Tensor): + if v.numel() != 1: + raise MisconfigurationException( + f"The metric `{k}` does not contain a single element" + f" thus it cannot be converted to float. Found `{v}`" + ) v = v.item() if isinstance(v, dict): @@ -81,23 +88,8 @@ def process_dict_result(self, output, train=False): if isinstance(output, torch.Tensor): progress_bar_metrics = {} log_metrics = {} - callback_metrics = {} hiddens = None - return output, progress_bar_metrics, log_metrics, callback_metrics, hiddens - - # --------------- - # EXTRACT CALLBACK KEYS - # --------------- - # all keys not progress_bar or log are candidates for callbacks - callback_metrics = {} - if isinstance(output, Mapping): - for k, v in output.items(): - if k not in ['progress_bar', 'log', 'hiddens']: - callback_metrics[k] = v - - if train and self._distrib_type in (DistributedType.DP, DistributedType.DDP2): - num_gpus = self.num_gpus - callback_metrics = self.reduce_distributed_output(callback_metrics, num_gpus) + return output, progress_bar_metrics, log_metrics, hiddens # --------------- # EXTRACT PROGRESS BAR KEYS @@ -159,17 +151,12 @@ def process_dict_result(self, output, train=False): # --------------- hiddens = output.get('hiddens', None) if isinstance(output, Mapping) else None - # use every metric passed in as a candidate for callback - callback_metrics.update(progress_bar_metrics) - callback_metrics.update(log_metrics) - # detach all metrics for callbacks to prevent memory leaks # no .item() because it will slow things down - callback_metrics = recursive_detach(callback_metrics) progress_bar_metrics = recursive_detach(progress_bar_metrics) log_metrics = recursive_detach(log_metrics) - return loss, progress_bar_metrics, log_metrics, callback_metrics, hiddens + return loss, progress_bar_metrics, log_metrics, hiddens def reduce_distributed_output(self, output, num_gpus): if num_gpus <= 1: diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 6af34b71f69c2..82bb858ef6c53 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -859,15 +859,6 @@ def run_sanity_check(self, ref_model): # run eval step _, eval_results = self.run_evaluation(max_batches=self.num_sanity_val_batches) - # allow no returns from eval - if eval_results is not None and len(eval_results) > 0: - # when we get a list back, used only the last item - if isinstance(eval_results, list): - eval_results = eval_results[-1] - - _, _, _, callback_metrics, _ = self.process_dict_result(eval_results) - self.logger_connector.callback_metrics = callback_metrics - self.on_sanity_check_end() self.running_sanity_check = False diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 36e1f6799437e..17efecaf98a2f 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -360,8 +360,7 @@ def _process_training_step_output(self, training_step_output, split_batch): batch_loss=training_step_output[0], pbar_on_batch_end=training_step_output[1], log_metrics=training_step_output[2], - callback_metrics=training_step_output[3], - hiddens=training_step_output[4], + hiddens=training_step_output[3], ) # if the user decides to finally reduce things in epoch_end, save raw output without graphs if isinstance(training_step_output_for_epoch_end, torch.Tensor): diff --git a/tests/base/model_valid_epoch_ends.py b/tests/base/model_valid_epoch_ends.py index dd29d355a4a98..7b83670acacef 100644 --- a/tests/base/model_valid_epoch_ends.py +++ b/tests/base/model_valid_epoch_ends.py @@ -43,9 +43,8 @@ def _mean(res, key): val_loss_mean = val_loss_mean.item() val_acc_mean = val_acc_mean.item() - metrics_dict = {'early_stop_on': val_loss_mean, 'val_acc': val_acc_mean} - results = {'progress_bar': metrics_dict, 'log': metrics_dict} - return results + self.log('early_stop_on', val_loss_mean, prog_bar=True) + self.log('val_acc', val_acc_mean, prog_bar=True) def validation_epoch_end__multiple_dataloaders(self, outputs): """ diff --git a/tests/callbacks/test_early_stopping.py b/tests/callbacks/test_early_stopping.py index 7062fe35bbcb7..643bfb90f2fda 100644 --- a/tests/callbacks/test_early_stopping.py +++ b/tests/callbacks/test_early_stopping.py @@ -127,7 +127,7 @@ class ModelOverrideValidationReturn(BoringModel): def validation_epoch_end(self, outputs): loss = self.validation_return_values[self.current_epoch] - return {"test_val_loss": loss} + self.log("test_val_loss", loss) model = ModelOverrideValidationReturn() early_stop_callback = EarlyStopping(monitor="test_val_loss", patience=patience, verbose=True) @@ -217,7 +217,7 @@ class CurrentModel(BoringModel): def validation_epoch_end(self, outputs): losses = [8, 4, 2, 3, 4, 5, 8, 10] val_loss = losses[self.current_epoch] - self.log('abc', torch.tensor(val_loss)) + self.log('abc', val_loss) model = CurrentModel() @@ -231,28 +231,6 @@ def validation_epoch_end(self, outputs): assert trainer.current_epoch == 5, 'early_stopping failed' -def test_early_stopping_functionality_arbitrary_key(tmpdir): - """Tests whether early stopping works with a custom key and dictionary results on val step.""" - - class CurrentModel(BoringModel): - - def validation_epoch_end(self, outputs): - losses = [8, 4, 2, 3, 4, 5, 8, 10] - val_loss = losses[self.current_epoch] - return {'jiraffe': torch.tensor(val_loss)} - - model = CurrentModel() - - trainer = Trainer( - default_root_dir=tmpdir, - callbacks=[EarlyStopping(monitor='jiraffe')], - overfit_batches=0.20, - max_epochs=20, - ) - trainer.fit(model) - assert trainer.current_epoch >= 5, 'early_stopping failed' - - @pytest.mark.parametrize('step_freeze, min_steps, min_epochs', [(5, 1, 1), (5, 1, 3), (3, 15, 1)]) def test_min_steps_override_early_stopping_functionality(tmpdir, step_freeze, min_steps, min_epochs): """Excepted Behaviour: @@ -269,7 +247,7 @@ def test_min_steps_override_early_stopping_functionality(tmpdir, step_freeze, mi when `early_stopping` is being triggered, THEN the highest between `min_epochs * len(train_dataloader)` and `min_steps` would be reached. - Caviat: IF min_steps is divisible by len(train_dataloader), then it will do min_steps + len(train_dataloader) + Caveat: IF min_steps is divisible by len(train_dataloader), then it will do min_steps + len(train_dataloader) This test validate those expected behaviours """ @@ -306,7 +284,7 @@ def validation_epoch_end(self, outputs): self._count_decrease += 1 self._loss_value -= self._eps self._values.append(_mean) - return {"test_val_loss": _mean} + self.log('test_val_loss', _mean) model = Model(step_freeze) model.training_step_end = None diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py index 3b4ea00ecb0ba..d87beb9a78aac 100644 --- a/tests/checkpointing/test_model_checkpoint.py +++ b/tests/checkpointing/test_model_checkpoint.py @@ -49,7 +49,6 @@ def training_step(self, batch, batch_idx): def validation_epoch_end(self, outputs): outs = torch.stack([x['x'] for x in outputs]).mean() - self.log('epoch', self.current_epoch) self.log('val_acc', outs) @@ -585,12 +584,7 @@ def test_model_checkpoint_topk_all(tmpdir): seed_everything(1000) epochs = 3 - class CustomModel(LogInTwoMethods): - - def validation_epoch_end(self, outputs): - return {'epoch': self.current_epoch} - - model = CustomModel() + model = BoringModel() checkpoint_callback = ModelCheckpoint( dirpath=tmpdir, filename="{epoch}", @@ -754,7 +748,7 @@ class ExtendedBoringModel(BoringModel): def validation_step(self, batch, batch_idx): output = self.layer(batch) loss = self.loss(batch, output) - return {"val_loss": loss} + self.log("val_loss", loss) model = ExtendedBoringModel() model.validation_epoch_end = None diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index fbda891f0065f..2befc5bd7dbd2 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -29,6 +29,7 @@ from pytorch_lightning.utilities.distributed import ReduceOp from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers import BoringModel, RandomDataset +from tests.helpers.runif import RunIf from tests.helpers.utils import pl_multi_process_test if _TPU_AVAILABLE: diff --git a/tests/trainer/legacy_deprecate_flow_log/test_eval_loop_dict_return.py b/tests/trainer/legacy_deprecate_flow_log/test_eval_loop_dict_return.py index 87cab653de6aa..a616acf4d65cf 100644 --- a/tests/trainer/legacy_deprecate_flow_log/test_eval_loop_dict_return.py +++ b/tests/trainer/legacy_deprecate_flow_log/test_eval_loop_dict_return.py @@ -122,49 +122,6 @@ def test_validation_step_arbitrary_dict_return(tmpdir): assert not model.validation_epoch_end_called -def test_validation_step_dict_return(tmpdir): - """ - Test that val step can return a dict with all the expected keys and they end up - in the correct place - """ - - model = DeterministicModel() - model.training_step = model.training_step__dict_return - model.validation_step = model.validation_step__dict_return - model.validation_step_end = None - model.validation_epoch_end = None - - trainer = Trainer( - default_root_dir=tmpdir, - weights_summary=None, - limit_train_batches=2, - limit_val_batches=2, - max_epochs=2, - ) - trainer.fit(model) - - # out are the results of the full loop - # eval_results are output of _evaluate - callback_metrics, eval_results = trainer.run_evaluation() - assert len(callback_metrics) == 1 - assert len(callback_metrics[0]) == 5 - assert len(eval_results) == 2 - assert eval_results[0]['log']['log_acc1'] == 12 - assert eval_results[1]['log']['log_acc1'] == 13 - - for k in ['val_loss', 'log', 'progress_bar']: - assert k in eval_results[0] - assert k in eval_results[1] - - # ensure all the keys ended up as candidates for callbacks - assert len(trainer.logger_connector.callback_metrics) in [7, 8] - - # make sure correct steps were called - assert model.validation_step_called - assert not model.validation_step_end_called - assert not model.validation_epoch_end_called - - def test_val_step_step_end_no_return(tmpdir): """ Test that val step + val step end work (with no return in val step end) @@ -195,136 +152,3 @@ def test_val_step_step_end_no_return(tmpdir): assert model.validation_step_called assert model.validation_step_end_called assert not model.validation_epoch_end_called - - -def test_val_step_step_end(tmpdir): - """ - Test that val step + val step end work - """ - - model = DeterministicModel() - model.training_step = model.training_step__dict_return - model.validation_step = model.validation_step__dict_return - model.validation_step_end = model.validation_step_end - model.validation_epoch_end = None - - trainer = Trainer( - default_root_dir=tmpdir, - weights_summary=None, - limit_train_batches=2, - limit_val_batches=2, - max_epochs=2, - ) - trainer.fit(model) - - # out are the results of the full loop - # eval_results are output of _evaluate - callback_metrics, eval_results = trainer.run_evaluation() - assert len(callback_metrics) == 1 - assert len(callback_metrics[0]) == 6 - - callback_metrics = callback_metrics[0] - assert callback_metrics['val_step_end'] == 1802 - assert len(eval_results) == 2 - assert eval_results[0]['log']['log_acc1'] == 12 - assert eval_results[1]['log']['log_acc1'] == 13 - - for k in ['val_loss', 'log', 'progress_bar']: - assert k in eval_results[0] - assert k in eval_results[1] - - # ensure all the keys ended up as candidates for callbacks - assert len(trainer.logger_connector.callback_metrics) in [8, 9] - - # make sure correct steps were called - assert model.validation_step_called - assert model.validation_step_end_called - assert not model.validation_epoch_end_called - - -def test_no_val_step_end(tmpdir): - """ - Test that val step + val epoch end - """ - - model = DeterministicModel() - model.training_step = model.training_step__dict_return - model.validation_step = model.validation_step__dict_return - model.validation_step_end = None - model.validation_epoch_end = model.validation_epoch_end - - trainer = Trainer( - default_root_dir=tmpdir, - weights_summary=None, - limit_train_batches=2, - limit_val_batches=3, - num_sanity_val_steps=0, - max_epochs=2 - ) - trainer.fit(model) - - # out are the results of the full loop - # eval_results are output of _evaluate - callback_metrics, eval_results = trainer.run_evaluation() - assert len(callback_metrics) == 1 - assert len(callback_metrics[0]) == 6 - assert len(eval_results) == 1 - - eval_results = eval_results[0] - assert 'val_step_end' not in eval_results - assert eval_results['val_epoch_end'] == 1233 - - for k in ['val_loss', 'log', 'progress_bar']: - assert k in eval_results - - # ensure all the keys ended up as candidates for callbacks - assert len(trainer.logger_connector.callback_metrics) in [8, 9] - - # make sure correct steps were called - assert model.validation_step_called - assert not model.validation_step_end_called - assert model.validation_epoch_end_called - - -def test_full_val_loop(tmpdir): - """ - Test that val step + val step end + val epoch end - """ - - model = DeterministicModel() - model.training_step = model.training_step__dict_return - model.validation_step = model.validation_step__dict_return - model.validation_step_end = model.validation_step_end - model.validation_epoch_end = model.validation_epoch_end - - trainer = Trainer( - default_root_dir=tmpdir, - weights_summary=None, - limit_train_batches=2, - limit_val_batches=3, - num_sanity_val_steps=0, - max_epochs=2 - ) - trainer.fit(model) - - # out are the results of the full loop - # eval_results are output of _evaluate - callback_metrics, eval_results = trainer.run_evaluation() - assert len(callback_metrics) == 1 - assert len(callback_metrics[0]) == 7 - assert len(eval_results) == 1 - - eval_results = eval_results[0] - assert eval_results['val_step_end'] == 1802 - assert eval_results['val_epoch_end'] == 1233 - - for k in ['val_loss', 'log', 'progress_bar']: - assert k in eval_results - - # ensure all the keys ended up as candidates for callbacks - assert len(trainer.logger_connector.callback_metrics) in [9, 10] - - # make sure correct steps were called - assert model.validation_step_called - assert model.validation_step_end_called - assert model.validation_epoch_end_called diff --git a/tests/trainer/legacy_deprecate_flow_log/test_trainer_steps_dict_return.py b/tests/trainer/legacy_deprecate_flow_log/test_trainer_steps_dict_return.py index 9c114f72080d8..3f60e6060d2ae 100644 --- a/tests/trainer/legacy_deprecate_flow_log/test_trainer_steps_dict_return.py +++ b/tests/trainer/legacy_deprecate_flow_log/test_trainer_steps_dict_return.py @@ -171,28 +171,6 @@ def test_result_obj_lr_scheduler_epoch(tmpdir): assert len(trainer.dev_debugger.saved_lr_scheduler_updates) == 3 -@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) -def test_result_obj_lr_scheduler_step(tmpdir): - """ - test that the LR scheduler was called at the correct time with the correct metrics - """ - model = DeterministicModel() - model.training_step = model.training_step__for_step_end_dict - model.training_step_end = model.training_step_end__dict - model.training_epoch_end = model.training_epoch_end__dict - model.val_dataloader = None - model.configure_optimizers = model.configure_optimizers__lr_on_plateau_step - - trainer = Trainer( - default_root_dir=tmpdir, - max_epochs=2, - weights_summary=None, - ) - trainer.fit(model) - - assert len(trainer.dev_debugger.saved_lr_scheduler_updates) == 8 - - def test_train_step_epoch_end(tmpdir): """ Checks train_step + training_epoch_end (NO training_step_end) diff --git a/tests/trainer/logging_/test_eval_loop_logging_1_0.py b/tests/trainer/logging_/test_eval_loop_logging_1_0.py index 79bdecae46424..e480ee7080b59 100644 --- a/tests/trainer/logging_/test_eval_loop_logging_1_0.py +++ b/tests/trainer/logging_/test_eval_loop_logging_1_0.py @@ -373,11 +373,10 @@ def test_multi_dataloaders_add_suffix_properly(tmpdir): class TestModel(BoringModel): - def test_step(self, batch, batch_idx, dataloader_idx): + def test_step(self, batch, *args): output = self.layer(batch) loss = self.loss(batch, output) self.log("test_loss", loss, on_step=True, on_epoch=True) - return {"y": loss} def test_dataloader(self): return [ @@ -398,22 +397,19 @@ def test_dataloader(self): weights_summary=None, ) results = trainer.test(model) - assert "test_loss_epoch/dataloader_idx_0" in results[0] - assert "test_loss_epoch/dataloader_idx_1" in results[1] + + assert {"test_loss/dataloader_idx_0", "test_loss_epoch/dataloader_idx_0"} == set(results[0]) + assert {"test_loss/dataloader_idx_1", "test_loss_epoch/dataloader_idx_1"} == set(results[1]) def test_single_dataloader_no_suffix_added(tmpdir): class TestModel(BoringModel): - def test_step(self, batch, batch_idx): + def test_step(self, batch, *args): output = self.layer(batch) loss = self.loss(batch, output) self.log("test_loss", loss, on_step=True, on_epoch=True) - return {"y": loss} - - def test_dataloader(self): - return torch.utils.data.DataLoader(RandomDataset(32, 64)) model = TestModel() model.test_epoch_end = None @@ -428,9 +424,9 @@ def test_dataloader(self): weights_summary=None, ) results = trainer.test(model) + assert len(results) == 1 - # error : It is wrong there. `y` should equal test_loss_epoch - assert results[0]['test_loss'] == results[0]['y'] + assert {"test_loss", "test_loss_epoch"} == set(results[0]) @mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) @@ -496,7 +492,7 @@ def on_validation_start(self, trainer, pl_module): ) def on_epoch_start(self, trainer, pl_module): - if trainer.validating: + if trainer.evaluating: self.make_logging( pl_module, 'on_epoch_start', @@ -546,7 +542,7 @@ def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, self.count += 1 def on_epoch_end(self, trainer, pl_module): - if trainer.validating: + if trainer.evaluating: self.make_logging( pl_module, 'on_epoch_end', 8, on_steps=[False], on_epochs=self.choices, prob_bars=self.choices ) @@ -860,7 +856,7 @@ def validation_step(self, batch, batch_idx): self.log('valid_loss_1', loss, on_step=False, on_epoch=True) self.log('valid_loss_2', loss, on_step=True, on_epoch=False) self.log('valid_loss_3', loss, on_step=False, on_epoch=False) - return {"val_loss": loss} + return {"val_loss": loss} # not added to callback_metrics def test_step(self, batch, batch_idx): output = self.layer(batch) @@ -937,7 +933,6 @@ def get_metrics_at_idx(idx): 'debug_epoch', 'valid_loss_1', 'test_loss', - 'val_loss', } assert set(trainer.callback_metrics) == expected_callback_metrics assert set(results[0]) == {'test_loss', 'debug_epoch'} diff --git a/tests/trainer/logging_/test_logger_connector.py b/tests/trainer/logging_/test_logger_connector.py index 92eb2c76a8c6b..5efb31b9fb608 100644 --- a/tests/trainer/logging_/test_logger_connector.py +++ b/tests/trainer/logging_/test_logger_connector.py @@ -453,6 +453,48 @@ def is_float(value: Any) -> bool: assert excepted_function(metrics["z"]) +def test_metric_holder_raises(tmpdir): + """Check that an error is raised when trying to convert non-scalar tensors""" + + class TestModel(BoringModel): + + def validation_step(self, batch, *args, **kwargs): + output = self(batch) + self.log('test', output) + + def test_step(self, *args, **kwargs): + return self.validation_step(*args, **kwargs) + + model = TestModel() + model.validation_epoch_end = None + model.test_epoch_end = None + + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True) + + match = "The metric `test` does not contain a single element" + with pytest.raises(MisconfigurationException, match=match): + trainer.test(model) + + +def test_can_return_tensor_with_more_than_one_element(tmpdir): + """Ensure {validation,test}_step return values are not included as callback metrics. #6623""" + + class TestModel(BoringModel): + + def test_step(self, batch, *args, **kwargs): + return {"test": torch.tensor([0, 1])} + + def test_epoch_end(self, outputs): + assert len(outputs) == 2 + assert all(list(d) == ["test"] for d in outputs) # check keys + assert all(torch.equal(d["test"], torch.tensor([0, 1])) for d in outputs) # check values + + model = TestModel() + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=2, progress_bar_refresh_rate=0) + trainer.fit(model) + trainer.test(model) + + def test_logging_to_progress_bar_with_reserved_key(tmpdir): """ Test that logging a metric with a reserved name to the progress bar raises a warning. """ diff --git a/tests/utilities/test_all_gather_grad.py b/tests/utilities/test_all_gather_grad.py index 86b977cfff029..94b2ad4263cbc 100644 --- a/tests/utilities/test_all_gather_grad.py +++ b/tests/utilities/test_all_gather_grad.py @@ -8,6 +8,7 @@ from pytorch_lightning import seed_everything, Trainer from pytorch_lightning.utilities import AllGatherGrad from tests.helpers.boring_model import BoringModel +from tests.helpers.runif import RunIf def setup_ddp(rank, world_size): From d91e36fe0476bba668166ef5a0835751eb10ee78 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Mon, 29 Mar 2021 23:25:01 +0200 Subject: [PATCH 11/13] Add 1.8.1 to adjust_versions.py --- requirements/adjust_versions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements/adjust_versions.py b/requirements/adjust_versions.py index c1499cd4ea5ee..d0dfbc59e2352 100644 --- a/requirements/adjust_versions.py +++ b/requirements/adjust_versions.py @@ -11,6 +11,7 @@ "1.7.0": dict(torchvision="0.8.1", torchtext="0.8"), "1.7.1": dict(torchvision="0.8.2", torchtext="0.8.1"), "1.8.0": dict(torchvision="0.9.0", torchtext="0.9"), + "1.8.1": dict(torchvision="0.9.0", torchtext="0.9"), } From 67d4749b401fbe8f40a660371e19bcd2b2d2295b Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Mon, 29 Mar 2021 18:59:20 +0100 Subject: [PATCH 12/13] [TPU] update is_tpu_exists utils internal logic to rely on xmp.spawn (#6719) * update_logic * update * Update tests/utilities/test_xla_device_utils.py * Update pytorch_lightning/utilities/xla_device.py Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> * Update pytorch_lightning/utilities/xla_device.py Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> * update test * Update tests/utilities/test_xla_device_utils.py * update * Apply fix * Docstring * flake8 * update Co-authored-by: Your Name Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> Co-authored-by: Carlos Mocholi --- CHANGELOG.md | 1 + pytorch_lightning/utilities/__init__.py | 1 - pytorch_lightning/utilities/xla_device.py | 54 +++++++++++++---------- tests/plugins/test_custom_plugin.py | 4 ++ tests/utilities/test_xla_device_utils.py | 30 ++++++++----- 5 files changed, 54 insertions(+), 36 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a2ea14b23d166..8a20ee5914854 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed error on TPUs when there was no `ModelCheckpoint` ([#6654](https://github.com/PyTorchLightning/pytorch-lightning/pull/6654)) - Fixed `trainer.test` freeze on TPUs ([#6654](https://github.com/PyTorchLightning/pytorch-lightning/pull/6654)) - Fixed a bug where gradients were disabled after calling `Trainer.predict` ([#6657](https://github.com/PyTorchLightning/pytorch-lightning/pull/6657)) +- Fixed bug where no TPUs were detected in a TPU pod env ([#6719](https://github.com/PyTorchLightning/pytorch-lightning/pull/6719)) ## [1.2.5] - 2021-03-23 diff --git a/pytorch_lightning/utilities/__init__.py b/pytorch_lightning/utilities/__init__.py index cf3aa06f305b8..e24e4a0db560a 100644 --- a/pytorch_lightning/utilities/__init__.py +++ b/pytorch_lightning/utilities/__init__.py @@ -14,7 +14,6 @@ """General utilities""" import numpy - from pytorch_lightning.utilities.apply_func import move_data_to_device # noqa: F401 from pytorch_lightning.utilities.distributed import ( # noqa: F401 AllGatherGrad, diff --git a/pytorch_lightning/utilities/xla_device.py b/pytorch_lightning/utilities/xla_device.py index fcf56e9c679f4..294d3d2c5ec40 100644 --- a/pytorch_lightning/utilities/xla_device.py +++ b/pytorch_lightning/utilities/xla_device.py @@ -12,18 +12,21 @@ # See the License for the specific language governing permissions and # limitations under the License. import functools +import os import queue as q import traceback from multiprocessing import Process, Queue -import torch +import torch.multiprocessing as mp from pytorch_lightning.utilities.imports import _XLA_AVAILABLE if _XLA_AVAILABLE: import torch_xla.core.xla_model as xm + import torch_xla.distributed.xla_multiprocessing as xmp + #: define waiting time got checking TPU available in sec -TPU_CHECK_TIMEOUT = 100 +TPU_CHECK_TIMEOUT = 25 def inner_f(queue, func, *args, **kwargs): # pragma: no cover @@ -55,23 +58,10 @@ def wrapper(*args, **kwargs): class XLADeviceUtils: """Used to detect the type of XLA device""" - TPU_AVAILABLE = None - - @staticmethod - def _fetch_xla_device_type(device: torch.device) -> str: - """ - Returns XLA device type - - Args: - device: (:class:`~torch.device`): Accepts a torch.device type with a XLA device format i.e xla:0 - - Return: - Returns a str of the device hardware type. i.e TPU - """ - if _XLA_AVAILABLE: - return xm.xla_device_hw(device) + _TPU_AVAILABLE = False @staticmethod + @pl_multi_process def _is_device_tpu() -> bool: """ Check if device is TPU @@ -79,10 +69,18 @@ def _is_device_tpu() -> bool: Return: A boolean value indicating if the xla device is a TPU device or not """ - if _XLA_AVAILABLE: - device = xm.xla_device() - device_type = XLADeviceUtils._fetch_xla_device_type(device) - return device_type == "TPU" + + def _fn(_: int, mp_queue): + try: + device = xm.xla_device() + mp_queue.put(device.type == 'xla') + except Exception: + mp_queue.put(False) + + smp = mp.get_context("spawn") + queue = smp.SimpleQueue() + xmp.spawn(_fn, args=(queue, ), nprocs=1) + return queue.get() @staticmethod def xla_available() -> bool: @@ -102,6 +100,14 @@ def tpu_device_exists() -> bool: Return: A boolean value indicating if a TPU device exists on the system """ - if XLADeviceUtils.TPU_AVAILABLE is None and _XLA_AVAILABLE: - XLADeviceUtils.TPU_AVAILABLE = pl_multi_process(XLADeviceUtils._is_device_tpu)() - return XLADeviceUtils.TPU_AVAILABLE + if os.getenv("PL_TPU_AVAILABLE", '0') == "1": + XLADeviceUtils._TPU_AVAILABLE = True + + if XLADeviceUtils.xla_available() and not XLADeviceUtils._TPU_AVAILABLE: + + XLADeviceUtils._TPU_AVAILABLE = XLADeviceUtils._is_device_tpu() + + if XLADeviceUtils._TPU_AVAILABLE: + os.environ["PL_TPU_AVAILABLE"] = '1' + + return XLADeviceUtils._TPU_AVAILABLE diff --git a/tests/plugins/test_custom_plugin.py b/tests/plugins/test_custom_plugin.py index 872b49ef48635..b0407d1fca6b2 100644 --- a/tests/plugins/test_custom_plugin.py +++ b/tests/plugins/test_custom_plugin.py @@ -11,6 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import pytest +import torch + from pytorch_lightning import Trainer from pytorch_lightning.plugins import DDPPlugin from tests.helpers import BoringModel @@ -26,6 +29,7 @@ def __init__(self, **kwargs): @RunIf(skip_windows=True) +@pytest.mark.skipif(torch.cuda.is_available(), reason="RuntimeError: Tensors must be CUDA and dense") def test_sync_batchnorm_set(tmpdir): """Tests if sync_batchnorm is automatically set for custom plugin.""" model = BoringModel() diff --git a/tests/utilities/test_xla_device_utils.py b/tests/utilities/test_xla_device_utils.py index 73b11b48267ce..edca2777b578a 100644 --- a/tests/utilities/test_xla_device_utils.py +++ b/tests/utilities/test_xla_device_utils.py @@ -17,29 +17,37 @@ import pytest import pytorch_lightning.utilities.xla_device as xla_utils -from pytorch_lightning.utilities import _TPU_AVAILABLE, _XLA_AVAILABLE -from tests.helpers.utils import pl_multi_process_test +from pytorch_lightning.utilities import _XLA_AVAILABLE +from tests.helpers.runif import RunIf @pytest.mark.skipif(_XLA_AVAILABLE, reason="test requires torch_xla to be absent") def test_tpu_device_absence(): - """Check tpu_device_exists returns None when torch_xla is not available""" - assert xla_utils.XLADeviceUtils.tpu_device_exists() is None + """Check tpu_device_exists returns False when torch_xla is not available""" + assert not xla_utils.XLADeviceUtils.tpu_device_exists() -@pytest.mark.skipif(not _TPU_AVAILABLE, reason="test requires torch_xla to be installed") -@pl_multi_process_test +@RunIf(tpu=True) def test_tpu_device_presence(): """Check tpu_device_exists returns True when TPU is available""" - assert xla_utils.XLADeviceUtils.tpu_device_exists() is True + assert xla_utils.XLADeviceUtils.tpu_device_exists() -@patch('pytorch_lightning.utilities.xla_device.TPU_CHECK_TIMEOUT', 10) +def sleep_fn(sleep_time: float) -> bool: + time.sleep(sleep_time) + return True + + +@patch('pytorch_lightning.utilities.xla_device.TPU_CHECK_TIMEOUT', 3) +@pytest.mark.skipif(not _XLA_AVAILABLE, reason="test requires torch_xla to be present") def test_result_returns_within_timeout_seconds(): - """Check that pl_multi_process returns within 10 seconds""" + """Check that pl_multi_process returns within 3 seconds""" + fn = xla_utils.pl_multi_process(sleep_fn) + start = time.time() - result = xla_utils.pl_multi_process(time.sleep)(xla_utils.TPU_CHECK_TIMEOUT * 1.25) + result = fn(xla_utils.TPU_CHECK_TIMEOUT * 0.5) end = time.time() elapsed_time = int(end - start) + assert elapsed_time <= xla_utils.TPU_CHECK_TIMEOUT - assert result is False + assert result From fb50a051c6195e16df3a767950e31e5120df4835 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 29 Mar 2021 10:39:06 +0200 Subject: [PATCH 13/13] remake nvidia docker (#6686) * use latest * remake * examples --- azure-pipelines.yml | 5 ++- dockers/nvidia/Dockerfile | 67 ++++++++++++++++++++++++++------ dockers/release/Dockerfile | 8 ++-- pl_examples/run_ddp-example.sh | 12 ------ pl_examples/run_ddp-examples.sh | 13 +++++++ pl_examples/run_examples-args.sh | 15 +++++++ 6 files changed, 91 insertions(+), 29 deletions(-) delete mode 100644 pl_examples/run_ddp-example.sh create mode 100644 pl_examples/run_ddp-examples.sh create mode 100644 pl_examples/run_examples-args.sh diff --git a/azure-pipelines.yml b/azure-pipelines.yml index f944cad2ae09d..376078ee85335 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -98,8 +98,9 @@ jobs: - script: | set -e python -m pytest pl_examples -v --maxfail=2 --durations=0 - python setup.py install --user --quiet - bash pl_examples/run_ddp-example.sh + pip install . --user --quiet + bash pl_examples/run_examples-args.sh --gpus 1 --max_epochs 1 --batch_size 64 --limit_train_batches 5 --limit_val_batches 3 + bash pl_examples/run_ddp-examples.sh --max_epochs 1 --batch_size 32 --limit_train_batches 2 --limit_val_batches 2 # cd pl_examples/basic_examples # bash submit_ddp_job.sh # bash submit_ddp2_job.sh diff --git a/dockers/nvidia/Dockerfile b/dockers/nvidia/Dockerfile index ea567a5306eed..ad1169c4450dd 100644 --- a/dockers/nvidia/Dockerfile +++ b/dockers/nvidia/Dockerfile @@ -12,26 +12,69 @@ # See the License for the specific language governing permissions and # limitations under the License. -FROM nvcr.io/nvidia/pytorch:20.12-py3 +FROM nvcr.io/nvidia/cuda:11.1.1-runtime-ubuntu20.04 MAINTAINER PyTorchLightning ARG LIGHTNING_VERSION="" -COPY ./ ./pytorch-lightning/ +SHELL ["/bin/bash", "-c"] +# https://techoverflow.net/2019/05/18/how-to-fix-configuring-tzdata-interactive-input-when-building-docker-images/ +ENV \ + DEBIAN_FRONTEND=noninteractive \ + TZ=Europe/Prague \ + PATH="$PATH:/root/.local/bin" \ + CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda" \ + MKL_THREADING_LAYER=GNU + +RUN apt-get update -qq && \ + apt-get install -y --no-install-recommends \ + build-essential \ + python3 \ + python3-distutils \ + python3-dev \ + pkg-config \ + cmake \ + git \ + wget \ + unzip \ + ca-certificates \ + && \ + +# Cleaning + apt-get autoremove -y && \ + apt-get clean && \ + rm -rf /root/.cache && \ + rm -rf /var/lib/apt/lists/* && \ + +# Setup PIP + update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \ + wget https://bootstrap.pypa.io/get-pip.py --progress=bar:force:noscroll --no-check-certificate && \ + python get-pip.py && \ + rm get-pip.py && \ + pip --version + +COPY ./ /home/pytorch-lightning/ -# install dependencies RUN \ - # Disable cache - #conda install "pip>20.1" && \ - #pip config set global.cache-dir false && \ - if [ -z $LIGHTNING_VERSION ] ; then \ - pip install ./pytorch-lightning --no-cache-dir ; \ - rm -rf pytorch-lightning ; \ - else \ + cd /home && \ + mv pytorch-lightning/notebooks . && \ + mv pytorch-lightning/pl_examples . && \ + # replace by specific version if asked + if [ ! -z "$LIGHTNING_VERSION" ] ; then \ rm -rf pytorch-lightning ; \ - pip install https://github.com/PyTorchLightning/pytorch-lightning/archive/${LIGHTNING_VERSION}.zip --no-cache-dir ; \ - fi + wget https://github.com/PyTorchLightning/pytorch-lightning/archive/${LIGHTNING_VERSION}.zip --progress=bar:force:noscroll ; \ + unzip ${LIGHTNING_VERSION}.zip ; \ + mv pytorch-lightning-*/ pytorch-lightning ; \ + rm *.zip ; \ + fi && \ + +# Installations + python -c "fname = './pytorch-lightning/requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('horovod')] ; open(fname, 'w').writelines(lines)" && \ + pip install -r ./pytorch-lightning/requirements/extra.txt -U --no-cache-dir && \ + pip install -r ./pytorch-lightning/requirements/examples.txt -U --no-cache-dir && \ + pip install ./pytorch-lightning --no-cache-dir && \ + rm -rf pytorch-lightning RUN python --version && \ pip --version && \ diff --git a/dockers/release/Dockerfile b/dockers/release/Dockerfile index 3584ee02746e3..5cd53385f660b 100644 --- a/dockers/release/Dockerfile +++ b/dockers/release/Dockerfile @@ -21,12 +21,14 @@ MAINTAINER PyTorchLightning ARG LIGHTNING_VERSION="" -COPY ./ ./pytorch-lightning/ +COPY ./ /home/pytorch-lightning/ # install dependencies RUN \ - # Disable cache - #conda install "pip>20.1" && \ + cd /home && \ + mv pytorch-lightning/notebooks . && \ + mv pytorch-lightning/pl_examples . && \ + # replace by specific version if asked if [ ! -z "$LIGHTNING_VERSION" ] ; then \ rm -rf pytorch-lightning ; \ wget https://github.com/PyTorchLightning/pytorch-lightning/archive/${LIGHTNING_VERSION}.zip --progress=bar:force:noscroll ; \ diff --git a/pl_examples/run_ddp-example.sh b/pl_examples/run_ddp-example.sh deleted file mode 100644 index f0c7695e766f2..0000000000000 --- a/pl_examples/run_ddp-example.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -ARGS_DEFAULT=" --default_root_dir %(tmpdir)s --max_epochs 1 --batch_size 32 --limit_train_batches 2 --limit_val_batches 2" -ARGS_EXTRA_DDP=" --gpus 2 --accelerator ddp" -ARGS_EXTRA_AMP=" --precision 16" - -python pl_examples/basic_examples/simple_image_classifier.py ${ARGS_DEFAULT} ${ARGS_EXTRA_DDP} -python pl_examples/basic_examples/simple_image_classifier.py ${ARGS_DEFAULT} ${ARGS_EXTRA_DDP} ${ARGS_EXTRA_AMP} -python pl_examples/basic_examples/backbone_image_classifier.py ${ARGS_DEFAULT} ${ARGS_EXTRA_DDP} -python pl_examples/basic_examples/backbone_image_classifier.py ${ARGS_DEFAULT} ${ARGS_EXTRA_DDP} ${ARGS_EXTRA_AMP} -python pl_examples/basic_examples/autoencoder.py ${ARGS_DEFAULT} ${ARGS_EXTRA_DDP} -python pl_examples/basic_examples/autoencoder.py ${ARGS_DEFAULT} ${ARGS_EXTRA_DDP} ${ARGS_EXTRA_AMP} diff --git a/pl_examples/run_ddp-examples.sh b/pl_examples/run_ddp-examples.sh new file mode 100644 index 0000000000000..6cc36364e397d --- /dev/null +++ b/pl_examples/run_ddp-examples.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +ARGS_EXTRA_DDP=" --gpus 2 --accelerator ddp" +ARGS_EXTRA_AMP=" --precision 16" + +python pl_examples/basic_examples/simple_image_classifier.py $@ ${ARGS_EXTRA_DDP} +python pl_examples/basic_examples/simple_image_classifier.py $@ ${ARGS_EXTRA_DDP} ${ARGS_EXTRA_AMP} + +python pl_examples/basic_examples/backbone_image_classifier.py $@ ${ARGS_EXTRA_DDP} +python pl_examples/basic_examples/backbone_image_classifier.py $@ ${ARGS_EXTRA_DDP} ${ARGS_EXTRA_AMP} + +python pl_examples/basic_examples/autoencoder.py $@ ${ARGS_EXTRA_DDP} +python pl_examples/basic_examples/autoencoder.py $@ ${ARGS_EXTRA_DDP} ${ARGS_EXTRA_AMP} diff --git a/pl_examples/run_examples-args.sh b/pl_examples/run_examples-args.sh new file mode 100644 index 0000000000000..352869538cb18 --- /dev/null +++ b/pl_examples/run_examples-args.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +echo $@ + +full_path=$(realpath $0) +echo $full_path + +dir_path=$(dirname $full_path) +echo $dir_path + +python ${dir_path}/basic_examples/simple_image_classifier.py $@ + +python ${dir_path}/basic_examples/backbone_image_classifier.py $@ + +python ${dir_path}/basic_examples/autoencoder.py $@